From 350afa8a1101f62ce31bc4ed6f69cf4b90ec4fa2 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@amd.com>
Date: Thu, 8 Aug 2024 06:29:34 +0000
Subject: x86/split_lock: Move Split and Bus lock code to a dedicated file

Bus Lock Detect functionality on AMD platforms works identical to Intel.

Move split_lock and bus_lock specific code from intel.c to a dedicated
file so that it can be compiled and supported on non-Intel platforms.

Also, introduce CONFIG_X86_BUS_LOCK_DETECT, make it dependent on
CONFIG_CPU_SUP_INTEL and add compilation dependency of the new bus_lock.c
file on CONFIG_X86_BUS_LOCK_DETECT.

Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://lore.kernel.org/all/20240808062937.1149-2-ravi.bangoria@amd.com
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8d150343d42..d606b2a1de99 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -976,7 +976,7 @@ struct task_struct {
 #ifdef CONFIG_ARCH_HAS_CPU_PASID
 	unsigned			pasid_activated:1;
 #endif
-#ifdef	CONFIG_CPU_SUP_INTEL
+#ifdef CONFIG_X86_BUS_LOCK_DETECT
 	unsigned			reported_split_lock:1;
 #endif
 #ifdef CONFIG_TASK_DELAY_ACCT
-- 
cgit v1.2.3


From 8cf9a01edc216b16b5839eb793ac544d2c97ce97 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 11 Sep 2024 15:42:57 -0400
Subject: fs: Introduce FOP_ASYNC_LOCK

Some lock managers (NLM, kNFSD) fastidiously avoid blocking their
kernel threads while servicing blocking locks.  If a filesystem supports
asynchronous lock requests those lock managers can use notifications to
quickly inform clients they have acquired a file lock.

Historically, only posix_lock_file() was capable of supporting asynchronous
locks so the check for support was simply file_operations->lock(), but with
recent changes in DLM, both GFS2 and OCFS2 also support asynchronous locks
and have started signalling their support with EXPORT_OP_ASYNC_LOCK.

We recently noticed that those changes dropped the checks for whether a
filesystem simply defaults to posix_lock_file(), so async lock
notifications have not been attempted for NLM and NFSv4.1+ for most
filesystems.  While trying to fix this it has become clear that testing
both the export flag combined with testing ->lock() creates quite a
layering mess.  It seems appropriate to signal support with a fop_flag.

Add FOP_ASYNC_LOCK so that filesystems with ->lock() can signal their
capability to handle lock requests asynchronously.  Add a helper for
lock managers to properly test that support.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Link: https://lore.kernel.org/r/3330d5a324abe2ce9c1dafe89cacdc6db41945d1.1726083391.git.bcodding@redhat.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/filelock.h | 5 +++++
 include/linux/fs.h       | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index daee999d05f3..58c1120a8253 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -180,6 +180,11 @@ static inline void locks_wake_up(struct file_lock *fl)
 	wake_up(&fl->c.flc_wait);
 }
 
+static inline bool locks_can_async_lock(const struct file_operations *fops)
+{
+	return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK;
+}
+
 /* fs/locks.c */
 void locks_free_lock_context(struct inode *inode);
 void locks_free_lock(struct file_lock *fl);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6ca11e241a24..78221ae589d9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2074,6 +2074,8 @@ struct file_operations {
 #define FOP_DIO_PARALLEL_WRITE	((__force fop_flags_t)(1 << 3))
 /* Contains huge pages */
 #define FOP_HUGE_PAGES		((__force fop_flags_t)(1 << 4))
+/* Supports asynchronous lock callbacks */
+#define FOP_ASYNC_LOCK		((__force fop_flags_t)(1 << 5))
 
 /* Wrap a directory iterator that needs exclusive inode access */
 int wrap_directory_iterator(struct file *, struct dir_context *,
-- 
cgit v1.2.3


From 2b0996c7646293c71c1297c00e07e54cee9bec5c Mon Sep 17 00:00:00 2001
From: Rosen Penev <rosenp@gmail.com>
Date: Fri, 6 Sep 2024 12:53:59 -0700
Subject: wifi: ath9k: remove ath9k_platform_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Completely unused here in favor of Device Tree based setup. The DT code
in here should currently match what is available with platform files.
Any such lapse can always be added.

Signed-off-by: Rosen Penev <rosenp@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Kalle Valo <quic_kvalo@quicinc.com>
Link: https://patch.msgid.link/20240906195359.6982-4-rosenp@gmail.com
---
 .../net/wireless/ath/ath9k/ath9k_pci_owl_loader.c  |  1 -
 drivers/net/wireless/ath/ath9k/hw.c                |  2 +-
 drivers/net/wireless/ath/ath9k/init.c              | 52 +---------------------
 include/linux/ath9k_platform.h                     | 51 ---------------------
 4 files changed, 3 insertions(+), 103 deletions(-)
 delete mode 100644 include/linux/ath9k_platform.h

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath9k/ath9k_pci_owl_loader.c b/drivers/net/wireless/ath/ath9k/ath9k_pci_owl_loader.c
index 004ca5f536be..fe1013a3a588 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k_pci_owl_loader.c
+++ b/drivers/net/wireless/ath/ath9k/ath9k_pci_owl_loader.c
@@ -18,7 +18,6 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/platform_device.h>
-#include <linux/ath9k_platform.h>
 #include <linux/nvmem-consumer.h>
 #include <linux/workqueue.h>
 
diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index 04a4b9ea61c3..36db734c74ae 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -490,7 +490,7 @@ static void ath9k_hw_init_macaddr(struct ath_hw *ah)
 	u16 eeval;
 	static const u32 EEP_MAC[] = { EEP_MAC_LSW, EEP_MAC_MID, EEP_MAC_MSW };
 
-	/* MAC address may already be loaded via ath9k_platform_data */
+	/* MAC address may already be loaded via NVMEM */
 	if (is_valid_ether_addr(common->macaddr))
 		return;
 
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index 7fad7e75af6a..f9e77c4624d9 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -18,7 +18,6 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
-#include <linux/ath9k_platform.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_net.h>
@@ -583,8 +582,8 @@ static int ath9k_nvmem_request_eeprom(struct ath_softc *sc)
 
 		/* nvmem cell might not be defined, or the nvmem
 		 * subsystem isn't included. In this case, follow
-		 * the established "just return 0;" convention of
-		 * ath9k_init_platform to say:
+		 * the established "just return 0;" convention
+		 * to say:
 		 * "All good. Nothing to see here. Please go on."
 		 */
 		if (err == -ENOENT || err == -EOPNOTSUPP)
@@ -620,49 +619,6 @@ static int ath9k_nvmem_request_eeprom(struct ath_softc *sc)
 	return 0;
 }
 
-static int ath9k_init_platform(struct ath_softc *sc)
-{
-	struct ath9k_platform_data *pdata = sc->dev->platform_data;
-	struct ath_hw *ah = sc->sc_ah;
-	struct ath_common *common = ath9k_hw_common(ah);
-	int ret;
-
-	if (!pdata)
-		return 0;
-
-	if (!pdata->use_eeprom) {
-		ah->ah_flags &= ~AH_USE_EEPROM;
-		ah->gpio_mask = pdata->gpio_mask;
-		ah->gpio_val = pdata->gpio_val;
-		ah->led_pin = pdata->led_pin;
-		ah->is_clk_25mhz = pdata->is_clk_25mhz;
-		ah->get_mac_revision = pdata->get_mac_revision;
-		ah->external_reset = pdata->external_reset;
-		ah->disable_2ghz = pdata->disable_2ghz;
-		ah->disable_5ghz = pdata->disable_5ghz;
-
-		if (!pdata->endian_check)
-			ah->ah_flags |= AH_NO_EEP_SWAP;
-	}
-
-	if (pdata->eeprom_name) {
-		ret = ath9k_eeprom_request(sc, pdata->eeprom_name);
-		if (ret)
-			return ret;
-	}
-
-	if (pdata->led_active_high)
-		ah->config.led_active_high = true;
-
-	if (pdata->tx_gain_buffalo)
-		ah->config.tx_gain_buffalo = true;
-
-	if (pdata->macaddr)
-		ether_addr_copy(common->macaddr, pdata->macaddr);
-
-	return 0;
-}
-
 static int ath9k_of_init(struct ath_softc *sc)
 {
 	struct device_node *np = sc->dev->of_node;
@@ -748,10 +704,6 @@ static int ath9k_init_softc(u16 devid, struct ath_softc *sc,
 	 */
 	ath9k_init_pcoem_platform(sc);
 
-	ret = ath9k_init_platform(sc);
-	if (ret)
-		return ret;
-
 	ret = ath9k_of_init(sc);
 	if (ret)
 		return ret;
diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h
deleted file mode 100644
index 76860a461ed2..000000000000
--- a/include/linux/ath9k_platform.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2008 Atheros Communications Inc.
- * Copyright (c) 2009 Gabor Juhos <juhosg@openwrt.org>
- * Copyright (c) 2009 Imre Kaloz <kaloz@openwrt.org>
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifndef _LINUX_ATH9K_PLATFORM_H
-#define _LINUX_ATH9K_PLATFORM_H
-
-#define ATH9K_PLAT_EEP_MAX_WORDS	2048
-
-struct ath9k_platform_data {
-	const char *eeprom_name;
-
-	u16 eeprom_data[ATH9K_PLAT_EEP_MAX_WORDS];
-	u8 *macaddr;
-
-	int led_pin;
-	u32 gpio_mask;
-	u32 gpio_val;
-
-	u32 bt_active_pin;
-	u32 bt_priority_pin;
-	u32 wlan_active_pin;
-
-	bool endian_check;
-	bool is_clk_25mhz;
-	bool tx_gain_buffalo;
-	bool disable_2ghz;
-	bool disable_5ghz;
-	bool led_active_high;
-
-	int (*get_mac_revision)(void);
-	int (*external_reset)(void);
-
-	bool use_eeprom;
-};
-
-#endif /* _LINUX_ATH9K_PLATFORM_H */
-- 
cgit v1.2.3


From 1cc2e1faafb3b5a2be25112559bdb495736b5af7 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Date: Fri, 20 Sep 2024 10:57:57 +0200
Subject: pwm: Add more locking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This ensures that a pwm_chip that has no corresponding driver isn't used
and that a driver doesn't go away while a callback is still running.

In the presence of device links this isn't necessary yet (so this is no
fix) but for pwm character device support this is needed.

To not serialize all pwm_apply_state() calls, this introduces a per chip
lock. An additional complication is that for atomic chips a mutex cannot
be used (as pwm_apply_atomic() must not sleep) and a spinlock cannot be
held while calling an operation for a sleeping chip. So depending on the
chip being atomic or not a spinlock or a mutex is used.

An additional change implemented here is that on driver remove the
.free() callback is called for each requested pwm_device. This is the
right time because later (e.g. when the consumer calls pwm_put()) the
free function is (maybe) not available any more.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Link: https://lore.kernel.org/r/026aa891c8270a11723a1ba7e4256f456f7e1e86.1726819463.git.u.kleine-koenig@baylibre.com
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 drivers/pwm/core.c  | 100 +++++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/pwm.h |  13 +++++++
 2 files changed, 105 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 6e752e148b98..5a095eb46b54 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -31,6 +31,24 @@ static DEFINE_MUTEX(pwm_lock);
 
 static DEFINE_IDR(pwm_chips);
 
+static void pwmchip_lock(struct pwm_chip *chip)
+{
+	if (chip->atomic)
+		spin_lock(&chip->atomic_lock);
+	else
+		mutex_lock(&chip->nonatomic_lock);
+}
+
+static void pwmchip_unlock(struct pwm_chip *chip)
+{
+	if (chip->atomic)
+		spin_unlock(&chip->atomic_lock);
+	else
+		mutex_unlock(&chip->nonatomic_lock);
+}
+
+DEFINE_GUARD(pwmchip, struct pwm_chip *, pwmchip_lock(_T), pwmchip_unlock(_T))
+
 static void pwm_apply_debug(struct pwm_device *pwm,
 			    const struct pwm_state *state)
 {
@@ -220,6 +238,7 @@ static int __pwm_apply(struct pwm_device *pwm, const struct pwm_state *state)
 int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state)
 {
 	int err;
+	struct pwm_chip *chip = pwm->chip;
 
 	/*
 	 * Some lowlevel driver's implementations of .apply() make use of
@@ -230,7 +249,12 @@ int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state)
 	 */
 	might_sleep();
 
-	if (IS_ENABLED(CONFIG_PWM_DEBUG) && pwm->chip->atomic) {
+	guard(pwmchip)(chip);
+
+	if (!chip->operational)
+		return -ENODEV;
+
+	if (IS_ENABLED(CONFIG_PWM_DEBUG) && chip->atomic) {
 		/*
 		 * Catch any drivers that have been marked as atomic but
 		 * that will sleep anyway.
@@ -254,9 +278,16 @@ EXPORT_SYMBOL_GPL(pwm_apply_might_sleep);
  */
 int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state)
 {
-	WARN_ONCE(!pwm->chip->atomic,
+	struct pwm_chip *chip = pwm->chip;
+
+	WARN_ONCE(!chip->atomic,
 		  "sleeping PWM driver used in atomic context\n");
 
+	guard(pwmchip)(chip);
+
+	if (!chip->operational)
+		return -ENODEV;
+
 	return __pwm_apply(pwm, state);
 }
 EXPORT_SYMBOL_GPL(pwm_apply_atomic);
@@ -334,8 +365,18 @@ static int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
 	if (!ops->capture)
 		return -ENOSYS;
 
+	/*
+	 * Holding the pwm_lock is probably not needed. If you use pwm_capture()
+	 * and you're interested to speed it up, please convince yourself it's
+	 * really not needed, test and then suggest a patch on the mailing list.
+	 */
 	guard(mutex)(&pwm_lock);
 
+	guard(pwmchip)(chip);
+
+	if (!chip->operational)
+		return -ENODEV;
+
 	return ops->capture(chip, pwm, result, timeout);
 }
 
@@ -368,6 +409,14 @@ static int pwm_device_request(struct pwm_device *pwm, const char *label)
 	if (test_bit(PWMF_REQUESTED, &pwm->flags))
 		return -EBUSY;
 
+	/*
+	 * This function is called while holding pwm_lock. As .operational only
+	 * changes while holding this lock, checking it here without holding the
+	 * chip lock is fine.
+	 */
+	if (!chip->operational)
+		return -ENODEV;
+
 	if (!try_module_get(chip->owner))
 		return -ENODEV;
 
@@ -396,7 +445,9 @@ err_get_device:
 		 */
 		struct pwm_state state = { 0, };
 
-		err = ops->get_state(chip, pwm, &state);
+		scoped_guard(pwmchip, chip)
+			err = ops->get_state(chip, pwm, &state);
+
 		trace_pwm_get(pwm, &state, err);
 
 		if (!err)
@@ -1020,6 +1071,7 @@ struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t
 
 	chip->npwm = npwm;
 	chip->uses_pwmchip_alloc = true;
+	chip->operational = false;
 
 	pwmchip_dev = &chip->dev;
 	device_initialize(pwmchip_dev);
@@ -1125,6 +1177,11 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 
 	chip->owner = owner;
 
+	if (chip->atomic)
+		spin_lock_init(&chip->atomic_lock);
+	else
+		mutex_init(&chip->nonatomic_lock);
+
 	guard(mutex)(&pwm_lock);
 
 	ret = idr_alloc(&pwm_chips, chip, 0, 0, GFP_KERNEL);
@@ -1138,6 +1195,9 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 	if (IS_ENABLED(CONFIG_OF))
 		of_pwmchip_add(chip);
 
+	scoped_guard(pwmchip, chip)
+		chip->operational = true;
+
 	ret = device_add(&chip->dev);
 	if (ret)
 		goto err_device_add;
@@ -1145,6 +1205,9 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 	return 0;
 
 err_device_add:
+	scoped_guard(pwmchip, chip)
+		chip->operational = false;
+
 	if (IS_ENABLED(CONFIG_OF))
 		of_pwmchip_remove(chip);
 
@@ -1164,11 +1227,27 @@ void pwmchip_remove(struct pwm_chip *chip)
 {
 	pwmchip_sysfs_unexport(chip);
 
-	if (IS_ENABLED(CONFIG_OF))
-		of_pwmchip_remove(chip);
+	scoped_guard(mutex, &pwm_lock) {
+		unsigned int i;
+
+		scoped_guard(pwmchip, chip)
+			chip->operational = false;
+
+		for (i = 0; i < chip->npwm; ++i) {
+			struct pwm_device *pwm = &chip->pwms[i];
+
+			if (test_and_clear_bit(PWMF_REQUESTED, &pwm->flags)) {
+				dev_warn(&chip->dev, "Freeing requested PWM #%u\n", i);
+				if (pwm->chip->ops->free)
+					pwm->chip->ops->free(pwm->chip, pwm);
+			}
+		}
+
+		if (IS_ENABLED(CONFIG_OF))
+			of_pwmchip_remove(chip);
 
-	scoped_guard(mutex, &pwm_lock)
 		idr_remove(&pwm_chips, chip->id);
+	}
 
 	device_del(&chip->dev);
 }
@@ -1538,12 +1617,17 @@ void pwm_put(struct pwm_device *pwm)
 
 	guard(mutex)(&pwm_lock);
 
-	if (!test_and_clear_bit(PWMF_REQUESTED, &pwm->flags)) {
+	/*
+	 * Trigger a warning if a consumer called pwm_put() twice.
+	 * If the chip isn't operational, PWMF_REQUESTED was already cleared in
+	 * pwmchip_remove(). So don't warn in this case.
+	 */
+	if (chip->operational && !test_and_clear_bit(PWMF_REQUESTED, &pwm->flags)) {
 		pr_warn("PWM device already freed\n");
 		return;
 	}
 
-	if (chip->ops->free)
+	if (chip->operational && chip->ops->free)
 		pwm->chip->ops->free(pwm->chip, pwm);
 
 	pwm->label = NULL;
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 8acd60b53f58..3ea73e075abe 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -275,6 +275,9 @@ struct pwm_ops {
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @atomic: can the driver's ->apply() be called in atomic context
  * @uses_pwmchip_alloc: signals if pwmchip_allow was used to allocate this chip
+ * @operational: signals if the chip can be used (or is already deregistered)
+ * @nonatomic_lock: mutex for nonatomic chips
+ * @atomic_lock: mutex for atomic chips
  * @pwms: array of PWM devices allocated by the framework
  */
 struct pwm_chip {
@@ -290,6 +293,16 @@ struct pwm_chip {
 
 	/* only used internally by the PWM framework */
 	bool uses_pwmchip_alloc;
+	bool operational;
+	union {
+		/*
+		 * depending on the chip being atomic or not either the mutex or
+		 * the spinlock is used. It protects .operational and
+		 * synchronizes the callbacks in .ops
+		 */
+		struct mutex nonatomic_lock;
+		spinlock_t atomic_lock;
+	};
 	struct pwm_device pwms[] __counted_by(npwm);
 };
 
-- 
cgit v1.2.3


From 17e40c25158f2505cbcdeda96624afcbab4af368 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Date: Fri, 20 Sep 2024 10:57:58 +0200
Subject: pwm: New abstraction for PWM waveforms
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Up to now the configuration of a PWM setting is described exclusively by
a struct pwm_state which contains information about period, duty_cycle,
polarity and if the PWM is enabled. (There is another member usage_power
which doesn't completely fit into pwm_state, I ignore it here for
simplicity.)

Instead of a polarity the new abstraction has a member duty_offset_ns
that defines when the rising edge happens after the period start. This
is more general, as with a pwm_state the rising edge can only happen at
the period's start or such that the falling edge is at the end of the
period (i.e. duty_offset_ns == 0 or duty_offset_ns == period_length_ns -
duty_length_ns).

A disabled PWM is modeled by .period_length_ns = 0. In my eyes this is a
nice usage of that otherwise unusable setting, as it doesn't define
anything about the future which matches the fact that consumers should
consider the state of the output as undefined and it's just there to say
"No further requirements about the output, you can save some power.".

Further I renamed period and duty_cycle to period_length_ns and
duty_length_ns. In the past there was confusion from time to time about
duty_cycle being measured in nanoseconds because people expected a
percentage of period instead. With "length_ns" as suffix the semantic
should be more obvious to people unfamiliar with the pwm subsystem.
period is renamed to period_length_ns for consistency.

The API for consumers doesn't change yet, but lowlevel drivers can
implement callbacks that work with pwm_waveforms instead of pwm_states.
A new thing about these callbacks is that the calculation of hardware
settings needed to implement a certain waveform is separated from
actually writing these settings. The motivation for that is that this
allows a consumer to query the hardware capabilities without actually
modifying the hardware state.

The rounding rules that are expected to be implemented in the
round_waveform_tohw() are: First pick the biggest possible period not
bigger than wf->period_length_ns. For that period pick the biggest
possible duty setting not bigger than wf->duty_length_ns. Third pick the
biggest possible offset not bigger than wf->duty_offset_ns. If the
requested period is too small for the hardware, it's expected that a
setting with the minimal period and duty_length_ns = duty_offset_ns = 0
is returned and this fact is signaled by a return value of 1.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Tested-by: Trevor Gamblin <tgamblin@baylibre.com>
Link: https://lore.kernel.org/r/df0faa33bf9e7c9e2e5eab8d31bbf61e861bd401.1726819463.git.u.kleine-koenig@baylibre.com
[ukleinek: Update pwm_check_rounding() to return bool instead of int.]
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 drivers/pwm/core.c  | 234 +++++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/pwm.h |  36 ++++++++
 2 files changed, 249 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 5a095eb46b54..bbe7bfdb1549 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -49,6 +49,102 @@ static void pwmchip_unlock(struct pwm_chip *chip)
 
 DEFINE_GUARD(pwmchip, struct pwm_chip *, pwmchip_lock(_T), pwmchip_unlock(_T))
 
+static void pwm_wf2state(const struct pwm_waveform *wf, struct pwm_state *state)
+{
+	if (wf->period_length_ns) {
+		if (wf->duty_length_ns + wf->duty_offset_ns < wf->period_length_ns)
+			*state = (struct pwm_state){
+				.enabled = true,
+				.polarity = PWM_POLARITY_NORMAL,
+				.period = wf->period_length_ns,
+				.duty_cycle = wf->duty_length_ns,
+			};
+		else
+			*state = (struct pwm_state){
+				.enabled = true,
+				.polarity = PWM_POLARITY_INVERSED,
+				.period = wf->period_length_ns,
+				.duty_cycle = wf->period_length_ns - wf->duty_length_ns,
+			};
+	} else {
+		*state = (struct pwm_state){
+			.enabled = false,
+		};
+	}
+}
+
+static void pwm_state2wf(const struct pwm_state *state, struct pwm_waveform *wf)
+{
+	if (state->enabled) {
+		if (state->polarity == PWM_POLARITY_NORMAL)
+			*wf = (struct pwm_waveform){
+				.period_length_ns = state->period,
+				.duty_length_ns = state->duty_cycle,
+				.duty_offset_ns = 0,
+			};
+		else
+			*wf = (struct pwm_waveform){
+				.period_length_ns = state->period,
+				.duty_length_ns = state->period - state->duty_cycle,
+				.duty_offset_ns = state->duty_cycle,
+			};
+	} else {
+		*wf = (struct pwm_waveform){
+			.period_length_ns = 0,
+		};
+	}
+}
+
+static bool pwm_check_rounding(const struct pwm_waveform *wf,
+			       const struct pwm_waveform *wf_rounded)
+{
+	if (!wf->period_length_ns)
+		return true;
+
+	if (wf->period_length_ns < wf_rounded->period_length_ns)
+		return false;
+
+	if (wf->duty_length_ns < wf_rounded->duty_length_ns)
+		return false;
+
+	if (wf->duty_offset_ns < wf_rounded->duty_offset_ns)
+		return false;
+
+	return true;
+}
+
+static int __pwm_round_waveform_tohw(struct pwm_chip *chip, struct pwm_device *pwm,
+				     const struct pwm_waveform *wf, void *wfhw)
+{
+	const struct pwm_ops *ops = chip->ops;
+
+	return ops->round_waveform_tohw(chip, pwm, wf, wfhw);
+}
+
+static int __pwm_round_waveform_fromhw(struct pwm_chip *chip, struct pwm_device *pwm,
+				       const void *wfhw, struct pwm_waveform *wf)
+{
+	const struct pwm_ops *ops = chip->ops;
+
+	return ops->round_waveform_fromhw(chip, pwm, wfhw, wf);
+}
+
+static int __pwm_read_waveform(struct pwm_chip *chip, struct pwm_device *pwm, void *wfhw)
+{
+	const struct pwm_ops *ops = chip->ops;
+
+	return ops->read_waveform(chip, pwm, wfhw);
+}
+
+static int __pwm_write_waveform(struct pwm_chip *chip, struct pwm_device *pwm, const void *wfhw)
+{
+	const struct pwm_ops *ops = chip->ops;
+
+	return ops->write_waveform(chip, pwm, wfhw);
+}
+
+#define WFHWSIZE 20
+
 static void pwm_apply_debug(struct pwm_device *pwm,
 			    const struct pwm_state *state)
 {
@@ -182,6 +278,7 @@ static bool pwm_state_valid(const struct pwm_state *state)
 static int __pwm_apply(struct pwm_device *pwm, const struct pwm_state *state)
 {
 	struct pwm_chip *chip;
+	const struct pwm_ops *ops;
 	int err;
 
 	if (!pwm || !state)
@@ -205,6 +302,7 @@ static int __pwm_apply(struct pwm_device *pwm, const struct pwm_state *state)
 	}
 
 	chip = pwm->chip;
+	ops = chip->ops;
 
 	if (state->period == pwm->state.period &&
 	    state->duty_cycle == pwm->state.duty_cycle &&
@@ -213,18 +311,69 @@ static int __pwm_apply(struct pwm_device *pwm, const struct pwm_state *state)
 	    state->usage_power == pwm->state.usage_power)
 		return 0;
 
-	err = chip->ops->apply(chip, pwm, state);
-	trace_pwm_apply(pwm, state, err);
-	if (err)
-		return err;
+	if (ops->write_waveform) {
+		struct pwm_waveform wf;
+		char wfhw[WFHWSIZE];
 
-	pwm->state = *state;
+		BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
 
-	/*
-	 * only do this after pwm->state was applied as some
-	 * implementations of .get_state depend on this
-	 */
-	pwm_apply_debug(pwm, state);
+		pwm_state2wf(state, &wf);
+
+		/*
+		 * The rounding is wrong here for states with inverted polarity.
+		 * While .apply() rounds down duty_cycle (which represents the
+		 * time from the start of the period to the inner edge),
+		 * .round_waveform_tohw() rounds down the time the PWM is high.
+		 * Can be fixed if the need arises, until reported otherwise
+		 * let's assume that consumers don't care.
+		 */
+
+		err = __pwm_round_waveform_tohw(chip, pwm, &wf, &wfhw);
+		if (err) {
+			if (err > 0)
+				/*
+				 * This signals an invalid request, typically
+				 * the requested period (or duty_offset) is
+				 * smaller than possible with the hardware.
+				 */
+				return -EINVAL;
+
+			return err;
+		}
+
+		if (IS_ENABLED(CONFIG_PWM_DEBUG)) {
+			struct pwm_waveform wf_rounded;
+
+			err = __pwm_round_waveform_fromhw(chip, pwm, &wfhw, &wf_rounded);
+			if (err)
+				return err;
+
+			if (!pwm_check_rounding(&wf, &wf_rounded))
+				dev_err(&chip->dev, "Wrong rounding: requested %llu/%llu [+%llu], result %llu/%llu [+%llu]\n",
+					wf.duty_length_ns, wf.period_length_ns, wf.duty_offset_ns,
+					wf_rounded.duty_length_ns, wf_rounded.period_length_ns, wf_rounded.duty_offset_ns);
+		}
+
+		err = __pwm_write_waveform(chip, pwm, &wfhw);
+		if (err)
+			return err;
+
+		pwm->state = *state;
+
+	} else {
+		err = ops->apply(chip, pwm, state);
+		trace_pwm_apply(pwm, state, err);
+		if (err)
+			return err;
+
+		pwm->state = *state;
+
+		/*
+		 * only do this after pwm->state was applied as some
+		 * implementations of .get_state() depend on this
+		 */
+		pwm_apply_debug(pwm, state);
+	}
 
 	return 0;
 }
@@ -292,6 +441,41 @@ int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state)
 }
 EXPORT_SYMBOL_GPL(pwm_apply_atomic);
 
+static int pwm_get_state_hw(struct pwm_device *pwm, struct pwm_state *state)
+{
+	struct pwm_chip *chip = pwm->chip;
+	const struct pwm_ops *ops = chip->ops;
+	int ret = -EOPNOTSUPP;
+
+	if (ops->read_waveform) {
+		char wfhw[WFHWSIZE];
+		struct pwm_waveform wf;
+
+		BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
+
+		scoped_guard(pwmchip, chip) {
+
+			ret = __pwm_read_waveform(chip, pwm, &wfhw);
+			if (ret)
+				return ret;
+
+			ret = __pwm_round_waveform_fromhw(chip, pwm, &wfhw, &wf);
+			if (ret)
+				return ret;
+		}
+
+		pwm_wf2state(&wf, state);
+
+	} else if (ops->get_state) {
+		scoped_guard(pwmchip, chip)
+			ret = ops->get_state(chip, pwm, state);
+
+		trace_pwm_get(pwm, state, ret);
+	}
+
+	return ret;
+}
+
 /**
  * pwm_adjust_config() - adjust the current PWM config to the PWM arguments
  * @pwm: PWM device
@@ -435,7 +619,7 @@ err_get_device:
 		}
 	}
 
-	if (ops->get_state) {
+	if (ops->read_waveform || ops->get_state) {
 		/*
 		 * Zero-initialize state because most drivers are unaware of
 		 * .usage_power. The other members of state are supposed to be
@@ -445,11 +629,7 @@ err_get_device:
 		 */
 		struct pwm_state state = { 0, };
 
-		scoped_guard(pwmchip, chip)
-			err = ops->get_state(chip, pwm, &state);
-
-		trace_pwm_get(pwm, &state, err);
-
+		err = pwm_get_state_hw(pwm, &state);
 		if (!err)
 			pwm->state = state;
 
@@ -1136,12 +1316,24 @@ static bool pwm_ops_check(const struct pwm_chip *chip)
 {
 	const struct pwm_ops *ops = chip->ops;
 
-	if (!ops->apply)
-		return false;
+	if (ops->write_waveform) {
+		if (!ops->round_waveform_tohw ||
+		    !ops->round_waveform_fromhw ||
+		    !ops->write_waveform)
+			return false;
 
-	if (IS_ENABLED(CONFIG_PWM_DEBUG) && !ops->get_state)
-		dev_warn(pwmchip_parent(chip),
-			 "Please implement the .get_state() callback\n");
+		if (WFHWSIZE < ops->sizeof_wfhw) {
+			dev_warn(pwmchip_parent(chip), "WFHWSIZE < %zu\n", ops->sizeof_wfhw);
+			return false;
+		}
+	} else {
+		if (!ops->apply)
+			return false;
+
+		if (IS_ENABLED(CONFIG_PWM_DEBUG) && !ops->get_state)
+			dev_warn(pwmchip_parent(chip),
+				 "Please implement the .get_state() callback\n");
+	}
 
 	return true;
 }
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 3ea73e075abe..d8cfe1c9b19d 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -49,6 +49,31 @@ enum {
 	PWMF_EXPORTED = 1,
 };
 
+/**
+ * struct pwm_waveform - description of a PWM waveform
+ * @period_length_ns: PWM period
+ * @duty_length_ns: PWM duty cycle
+ * @duty_offset_ns: offset of the rising edge from the period's start
+ *
+ * This is a representation of a PWM waveform alternative to struct pwm_state
+ * below. It's more expressive than struct pwm_state as it contains a
+ * duty_offset_ns and so can represent offsets other than zero (with .polarity =
+ * PWM_POLARITY_NORMAL) and period - duty_cycle (.polarity =
+ * PWM_POLARITY_INVERSED).
+ *
+ * Note there is no explicit bool for enabled. A "disabled" PWM is represented
+ * by .period_length_ns = 0. Note further that the behaviour of a "disabled" PWM
+ * is undefined. Depending on the hardware's capabilities it might drive the
+ * active or inactive level, go high-z or even continue to toggle.
+ *
+ * The unit for all three members is nanoseconds.
+ */
+struct pwm_waveform {
+	u64 period_length_ns;
+	u64 duty_length_ns;
+	u64 duty_offset_ns;
+};
+
 /*
  * struct pwm_state - state of a PWM channel
  * @period: PWM period (in nanoseconds)
@@ -259,6 +284,17 @@ struct pwm_ops {
 	void (*free)(struct pwm_chip *chip, struct pwm_device *pwm);
 	int (*capture)(struct pwm_chip *chip, struct pwm_device *pwm,
 		       struct pwm_capture *result, unsigned long timeout);
+
+	size_t sizeof_wfhw;
+	int (*round_waveform_tohw)(struct pwm_chip *chip, struct pwm_device *pwm,
+				   const struct pwm_waveform *wf, void *wfhw);
+	int (*round_waveform_fromhw)(struct pwm_chip *chip, struct pwm_device *pwm,
+				     const void *wfhw, struct pwm_waveform *wf);
+	int (*read_waveform)(struct pwm_chip *chip, struct pwm_device *pwm,
+			    void *wfhw);
+	int (*write_waveform)(struct pwm_chip *chip, struct pwm_device *pwm,
+			      const void *wfhw);
+
 	int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm,
 		     const struct pwm_state *state);
 	int (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
-- 
cgit v1.2.3


From 6c5126c6406d1c31e91f5b925c621c1c785366be Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Date: Fri, 20 Sep 2024 10:57:59 +0200
Subject: pwm: Provide new consumer API functions for waveforms
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide API functions for consumers to work with waveforms.

Note that one relevant difference between pwm_get_state() and
pwm_get_waveform*() is that the latter yields the actually configured
hardware state, while the former yields the last state passed to
pwm_apply*() and so doesn't account for hardware specific rounding.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Tested-by: Trevor Gamblin <tgamblin@baylibre.com>
Link: https://lore.kernel.org/r/6c97d27682853f603e18e9196043886dd671845d.1726819463.git.u.kleine-koenig@baylibre.com
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 drivers/pwm/core.c  | 261 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pwm.h |   6 +-
 2 files changed, 266 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index bbe7bfdb1549..038f17dd2757 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -49,6 +49,30 @@ static void pwmchip_unlock(struct pwm_chip *chip)
 
 DEFINE_GUARD(pwmchip, struct pwm_chip *, pwmchip_lock(_T), pwmchip_unlock(_T))
 
+static bool pwm_wf_valid(const struct pwm_waveform *wf)
+{
+	/*
+	 * For now restrict waveforms to period_length_ns <= S64_MAX to provide
+	 * some space for future extensions. One possibility is to simplify
+	 * representing waveforms with inverted polarity using negative values
+	 * somehow.
+	 */
+	if (wf->period_length_ns > S64_MAX)
+		return false;
+
+	if (wf->duty_length_ns > wf->period_length_ns)
+		return false;
+
+	/*
+	 * .duty_offset_ns is supposed to be smaller than .period_length_ns, apart
+	 * from the corner case .duty_offset_ns == 0 && .period_length_ns == 0.
+	 */
+	if (wf->duty_offset_ns && wf->duty_offset_ns >= wf->period_length_ns)
+		return false;
+
+	return true;
+}
+
 static void pwm_wf2state(const struct pwm_waveform *wf, struct pwm_state *state)
 {
 	if (wf->period_length_ns) {
@@ -95,6 +119,29 @@ static void pwm_state2wf(const struct pwm_state *state, struct pwm_waveform *wf)
 	}
 }
 
+static int pwmwfcmp(const struct pwm_waveform *a, const struct pwm_waveform *b)
+{
+	if (a->period_length_ns > b->period_length_ns)
+		return 1;
+
+	if (a->period_length_ns < b->period_length_ns)
+		return -1;
+
+	if (a->duty_length_ns > b->duty_length_ns)
+		return 1;
+
+	if (a->duty_length_ns < b->duty_length_ns)
+		return -1;
+
+	if (a->duty_offset_ns > b->duty_offset_ns)
+		return 1;
+
+	if (a->duty_offset_ns < b->duty_offset_ns)
+		return -1;
+
+	return 0;
+}
+
 static bool pwm_check_rounding(const struct pwm_waveform *wf,
 			       const struct pwm_waveform *wf_rounded)
 {
@@ -145,6 +192,220 @@ static int __pwm_write_waveform(struct pwm_chip *chip, struct pwm_device *pwm, c
 
 #define WFHWSIZE 20
 
+/**
+ * pwm_round_waveform_might_sleep - Query hardware capabilities
+ * Cannot be used in atomic context.
+ * @pwm: PWM device
+ * @wf: waveform to round and output parameter
+ *
+ * Typically a given waveform cannot be implemented exactly by hardware, e.g.
+ * because hardware only supports coarse period resolution or no duty_offset.
+ * This function returns the actually implemented waveform if you pass wf to
+ * pwm_set_waveform_might_sleep now.
+ *
+ * Note however that the world doesn't stop turning when you call it, so when
+ * doing
+ *
+ * 	pwm_round_waveform_might_sleep(mypwm, &wf);
+ * 	pwm_set_waveform_might_sleep(mypwm, &wf, true);
+ *
+ * the latter might fail, e.g. because an input clock changed its rate between
+ * these two calls and the waveform determined by
+ * pwm_round_waveform_might_sleep() cannot be implemented any more.
+ *
+ * Returns 0 on success, 1 if there is no valid hardware configuration matching
+ * the input waveform under the PWM rounding rules or a negative errno.
+ */
+int pwm_round_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf)
+{
+	struct pwm_chip *chip = pwm->chip;
+	const struct pwm_ops *ops = chip->ops;
+	struct pwm_waveform wf_req = *wf;
+	char wfhw[WFHWSIZE];
+	int ret_tohw, ret_fromhw;
+
+	BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
+
+	if (!pwm_wf_valid(wf))
+		return -EINVAL;
+
+	guard(pwmchip)(chip);
+
+	if (!chip->operational)
+		return -ENODEV;
+
+	ret_tohw = __pwm_round_waveform_tohw(chip, pwm, wf, wfhw);
+	if (ret_tohw < 0)
+		return ret_tohw;
+
+	if (IS_ENABLED(CONFIG_PWM_DEBUG) && ret_tohw > 1)
+		dev_err(&chip->dev, "Unexpected return value from __pwm_round_waveform_tohw: requested %llu/%llu [+%llu], return value %d\n",
+			wf_req.duty_length_ns, wf_req.period_length_ns, wf_req.duty_offset_ns, ret_tohw);
+
+	ret_fromhw = __pwm_round_waveform_fromhw(chip, pwm, wfhw, wf);
+	if (ret_fromhw < 0)
+		return ret_fromhw;
+
+	if (IS_ENABLED(CONFIG_PWM_DEBUG) && ret_fromhw > 0)
+		dev_err(&chip->dev, "Unexpected return value from __pwm_round_waveform_fromhw: requested %llu/%llu [+%llu], return value %d\n",
+			wf_req.duty_length_ns, wf_req.period_length_ns, wf_req.duty_offset_ns, ret_tohw);
+
+	if (IS_ENABLED(CONFIG_PWM_DEBUG) &&
+	    ret_tohw == 0 && !pwm_check_rounding(&wf_req, wf))
+		dev_err(&chip->dev, "Wrong rounding: requested %llu/%llu [+%llu], result %llu/%llu [+%llu]\n",
+			wf_req.duty_length_ns, wf_req.period_length_ns, wf_req.duty_offset_ns,
+			wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns);
+
+	return ret_tohw;
+}
+EXPORT_SYMBOL_GPL(pwm_round_waveform_might_sleep);
+
+/**
+ * pwm_get_waveform_might_sleep - Query hardware about current configuration
+ * Cannot be used in atomic context.
+ * @pwm: PWM device
+ * @wf: output parameter
+ *
+ * Stores the current configuration of the PWM in @wf. Note this is the
+ * equivalent of pwm_get_state_hw() (and not pwm_get_state()) for pwm_waveform.
+ */
+int pwm_get_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf)
+{
+	struct pwm_chip *chip = pwm->chip;
+	const struct pwm_ops *ops = chip->ops;
+	char wfhw[WFHWSIZE];
+	int err;
+
+	BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
+
+	guard(pwmchip)(chip);
+
+	if (!chip->operational)
+		return -ENODEV;
+
+	err = __pwm_read_waveform(chip, pwm, &wfhw);
+	if (err)
+		return err;
+
+	return __pwm_round_waveform_fromhw(chip, pwm, &wfhw, wf);
+}
+EXPORT_SYMBOL_GPL(pwm_get_waveform_might_sleep);
+
+/* Called with the pwmchip lock held */
+static int __pwm_set_waveform(struct pwm_device *pwm,
+			      const struct pwm_waveform *wf,
+			      bool exact)
+{
+	struct pwm_chip *chip = pwm->chip;
+	const struct pwm_ops *ops = chip->ops;
+	char wfhw[WFHWSIZE];
+	struct pwm_waveform wf_rounded;
+	int err;
+
+	BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
+
+	if (!pwm_wf_valid(wf))
+		return -EINVAL;
+
+	err = __pwm_round_waveform_tohw(chip, pwm, wf, &wfhw);
+	if (err)
+		return err;
+
+	if ((IS_ENABLED(CONFIG_PWM_DEBUG) || exact) && wf->period_length_ns) {
+		err = __pwm_round_waveform_fromhw(chip, pwm, &wfhw, &wf_rounded);
+		if (err)
+			return err;
+
+		if (IS_ENABLED(CONFIG_PWM_DEBUG) && !pwm_check_rounding(wf, &wf_rounded))
+			dev_err(&chip->dev, "Wrong rounding: requested %llu/%llu [+%llu], result %llu/%llu [+%llu]\n",
+				wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns,
+				wf_rounded.duty_length_ns, wf_rounded.period_length_ns, wf_rounded.duty_offset_ns);
+
+		if (exact && pwmwfcmp(wf, &wf_rounded)) {
+			dev_dbg(&chip->dev, "Requested no rounding, but %llu/%llu [+%llu] -> %llu/%llu [+%llu]\n",
+				wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns,
+				wf_rounded.duty_length_ns, wf_rounded.period_length_ns, wf_rounded.duty_offset_ns);
+
+			return 1;
+		}
+	}
+
+	err = __pwm_write_waveform(chip, pwm, &wfhw);
+	if (err)
+		return err;
+
+	/* update .state */
+	pwm_wf2state(wf, &pwm->state);
+
+	if (IS_ENABLED(CONFIG_PWM_DEBUG) && ops->read_waveform && wf->period_length_ns) {
+		struct pwm_waveform wf_set;
+
+		err = __pwm_read_waveform(chip, pwm, &wfhw);
+		if (err)
+			/* maybe ignore? */
+			return err;
+
+		err = __pwm_round_waveform_fromhw(chip, pwm, &wfhw, &wf_set);
+		if (err)
+			/* maybe ignore? */
+			return err;
+
+		if (pwmwfcmp(&wf_set, &wf_rounded) != 0)
+			dev_err(&chip->dev,
+				"Unexpected setting: requested %llu/%llu [+%llu], expected %llu/%llu [+%llu], set %llu/%llu [+%llu]\n",
+				wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns,
+				wf_rounded.duty_length_ns, wf_rounded.period_length_ns, wf_rounded.duty_offset_ns,
+				wf_set.duty_length_ns, wf_set.period_length_ns, wf_set.duty_offset_ns);
+	}
+	return 0;
+}
+
+/**
+ * pwm_set_waveform_might_sleep - Apply a new waveform
+ * Cannot be used in atomic context.
+ * @pwm: PWM device
+ * @wf: The waveform to apply
+ * @exact: If true no rounding is allowed
+ *
+ * Typically a requested waveform cannot be implemented exactly, e.g. because
+ * you requested .period_length_ns = 100 ns, but the hardware can only set
+ * periods that are a multiple of 8.5 ns. With that hardware passing exact =
+ * true results in pwm_set_waveform_might_sleep() failing and returning 1. If
+ * exact = false you get a period of 93.5 ns (i.e. the biggest period not bigger
+ * than the requested value).
+ * Note that even with exact = true, some rounding by less than 1 is
+ * possible/needed. In the above example requesting .period_length_ns = 94 and
+ * exact = true, you get the hardware configured with period = 93.5 ns.
+ */
+int pwm_set_waveform_might_sleep(struct pwm_device *pwm,
+				 const struct pwm_waveform *wf, bool exact)
+{
+	struct pwm_chip *chip = pwm->chip;
+	int err;
+
+	might_sleep();
+
+	guard(pwmchip)(chip);
+
+	if (!chip->operational)
+		return -ENODEV;
+
+	if (IS_ENABLED(CONFIG_PWM_DEBUG) && chip->atomic) {
+		/*
+		 * Catch any drivers that have been marked as atomic but
+		 * that will sleep anyway.
+		 */
+		non_block_start();
+		err = __pwm_set_waveform(pwm, wf, exact);
+		non_block_end();
+	} else {
+		err = __pwm_set_waveform(pwm, wf, exact);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pwm_set_waveform_might_sleep);
+
 static void pwm_apply_debug(struct pwm_device *pwm,
 			    const struct pwm_state *state)
 {
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index d8cfe1c9b19d..c3d9ddeafa65 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -358,7 +358,11 @@ static inline void pwmchip_set_drvdata(struct pwm_chip *chip, void *data)
 }
 
 #if IS_ENABLED(CONFIG_PWM)
-/* PWM user APIs */
+
+/* PWM consumer APIs */
+int pwm_round_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf);
+int pwm_get_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf);
+int pwm_set_waveform_might_sleep(struct pwm_device *pwm, const struct pwm_waveform *wf, bool exact);
 int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state);
 int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state);
 int pwm_adjust_config(struct pwm_device *pwm);
-- 
cgit v1.2.3


From caf78b0465053c23aa6211b9815dd5433766627d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 24 Sep 2024 12:08:08 +0200
Subject: regcache: Improve documentation of available cache types

There is some user confusion about which cache types to choose when which
is not helped by the lack of any central documentation providing an
overview of what's available. Provide a short overview in the API header to
try to help reduce this.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://patch.msgid.link/20240924-regcache-document-types-v1-1-e157054e1215@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index f9ccad32fc5c..da07584284ab 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -54,7 +54,14 @@ struct sdw_slave;
 #define REGMAP_UPSHIFT(s)	(-(s))
 #define REGMAP_DOWNSHIFT(s)	(s)
 
-/* An enum of all the supported cache types */
+/*
+ * The supported cache types, the default is no cache.  Any new caches
+ * should usually use the maple tree cache unless they specifically
+ * require that there are never any allocations at runtime and can't
+ * provide defaults in which case they should use the flat cache.  The
+ * rbtree cache *may* have some performance advantage for very low end
+ * systems that make heavy use of cache syncs but is mainly legacy.
+ */
 enum regcache_type {
 	REGCACHE_NONE,
 	REGCACHE_RBTREE,
-- 
cgit v1.2.3


From 5441b6975adc26aeaca316b9075e04a98238b1b3 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 25 Sep 2024 17:38:04 +0800
Subject: regulator: Add of_regulator_get_optional() for pure DT regulator
 lookup

The to-be-introduced I2C component prober needs to enable regulator
supplies (and toggle GPIO pins) for the various components it intends
to probe. To support this, a new "pure DT lookup" method for getting
regulator supplies is needed, since the device normally requesting
the supply won't get created until after the component is probed to
be available.

Add a new of_regulator_get_optional() function for this. This mirrors
the existing regulator_get_optional() function, but is OF-specific.
The underlying code that supports the existing regulator_get*()
functions has been reworked in previous patches to support this
specific case.

Also convert an existing usage of "dev && dev->of_node" to
"dev_of_node(dev)".

Link: https://lore.kernel.org/all/20231220203537.83479-2-jernej.skrabec@gmail.com/ [1]
Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Link: https://patch.msgid.link/20240925093807.1026949-2-wenst@chromium.org
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c           |  4 +--
 drivers/regulator/internal.h       |  2 ++
 drivers/regulator/of_regulator.c   | 51 +++++++++++++++++++++++++++++++++-----
 include/linux/regulator/consumer.h | 20 +++++++++++++++
 4 files changed, 69 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 1179766811f5..d0b3879f2746 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1959,8 +1959,8 @@ static struct regulator_dev *regulator_dev_lookup(struct device *dev,
 	regulator_supply_alias(&dev, &supply);
 
 	/* first do a dt based lookup */
-	if (dev && dev->of_node) {
-		r = of_regulator_dev_lookup(dev, supply);
+	if (dev_of_node(dev)) {
+		r = of_regulator_dev_lookup(dev, dev_of_node(dev), supply);
 		if (!IS_ERR(r))
 			return r;
 		if (PTR_ERR(r) == -EPROBE_DEFER)
diff --git a/drivers/regulator/internal.h b/drivers/regulator/internal.h
index 5b43f802468d..f62cacbbc729 100644
--- a/drivers/regulator/internal.h
+++ b/drivers/regulator/internal.h
@@ -67,6 +67,7 @@ static inline struct regulator_dev *dev_to_rdev(struct device *dev)
 
 #ifdef CONFIG_OF
 struct regulator_dev *of_regulator_dev_lookup(struct device *dev,
+					      struct device_node *np,
 					      const char *supply);
 struct regulator_init_data *regulator_of_get_init_data(struct device *dev,
 			         const struct regulator_desc *desc,
@@ -82,6 +83,7 @@ bool of_check_coupling_data(struct regulator_dev *rdev);
 
 #else
 static inline struct regulator_dev *of_regulator_dev_lookup(struct device *dev,
+							    struct device_node *np,
 							    const char *supply)
 {
 	return ERR_PTR(-ENODEV);
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 3f490d81abc2..358c3ed791db 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -588,7 +588,8 @@ err_node_put:
 
 /**
  * of_get_regulator - get a regulator device node based on supply name
- * @dev: Device pointer for the consumer (of regulator) device
+ * @dev: Device pointer for dev_printk() messages
+ * @node: Device node pointer for supply property lookup
  * @supply: regulator supply name
  *
  * Extract the regulator device node corresponding to the supply name.
@@ -596,15 +597,16 @@ err_node_put:
  * Return: Pointer to the &struct device_node corresponding to the regulator
  *	   if found, or %NULL if not found.
  */
-static struct device_node *of_get_regulator(struct device *dev, const char *supply)
+static struct device_node *of_get_regulator(struct device *dev, struct device_node *node,
+					    const char *supply)
 {
 	struct device_node *regnode = NULL;
 	char prop_name[64]; /* 64 is max size of property name */
 
-	dev_dbg(dev, "Looking up %s-supply from device tree\n", supply);
+	dev_dbg(dev, "Looking up %s-supply from device node %pOF\n", supply, node);
 
 	snprintf(prop_name, 64, "%s-supply", supply);
-	regnode = of_parse_phandle(dev->of_node, prop_name, 0);
+	regnode = of_parse_phandle(node, prop_name, 0);
 	if (regnode)
 		return regnode;
 
@@ -628,6 +630,7 @@ static struct regulator_dev *of_find_regulator_by_node(struct device_node *np)
 /**
  * of_regulator_dev_lookup - lookup a regulator device with device tree only
  * @dev: Device pointer for regulator supply lookup.
+ * @np: Device node pointer for regulator supply lookup.
  * @supply: Supply name or regulator ID.
  *
  * Return: Pointer to the &struct regulator_dev on success, or ERR_PTR()
@@ -642,13 +645,13 @@ static struct regulator_dev *of_find_regulator_by_node(struct device_node *np)
  * * -%ENODEV if lookup fails permanently.
  * * -%EPROBE_DEFER if lookup could succeed in the future.
  */
-struct regulator_dev *of_regulator_dev_lookup(struct device *dev,
+struct regulator_dev *of_regulator_dev_lookup(struct device *dev, struct device_node *np,
 					      const char *supply)
 {
 	struct regulator_dev *r;
 	struct device_node *node;
 
-	node = of_get_regulator(dev, supply);
+	node = of_get_regulator(dev, np, supply);
 	if (node) {
 		r = of_find_regulator_by_node(node);
 		of_node_put(node);
@@ -665,6 +668,42 @@ struct regulator_dev *of_regulator_dev_lookup(struct device *dev,
 	return ERR_PTR(-ENODEV);
 }
 
+static struct regulator *_of_regulator_get(struct device *dev, struct device_node *node,
+					   const char *id, enum regulator_get_type get_type)
+{
+	struct regulator_dev *r;
+	int ret;
+
+	ret = _regulator_get_common_check(dev, id, get_type);
+	if (ret)
+		return ERR_PTR(ret);
+
+	r = of_regulator_dev_lookup(dev, node, id);
+	return _regulator_get_common(r, dev, id, get_type);
+}
+
+/**
+ * of_regulator_get_optional - get optional regulator via device tree lookup
+ * @dev: device used for dev_printk() messages
+ * @node: device node for regulator "consumer"
+ * @id: Supply name
+ *
+ * Return: pointer to struct regulator corresponding to the regulator producer,
+ *	   or PTR_ERR() encoded error number.
+ *
+ * This is intended for use by consumers that want to get a regulator
+ * supply directly from a device node, and can and want to deal with
+ * absence of such supplies. This will _not_ consider supply aliases.
+ * See regulator_dev_lookup().
+ */
+struct regulator *of_regulator_get_optional(struct device *dev,
+					    struct device_node *node,
+					    const char *id)
+{
+	return _of_regulator_get(dev, node, id, OPTIONAL_GET);
+}
+EXPORT_SYMBOL_GPL(of_regulator_get_optional);
+
 /*
  * Returns number of regulators coupled with rdev.
  */
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index b9ce521910a0..2b22f07e491c 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -168,6 +168,19 @@ int devm_regulator_get_enable_read_voltage(struct device *dev, const char *id);
 void regulator_put(struct regulator *regulator);
 void devm_regulator_put(struct regulator *regulator);
 
+#if IS_ENABLED(CONFIG_OF)
+struct regulator *__must_check of_regulator_get_optional(struct device *dev,
+							 struct device_node *node,
+							 const char *id);
+#else
+static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev,
+								       struct device_node *node,
+								       const char *id)
+{
+	return ERR_PTR(-ENODEV);
+}
+#endif
+
 int regulator_register_supply_alias(struct device *dev, const char *id,
 				    struct device *alias_dev,
 				    const char *alias_id);
@@ -350,6 +363,13 @@ devm_regulator_get_optional(struct device *dev, const char *id)
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev,
+								       struct device_node *node,
+								       const char *id)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline void regulator_put(struct regulator *regulator)
 {
 }
-- 
cgit v1.2.3


From 36ec3f437227470568e5f460997f367f5446a34d Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 25 Sep 2024 17:38:05 +0800
Subject: regulator: Add devres version of of_regulator_get_optional()

There are existing uses for a devres version of of_regulator_get_optional()
in power domain drivers. On MediaTek platforms, power domains may have
regulator supplies tied to them. The driver currently tries to use
devm_regulator_get() to not have to manage the lifecycle, but ends up
doing it in a very hacky way by replacing the device node of the power
domain controller device to the device node of the power domain that is
currently being registered, getting the supply, and reverting the device
node.

Provide a better API so that the hack can be replaced.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Link: https://patch.msgid.link/20240925093807.1026949-3-wenst@chromium.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/devres.c         | 39 ++++++++++++++++++++++++++++++++++++++
 drivers/regulator/internal.h       | 16 ++++++++++------
 drivers/regulator/of_regulator.c   |  4 ++--
 include/linux/regulator/consumer.h | 17 +++++++++++++++++
 4 files changed, 68 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/devres.c b/drivers/regulator/devres.c
index 1b893cdd1aad..36164aec30e8 100644
--- a/drivers/regulator/devres.c
+++ b/drivers/regulator/devres.c
@@ -749,3 +749,42 @@ void *devm_regulator_irq_helper(struct device *dev,
 	return ptr;
 }
 EXPORT_SYMBOL_GPL(devm_regulator_irq_helper);
+
+#if IS_ENABLED(CONFIG_OF)
+static struct regulator *_devm_of_regulator_get(struct device *dev, struct device_node *node,
+						const char *id, int get_type)
+{
+	struct regulator **ptr, *regulator;
+
+	ptr = devres_alloc(devm_regulator_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	regulator = _of_regulator_get(dev, node, id, get_type);
+	if (!IS_ERR(regulator)) {
+		*ptr = regulator;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return regulator;
+}
+
+/**
+ * devm_of_regulator_get_optional - Resource managed of_regulator_get_optional()
+ * @dev: device used for dev_printk() messages and resource lifetime management
+ * @node: device node for regulator "consumer"
+ * @id:  supply name or regulator ID.
+ *
+ * Managed regulator_get_optional(). Regulators returned from this
+ * function are automatically regulator_put() on driver detach. See
+ * of_regulator_get_optional() for more information.
+ */
+struct regulator *devm_of_regulator_get_optional(struct device *dev, struct device_node *node,
+						 const char *id)
+{
+	return _devm_of_regulator_get(dev, node, id, OPTIONAL_GET);
+}
+EXPORT_SYMBOL_GPL(devm_of_regulator_get_optional);
+#endif
diff --git a/drivers/regulator/internal.h b/drivers/regulator/internal.h
index f62cacbbc729..b3d48dc38bc4 100644
--- a/drivers/regulator/internal.h
+++ b/drivers/regulator/internal.h
@@ -65,6 +65,13 @@ static inline struct regulator_dev *dev_to_rdev(struct device *dev)
 	return container_of(dev, struct regulator_dev, dev);
 }
 
+enum regulator_get_type {
+	NORMAL_GET,
+	EXCLUSIVE_GET,
+	OPTIONAL_GET,
+	MAX_GET_TYPE
+};
+
 #ifdef CONFIG_OF
 struct regulator_dev *of_regulator_dev_lookup(struct device *dev,
 					      struct device_node *np,
@@ -74,6 +81,9 @@ struct regulator_init_data *regulator_of_get_init_data(struct device *dev,
 				 struct regulator_config *config,
 				 struct device_node **node);
 
+struct regulator *_of_regulator_get(struct device *dev, struct device_node *node,
+				    const char *id, enum regulator_get_type get_type);
+
 struct regulator_dev *of_parse_coupled_regulator(struct regulator_dev *rdev,
 						 int index);
 
@@ -116,12 +126,6 @@ static inline bool of_check_coupling_data(struct regulator_dev *rdev)
 }
 
 #endif
-enum regulator_get_type {
-	NORMAL_GET,
-	EXCLUSIVE_GET,
-	OPTIONAL_GET,
-	MAX_GET_TYPE
-};
 
 int _regulator_get_common_check(struct device *dev, const char *id,
 				enum regulator_get_type get_type);
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 358c3ed791db..3d85762beda6 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -668,8 +668,8 @@ struct regulator_dev *of_regulator_dev_lookup(struct device *dev, struct device_
 	return ERR_PTR(-ENODEV);
 }
 
-static struct regulator *_of_regulator_get(struct device *dev, struct device_node *node,
-					   const char *id, enum regulator_get_type get_type)
+struct regulator *_of_regulator_get(struct device *dev, struct device_node *node,
+				    const char *id, enum regulator_get_type get_type)
 {
 	struct regulator_dev *r;
 	int ret;
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 2b22f07e491c..8c3c372ad735 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -172,6 +172,9 @@ void devm_regulator_put(struct regulator *regulator);
 struct regulator *__must_check of_regulator_get_optional(struct device *dev,
 							 struct device_node *node,
 							 const char *id);
+struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev,
+							      struct device_node *node,
+							      const char *id);
 #else
 static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev,
 								       struct device_node *node,
@@ -179,6 +182,13 @@ static inline struct regulator *__must_check of_regulator_get_optional(struct de
 {
 	return ERR_PTR(-ENODEV);
 }
+
+static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev,
+									    struct device_node *node,
+									    const char *id)
+{
+	return ERR_PTR(-ENODEV);
+}
 #endif
 
 int regulator_register_supply_alias(struct device *dev, const char *id,
@@ -370,6 +380,13 @@ static inline struct regulator *__must_check of_regulator_get_optional(struct de
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev,
+									    struct device_node *node,
+									    const char *id)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline void regulator_put(struct regulator *regulator)
 {
 }
-- 
cgit v1.2.3


From 0809a9ccac4a2ffdfd1561bb551aec6099775545 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Mon, 2 Sep 2024 20:59:47 +0800
Subject: spi: remove {devm_}spi_alloc_master/slave()

All the {devm_}spi_alloc_master/slave() have been replaced,
so they can be removed and replaced in doc and comment.

No functional changed.

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Link: https://patch.msgid.link/20240902125947.1368-8-yangyingliang@huaweicloud.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/driver-api/driver-model/devres.rst |  4 ++--
 drivers/spi/spi.c                                | 14 +++++------
 include/linux/spi/spi.h                          | 30 ------------------------
 3 files changed, 9 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 5f2ee8d717b1..ebbf8e4cc85f 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -462,8 +462,8 @@ SLAVE DMA ENGINE
   devm_acpi_dma_controller_free()
 
 SPI
-  devm_spi_alloc_master()
-  devm_spi_alloc_slave()
+  devm_spi_alloc_host()
+  devm_spi_alloc_target()
   devm_spi_optimize_message()
   devm_spi_register_controller()
   devm_spi_register_host()
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index c1dad30a4528..073ffae97767 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -3238,9 +3238,9 @@ static int spi_controller_id_alloc(struct spi_controller *ctlr, int start, int e
 }
 
 /**
- * spi_register_controller - register SPI master or slave controller
- * @ctlr: initialized master, originally from spi_alloc_master() or
- *	spi_alloc_slave()
+ * spi_register_controller - register SPI host or target controller
+ * @ctlr: initialized controller, originally from spi_alloc_host() or
+ *	spi_alloc_target()
  * Context: can sleep
  *
  * SPI controllers connect to their drivers using some non-SPI bus,
@@ -3390,11 +3390,11 @@ static void devm_spi_unregister(struct device *dev, void *res)
 }
 
 /**
- * devm_spi_register_controller - register managed SPI master or slave
+ * devm_spi_register_controller - register managed SPI host or target
  *	controller
  * @dev:    device managing SPI controller
- * @ctlr: initialized controller, originally from spi_alloc_master() or
- *	spi_alloc_slave()
+ * @ctlr: initialized controller, originally from spi_alloc_host() or
+ *	spi_alloc_target()
  * Context: can sleep
  *
  * Register a SPI device as with spi_register_controller() which will
@@ -3478,7 +3478,7 @@ void spi_unregister_controller(struct spi_controller *ctlr)
 
 	/*
 	 * Release the last reference on the controller if its driver
-	 * has not yet been converted to devm_spi_alloc_master/slave().
+	 * has not yet been converted to devm_spi_alloc_host/target().
 	 */
 	if (!ctlr->devm_allocated)
 		put_device(&ctlr->dev);
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 4b95663163e0..8497f4747e24 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -824,21 +824,6 @@ void spi_take_timestamp_post(struct spi_controller *ctlr,
 extern struct spi_controller *__spi_alloc_controller(struct device *host,
 						unsigned int size, bool slave);
 
-static inline struct spi_controller *spi_alloc_master(struct device *host,
-						      unsigned int size)
-{
-	return __spi_alloc_controller(host, size, false);
-}
-
-static inline struct spi_controller *spi_alloc_slave(struct device *host,
-						     unsigned int size)
-{
-	if (!IS_ENABLED(CONFIG_SPI_SLAVE))
-		return NULL;
-
-	return __spi_alloc_controller(host, size, true);
-}
-
 static inline struct spi_controller *spi_alloc_host(struct device *dev,
 						    unsigned int size)
 {
@@ -858,21 +843,6 @@ struct spi_controller *__devm_spi_alloc_controller(struct device *dev,
 						   unsigned int size,
 						   bool slave);
 
-static inline struct spi_controller *devm_spi_alloc_master(struct device *dev,
-							   unsigned int size)
-{
-	return __devm_spi_alloc_controller(dev, size, false);
-}
-
-static inline struct spi_controller *devm_spi_alloc_slave(struct device *dev,
-							  unsigned int size)
-{
-	if (!IS_ENABLED(CONFIG_SPI_SLAVE))
-		return NULL;
-
-	return __devm_spi_alloc_controller(dev, size, true);
-}
-
 static inline struct spi_controller *devm_spi_alloc_host(struct device *dev,
 							 unsigned int size)
 {
-- 
cgit v1.2.3


From 6074e905023d09f64f2c896f475820a5623deb2c Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Mon, 16 Sep 2024 13:00:25 +0200
Subject: firmware: sysfb: Add a sysfb_handles_screen_info() helper function

That can be used by drivers to check if the Generic System Framebuffers
(sysfb) support can handle the data contained in the global screen_info.

Drivers might need this information to know if have to setup the system
framebuffer, or if they have to delegate this action to sysfb instead.

Suggested-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://lore.kernel.org/r/20240916110040.1688511-2-javierm@redhat.com
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
---
 drivers/firmware/sysfb.c | 19 +++++++++++++++++++
 include/linux/sysfb.h    |  7 +++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/sysfb.c b/drivers/firmware/sysfb.c
index 02a07d3d0d40..770e74be14f3 100644
--- a/drivers/firmware/sysfb.c
+++ b/drivers/firmware/sysfb.c
@@ -77,6 +77,25 @@ void sysfb_disable(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(sysfb_disable);
 
+/**
+ * sysfb_handles_screen_info() - reports if sysfb handles the global screen_info
+ *
+ * Callers can use sysfb_handles_screen_info() to determine whether the Generic
+ * System Framebuffers (sysfb) can handle the global screen_info data structure
+ * or not. Drivers might need this information to know if they have to setup the
+ * system framebuffer, or if they have to delegate this action to sysfb instead.
+ *
+ * Returns:
+ * True if sysfb handles the global screen_info data structure.
+ */
+bool sysfb_handles_screen_info(void)
+{
+	const struct screen_info *si = &screen_info;
+
+	return !!screen_info_video_type(si);
+}
+EXPORT_SYMBOL_GPL(sysfb_handles_screen_info);
+
 #if defined(CONFIG_PCI)
 static bool sysfb_pci_dev_is_enabled(struct pci_dev *pdev)
 {
diff --git a/include/linux/sysfb.h b/include/linux/sysfb.h
index bef5f06a91de..07cbab516942 100644
--- a/include/linux/sysfb.h
+++ b/include/linux/sysfb.h
@@ -60,12 +60,19 @@ struct efifb_dmi_info {
 
 void sysfb_disable(struct device *dev);
 
+bool sysfb_handles_screen_info(void);
+
 #else /* CONFIG_SYSFB */
 
 static inline void sysfb_disable(struct device *dev)
 {
 }
 
+static inline bool sysfb_handles_screen_info(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_SYSFB */
 
 #ifdef CONFIG_EFI
-- 
cgit v1.2.3


From e4ca0e59c39442546866f3dd514a3a5956577daf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 3 Sep 2024 20:59:04 +0300
Subject: types: Complement the aligned types with signed 64-bit one

Some user may want to use aligned signed 64-bit type.
Provide it for them.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20240903180218.3640501-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/types.h      | 3 ++-
 include/uapi/linux/types.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index 2bc8766ba20c..2d7b9ae8714c 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -115,8 +115,9 @@ typedef u64			u_int64_t;
 typedef s64			int64_t;
 #endif
 
-/* this is a special 64bit data type that is 8-byte aligned */
+/* These are the special 64-bit data types that are 8-byte aligned */
 #define aligned_u64		__aligned_u64
+#define aligned_s64		__aligned_s64
 #define aligned_be64		__aligned_be64
 #define aligned_le64		__aligned_le64
 
diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index 6375a0684052..48b933938877 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -53,6 +53,7 @@ typedef __u32 __bitwise __wsum;
  * No conversions are necessary between 32-bit user-space and a 64-bit kernel.
  */
 #define __aligned_u64 __u64 __attribute__((aligned(8)))
+#define __aligned_s64 __s64 __attribute__((aligned(8)))
 #define __aligned_be64 __be64 __attribute__((aligned(8)))
 #define __aligned_le64 __le64 __attribute__((aligned(8)))
 
-- 
cgit v1.2.3


From faf178607772f28006e403d7bab6c4217d4ee447 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 7 Sep 2024 19:24:46 +0200
Subject: iio: adc: Constify struct iio_map

'struct iio_map' are not modified in these drivers.

Constifying this structure moves some data to a read-only section, so
increase overall security.

In order to do it, the prototype of iio_map_array_register() and
devm_iio_map_array_register(), and a few structures that hold a
"struct iio_map *" need to be adjusted.

On a x86_64, with allmodconfig, as an example:
Before:
======
   text	   data	    bss	    dec	    hex	filename
  21086	    760	      0	  21846	   5556	drivers/iio/adc/axp20x_adc.o

After:
=====
   text	   data	    bss	    dec	    hex	filename
  21470	    360	      0	  21830	   5546	drivers/iio/adc/axp20x_adc.o
  33842	   1697	    384	  35923	   8c53	drivers/iio/addac/ad74413r.o

--
Compile tested only

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://patch.msgid.link/5729dc3cc3892ecf0d8ea28c5f7307b34e27493e.1725729801.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/axp20x_adc.c      | 6 +++---
 drivers/iio/adc/axp288_adc.c      | 2 +-
 drivers/iio/adc/da9150-gpadc.c    | 2 +-
 drivers/iio/adc/intel_mrfld_adc.c | 2 +-
 drivers/iio/adc/lp8788_adc.c      | 6 +++---
 drivers/iio/adc/mp2629_adc.c      | 2 +-
 drivers/iio/adc/rn5t618-adc.c     | 2 +-
 drivers/iio/adc/sun4i-gpadc-iio.c | 2 +-
 drivers/iio/inkern.c              | 7 ++++---
 include/linux/iio/driver.h        | 5 +++--
 10 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/axp20x_adc.c b/drivers/iio/adc/axp20x_adc.c
index d43c8d124a0c..940c92f2792a 100644
--- a/drivers/iio/adc/axp20x_adc.c
+++ b/drivers/iio/adc/axp20x_adc.c
@@ -155,7 +155,7 @@ enum axp813_adc_channel_v {
 	AXP813_BATT_V,
 };
 
-static struct iio_map axp20x_maps[] = {
+static const struct iio_map axp20x_maps[] = {
 	{
 		.consumer_dev_name = "axp20x-usb-power-supply",
 		.consumer_channel = "vbus_v",
@@ -187,7 +187,7 @@ static struct iio_map axp20x_maps[] = {
 	}, { /* sentinel */ }
 };
 
-static struct iio_map axp22x_maps[] = {
+static const struct iio_map axp22x_maps[] = {
 	{
 		.consumer_dev_name = "axp20x-battery-power-supply",
 		.consumer_channel = "batt_v",
@@ -1044,7 +1044,7 @@ struct axp_data {
 	unsigned long			adc_en2_mask;
 	int				(*adc_rate)(struct axp20x_adc_iio *info,
 						    int rate);
-	struct iio_map			*maps;
+	const struct iio_map		*maps;
 };
 
 static const struct axp_data axp192_data = {
diff --git a/drivers/iio/adc/axp288_adc.c b/drivers/iio/adc/axp288_adc.c
index 8c3acc0cd7e9..45542efc3ece 100644
--- a/drivers/iio/adc/axp288_adc.c
+++ b/drivers/iio/adc/axp288_adc.c
@@ -103,7 +103,7 @@ static const struct iio_chan_spec axp288_adc_channels[] = {
 };
 
 /* for consumer drivers */
-static struct iio_map axp288_adc_default_maps[] = {
+static const struct iio_map axp288_adc_default_maps[] = {
 	IIO_MAP("TS_PIN", "axp288-batt", "axp288-batt-temp"),
 	IIO_MAP("PMIC_TEMP", "axp288-pmic", "axp288-pmic-temp"),
 	IIO_MAP("GPADC", "axp288-gpadc", "axp288-system-temp"),
diff --git a/drivers/iio/adc/da9150-gpadc.c b/drivers/iio/adc/da9150-gpadc.c
index 8f0d3fb63b67..82628746ba8e 100644
--- a/drivers/iio/adc/da9150-gpadc.c
+++ b/drivers/iio/adc/da9150-gpadc.c
@@ -291,7 +291,7 @@ static const struct iio_chan_spec da9150_gpadc_channels[] = {
 };
 
 /* Default maps used by da9150-charger */
-static struct iio_map da9150_gpadc_default_maps[] = {
+static const struct iio_map da9150_gpadc_default_maps[] = {
 	{
 		.consumer_dev_name = "da9150-charger",
 		.consumer_channel = "CHAN_IBUS",
diff --git a/drivers/iio/adc/intel_mrfld_adc.c b/drivers/iio/adc/intel_mrfld_adc.c
index 0590a126f321..30c8c09e3716 100644
--- a/drivers/iio/adc/intel_mrfld_adc.c
+++ b/drivers/iio/adc/intel_mrfld_adc.c
@@ -164,7 +164,7 @@ static const struct iio_chan_spec mrfld_adc_channels[] = {
 	BCOVE_ADC_CHANNEL(IIO_TEMP,       8, "CH8", 0xC6),
 };
 
-static struct iio_map iio_maps[] = {
+static const struct iio_map iio_maps[] = {
 	IIO_MAP("CH0", "bcove-battery", "VBATRSLT"),
 	IIO_MAP("CH1", "bcove-battery", "BATTID"),
 	IIO_MAP("CH2", "bcove-battery", "IBATRSLT"),
diff --git a/drivers/iio/adc/lp8788_adc.c b/drivers/iio/adc/lp8788_adc.c
index 6d9b354bc705..0d49be0061a2 100644
--- a/drivers/iio/adc/lp8788_adc.c
+++ b/drivers/iio/adc/lp8788_adc.c
@@ -26,7 +26,7 @@
 
 struct lp8788_adc {
 	struct lp8788 *lp;
-	struct iio_map *map;
+	const struct iio_map *map;
 	struct mutex lock;
 };
 
@@ -149,7 +149,7 @@ static const struct iio_chan_spec lp8788_adc_channels[] = {
 };
 
 /* default maps used by iio consumer (lp8788-charger driver) */
-static struct iio_map lp8788_default_iio_maps[] = {
+static const struct iio_map lp8788_default_iio_maps[] = {
 	{
 		.consumer_dev_name = "lp8788-charger",
 		.consumer_channel = "lp8788_vbatt_5p0",
@@ -168,7 +168,7 @@ static int lp8788_iio_map_register(struct device *dev,
 				struct lp8788_platform_data *pdata,
 				struct lp8788_adc *adc)
 {
-	struct iio_map *map;
+	const struct iio_map *map;
 	int ret;
 
 	map = (!pdata || !pdata->adc_pdata) ?
diff --git a/drivers/iio/adc/mp2629_adc.c b/drivers/iio/adc/mp2629_adc.c
index 5fbf9b6abd9c..921d3e193752 100644
--- a/drivers/iio/adc/mp2629_adc.c
+++ b/drivers/iio/adc/mp2629_adc.c
@@ -52,7 +52,7 @@ static struct iio_chan_spec mp2629_channels[] = {
 	MP2629_ADC_CHAN(INPUT_CURRENT, IIO_CURRENT)
 };
 
-static struct iio_map mp2629_adc_maps[] = {
+static const struct iio_map mp2629_adc_maps[] = {
 	MP2629_MAP(BATT_VOLT, "batt-volt"),
 	MP2629_MAP(SYSTEM_VOLT, "system-volt"),
 	MP2629_MAP(INPUT_VOLT, "input-volt"),
diff --git a/drivers/iio/adc/rn5t618-adc.c b/drivers/iio/adc/rn5t618-adc.c
index ce5f3011fe00..b33536157adc 100644
--- a/drivers/iio/adc/rn5t618-adc.c
+++ b/drivers/iio/adc/rn5t618-adc.c
@@ -185,7 +185,7 @@ static const struct iio_chan_spec rn5t618_adc_iio_channels[] = {
 	RN5T618_ADC_CHANNEL(AIN0, IIO_VOLTAGE, "AIN0")
 };
 
-static struct iio_map rn5t618_maps[] = {
+static const struct iio_map rn5t618_maps[] = {
 	IIO_MAP("VADP", "rn5t618-power", "vadp"),
 	IIO_MAP("VUSB", "rn5t618-power", "vusb"),
 	{ /* sentinel */ }
diff --git a/drivers/iio/adc/sun4i-gpadc-iio.c b/drivers/iio/adc/sun4i-gpadc-iio.c
index 100ecced5fc1..5d459f050634 100644
--- a/drivers/iio/adc/sun4i-gpadc-iio.c
+++ b/drivers/iio/adc/sun4i-gpadc-iio.c
@@ -114,7 +114,7 @@ struct sun4i_gpadc_iio {
 	.datasheet_name = _name,				\
 }
 
-static struct iio_map sun4i_gpadc_hwmon_maps[] = {
+static const struct iio_map sun4i_gpadc_hwmon_maps[] = {
 	{
 		.adc_channel_label = "temp_adc",
 		.consumer_dev_name = "iio_hwmon.0",
diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 151099be2863..7f325b3ed08f 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -20,7 +20,7 @@
 
 struct iio_map_internal {
 	struct iio_dev *indio_dev;
-	struct iio_map *map;
+	const struct iio_map *map;
 	struct list_head l;
 };
 
@@ -42,7 +42,7 @@ static int iio_map_array_unregister_locked(struct iio_dev *indio_dev)
 	return ret;
 }
 
-int iio_map_array_register(struct iio_dev *indio_dev, struct iio_map *maps)
+int iio_map_array_register(struct iio_dev *indio_dev, const struct iio_map *maps)
 {
 	struct iio_map_internal *mapi;
 	int i = 0;
@@ -86,7 +86,8 @@ static void iio_map_array_unregister_cb(void *indio_dev)
 	iio_map_array_unregister(indio_dev);
 }
 
-int devm_iio_map_array_register(struct device *dev, struct iio_dev *indio_dev, struct iio_map *maps)
+int devm_iio_map_array_register(struct device *dev, struct iio_dev *indio_dev,
+				const struct iio_map *maps)
 {
 	int ret;
 
diff --git a/include/linux/iio/driver.h b/include/linux/iio/driver.h
index 7a157ed218f6..7f8b55551ed0 100644
--- a/include/linux/iio/driver.h
+++ b/include/linux/iio/driver.h
@@ -18,7 +18,7 @@ struct iio_map;
  * @map:	array of mappings specifying association of channel with client
  */
 int iio_map_array_register(struct iio_dev *indio_dev,
-			   struct iio_map *map);
+			   const struct iio_map *map);
 
 /**
  * iio_map_array_unregister() - tell the core to remove consumer mappings for
@@ -38,6 +38,7 @@ int iio_map_array_unregister(struct iio_dev *indio_dev);
  * handle de-registration of the IIO map object when the device's refcount goes to
  * zero.
  */
-int devm_iio_map_array_register(struct device *dev, struct iio_dev *indio_dev, struct iio_map *maps);
+int devm_iio_map_array_register(struct device *dev, struct iio_dev *indio_dev,
+				const struct iio_map *maps);
 
 #endif
-- 
cgit v1.2.3


From 0f702757c68b9790a844e5c07073d3d1c777b13a Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Mon, 26 Aug 2024 11:34:11 +0800
Subject: ARM: samsung: Remove obsoleted declaration for s3c_hwmon_set_platdata

The s3c_hwmon_set_platdata() have been removed since
commit 0d297df03890 ("ARM: s3c: simplify platform code"), and now
it is useless, so remove it.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Link: https://lore.kernel.org/r/20240826033411.4022822-1-cuigaosheng1@huawei.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 include/linux/platform_data/hwmon-s3c.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/hwmon-s3c.h b/include/linux/platform_data/hwmon-s3c.h
index 1707ad4147df..7d21e0c41037 100644
--- a/include/linux/platform_data/hwmon-s3c.h
+++ b/include/linux/platform_data/hwmon-s3c.h
@@ -33,14 +33,4 @@ struct s3c_hwmon_pdata {
 	struct s3c_hwmon_chcfg	*in[8];
 };
 
-/**
- * s3c_hwmon_set_platdata - Set platform data for S3C HWMON device
- * @pd: Platform data to register to device.
- *
- * Register the given platform data for use with the S3C HWMON device.
- * The call will copy the platform data, so the board definitions can
- * make the structure itself __initdata.
- */
-extern void __init s3c_hwmon_set_platdata(struct s3c_hwmon_pdata *pd);
-
 #endif /* __HWMON_S3C_H__ */
-- 
cgit v1.2.3


From 3a6ad95d97eb62a7b7c804ef7eeb329a1f697d00 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Mon, 26 Aug 2024 11:31:18 +0800
Subject: ASoC: samsung: Remove obsoleted declaration for
 s3c64xx_ac97_setup_gpio

The s3c64xx_ac97_setup_gpio() have been removed since
commit 0d297df03890 ("ARM: s3c: simplify platform code"), and now
it is useless, so remove it.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Link: https://lore.kernel.org/r/20240826033118.4021727-1-cuigaosheng1@huawei.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 include/linux/platform_data/asoc-s3c.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/asoc-s3c.h b/include/linux/platform_data/asoc-s3c.h
index f9c00f839e9f..085dd8e8af76 100644
--- a/include/linux/platform_data/asoc-s3c.h
+++ b/include/linux/platform_data/asoc-s3c.h
@@ -13,8 +13,6 @@
 
 #include <linux/dmaengine.h>
 
-extern void s3c64xx_ac97_setup_gpio(int);
-
 struct samsung_i2s_type {
 /* If the Primary DAI has 5.1 Channels */
 #define QUIRK_PRI_6CHAN		(1 << 0)
-- 
cgit v1.2.3


From a370b72ec7165ebe1230d0225cbe66f6526e68ef Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sun, 18 Aug 2024 21:48:13 +0900
Subject: tracing: Add a comment about ftrace_regs definition

To clarify what will be expected on ftrace_regs, add a comment to the
architecture independent definition of the ftrace_regs.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index fd5e84d0ec47..42106b3de396 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -117,6 +117,32 @@ extern int ftrace_enabled;
 
 #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
 
+/**
+ * ftrace_regs - ftrace partial/optimal register set
+ *
+ * ftrace_regs represents a group of registers which is used at the
+ * function entry and exit. There are three types of registers.
+ *
+ * - Registers for passing the parameters to callee, including the stack
+ *   pointer. (e.g. rcx, rdx, rdi, rsi, r8, r9 and rsp on x86_64)
+ * - Registers for passing the return values to caller.
+ *   (e.g. rax and rdx on x86_64)
+ * - Registers for hooking the function call and return including the
+ *   frame pointer (the frame pointer is architecture/config dependent)
+ *   (e.g. rip, rbp and rsp for x86_64)
+ *
+ * Also, architecture dependent fields can be used for internal process.
+ * (e.g. orig_ax on x86_64)
+ *
+ * On the function entry, those registers will be restored except for
+ * the stack pointer, so that user can change the function parameters
+ * and instruction pointer (e.g. live patching.)
+ * On the function exit, only registers which is used for return values
+ * are restored.
+ *
+ * NOTE: user *must not* access regs directly, only do it via APIs, because
+ * the member can be changed according to the architecture.
+ */
 struct ftrace_regs {
 	struct pt_regs		regs;
 };
-- 
cgit v1.2.3


From a312a0f7834e605e7c41570f0e9525d0fc4a70a4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sat, 14 Sep 2024 17:48:06 -0400
Subject: fgraph: Use fgraph data to store subtime for profiler

Instead of having the "subtime" for the function profiler in the
infrastructure ftrace_ret_stack structure, have it use the fgraph data
reserve and retrieve functions.

This will keep the limited shadow stack from wasting 8 bytes for something
that is seldom used.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Link: https://lore.kernel.org/20240914214826.780323141@goodmis.org
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  4 +---
 kernel/trace/fgraph.c  | 64 ++++++++++++++++++++++++++++++++++++++------------
 kernel/trace/ftrace.c  | 23 +++++++++---------
 3 files changed, 62 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 42106b3de396..aabd348cad4a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1081,6 +1081,7 @@ struct fgraph_ops {
 
 void *fgraph_reserve_data(int idx, int size_bytes);
 void *fgraph_retrieve_data(int idx, int *size_bytes);
+void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth);
 
 /*
  * Stack of return addresses for functions
@@ -1091,9 +1092,6 @@ struct ftrace_ret_stack {
 	unsigned long ret;
 	unsigned long func;
 	unsigned long long calltime;
-#ifdef CONFIG_FUNCTION_PROFILER
-	unsigned long long subtime;
-#endif
 #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
 	unsigned long fp;
 #endif
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d7d4fb403f6f..095ceb752b28 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -390,21 +390,7 @@ void *fgraph_reserve_data(int idx, int size_bytes)
  */
 void *fgraph_retrieve_data(int idx, int *size_bytes)
 {
-	int offset = current->curr_ret_stack - 1;
-	unsigned long val;
-
-	val = get_fgraph_entry(current, offset);
-	while (__get_type(val) == FGRAPH_TYPE_DATA) {
-		if (__get_data_index(val) == idx)
-			goto found;
-		offset -= __get_data_size(val) + 1;
-		val = get_fgraph_entry(current, offset);
-	}
-	return NULL;
-found:
-	if (size_bytes)
-		*size_bytes = __get_data_size(val) * sizeof(long);
-	return get_data_type_data(current, offset);
+	return fgraph_retrieve_parent_data(idx, size_bytes, 0);
 }
 
 /**
@@ -460,6 +446,54 @@ get_ret_stack(struct task_struct *t, int offset, int *frame_offset)
 	return RET_STACK(t, offset);
 }
 
+/**
+ * fgraph_retrieve_parent_data - get data from a parent function
+ * @idx: The index into the fgraph_array (fgraph_ops::idx)
+ * @size_bytes: A pointer to retrieved data size
+ * @depth: The depth to find the parent (0 is the current function)
+ *
+ * This is similar to fgraph_retrieve_data() but can be used to retrieve
+ * data from a parent caller function.
+ *
+ * Return: a pointer to the specified parent data or NULL if not found
+ */
+void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth)
+{
+	struct ftrace_ret_stack *ret_stack = NULL;
+	int offset = current->curr_ret_stack;
+	unsigned long val;
+
+	if (offset <= 0)
+		return NULL;
+
+	for (;;) {
+		int next_offset;
+
+		ret_stack = get_ret_stack(current, offset, &next_offset);
+		if (!ret_stack || --depth < 0)
+			break;
+		offset = next_offset;
+	}
+
+	if (!ret_stack)
+		return NULL;
+
+	offset--;
+
+	val = get_fgraph_entry(current, offset);
+	while (__get_type(val) == FGRAPH_TYPE_DATA) {
+		if (__get_data_index(val) == idx)
+			goto found;
+		offset -= __get_data_size(val) + 1;
+		val = get_fgraph_entry(current, offset);
+	}
+	return NULL;
+found:
+	if (size_bytes)
+		*size_bytes = __get_data_size(val) * sizeof(long);
+	return get_data_type_data(current, offset);
+}
+
 /* Both enabled by default (can be cleared by function_graph tracer flags */
 static bool fgraph_sleep_time = true;
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d2dd71d04b8a..bac1f2ee1983 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -823,7 +823,7 @@ void ftrace_graph_graph_time_control(bool enable)
 static int profile_graph_entry(struct ftrace_graph_ent *trace,
 			       struct fgraph_ops *gops)
 {
-	struct ftrace_ret_stack *ret_stack;
+	unsigned long long *subtime;
 
 	function_profile_call(trace->func, 0, NULL, NULL);
 
@@ -831,9 +831,9 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
 	if (!current->ret_stack)
 		return 0;
 
-	ret_stack = ftrace_graph_get_ret_stack(current, 0);
-	if (ret_stack)
-		ret_stack->subtime = 0;
+	subtime = fgraph_reserve_data(gops->idx, sizeof(*subtime));
+	if (subtime)
+		*subtime = 0;
 
 	return 1;
 }
@@ -841,11 +841,12 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
 static void profile_graph_return(struct ftrace_graph_ret *trace,
 				 struct fgraph_ops *gops)
 {
-	struct ftrace_ret_stack *ret_stack;
 	struct ftrace_profile_stat *stat;
 	unsigned long long calltime;
+	unsigned long long *subtime;
 	struct ftrace_profile *rec;
 	unsigned long flags;
+	int size;
 
 	local_irq_save(flags);
 	stat = this_cpu_ptr(&ftrace_profile_stats);
@@ -861,13 +862,13 @@ static void profile_graph_return(struct ftrace_graph_ret *trace,
 	if (!fgraph_graph_time) {
 
 		/* Append this call time to the parent time to subtract */
-		ret_stack = ftrace_graph_get_ret_stack(current, 1);
-		if (ret_stack)
-			ret_stack->subtime += calltime;
+		subtime = fgraph_retrieve_parent_data(gops->idx, &size, 1);
+		if (subtime)
+			*subtime += calltime;
 
-		ret_stack = ftrace_graph_get_ret_stack(current, 0);
-		if (ret_stack && ret_stack->subtime < calltime)
-			calltime -= ret_stack->subtime;
+		subtime = fgraph_retrieve_data(gops->idx, &size);
+		if (subtime && *subtime && *subtime < calltime)
+			calltime -= *subtime;
 		else
 			calltime = 0;
 	}
-- 
cgit v1.2.3


From 3c9880f3ab52b52b5b4e1850a70e80dd7329cb4c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sat, 14 Sep 2024 17:48:07 -0400
Subject: ftrace: Use a running sleeptime instead of saving on shadow stack

The fgraph "sleep-time" option tells the function graph tracer and the
profiler whether to include the time a function "sleeps" (is scheduled off
the CPU) in its duration for the function. By default it is true, which
means the duration of a function is calculated by the timestamp of when the
function was entered to the timestamp of when it exits.

If the "sleep-time" option is disabled, it needs to remove the time that the
task was not running on the CPU during the function. Currently it is done in
a sched_switch tracepoint probe where it moves the "calltime" (time of entry
of the function) forward by the sleep time calculated. It updates all the
calltime in the shadow stack.

This is time consuming for those users of the function graph tracer that
does not care about the sleep time. Instead, add a "ftrace_sleeptime" to the
task_struct that gets the sleep time added each time the task wakes up. Then
have the function entry save the current "ftrace_sleeptime" and on function
exit, move the calltime forward by the difference of the current
"ftrace_sleeptime" from the saved sleeptime.

This removes one dependency of "calltime" needed to be on the shadow stack.
It also simplifies the code that removes the sleep time of functions.

TODO: Only enable the sched_switch tracepoint when this is needed.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Link: https://lore.kernel.org/20240914214826.938908568@goodmis.org
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/sched.h                |  1 +
 kernel/trace/fgraph.c                | 16 ++-------------
 kernel/trace/ftrace.c                | 39 ++++++++++++++++++++++++++----------
 kernel/trace/trace.h                 |  1 +
 kernel/trace/trace_functions_graph.c | 28 ++++++++++++++++++++++++++
 5 files changed, 60 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e6ee4258169a..c08f3bdb11a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1441,6 +1441,7 @@ struct task_struct {
 
 	/* Timestamp for last schedule: */
 	unsigned long long		ftrace_timestamp;
+	unsigned long long		ftrace_sleeptime;
 
 	/*
 	 * Number of functions that haven't been traced
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 095ceb752b28..b2e95bf82211 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -495,7 +495,7 @@ found:
 }
 
 /* Both enabled by default (can be cleared by function_graph tracer flags */
-static bool fgraph_sleep_time = true;
+bool fgraph_sleep_time = true;
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /*
@@ -1046,9 +1046,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
 				struct task_struct *next,
 				unsigned int prev_state)
 {
-	struct ftrace_ret_stack *ret_stack;
 	unsigned long long timestamp;
-	int offset;
 
 	/*
 	 * Does the user want to count the time a function was asleep.
@@ -1065,17 +1063,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
 	if (!next->ftrace_timestamp)
 		return;
 
-	/*
-	 * Update all the counters in next to make up for the
-	 * time next was sleeping.
-	 */
-	timestamp -= next->ftrace_timestamp;
-
-	for (offset = next->curr_ret_stack; offset > 0; ) {
-		ret_stack = get_ret_stack(next, offset, &offset);
-		if (ret_stack)
-			ret_stack->calltime += timestamp;
-	}
+	next->ftrace_sleeptime += timestamp - next->ftrace_timestamp;
 }
 
 static DEFINE_PER_CPU(unsigned long *, idle_ret_stack);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bac1f2ee1983..90b3975d5315 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -820,10 +820,15 @@ void ftrace_graph_graph_time_control(bool enable)
 	fgraph_graph_time = enable;
 }
 
+struct profile_fgraph_data {
+	unsigned long long		subtime;
+	unsigned long long		sleeptime;
+};
+
 static int profile_graph_entry(struct ftrace_graph_ent *trace,
 			       struct fgraph_ops *gops)
 {
-	unsigned long long *subtime;
+	struct profile_fgraph_data *profile_data;
 
 	function_profile_call(trace->func, 0, NULL, NULL);
 
@@ -831,9 +836,12 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
 	if (!current->ret_stack)
 		return 0;
 
-	subtime = fgraph_reserve_data(gops->idx, sizeof(*subtime));
-	if (subtime)
-		*subtime = 0;
+	profile_data = fgraph_reserve_data(gops->idx, sizeof(*profile_data));
+	if (!profile_data)
+		return 0;
+
+	profile_data->subtime = 0;
+	profile_data->sleeptime = current->ftrace_sleeptime;
 
 	return 1;
 }
@@ -841,9 +849,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
 static void profile_graph_return(struct ftrace_graph_ret *trace,
 				 struct fgraph_ops *gops)
 {
+	struct profile_fgraph_data *profile_data;
+	struct profile_fgraph_data *parent_data;
 	struct ftrace_profile_stat *stat;
 	unsigned long long calltime;
-	unsigned long long *subtime;
 	struct ftrace_profile *rec;
 	unsigned long flags;
 	int size;
@@ -859,16 +868,24 @@ static void profile_graph_return(struct ftrace_graph_ret *trace,
 
 	calltime = trace->rettime - trace->calltime;
 
+	if (!fgraph_sleep_time) {
+		profile_data = fgraph_retrieve_data(gops->idx, &size);
+		if (profile_data && current->ftrace_sleeptime)
+			calltime -= current->ftrace_sleeptime - profile_data->sleeptime;
+	}
+
 	if (!fgraph_graph_time) {
 
 		/* Append this call time to the parent time to subtract */
-		subtime = fgraph_retrieve_parent_data(gops->idx, &size, 1);
-		if (subtime)
-			*subtime += calltime;
+		parent_data = fgraph_retrieve_parent_data(gops->idx, &size, 1);
+		if (parent_data)
+			parent_data->subtime += calltime;
+
+		if (!profile_data)
+			profile_data = fgraph_retrieve_data(gops->idx, &size);
 
-		subtime = fgraph_retrieve_data(gops->idx, &size);
-		if (subtime && *subtime && *subtime < calltime)
-			calltime -= *subtime;
+		if (profile_data && profile_data->subtime && profile_data->subtime < calltime)
+			calltime -= profile_data->subtime;
 		else
 			calltime = 0;
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c866991b9c78..2f8017f8d34d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1048,6 +1048,7 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 extern unsigned int fgraph_max_depth;
+extern bool fgraph_sleep_time;
 
 static inline bool
 ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a569daaac4c4..bbd898f5a73c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -133,6 +133,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
 	unsigned long *task_var = fgraph_get_task_var(gops);
 	struct trace_array *tr = gops->private;
 	struct trace_array_cpu *data;
+	unsigned long *sleeptime;
 	unsigned long flags;
 	unsigned int trace_ctx;
 	long disabled;
@@ -167,6 +168,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
 	if (ftrace_graph_ignore_irqs())
 		return 0;
 
+	/* save the current sleep time if we are to ignore it */
+	if (!fgraph_sleep_time) {
+		sleeptime = fgraph_reserve_data(gops->idx, sizeof(*sleeptime));
+		if (sleeptime)
+			*sleeptime = current->ftrace_sleeptime;
+	}
+
 	/*
 	 * Stop here if tracing_threshold is set. We only write function return
 	 * events to the ring buffer.
@@ -238,6 +246,22 @@ void __trace_graph_return(struct trace_array *tr,
 		trace_buffer_unlock_commit_nostack(buffer, event);
 }
 
+static void handle_nosleeptime(struct ftrace_graph_ret *trace,
+			       struct fgraph_ops *gops)
+{
+	unsigned long long *sleeptime;
+	int size;
+
+	if (fgraph_sleep_time)
+		return;
+
+	sleeptime = fgraph_retrieve_data(gops->idx, &size);
+	if (!sleeptime)
+		return;
+
+	trace->calltime += current->ftrace_sleeptime - *sleeptime;
+}
+
 void trace_graph_return(struct ftrace_graph_ret *trace,
 			struct fgraph_ops *gops)
 {
@@ -256,6 +280,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
 		return;
 	}
 
+	handle_nosleeptime(trace, gops);
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = per_cpu_ptr(tr->array_buffer.data, cpu);
@@ -278,6 +304,8 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
 		return;
 	}
 
+	handle_nosleeptime(trace, gops);
+
 	if (tracing_thresh &&
 	    (trace->rettime - trace->calltime < tracing_thresh))
 		return;
-- 
cgit v1.2.3


From f1f36e22bee967db5e812a65e24389e54c46f3c2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sat, 14 Sep 2024 17:48:08 -0400
Subject: ftrace: Have calltime be saved in the fgraph storage

The calltime field in the shadow stack frame is only used by the function
graph tracer and profiler. But now that there's other users of the function
graph infrastructure, this adds overhead and wastes space on the shadow
stack. Move the calltime to the fgraph data storage, where the function
graph and profiler entry functions will save it in its own graph storage and
retrieve it in its exit functions.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Link: https://lore.kernel.org/20240914214827.096968730@goodmis.org
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h               |  1 -
 kernel/trace/fgraph.c                |  5 ---
 kernel/trace/ftrace.c                | 19 ++++++------
 kernel/trace/trace_functions_graph.c | 60 ++++++++++++++++++++++++------------
 4 files changed, 51 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index aabd348cad4a..e684addf6508 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1091,7 +1091,6 @@ void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth);
 struct ftrace_ret_stack {
 	unsigned long ret;
 	unsigned long func;
-	unsigned long long calltime;
 #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
 	unsigned long fp;
 #endif
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index b2e95bf82211..58a28ec35dab 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -558,7 +558,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
 			 int fgraph_idx)
 {
 	struct ftrace_ret_stack *ret_stack;
-	unsigned long long calltime;
 	unsigned long val;
 	int offset;
 
@@ -588,8 +587,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
 		return -EBUSY;
 	}
 
-	calltime = trace_clock_local();
-
 	offset = READ_ONCE(current->curr_ret_stack);
 	ret_stack = RET_STACK(current, offset);
 	offset += FGRAPH_FRAME_OFFSET;
@@ -623,7 +620,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
 
 	ret_stack->ret = ret;
 	ret_stack->func = func;
-	ret_stack->calltime = calltime;
 #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
 	ret_stack->fp = frame_pointer;
 #endif
@@ -757,7 +753,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
 	*offset += FGRAPH_FRAME_OFFSET;
 	*ret = ret_stack->ret;
 	trace->func = ret_stack->func;
-	trace->calltime = ret_stack->calltime;
 	trace->overrun = atomic_read(&current->trace_overrun);
 	trace->depth = current->curr_ret_depth;
 	/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 90b3975d5315..cae388122ca8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -821,6 +821,7 @@ void ftrace_graph_graph_time_control(bool enable)
 }
 
 struct profile_fgraph_data {
+	unsigned long long		calltime;
 	unsigned long long		subtime;
 	unsigned long long		sleeptime;
 };
@@ -842,6 +843,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
 
 	profile_data->subtime = 0;
 	profile_data->sleeptime = current->ftrace_sleeptime;
+	profile_data->calltime = trace_clock_local();
 
 	return 1;
 }
@@ -850,9 +852,9 @@ static void profile_graph_return(struct ftrace_graph_ret *trace,
 				 struct fgraph_ops *gops)
 {
 	struct profile_fgraph_data *profile_data;
-	struct profile_fgraph_data *parent_data;
 	struct ftrace_profile_stat *stat;
 	unsigned long long calltime;
+	unsigned long long rettime = trace_clock_local();
 	struct ftrace_profile *rec;
 	unsigned long flags;
 	int size;
@@ -862,29 +864,28 @@ static void profile_graph_return(struct ftrace_graph_ret *trace,
 	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
+	profile_data = fgraph_retrieve_data(gops->idx, &size);
+
 	/* If the calltime was zero'd ignore it */
-	if (!trace->calltime)
+	if (!profile_data || !profile_data->calltime)
 		goto out;
 
-	calltime = trace->rettime - trace->calltime;
+	calltime = rettime - profile_data->calltime;
 
 	if (!fgraph_sleep_time) {
-		profile_data = fgraph_retrieve_data(gops->idx, &size);
-		if (profile_data && current->ftrace_sleeptime)
+		if (current->ftrace_sleeptime)
 			calltime -= current->ftrace_sleeptime - profile_data->sleeptime;
 	}
 
 	if (!fgraph_graph_time) {
+		struct profile_fgraph_data *parent_data;
 
 		/* Append this call time to the parent time to subtract */
 		parent_data = fgraph_retrieve_parent_data(gops->idx, &size, 1);
 		if (parent_data)
 			parent_data->subtime += calltime;
 
-		if (!profile_data)
-			profile_data = fgraph_retrieve_data(gops->idx, &size);
-
-		if (profile_data && profile_data->subtime && profile_data->subtime < calltime)
+		if (profile_data->subtime && profile_data->subtime < calltime)
 			calltime -= profile_data->subtime;
 		else
 			calltime = 0;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index bbd898f5a73c..5c1b150fbba3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -127,13 +127,18 @@ static inline int ftrace_graph_ignore_irqs(void)
 	return in_hardirq();
 }
 
+struct fgraph_times {
+	unsigned long long		calltime;
+	unsigned long long		sleeptime; /* may be optional! */
+};
+
 int trace_graph_entry(struct ftrace_graph_ent *trace,
 		      struct fgraph_ops *gops)
 {
 	unsigned long *task_var = fgraph_get_task_var(gops);
 	struct trace_array *tr = gops->private;
 	struct trace_array_cpu *data;
-	unsigned long *sleeptime;
+	struct fgraph_times *ftimes;
 	unsigned long flags;
 	unsigned int trace_ctx;
 	long disabled;
@@ -168,12 +173,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
 	if (ftrace_graph_ignore_irqs())
 		return 0;
 
-	/* save the current sleep time if we are to ignore it */
-	if (!fgraph_sleep_time) {
-		sleeptime = fgraph_reserve_data(gops->idx, sizeof(*sleeptime));
-		if (sleeptime)
-			*sleeptime = current->ftrace_sleeptime;
+	if (fgraph_sleep_time) {
+		/* Only need to record the calltime */
+		ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime));
+	} else {
+		ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes));
+		if (ftimes)
+			ftimes->sleeptime = current->ftrace_sleeptime;
 	}
+	if (!ftimes)
+		return 0;
+
+	ftimes->calltime = trace_clock_local();
 
 	/*
 	 * Stop here if tracing_threshold is set. We only write function return
@@ -247,19 +258,13 @@ void __trace_graph_return(struct trace_array *tr,
 }
 
 static void handle_nosleeptime(struct ftrace_graph_ret *trace,
-			       struct fgraph_ops *gops)
+			       struct fgraph_times *ftimes,
+			       int size)
 {
-	unsigned long long *sleeptime;
-	int size;
-
-	if (fgraph_sleep_time)
-		return;
-
-	sleeptime = fgraph_retrieve_data(gops->idx, &size);
-	if (!sleeptime)
+	if (fgraph_sleep_time || size < sizeof(*ftimes))
 		return;
 
-	trace->calltime += current->ftrace_sleeptime - *sleeptime;
+	ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime;
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace,
@@ -268,9 +273,11 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
 	unsigned long *task_var = fgraph_get_task_var(gops);
 	struct trace_array *tr = gops->private;
 	struct trace_array_cpu *data;
+	struct fgraph_times *ftimes;
 	unsigned long flags;
 	unsigned int trace_ctx;
 	long disabled;
+	int size;
 	int cpu;
 
 	ftrace_graph_addr_finish(gops, trace);
@@ -280,7 +287,13 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
 		return;
 	}
 
-	handle_nosleeptime(trace, gops);
+	ftimes = fgraph_retrieve_data(gops->idx, &size);
+	if (!ftimes)
+		return;
+
+	handle_nosleeptime(trace, ftimes, size);
+
+	trace->calltime = ftimes->calltime;
 
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
@@ -297,6 +310,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
 static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
 				      struct fgraph_ops *gops)
 {
+	struct fgraph_times *ftimes;
+	int size;
+
 	ftrace_graph_addr_finish(gops, trace);
 
 	if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
@@ -304,10 +320,16 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
 		return;
 	}
 
-	handle_nosleeptime(trace, gops);
+	ftimes = fgraph_retrieve_data(gops->idx, &size);
+	if (!ftimes)
+		return;
+
+	handle_nosleeptime(trace, ftimes, size);
+
+	trace->calltime = ftimes->calltime;
 
 	if (tracing_thresh &&
-	    (trace->rettime - trace->calltime < tracing_thresh))
+	    (trace->rettime - ftimes->calltime < tracing_thresh))
 		return;
 	else
 		trace_graph_return(trace, gops);
-- 
cgit v1.2.3


From 26228256b796eb0145bdfb2ae34ec8c4c0ef1319 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 6 Sep 2024 09:52:16 +0200
Subject: backlight: lcd: Test against struct fb_info.lcd_dev

Add struct fb_info.lcd_dev for fbdev drivers to store a reference to
their lcd device. Update the lcd's fb_notifier_callback() to test for
this field. The lcd module can now detect if an lcd device belongs to
an fbdev device.

This works similar to the bl_dev for backlights and will allow for
the removal of the check_fb callback from several fbdev driver's lcd
devices.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240906075439.98476-3-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/lcd.c |  3 +++
 include/linux/fb.h            | 13 +++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 2f57d6867d42..c69407aed296 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -30,6 +30,7 @@ static int fb_notifier_callback(struct notifier_block *self,
 	struct lcd_device *ld = container_of(self, struct lcd_device, fb_notif);
 	struct fb_event *evdata = data;
 	struct fb_info *info = evdata->info;
+	struct lcd_device *fb_lcd = fb_lcd_device(info);
 
 	guard(mutex)(&ld->ops_lock);
 
@@ -37,6 +38,8 @@ static int fb_notifier_callback(struct notifier_block *self,
 		return 0;
 	if (ld->ops->check_fb && !ld->ops->check_fb(ld, info))
 		return 0;
+	if (fb_lcd && fb_lcd != ld)
+		return 0;
 
 	if (event == FB_EVENT_BLANK) {
 		if (ld->ops->set_power)
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 267b59ead432..5ba187e08cf7 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -21,6 +21,7 @@ struct fb_info;
 struct file;
 struct i2c_adapter;
 struct inode;
+struct lcd_device;
 struct module;
 struct notifier_block;
 struct page;
@@ -480,6 +481,13 @@ struct fb_info {
 	struct mutex bl_curve_mutex;
 	u8 bl_curve[FB_BACKLIGHT_LEVELS];
 #endif
+
+	/*
+	 * Assigned LCD device; set before framebuffer
+	 * registration, remove after unregister
+	 */
+	struct lcd_device *lcd_dev;
+
 #ifdef CONFIG_FB_DEFERRED_IO
 	struct delayed_work deferred_work;
 	unsigned long npagerefs;
@@ -754,6 +762,11 @@ static inline struct backlight_device *fb_bl_device(struct fb_info *info)
 }
 #endif
 
+static inline struct lcd_device *fb_lcd_device(struct fb_info *info)
+{
+	return info->lcd_dev;
+}
+
 /* fbmon.c */
 #define FB_MAXTIMINGS		0
 #define FB_VSYNCTIMINGS		1
-- 
cgit v1.2.3


From 48ffe2074c2864ab64ee2004e7ebf3d6a6730fbf Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 6 Sep 2024 09:52:17 +0200
Subject: backlight: lcd: Add LCD_POWER_ constants for power states

Duplicate FB_BLANK_ constants as LCD_POWER_ constants in the lcd
header file. Allows lcd drivers to avoid including the fbdev header
file and removes a compile-time dependency between the two subsystems.

The new LCD_POWER_ constants have the same values as their
FB_BLANK_ counterparts. Hence semantics does not change and the lcd
drivers can be converted one by one. Each instance of FB_BLANK_UNBLANK
becomes LCD_POWER_ON, each of FB_BLANK_POWERDOWN becomes LCD_POWER_OFF,
FB_BLANK_NORMAL becomes LCD_POWER_REDUCED and FB_BLANK_VSYNC_SUSPEND
becomes LCD_POWER_REDUCED_VSYNC_SUSPEND.

Lcd code or drivers do not use FB_BLANK_HSYNC_SUSPEND, so no
new constants for this is being added. The tokens LCD_POWER_REDUCED
and LCD_POWER_REDUCED_VSYNC_SUSPEND are deprecated and drivers should
replace them with LCD_POWER_ON and LCD_POWER_OFF.

See also commit a1cacb8a8e70 ("backlight: Add BACKLIGHT_POWER_ constants
for power states"), which added similar constants for backlight drivers.

v2:
- fix typo in commit description

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240906075439.98476-4-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/lcd.c | 22 +++++++++++++++++++++-
 include/linux/lcd.h           |  5 +++++
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index c69407aed296..713f7fb8b10a 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -20,6 +20,24 @@
 
 #if defined(CONFIG_FB) || (defined(CONFIG_FB_MODULE) && \
 			   defined(CONFIG_LCD_CLASS_DEVICE_MODULE))
+static int to_lcd_power(int fb_blank)
+{
+	switch (fb_blank) {
+	case FB_BLANK_UNBLANK:
+		return LCD_POWER_ON;
+	/* deprecated; TODO: should become 'off' */
+	case FB_BLANK_NORMAL:
+		return LCD_POWER_REDUCED;
+	case FB_BLANK_VSYNC_SUSPEND:
+		return LCD_POWER_REDUCED_VSYNC_SUSPEND;
+	/* 'off' */
+	case FB_BLANK_HSYNC_SUSPEND:
+	case FB_BLANK_POWERDOWN:
+	default:
+		return LCD_POWER_OFF;
+	}
+}
+
 /* This callback gets called when something important happens inside a
  * framebuffer driver. We're looking if that important event is blanking,
  * and if it is, we're switching lcd power as well ...
@@ -42,8 +60,10 @@ static int fb_notifier_callback(struct notifier_block *self,
 		return 0;
 
 	if (event == FB_EVENT_BLANK) {
+		int power = to_lcd_power(*(int *)evdata->data);
+
 		if (ld->ops->set_power)
-			ld->ops->set_power(ld, *(int *)evdata->data);
+			ld->ops->set_power(ld, power);
 	} else {
 		if (ld->ops->set_mode)
 			ld->ops->set_mode(ld, evdata->data);
diff --git a/include/linux/lcd.h b/include/linux/lcd.h
index 68703a51dc53..dfcc54d327f5 100644
--- a/include/linux/lcd.h
+++ b/include/linux/lcd.h
@@ -14,6 +14,11 @@
 #include <linux/notifier.h>
 #include <linux/fb.h>
 
+#define LCD_POWER_ON			(0)
+#define LCD_POWER_REDUCED		(1) // deprecated; don't use in new code
+#define LCD_POWER_REDUCED_VSYNC_SUSPEND	(2) // deprecated; don't use in new code
+#define LCD_POWER_OFF			(4)
+
 /* Notes on locking:
  *
  * lcd_device->ops_lock is an internal backlight lock protecting the ops
-- 
cgit v1.2.3


From 43e1120deb3768c86aa3875c7073658e44a30ea5 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 6 Sep 2024 09:52:40 +0200
Subject: backlight: lcd: Replace check_fb with controls_device

Rename check_fb in struct lcd_ops to controls_device. The callback
is now independent from fbdev's struct fb_info and tests if an lcd
device controls a hardware display device. The new naming and semantics
follow similar functionality for backlight devices.

v2:
- fix typos in commit description (Daniel)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240906075439.98476-27-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/lcd.c          |  2 +-
 drivers/video/backlight/platform_lcd.c | 11 +++++------
 include/linux/lcd.h                    | 16 ++++++++++++----
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 713f7fb8b10a..dd175b446180 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -54,7 +54,7 @@ static int fb_notifier_callback(struct notifier_block *self,
 
 	if (!ld->ops)
 		return 0;
-	if (ld->ops->check_fb && !ld->ops->check_fb(ld, info))
+	if (ld->ops->controls_device && !ld->ops->controls_device(ld, info->device))
 		return 0;
 	if (fb_lcd && fb_lcd != ld)
 		return 0;
diff --git a/drivers/video/backlight/platform_lcd.c b/drivers/video/backlight/platform_lcd.c
index 69a22d1a8a35..c9fe50f4d8ed 100644
--- a/drivers/video/backlight/platform_lcd.c
+++ b/drivers/video/backlight/platform_lcd.c
@@ -9,7 +9,6 @@
 
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/fb.h>
 #include <linux/lcd.h>
 #include <linux/slab.h>
 
@@ -50,17 +49,17 @@ static int platform_lcd_set_power(struct lcd_device *lcd, int power)
 	return 0;
 }
 
-static int platform_lcd_match(struct lcd_device *lcd, struct fb_info *info)
+static bool platform_lcd_controls_device(struct lcd_device *lcd, struct device *display_device)
 {
 	struct platform_lcd *plcd = to_our_lcd(lcd);
 
-	return plcd->us->parent == info->device;
+	return plcd->us->parent == display_device;
 }
 
 static const struct lcd_ops platform_lcd_ops = {
-	.get_power	= platform_lcd_get_power,
-	.set_power	= platform_lcd_set_power,
-	.check_fb	= platform_lcd_match,
+	.get_power		= platform_lcd_get_power,
+	.set_power		= platform_lcd_set_power,
+	.controls_device	= platform_lcd_controls_device,
 };
 
 static int platform_lcd_probe(struct platform_device *pdev)
diff --git a/include/linux/lcd.h b/include/linux/lcd.h
index dfcc54d327f5..8399b5ed48f2 100644
--- a/include/linux/lcd.h
+++ b/include/linux/lcd.h
@@ -35,7 +35,6 @@
  */
 
 struct lcd_device;
-struct fb_info;
 
 struct lcd_properties {
 	/* The maximum value for contrast (read-only) */
@@ -54,9 +53,18 @@ struct lcd_ops {
         int (*set_contrast)(struct lcd_device *, int contrast);
 	/* Set LCD panel mode (resolutions ...) */
 	int (*set_mode)(struct lcd_device *, struct fb_videomode *);
-	/* Check if given framebuffer device is the one LCD is bound to;
-	   return 0 if not, !=0 if it is. If NULL, lcd always matches the fb. */
-	int (*check_fb)(struct lcd_device *, struct fb_info *);
+
+	/*
+	 * Check if the LCD controls the given display device. This
+	 * operation is optional and if not implemented it is assumed that
+	 * the display is always the one controlled by the LCD.
+	 *
+	 * RETURNS:
+	 *
+	 * If display_dev is NULL or display_dev matches the device controlled by
+	 * the LCD, return true. Otherwise return false.
+	 */
+	bool (*controls_device)(struct lcd_device *lcd, struct device *display_device);
 };
 
 struct lcd_device {
-- 
cgit v1.2.3


From 02e224d096ef58fe59e96609de6018e133f33512 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 6 Sep 2024 09:52:41 +0200
Subject: backlight: lcd: Remove struct fb_videomode from set_mode callback

Implementations of struct lcd_ops.set_mode only require the resolution
from struct fb_videomode. Pass the xres and yres fields, but remove the
dependency on the fbdev data structure.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240906075439.98476-28-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/corgi_lcd.c | 5 ++---
 drivers/video/backlight/lcd.c       | 4 +++-
 drivers/video/backlight/tdo24m.c    | 5 ++---
 include/linux/lcd.h                 | 7 +++++--
 4 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/corgi_lcd.c b/drivers/video/backlight/corgi_lcd.c
index 35c3fd3281ca..69f49371ea35 100644
--- a/drivers/video/backlight/corgi_lcd.c
+++ b/drivers/video/backlight/corgi_lcd.c
@@ -17,7 +17,6 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/gpio/consumer.h>
-#include <linux/fb.h>
 #include <linux/lcd.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/corgi_lcd.h>
@@ -332,12 +331,12 @@ static void corgi_lcd_power_off(struct corgi_lcd *lcd)
 			POWER1_VW_OFF | POWER1_GVSS_OFF | POWER1_VDD_OFF);
 }
 
-static int corgi_lcd_set_mode(struct lcd_device *ld, struct fb_videomode *m)
+static int corgi_lcd_set_mode(struct lcd_device *ld, u32 xres, u32 yres)
 {
 	struct corgi_lcd *lcd = lcd_get_data(ld);
 	int mode = CORGI_LCD_MODE_QVGA;
 
-	if (m->xres == 640 || m->xres == 480)
+	if (xres == 640 || xres == 480)
 		mode = CORGI_LCD_MODE_VGA;
 
 	if (lcd->mode == mode)
diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index dd175b446180..3267acf8dc5b 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -65,8 +65,10 @@ static int fb_notifier_callback(struct notifier_block *self,
 		if (ld->ops->set_power)
 			ld->ops->set_power(ld, power);
 	} else {
+		const struct fb_videomode *videomode = evdata->data;
+
 		if (ld->ops->set_mode)
-			ld->ops->set_mode(ld, evdata->data);
+			ld->ops->set_mode(ld, videomode->xres, videomode->yres);
 	}
 
 	return 0;
diff --git a/drivers/video/backlight/tdo24m.c b/drivers/video/backlight/tdo24m.c
index a14a94114e9d..c04ee3d04d87 100644
--- a/drivers/video/backlight/tdo24m.c
+++ b/drivers/video/backlight/tdo24m.c
@@ -12,7 +12,6 @@
 #include <linux/device.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/tdo24m.h>
-#include <linux/fb.h>
 #include <linux/lcd.h>
 #include <linux/slab.h>
 
@@ -308,12 +307,12 @@ static int tdo24m_get_power(struct lcd_device *ld)
 	return lcd->power;
 }
 
-static int tdo24m_set_mode(struct lcd_device *ld, struct fb_videomode *m)
+static int tdo24m_set_mode(struct lcd_device *ld, u32 xres, u32 yres)
 {
 	struct tdo24m *lcd = lcd_get_data(ld);
 	int mode = MODE_QVGA;
 
-	if (m->xres == 640 || m->xres == 480)
+	if (xres == 640 || xres == 480)
 		mode = MODE_VGA;
 
 	if (lcd->mode == mode)
diff --git a/include/linux/lcd.h b/include/linux/lcd.h
index 8399b5ed48f2..59a80b396a71 100644
--- a/include/linux/lcd.h
+++ b/include/linux/lcd.h
@@ -51,8 +51,11 @@ struct lcd_ops {
 	int (*get_contrast)(struct lcd_device *);
 	/* Set LCD panel contrast */
         int (*set_contrast)(struct lcd_device *, int contrast);
-	/* Set LCD panel mode (resolutions ...) */
-	int (*set_mode)(struct lcd_device *, struct fb_videomode *);
+
+	/*
+	 * Set LCD panel mode (resolutions ...)
+	 */
+	int (*set_mode)(struct lcd_device *lcd, u32 xres, u32 yres);
 
 	/*
 	 * Check if the LCD controls the given display device. This
-- 
cgit v1.2.3


From 0d580d99749e759b62dc8e28f511310e9235da7a Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 6 Sep 2024 09:52:42 +0200
Subject: backlight: lcd: Do not include <linux/fb.h> in lcd header

With the exception of fb_notifier_callback(), none of the lcd code
uses fbdev; especially not the lcd drivers. Remove the include
statement for <linux/fb.h> from the public lcd header.

v2:
- fix typos in commit description

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240906075439.98476-29-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/lcd.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lcd.h b/include/linux/lcd.h
index 59a80b396a71..c3ccdff4519a 100644
--- a/include/linux/lcd.h
+++ b/include/linux/lcd.h
@@ -12,7 +12,6 @@
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
-#include <linux/fb.h>
 
 #define LCD_POWER_ON			(0)
 #define LCD_POWER_REDUCED		(1) // deprecated; don't use in new code
-- 
cgit v1.2.3


From 86b9ce0a8a6cf9397503920cd5412d207a93fae9 Mon Sep 17 00:00:00 2001
From: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
Date: Fri, 6 Sep 2024 16:31:12 +0530
Subject: firmware: xilinx: Add Pinctrl Get Attribute ID

Add Pinctrl Get Attribute ID to the query ids list.

Signed-off-by: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
Link: https://lore.kernel.org/20240906110113.3154327-3-sai.krishna.potthuri@amd.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index d7d07afc0532..3b4ce4ec5d3f 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -238,6 +238,7 @@ enum pm_query_id {
 	PM_QID_PINCTRL_GET_PIN_GROUPS = 11,
 	PM_QID_CLOCK_GET_NUM_CLOCKS = 12,
 	PM_QID_CLOCK_GET_MAX_DIVISOR = 13,
+	PM_QID_PINCTRL_GET_ATTRIBUTES = 15,
 };
 
 enum rpu_oper_mode {
-- 
cgit v1.2.3


From b875bd5b381e114115922944f7a01e31f8b07c2a Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 11 Sep 2024 15:43:00 -0400
Subject: exportfs: Remove EXPORT_OP_ASYNC_LOCK

Now that GFS2 and OCFS2 are signalling async ->lock() support with
FOP_ASYNC_LOCK and checks for support are converted, we can remove
EXPORT_OP_ASYNC_LOCK.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Link: https://lore.kernel.org/r/0a114db814fec3086f937ae3d44a086f13b8de26.1726083391.git.bcodding@redhat.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/nfs/exporting.rst |  7 -------
 fs/gfs2/export.c                            |  1 -
 fs/ocfs2/export.c                           |  1 -
 include/linux/exportfs.h                    | 13 -------------
 4 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index f04ce1215a03..de64d2d002a2 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -238,10 +238,3 @@ following flags are defined:
     all of an inode's dirty data on last close. Exports that behave this
     way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip
     waiting for writeback when closing such files.
-
-  EXPORT_OP_ASYNC_LOCK - Indicates a capable filesystem to do async lock
-    requests from lockd. Only set EXPORT_OP_ASYNC_LOCK if the filesystem has
-    it's own ->lock() functionality as core posix_lock_file() implementation
-    has no async lock request handling yet. For more information about how to
-    indicate an async lock request from a ->lock() file_operations struct, see
-    fs/locks.c and comment for the function vfs_lock_file().
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d418d8b5367f..3334c394ce9c 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = {
 	.fh_to_parent = gfs2_fh_to_parent,
 	.get_name = gfs2_get_name,
 	.get_parent = gfs2_get_parent,
-	.flags = EXPORT_OP_ASYNC_LOCK,
 };
 
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 96b684763b39..b95724b767e1 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = {
 	.fh_to_dentry	= ocfs2_fh_to_dentry,
 	.fh_to_parent	= ocfs2_fh_to_parent,
 	.get_parent	= ocfs2_get_parent,
-	.flags		= EXPORT_OP_ASYNC_LOCK,
 };
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 893a1d21dc1c..1ab165c2939f 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -250,19 +250,6 @@ struct export_operations {
 	unsigned long	flags;
 };
 
-/**
- * exportfs_lock_op_is_async() - export op supports async lock operation
- * @export_ops:	the nfs export operations to check
- *
- * Returns true if the nfs export_operations structure has
- * EXPORT_OP_ASYNC_LOCK in their flags set
- */
-static inline bool
-exportfs_lock_op_is_async(const struct export_operations *export_ops)
-{
-	return export_ops->flags & EXPORT_OP_ASYNC_LOCK;
-}
-
 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 				    int *max_len, struct inode *parent,
 				    int flags);
-- 
cgit v1.2.3


From dad35f7d2fc14e446669d4cab100597a6798eae5 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Wed, 25 Sep 2024 18:40:09 +0200
Subject: reset: replace boolean parameters with flags parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce enum reset_control_flags and replace the list of boolean
parameters to the internal reset_control_get functions with a single
flags parameter, before adding more boolean options.

The separate boolean parameters have been shown to be error prone in
the past. See for example commit a57f68ddc886 ("reset: Fix devm bulk
optional exclusive control getter").

Acked-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Link: https://lore.kernel.org/r/20240925-reset-get-deasserted-v2-1-b3601bbd0458@pengutronix.de
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  |  71 ++++++++++++----------
 include/linux/reset.h | 161 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 139 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index 4d509d41456a..682d61812852 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -773,12 +773,19 @@ EXPORT_SYMBOL_GPL(reset_control_bulk_release);
 
 static struct reset_control *
 __reset_control_get_internal(struct reset_controller_dev *rcdev,
-			     unsigned int index, bool shared, bool acquired)
+			     unsigned int index, enum reset_control_flags flags)
 {
+	bool shared = flags & RESET_CONTROL_FLAGS_BIT_SHARED;
+	bool acquired = flags & RESET_CONTROL_FLAGS_BIT_ACQUIRED;
 	struct reset_control *rstc;
 
 	lockdep_assert_held(&reset_list_mutex);
 
+	/* Expect callers to filter out OPTIONAL and DEASSERTED bits */
+	if (WARN_ON(flags & ~(RESET_CONTROL_FLAGS_BIT_SHARED |
+			      RESET_CONTROL_FLAGS_BIT_ACQUIRED)))
+		return ERR_PTR(-EINVAL);
+
 	list_for_each_entry(rstc, &rcdev->reset_control_head, list) {
 		if (rstc->id == index) {
 			/*
@@ -994,8 +1001,9 @@ static struct reset_controller_dev *__reset_find_rcdev(const struct of_phandle_a
 
 struct reset_control *
 __of_reset_control_get(struct device_node *node, const char *id, int index,
-		       bool shared, bool optional, bool acquired)
+		       enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
 	bool gpio_fallback = false;
 	struct reset_control *rstc;
 	struct reset_controller_dev *rcdev;
@@ -1059,8 +1067,10 @@ __of_reset_control_get(struct device_node *node, const char *id, int index,
 		goto out_unlock;
 	}
 
+	flags &= ~RESET_CONTROL_FLAGS_BIT_OPTIONAL;
+
 	/* reset_list_mutex also protects the rcdev's reset_control list */
-	rstc = __reset_control_get_internal(rcdev, rstc_id, shared, acquired);
+	rstc = __reset_control_get_internal(rcdev, rstc_id, flags);
 
 out_unlock:
 	mutex_unlock(&reset_list_mutex);
@@ -1091,8 +1101,9 @@ __reset_controller_by_name(const char *name)
 
 static struct reset_control *
 __reset_control_get_from_lookup(struct device *dev, const char *con_id,
-				bool shared, bool optional, bool acquired)
+				enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
 	const struct reset_control_lookup *lookup;
 	struct reset_controller_dev *rcdev;
 	const char *dev_id = dev_name(dev);
@@ -1116,9 +1127,11 @@ __reset_control_get_from_lookup(struct device *dev, const char *con_id,
 				return ERR_PTR(-EPROBE_DEFER);
 			}
 
+			flags &= ~RESET_CONTROL_FLAGS_BIT_OPTIONAL;
+
 			rstc = __reset_control_get_internal(rcdev,
 							    lookup->index,
-							    shared, acquired);
+							    flags);
 			mutex_unlock(&reset_list_mutex);
 			break;
 		}
@@ -1133,30 +1146,29 @@ __reset_control_get_from_lookup(struct device *dev, const char *con_id,
 }
 
 struct reset_control *__reset_control_get(struct device *dev, const char *id,
-					  int index, bool shared, bool optional,
-					  bool acquired)
+					  int index, enum reset_control_flags flags)
 {
+	bool shared = flags & RESET_CONTROL_FLAGS_BIT_SHARED;
+	bool acquired = flags & RESET_CONTROL_FLAGS_BIT_ACQUIRED;
+
 	if (WARN_ON(shared && acquired))
 		return ERR_PTR(-EINVAL);
 
 	if (dev->of_node)
-		return __of_reset_control_get(dev->of_node, id, index, shared,
-					      optional, acquired);
+		return __of_reset_control_get(dev->of_node, id, index, flags);
 
-	return __reset_control_get_from_lookup(dev, id, shared, optional,
-					       acquired);
+	return __reset_control_get_from_lookup(dev, id, flags);
 }
 EXPORT_SYMBOL_GPL(__reset_control_get);
 
 int __reset_control_bulk_get(struct device *dev, int num_rstcs,
 			     struct reset_control_bulk_data *rstcs,
-			     bool shared, bool optional, bool acquired)
+			     enum reset_control_flags flags)
 {
 	int ret, i;
 
 	for (i = 0; i < num_rstcs; i++) {
-		rstcs[i].rstc = __reset_control_get(dev, rstcs[i].id, 0,
-						    shared, optional, acquired);
+		rstcs[i].rstc = __reset_control_get(dev, rstcs[i].id, 0, flags);
 		if (IS_ERR(rstcs[i].rstc)) {
 			ret = PTR_ERR(rstcs[i].rstc);
 			goto err;
@@ -1226,7 +1238,7 @@ static void devm_reset_control_release(struct device *dev, void *res)
 
 struct reset_control *
 __devm_reset_control_get(struct device *dev, const char *id, int index,
-			 bool shared, bool optional, bool acquired)
+			 enum reset_control_flags flags)
 {
 	struct reset_control **ptr, *rstc;
 
@@ -1235,7 +1247,7 @@ __devm_reset_control_get(struct device *dev, const char *id, int index,
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	rstc = __reset_control_get(dev, id, index, shared, optional, acquired);
+	rstc = __reset_control_get(dev, id, index, flags);
 	if (IS_ERR_OR_NULL(rstc)) {
 		devres_free(ptr);
 		return rstc;
@@ -1262,7 +1274,7 @@ static void devm_reset_control_bulk_release(struct device *dev, void *res)
 
 int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs,
 				  struct reset_control_bulk_data *rstcs,
-				  bool shared, bool optional, bool acquired)
+				  enum reset_control_flags flags)
 {
 	struct reset_control_bulk_devres *ptr;
 	int ret;
@@ -1272,7 +1284,7 @@ int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs,
 	if (!ptr)
 		return -ENOMEM;
 
-	ret = __reset_control_bulk_get(dev, num_rstcs, rstcs, shared, optional, acquired);
+	ret = __reset_control_bulk_get(dev, num_rstcs, rstcs, flags);
 	if (ret < 0) {
 		devres_free(ptr);
 		return ret;
@@ -1298,6 +1310,7 @@ EXPORT_SYMBOL_GPL(__devm_reset_control_bulk_get);
  */
 int __device_reset(struct device *dev, bool optional)
 {
+	enum reset_control_flags flags;
 	struct reset_control *rstc;
 	int ret;
 
@@ -1313,7 +1326,8 @@ int __device_reset(struct device *dev, bool optional)
 	}
 #endif
 
-	rstc = __reset_control_get(dev, NULL, 0, 0, optional, true);
+	flags = optional ? RESET_CONTROL_OPTIONAL_EXCLUSIVE : RESET_CONTROL_EXCLUSIVE;
+	rstc = __reset_control_get(dev, NULL, 0, flags);
 	if (IS_ERR(rstc))
 		return PTR_ERR(rstc);
 
@@ -1356,17 +1370,14 @@ static int of_reset_control_get_count(struct device_node *node)
  *				device node.
  *
  * @np: device node for the device that requests the reset controls array
- * @shared: whether reset controls are shared or not
- * @optional: whether it is optional to get the reset controls
- * @acquired: only one reset control may be acquired for a given controller
- *            and ID
+ * @flags: whether reset controls are shared, optional, acquired
  *
  * Returns pointer to allocated reset_control on success or error on failure
  */
 struct reset_control *
-of_reset_control_array_get(struct device_node *np, bool shared, bool optional,
-			   bool acquired)
+of_reset_control_array_get(struct device_node *np, enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
 	struct reset_control_array *resets;
 	struct reset_control *rstc;
 	int num, i;
@@ -1381,8 +1392,7 @@ of_reset_control_array_get(struct device_node *np, bool shared, bool optional,
 	resets->num_rstcs = num;
 
 	for (i = 0; i < num; i++) {
-		rstc = __of_reset_control_get(np, NULL, i, shared, optional,
-					      acquired);
+		rstc = __of_reset_control_get(np, NULL, i, flags);
 		if (IS_ERR(rstc))
 			goto err_rst;
 		resets->rstc[i] = rstc;
@@ -1407,8 +1417,7 @@ EXPORT_SYMBOL_GPL(of_reset_control_array_get);
  * devm_reset_control_array_get - Resource managed reset control array get
  *
  * @dev: device that requests the list of reset controls
- * @shared: whether reset controls are shared or not
- * @optional: whether it is optional to get the reset controls
+ * @flags: whether reset controls are shared, optional, acquired
  *
  * The reset control array APIs are intended for a list of resets
  * that just have to be asserted or deasserted, without any
@@ -1417,7 +1426,7 @@ EXPORT_SYMBOL_GPL(of_reset_control_array_get);
  * Returns pointer to allocated reset_control on success or error on failure
  */
 struct reset_control *
-devm_reset_control_array_get(struct device *dev, bool shared, bool optional)
+devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags)
 {
 	struct reset_control **ptr, *rstc;
 
@@ -1426,7 +1435,7 @@ devm_reset_control_array_get(struct device *dev, bool shared, bool optional)
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	rstc = of_reset_control_array_get(dev->of_node, shared, optional, true);
+	rstc = of_reset_control_array_get(dev->of_node, flags);
 	if (IS_ERR_OR_NULL(rstc)) {
 		devres_free(ptr);
 		return rstc;
diff --git a/include/linux/reset.h b/include/linux/reset.h
index 514ddf003efc..99296af98f81 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -25,6 +25,33 @@ struct reset_control_bulk_data {
 	struct reset_control		*rstc;
 };
 
+#define RESET_CONTROL_FLAGS_BIT_SHARED		BIT(0)	/* not exclusive */
+#define RESET_CONTROL_FLAGS_BIT_OPTIONAL	BIT(1)
+#define RESET_CONTROL_FLAGS_BIT_ACQUIRED	BIT(2)	/* iff exclusive, not released */
+
+/**
+ * enum reset_control_flags - Flags that can be passed to the reset_control_get functions
+ *                    to determine the type of reset control.
+ *                    These values cannot be OR'd.
+ *
+ * @RESET_CONTROL_EXCLUSIVE:				exclusive, acquired,
+ * @RESET_CONTROL_EXCLUSIVE_RELEASED:			exclusive, released,
+ * @RESET_CONTROL_SHARED:				shared
+ * @RESET_CONTROL_OPTIONAL_EXCLUSIVE:			optional, exclusive, acquired
+ * @RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED:		optional, exclusive, released
+ * @RESET_CONTROL_OPTIONAL_SHARED:			optional, shared
+ */
+enum reset_control_flags {
+	RESET_CONTROL_EXCLUSIVE				= RESET_CONTROL_FLAGS_BIT_ACQUIRED,
+	RESET_CONTROL_EXCLUSIVE_RELEASED		= 0,
+	RESET_CONTROL_SHARED				= RESET_CONTROL_FLAGS_BIT_SHARED,
+	RESET_CONTROL_OPTIONAL_EXCLUSIVE		= RESET_CONTROL_FLAGS_BIT_OPTIONAL |
+							  RESET_CONTROL_FLAGS_BIT_ACQUIRED,
+	RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED	= RESET_CONTROL_FLAGS_BIT_OPTIONAL,
+	RESET_CONTROL_OPTIONAL_SHARED			= RESET_CONTROL_FLAGS_BIT_OPTIONAL |
+							  RESET_CONTROL_FLAGS_BIT_SHARED,
+};
+
 #ifdef CONFIG_RESET_CONTROLLER
 
 int reset_control_reset(struct reset_control *rstc);
@@ -42,30 +69,25 @@ int reset_control_bulk_acquire(int num_rstcs, struct reset_control_bulk_data *rs
 void reset_control_bulk_release(int num_rstcs, struct reset_control_bulk_data *rstcs);
 
 struct reset_control *__of_reset_control_get(struct device_node *node,
-				     const char *id, int index, bool shared,
-				     bool optional, bool acquired);
+				     const char *id, int index, enum reset_control_flags flags);
 struct reset_control *__reset_control_get(struct device *dev, const char *id,
-					  int index, bool shared,
-					  bool optional, bool acquired);
+					  int index, enum reset_control_flags flags);
 void reset_control_put(struct reset_control *rstc);
 int __reset_control_bulk_get(struct device *dev, int num_rstcs,
 			     struct reset_control_bulk_data *rstcs,
-			     bool shared, bool optional, bool acquired);
+			     enum reset_control_flags flags);
 void reset_control_bulk_put(int num_rstcs, struct reset_control_bulk_data *rstcs);
 
 int __device_reset(struct device *dev, bool optional);
 struct reset_control *__devm_reset_control_get(struct device *dev,
-				     const char *id, int index, bool shared,
-				     bool optional, bool acquired);
+				     const char *id, int index, enum reset_control_flags flags);
 int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs,
 				  struct reset_control_bulk_data *rstcs,
-				  bool shared, bool optional, bool acquired);
+				  enum reset_control_flags flags);
 
 struct reset_control *devm_reset_control_array_get(struct device *dev,
-						   bool shared, bool optional);
-struct reset_control *of_reset_control_array_get(struct device_node *np,
-						 bool shared, bool optional,
-						 bool acquired);
+						   enum reset_control_flags flags);
+struct reset_control *of_reset_control_array_get(struct device_node *np, enum reset_control_flags);
 
 int reset_control_get_count(struct device *dev);
 
@@ -116,17 +138,19 @@ static inline int __device_reset(struct device *dev, bool optional)
 
 static inline struct reset_control *__of_reset_control_get(
 					struct device_node *node,
-					const char *id, int index, bool shared,
-					bool optional, bool acquired)
+					const char *id, int index, enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
+
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
 static inline struct reset_control *__reset_control_get(
 					struct device *dev, const char *id,
-					int index, bool shared, bool optional,
-					bool acquired)
+					int index, enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
+
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
@@ -162,8 +186,10 @@ reset_control_bulk_release(int num_rstcs, struct reset_control_bulk_data *rstcs)
 static inline int
 __reset_control_bulk_get(struct device *dev, int num_rstcs,
 			 struct reset_control_bulk_data *rstcs,
-			 bool shared, bool optional, bool acquired)
+			 enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
+
 	return optional ? 0 : -EOPNOTSUPP;
 }
 
@@ -174,30 +200,36 @@ reset_control_bulk_put(int num_rstcs, struct reset_control_bulk_data *rstcs)
 
 static inline struct reset_control *__devm_reset_control_get(
 					struct device *dev, const char *id,
-					int index, bool shared, bool optional,
-					bool acquired)
+					int index, enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
+
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
 static inline int
 __devm_reset_control_bulk_get(struct device *dev, int num_rstcs,
 			      struct reset_control_bulk_data *rstcs,
-			      bool shared, bool optional, bool acquired)
+			      enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
+
 	return optional ? 0 : -EOPNOTSUPP;
 }
 
 static inline struct reset_control *
-devm_reset_control_array_get(struct device *dev, bool shared, bool optional)
+devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
+
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
 static inline struct reset_control *
-of_reset_control_array_get(struct device_node *np, bool shared, bool optional,
-			   bool acquired)
+of_reset_control_array_get(struct device_node *np, enum reset_control_flags flags)
 {
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
+
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
@@ -236,7 +268,7 @@ static inline int device_reset_optional(struct device *dev)
 static inline struct reset_control *
 __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 {
-	return __reset_control_get(dev, id, 0, false, false, true);
+	return __reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE);
 }
 
 /**
@@ -253,7 +285,7 @@ static inline int __must_check
 reset_control_bulk_get_exclusive(struct device *dev, int num_rstcs,
 				 struct reset_control_bulk_data *rstcs)
 {
-	return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, true);
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_EXCLUSIVE);
 }
 
 /**
@@ -274,7 +306,7 @@ static inline struct reset_control *
 __must_check reset_control_get_exclusive_released(struct device *dev,
 						  const char *id)
 {
-	return __reset_control_get(dev, id, 0, false, false, false);
+	return __reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE_RELEASED);
 }
 
 /**
@@ -295,7 +327,7 @@ static inline int __must_check
 reset_control_bulk_get_exclusive_released(struct device *dev, int num_rstcs,
 					  struct reset_control_bulk_data *rstcs)
 {
-	return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, false);
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_EXCLUSIVE_RELEASED);
 }
 
 /**
@@ -316,7 +348,8 @@ static inline int __must_check
 reset_control_bulk_get_optional_exclusive_released(struct device *dev, int num_rstcs,
 						   struct reset_control_bulk_data *rstcs)
 {
-	return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, false);
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs,
+					RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED);
 }
 
 /**
@@ -344,7 +377,7 @@ reset_control_bulk_get_optional_exclusive_released(struct device *dev, int num_r
 static inline struct reset_control *reset_control_get_shared(
 					struct device *dev, const char *id)
 {
-	return __reset_control_get(dev, id, 0, true, false, false);
+	return __reset_control_get(dev, id, 0, RESET_CONTROL_SHARED);
 }
 
 /**
@@ -361,7 +394,7 @@ static inline int __must_check
 reset_control_bulk_get_shared(struct device *dev, int num_rstcs,
 			      struct reset_control_bulk_data *rstcs)
 {
-	return __reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, false);
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_SHARED);
 }
 
 /**
@@ -378,7 +411,7 @@ reset_control_bulk_get_shared(struct device *dev, int num_rstcs,
 static inline struct reset_control *reset_control_get_optional_exclusive(
 					struct device *dev, const char *id)
 {
-	return __reset_control_get(dev, id, 0, false, true, true);
+	return __reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE);
 }
 
 /**
@@ -398,7 +431,7 @@ static inline int __must_check
 reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs,
 					  struct reset_control_bulk_data *rstcs)
 {
-	return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, true);
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_OPTIONAL_EXCLUSIVE);
 }
 
 /**
@@ -415,7 +448,7 @@ reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs,
 static inline struct reset_control *reset_control_get_optional_shared(
 					struct device *dev, const char *id)
 {
-	return __reset_control_get(dev, id, 0, true, true, false);
+	return __reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_SHARED);
 }
 
 /**
@@ -435,7 +468,7 @@ static inline int __must_check
 reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs,
 				       struct reset_control_bulk_data *rstcs)
 {
-	return __reset_control_bulk_get(dev, num_rstcs, rstcs, true, true, false);
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_OPTIONAL_SHARED);
 }
 
 /**
@@ -451,7 +484,7 @@ reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs,
 static inline struct reset_control *of_reset_control_get_exclusive(
 				struct device_node *node, const char *id)
 {
-	return __of_reset_control_get(node, id, 0, false, false, true);
+	return __of_reset_control_get(node, id, 0, RESET_CONTROL_EXCLUSIVE);
 }
 
 /**
@@ -471,7 +504,7 @@ static inline struct reset_control *of_reset_control_get_exclusive(
 static inline struct reset_control *of_reset_control_get_optional_exclusive(
 				struct device_node *node, const char *id)
 {
-	return __of_reset_control_get(node, id, 0, false, true, true);
+	return __of_reset_control_get(node, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE);
 }
 
 /**
@@ -496,7 +529,7 @@ static inline struct reset_control *of_reset_control_get_optional_exclusive(
 static inline struct reset_control *of_reset_control_get_shared(
 				struct device_node *node, const char *id)
 {
-	return __of_reset_control_get(node, id, 0, true, false, false);
+	return __of_reset_control_get(node, id, 0, RESET_CONTROL_SHARED);
 }
 
 /**
@@ -513,7 +546,7 @@ static inline struct reset_control *of_reset_control_get_shared(
 static inline struct reset_control *of_reset_control_get_exclusive_by_index(
 					struct device_node *node, int index)
 {
-	return __of_reset_control_get(node, NULL, index, false, false, true);
+	return __of_reset_control_get(node, NULL, index, RESET_CONTROL_EXCLUSIVE);
 }
 
 /**
@@ -541,7 +574,7 @@ static inline struct reset_control *of_reset_control_get_exclusive_by_index(
 static inline struct reset_control *of_reset_control_get_shared_by_index(
 					struct device_node *node, int index)
 {
-	return __of_reset_control_get(node, NULL, index, true, false, false);
+	return __of_reset_control_get(node, NULL, index, RESET_CONTROL_SHARED);
 }
 
 /**
@@ -560,7 +593,7 @@ static inline struct reset_control *
 __must_check devm_reset_control_get_exclusive(struct device *dev,
 					      const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, false, false, true);
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE);
 }
 
 /**
@@ -580,7 +613,8 @@ static inline int __must_check
 devm_reset_control_bulk_get_exclusive(struct device *dev, int num_rstcs,
 				      struct reset_control_bulk_data *rstcs)
 {
-	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, true);
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs,
+					     RESET_CONTROL_EXCLUSIVE);
 }
 
 /**
@@ -599,7 +633,7 @@ static inline struct reset_control *
 __must_check devm_reset_control_get_exclusive_released(struct device *dev,
 						       const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, false, false, false);
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE_RELEASED);
 }
 
 /**
@@ -619,7 +653,8 @@ static inline int __must_check
 devm_reset_control_bulk_get_exclusive_released(struct device *dev, int num_rstcs,
 					       struct reset_control_bulk_data *rstcs)
 {
-	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, false);
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs,
+					     RESET_CONTROL_EXCLUSIVE_RELEASED);
 }
 
 /**
@@ -638,7 +673,7 @@ static inline struct reset_control *
 __must_check devm_reset_control_get_optional_exclusive_released(struct device *dev,
 								const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, false, true, false);
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED);
 }
 
 /**
@@ -658,7 +693,8 @@ static inline int __must_check
 devm_reset_control_bulk_get_optional_exclusive_released(struct device *dev, int num_rstcs,
 							struct reset_control_bulk_data *rstcs)
 {
-	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, false);
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs,
+					     RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED);
 }
 
 /**
@@ -673,7 +709,7 @@ devm_reset_control_bulk_get_optional_exclusive_released(struct device *dev, int
 static inline struct reset_control *devm_reset_control_get_shared(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, true, false, false);
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_SHARED);
 }
 
 /**
@@ -693,7 +729,7 @@ static inline int __must_check
 devm_reset_control_bulk_get_shared(struct device *dev, int num_rstcs,
 				   struct reset_control_bulk_data *rstcs)
 {
-	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, false);
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_SHARED);
 }
 
 /**
@@ -711,7 +747,7 @@ devm_reset_control_bulk_get_shared(struct device *dev, int num_rstcs,
 static inline struct reset_control *devm_reset_control_get_optional_exclusive(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, false, true, true);
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE);
 }
 
 /**
@@ -731,7 +767,8 @@ static inline int __must_check
 devm_reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs,
 					       struct reset_control_bulk_data *rstcs)
 {
-	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, true);
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs,
+					     RESET_CONTROL_OPTIONAL_EXCLUSIVE);
 }
 
 /**
@@ -749,7 +786,7 @@ devm_reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs
 static inline struct reset_control *devm_reset_control_get_optional_shared(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, true, true, false);
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_SHARED);
 }
 
 /**
@@ -769,7 +806,7 @@ static inline int __must_check
 devm_reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs,
 					    struct reset_control_bulk_data *rstcs)
 {
-	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, true, false);
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_OPTIONAL_SHARED);
 }
 
 /**
@@ -787,7 +824,7 @@ devm_reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs,
 static inline struct reset_control *
 devm_reset_control_get_exclusive_by_index(struct device *dev, int index)
 {
-	return __devm_reset_control_get(dev, NULL, index, false, false, true);
+	return __devm_reset_control_get(dev, NULL, index, RESET_CONTROL_EXCLUSIVE);
 }
 
 /**
@@ -803,7 +840,7 @@ devm_reset_control_get_exclusive_by_index(struct device *dev, int index)
 static inline struct reset_control *
 devm_reset_control_get_shared_by_index(struct device *dev, int index)
 {
-	return __devm_reset_control_get(dev, NULL, index, true, false, false);
+	return __devm_reset_control_get(dev, NULL, index, RESET_CONTROL_SHARED);
 }
 
 /*
@@ -851,54 +888,54 @@ static inline struct reset_control *devm_reset_control_get_by_index(
 static inline struct reset_control *
 devm_reset_control_array_get_exclusive(struct device *dev)
 {
-	return devm_reset_control_array_get(dev, false, false);
+	return devm_reset_control_array_get(dev, RESET_CONTROL_EXCLUSIVE);
 }
 
 static inline struct reset_control *
 devm_reset_control_array_get_shared(struct device *dev)
 {
-	return devm_reset_control_array_get(dev, true, false);
+	return devm_reset_control_array_get(dev, RESET_CONTROL_SHARED);
 }
 
 static inline struct reset_control *
 devm_reset_control_array_get_optional_exclusive(struct device *dev)
 {
-	return devm_reset_control_array_get(dev, false, true);
+	return devm_reset_control_array_get(dev, RESET_CONTROL_OPTIONAL_EXCLUSIVE);
 }
 
 static inline struct reset_control *
 devm_reset_control_array_get_optional_shared(struct device *dev)
 {
-	return devm_reset_control_array_get(dev, true, true);
+	return devm_reset_control_array_get(dev, RESET_CONTROL_OPTIONAL_SHARED);
 }
 
 static inline struct reset_control *
 of_reset_control_array_get_exclusive(struct device_node *node)
 {
-	return of_reset_control_array_get(node, false, false, true);
+	return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE);
 }
 
 static inline struct reset_control *
 of_reset_control_array_get_exclusive_released(struct device_node *node)
 {
-	return of_reset_control_array_get(node, false, false, false);
+	return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE_RELEASED);
 }
 
 static inline struct reset_control *
 of_reset_control_array_get_shared(struct device_node *node)
 {
-	return of_reset_control_array_get(node, true, false, true);
+	return of_reset_control_array_get(node, RESET_CONTROL_SHARED);
 }
 
 static inline struct reset_control *
 of_reset_control_array_get_optional_exclusive(struct device_node *node)
 {
-	return of_reset_control_array_get(node, false, true, true);
+	return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_EXCLUSIVE);
 }
 
 static inline struct reset_control *
 of_reset_control_array_get_optional_shared(struct device_node *node)
 {
-	return of_reset_control_array_get(node, true, true, true);
+	return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_SHARED);
 }
 #endif
-- 
cgit v1.2.3


From d872bed85036f5e60c66b0dd0994346b4ea6470c Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Wed, 25 Sep 2024 18:40:10 +0200
Subject: reset: Add devres helpers to request pre-deasserted reset controls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add devres helpers

 - devm_reset_control_bulk_get_exclusive_deasserted
 - devm_reset_control_bulk_get_optional_exclusive_deasserted
 - devm_reset_control_bulk_get_optional_shared_deasserted
 - devm_reset_control_bulk_get_shared_deasserted
 - devm_reset_control_get_exclusive_deasserted
 - devm_reset_control_get_optional_exclusive_deasserted
 - devm_reset_control_get_optional_shared_deasserted
 - devm_reset_control_get_shared_deasserted

to request and immediately deassert reset controls. During cleanup,
reset_control_assert() will be called automatically on the returned
reset controls.

Acked-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Link: https://lore.kernel.org/r/20240925-reset-get-deasserted-v2-2-b3601bbd0458@pengutronix.de
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  |  48 ++++++++++++++++++++-
 include/linux/reset.h | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index 682d61812852..22f67fc77ae5 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -1236,23 +1236,46 @@ static void devm_reset_control_release(struct device *dev, void *res)
 	reset_control_put(*(struct reset_control **)res);
 }
 
+static void devm_reset_control_release_deasserted(struct device *dev, void *res)
+{
+	struct reset_control *rstc = *(struct reset_control **)res;
+
+	reset_control_assert(rstc);
+	reset_control_put(rstc);
+}
+
 struct reset_control *
 __devm_reset_control_get(struct device *dev, const char *id, int index,
 			 enum reset_control_flags flags)
 {
 	struct reset_control **ptr, *rstc;
+	bool deasserted = flags & RESET_CONTROL_FLAGS_BIT_DEASSERTED;
 
-	ptr = devres_alloc(devm_reset_control_release, sizeof(*ptr),
+	ptr = devres_alloc(deasserted ? devm_reset_control_release_deasserted :
+			   devm_reset_control_release, sizeof(*ptr),
 			   GFP_KERNEL);
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
+	flags &= ~RESET_CONTROL_FLAGS_BIT_DEASSERTED;
+
 	rstc = __reset_control_get(dev, id, index, flags);
 	if (IS_ERR_OR_NULL(rstc)) {
 		devres_free(ptr);
 		return rstc;
 	}
 
+	if (deasserted) {
+		int ret;
+
+		ret = reset_control_deassert(rstc);
+		if (ret) {
+			reset_control_put(rstc);
+			devres_free(ptr);
+			return ERR_PTR(ret);
+		}
+	}
+
 	*ptr = rstc;
 	devres_add(dev, ptr);
 
@@ -1272,24 +1295,45 @@ static void devm_reset_control_bulk_release(struct device *dev, void *res)
 	reset_control_bulk_put(devres->num_rstcs, devres->rstcs);
 }
 
+static void devm_reset_control_bulk_release_deasserted(struct device *dev, void *res)
+{
+	struct reset_control_bulk_devres *devres = res;
+
+	reset_control_bulk_assert(devres->num_rstcs, devres->rstcs);
+	reset_control_bulk_put(devres->num_rstcs, devres->rstcs);
+}
+
 int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs,
 				  struct reset_control_bulk_data *rstcs,
 				  enum reset_control_flags flags)
 {
 	struct reset_control_bulk_devres *ptr;
+	bool deasserted = flags & RESET_CONTROL_FLAGS_BIT_DEASSERTED;
 	int ret;
 
-	ptr = devres_alloc(devm_reset_control_bulk_release, sizeof(*ptr),
+	ptr = devres_alloc(deasserted ? devm_reset_control_bulk_release_deasserted :
+			   devm_reset_control_bulk_release, sizeof(*ptr),
 			   GFP_KERNEL);
 	if (!ptr)
 		return -ENOMEM;
 
+	flags &= ~RESET_CONTROL_FLAGS_BIT_DEASSERTED;
+
 	ret = __reset_control_bulk_get(dev, num_rstcs, rstcs, flags);
 	if (ret < 0) {
 		devres_free(ptr);
 		return ret;
 	}
 
+	if (deasserted) {
+		ret = reset_control_bulk_deassert(num_rstcs, rstcs);
+		if (ret) {
+			reset_control_bulk_put(num_rstcs, rstcs);
+			devres_free(ptr);
+			return ret;
+		}
+	}
+
 	ptr->num_rstcs = num_rstcs;
 	ptr->rstcs = rstcs;
 	devres_add(dev, ptr);
diff --git a/include/linux/reset.h b/include/linux/reset.h
index 99296af98f81..2986ced69a02 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -28,6 +28,7 @@ struct reset_control_bulk_data {
 #define RESET_CONTROL_FLAGS_BIT_SHARED		BIT(0)	/* not exclusive */
 #define RESET_CONTROL_FLAGS_BIT_OPTIONAL	BIT(1)
 #define RESET_CONTROL_FLAGS_BIT_ACQUIRED	BIT(2)	/* iff exclusive, not released */
+#define RESET_CONTROL_FLAGS_BIT_DEASSERTED	BIT(3)
 
 /**
  * enum reset_control_flags - Flags that can be passed to the reset_control_get functions
@@ -35,21 +36,35 @@ struct reset_control_bulk_data {
  *                    These values cannot be OR'd.
  *
  * @RESET_CONTROL_EXCLUSIVE:				exclusive, acquired,
+ * @RESET_CONTROL_EXCLUSIVE_DEASSERTED:			exclusive, acquired, deasserted
  * @RESET_CONTROL_EXCLUSIVE_RELEASED:			exclusive, released,
  * @RESET_CONTROL_SHARED:				shared
+ * @RESET_CONTROL_SHARED_DEASSERTED:			shared, deasserted
  * @RESET_CONTROL_OPTIONAL_EXCLUSIVE:			optional, exclusive, acquired
+ * @RESET_CONTROL_OPTIONAL_EXCLUSIVE_DEASSERTED:	optional, exclusive, acquired, deasserted
  * @RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED:		optional, exclusive, released
  * @RESET_CONTROL_OPTIONAL_SHARED:			optional, shared
+ * @RESET_CONTROL_OPTIONAL_SHARED_DEASSERTED:		optional, shared, deasserted
  */
 enum reset_control_flags {
 	RESET_CONTROL_EXCLUSIVE				= RESET_CONTROL_FLAGS_BIT_ACQUIRED,
+	RESET_CONTROL_EXCLUSIVE_DEASSERTED		= RESET_CONTROL_FLAGS_BIT_ACQUIRED |
+							  RESET_CONTROL_FLAGS_BIT_DEASSERTED,
 	RESET_CONTROL_EXCLUSIVE_RELEASED		= 0,
 	RESET_CONTROL_SHARED				= RESET_CONTROL_FLAGS_BIT_SHARED,
+	RESET_CONTROL_SHARED_DEASSERTED			= RESET_CONTROL_FLAGS_BIT_SHARED |
+							  RESET_CONTROL_FLAGS_BIT_DEASSERTED,
 	RESET_CONTROL_OPTIONAL_EXCLUSIVE		= RESET_CONTROL_FLAGS_BIT_OPTIONAL |
 							  RESET_CONTROL_FLAGS_BIT_ACQUIRED,
+	RESET_CONTROL_OPTIONAL_EXCLUSIVE_DEASSERTED	= RESET_CONTROL_FLAGS_BIT_OPTIONAL |
+							  RESET_CONTROL_FLAGS_BIT_ACQUIRED |
+							  RESET_CONTROL_FLAGS_BIT_DEASSERTED,
 	RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED	= RESET_CONTROL_FLAGS_BIT_OPTIONAL,
 	RESET_CONTROL_OPTIONAL_SHARED			= RESET_CONTROL_FLAGS_BIT_OPTIONAL |
 							  RESET_CONTROL_FLAGS_BIT_SHARED,
+	RESET_CONTROL_OPTIONAL_SHARED_DEASSERTED	= RESET_CONTROL_FLAGS_BIT_OPTIONAL |
+							  RESET_CONTROL_FLAGS_BIT_SHARED |
+							  RESET_CONTROL_FLAGS_BIT_DEASSERTED,
 };
 
 #ifdef CONFIG_RESET_CONTROLLER
@@ -596,6 +611,25 @@ __must_check devm_reset_control_get_exclusive(struct device *dev,
 	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE);
 }
 
+/**
+ * devm_reset_control_get_exclusive_deasserted - resource managed
+ *                                    reset_control_get_exclusive() +
+ *                                    reset_control_deassert()
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Managed reset_control_get_exclusive() + reset_control_deassert(). For reset
+ * controllers returned from this function, reset_control_assert() +
+ * reset_control_put() is called automatically on driver detach.
+ *
+ * See reset_control_get_exclusive() for more information.
+ */
+static inline struct reset_control * __must_check
+devm_reset_control_get_exclusive_deasserted(struct device *dev, const char *id)
+{
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE_DEASSERTED);
+}
+
 /**
  * devm_reset_control_bulk_get_exclusive - resource managed
  *                                         reset_control_bulk_get_exclusive()
@@ -712,6 +746,25 @@ static inline struct reset_control *devm_reset_control_get_shared(
 	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_SHARED);
 }
 
+/**
+ * devm_reset_control_get_shared_deasserted - resource managed
+ *                                            reset_control_get_shared() +
+ *                                            reset_control_deassert()
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Managed reset_control_get_shared() + reset_control_deassert(). For reset
+ * controllers returned from this function, reset_control_assert() +
+ * reset_control_put() is called automatically on driver detach.
+ *
+ * See devm_reset_control_get_shared() for more information.
+ */
+static inline struct reset_control * __must_check
+devm_reset_control_get_shared_deasserted(struct device *dev, const char *id)
+{
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_SHARED_DEASSERTED);
+}
+
 /**
  * devm_reset_control_bulk_get_shared - resource managed
  *                                      reset_control_bulk_get_shared()
@@ -732,6 +785,28 @@ devm_reset_control_bulk_get_shared(struct device *dev, int num_rstcs,
 	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_SHARED);
 }
 
+/**
+ * devm_reset_control_bulk_get_shared_deasserted - resource managed
+ *                                                 reset_control_bulk_get_shared() +
+ *                                                 reset_control_bulk_deassert()
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Managed reset_control_bulk_get_shared() + reset_control_bulk_deassert(). For
+ * reset controllers returned from this function, reset_control_bulk_assert() +
+ * reset_control_bulk_put() are called automatically on driver detach.
+ *
+ * See devm_reset_control_bulk_get_shared() for more information.
+ */
+static inline int __must_check
+devm_reset_control_bulk_get_shared_deasserted(struct device *dev, int num_rstcs,
+					      struct reset_control_bulk_data *rstcs)
+{
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs,
+					     RESET_CONTROL_SHARED_DEASSERTED);
+}
+
 /**
  * devm_reset_control_get_optional_exclusive - resource managed
  *                                             reset_control_get_optional_exclusive()
@@ -750,6 +825,25 @@ static inline struct reset_control *devm_reset_control_get_optional_exclusive(
 	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE);
 }
 
+/**
+ * devm_reset_control_get_optional_exclusive_deasserted - resource managed
+ *                                                        reset_control_get_optional_exclusive() +
+ *                                                        reset_control_deassert()
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Managed reset_control_get_optional_exclusive() + reset_control_deassert().
+ * For reset controllers returned from this function, reset_control_assert() +
+ * reset_control_put() is called automatically on driver detach.
+ *
+ * See devm_reset_control_get_optional_exclusive() for more information.
+ */
+static inline struct reset_control *
+devm_reset_control_get_optional_exclusive_deasserted(struct device *dev, const char *id)
+{
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE_DEASSERTED);
+}
+
 /**
  * devm_reset_control_bulk_get_optional_exclusive - resource managed
  *                                                  reset_control_bulk_get_optional_exclusive()
@@ -789,6 +883,25 @@ static inline struct reset_control *devm_reset_control_get_optional_shared(
 	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_SHARED);
 }
 
+/**
+ * devm_reset_control_get_optional_shared_deasserted - resource managed
+ *                                                     reset_control_get_optional_shared() +
+ *                                                     reset_control_deassert()
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Managed reset_control_get_optional_shared() + reset_control_deassert(). For
+ * reset controllers returned from this function, reset_control_assert() +
+ * reset_control_put() is called automatically on driver detach.
+ *
+ * See devm_reset_control_get_optional_shared() for more information.
+ */
+static inline struct reset_control *
+devm_reset_control_get_optional_shared_deasserted(struct device *dev, const char *id)
+{
+	return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_SHARED_DEASSERTED);
+}
+
 /**
  * devm_reset_control_bulk_get_optional_shared - resource managed
  *                                               reset_control_bulk_get_optional_shared()
-- 
cgit v1.2.3


From dab9cd4b8e7f5fce4e7a0424991ec4714a780f3f Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Date: Tue, 1 Oct 2024 10:51:39 +0200
Subject: pwm: Add kernel doc for members added to pwm_ops recently
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The callbacks for lowlevel pwm drivers were expanded to handle the new
waveform abstraction. When doing that I missed to expand the kernel doc
description. This is catched up here.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/linux-next/20241001135207.125ca7af@canb.auug.org.au
Fixes: 17e40c25158f ("pwm: New abstraction for PWM waveforms")
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Link: https://lore.kernel.org/r/20241001085138.1025818-2-u.kleine-koenig@baylibre.com
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 include/linux/pwm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index c3d9ddeafa65..f1cb1e5b0a36 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -276,6 +276,11 @@ struct pwm_capture {
  * @request: optional hook for requesting a PWM
  * @free: optional hook for freeing a PWM
  * @capture: capture and report PWM signal
+ * @sizeof_wfhw: size (in bytes) of driver specific waveform presentation
+ * @round_waveform_tohw: convert a struct pwm_waveform to driver specific presentation
+ * @round_waveform_fromhw: convert a driver specific waveform presentation to struct pwm_waveform
+ * @read_waveform: read driver specific waveform presentation from hardware
+ * @write_waveform: write driver specific waveform presentation to hardware
  * @apply: atomically apply a new PWM config
  * @get_state: get the current PWM state.
  */
-- 
cgit v1.2.3


From 030ace430afcf847f537227afceb22dfe8fb8fc8 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@linaro.org>
Date: Thu, 26 Sep 2024 22:19:52 +0800
Subject: spi: spi-mem: Allow specifying the byte order in Octal DTR mode

There are NOR flashes (Macronix) that swap the bytes on a 16-bit
boundary when configured in Octal DTR mode. The byte order of
16-bit words is swapped when read or written in Octal Double
Transfer Rate (DTR) mode compared to Single Transfer Rate (STR)
modes. If one writes D0 D1 D2 D3 bytes using 1-1-1 mode, and uses
8D-8D-8D SPI mode for reading, it will read back D1 D0 D3 D2.
Swapping the bytes may introduce some endianness problems. It can
affect the boot sequence if the entire boot sequence is not handled
in either 8D-8D-8D mode or 1-1-1 mode. Therefore, it is necessary
to swap the bytes back to ensure the same byte order as in STR modes.
Fortunately there are controllers that could swap the bytes back at
runtime, addressing the flash's endianness requirements. Provide a
way for the upper layers to specify the byte order in Octal DTR mode.

Merge Tudor's patch and add modifications for suiting newer version
of Linux kernel.

Suggested-by: Michael Walle <mwalle@kernel.org>
Signed-off-by: JaimeLiao <jaimeliao@mxic.com.tw>
Signed-off-by: AlvinZhou <alvinzhou@mxic.com.tw>
Acked-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240926141956.2386374-3-alvinzhou.tw@gmail.com
Signed-off-by: Tudor Ambarus <tudor.ambarus@linaro.org>
---
 drivers/spi/spi-mem.c       | 3 +++
 include/linux/spi/spi-mem.h | 8 +++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index 17b8baf749e6..abc6792e738c 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -172,6 +172,9 @@ bool spi_mem_default_supports_op(struct spi_mem *mem,
 		if (!spi_mem_controller_is_capable(ctlr, dtr))
 			return false;
 
+		if (op->data.swap16 && !spi_mem_controller_is_capable(ctlr, swap16))
+			return false;
+
 		if (op->cmd.nbytes != 2)
 			return false;
 	} else {
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index f866d5c8ed32..c46d2b8029be 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -90,6 +90,8 @@ enum spi_mem_data_dir {
  * @data.buswidth: number of IO lanes used to send/receive the data
  * @data.dtr: whether the data should be sent in DTR mode or not
  * @data.ecc: whether error correction is required or not
+ * @data.swap16: whether the byte order of 16-bit words is swapped when read
+ *		 or written in Octal DTR mode compared to STR mode.
  * @data.dir: direction of the transfer
  * @data.nbytes: number of data bytes to send/receive. Can be zero if the
  *		 operation does not involve transferring data
@@ -124,7 +126,8 @@ struct spi_mem_op {
 		u8 buswidth;
 		u8 dtr : 1;
 		u8 ecc : 1;
-		u8 __pad : 6;
+		u8 swap16 : 1;
+		u8 __pad : 5;
 		enum spi_mem_data_dir dir;
 		unsigned int nbytes;
 		union {
@@ -297,10 +300,13 @@ struct spi_controller_mem_ops {
  * struct spi_controller_mem_caps - SPI memory controller capabilities
  * @dtr: Supports DTR operations
  * @ecc: Supports operations with error correction
+ * @swap16: Supports swapping bytes on a 16 bit boundary when configured in
+ *	    Octal DTR
  */
 struct spi_controller_mem_caps {
 	bool dtr;
 	bool ecc;
+	bool swap16;
 };
 
 #define spi_mem_controller_is_capable(ctlr, cap)	\
-- 
cgit v1.2.3


From 848f2bbb363d4cdb4202db328a103fd3c34e21a2 Mon Sep 17 00:00:00 2001
From: Ronak Jain <ronak.jain@amd.com>
Date: Tue, 30 Jul 2024 01:43:42 -0700
Subject: firmware: xilinx: Add missing debug firmware interfaces

Add missing PM EEMI APIs interface in debug firmware driver.

The debugfs firmware driver interface is intended for testing and
debugging the EEMI APIs only. This interface does not contain any
checking regarding improper usage, and the number, type and valid
ranges of the arguments. This interface must be used with a lot of
care. In fact, accessing this interface during normal PM operation
will very likely cause unexpected problems.

The debugfs interface shouldn't be used in the production system and
hence it is disabled by default in defconfig.

Signed-off-by: Ronak Jain <ronak.jain@amd.com>
Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey@amd.com>
Signed-off-by: Michal Simek <michal.simek@amd.com>
Link: https://lore.kernel.org/r/20240730084342.1683231-1-ronak.jain@amd.com
---
 drivers/firmware/xilinx/zynqmp-debug.c | 162 ++++++++++++++++++++++++++++++++-
 include/linux/firmware/xlnx-zynqmp.h   |   4 +
 2 files changed, 165 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp-debug.c b/drivers/firmware/xilinx/zynqmp-debug.c
index 8528850af889..22853ae0efdf 100644
--- a/drivers/firmware/xilinx/zynqmp-debug.c
+++ b/drivers/firmware/xilinx/zynqmp-debug.c
@@ -31,12 +31,50 @@ static char debugfs_buf[PAGE_SIZE];
 
 #define PM_API(id)		 {id, #id, strlen(#id)}
 static struct pm_api_info pm_api_list[] = {
+	PM_API(PM_FORCE_POWERDOWN),
+	PM_API(PM_REQUEST_WAKEUP),
+	PM_API(PM_SYSTEM_SHUTDOWN),
+	PM_API(PM_REQUEST_NODE),
+	PM_API(PM_RELEASE_NODE),
+	PM_API(PM_SET_REQUIREMENT),
 	PM_API(PM_GET_API_VERSION),
+	PM_API(PM_REGISTER_NOTIFIER),
+	PM_API(PM_RESET_ASSERT),
+	PM_API(PM_RESET_GET_STATUS),
+	PM_API(PM_GET_CHIPID),
+	PM_API(PM_PINCTRL_SET_FUNCTION),
+	PM_API(PM_PINCTRL_CONFIG_PARAM_GET),
+	PM_API(PM_PINCTRL_CONFIG_PARAM_SET),
+	PM_API(PM_IOCTL),
+	PM_API(PM_CLOCK_ENABLE),
+	PM_API(PM_CLOCK_DISABLE),
+	PM_API(PM_CLOCK_GETSTATE),
+	PM_API(PM_CLOCK_SETDIVIDER),
+	PM_API(PM_CLOCK_GETDIVIDER),
+	PM_API(PM_CLOCK_SETPARENT),
+	PM_API(PM_CLOCK_GETPARENT),
 	PM_API(PM_QUERY_DATA),
 };
 
 static struct dentry *firmware_debugfs_root;
 
+/**
+ * zynqmp_pm_ioctl - PM IOCTL for device control and configs
+ * @node:	Node ID of the device
+ * @ioctl:	ID of the requested IOCTL
+ * @arg1:	Argument 1 of requested IOCTL call
+ * @arg2:	Argument 2 of requested IOCTL call
+ * @arg3:	Argument 3 of requested IOCTL call
+ * @out:	Returned output value
+ *
+ * Return:	Returns status, either success or error+reason
+ */
+static int zynqmp_pm_ioctl(const u32 node, const u32 ioctl, const u32 arg1,
+			   const u32 arg2, const u32 arg3, u32 *out)
+{
+	return zynqmp_pm_invoke_fn(PM_IOCTL, out, 5, node, ioctl, arg1, arg2, arg3);
+}
+
 /**
  * zynqmp_pm_argument_value() - Extract argument value from a PM-API request
  * @arg:	Entered PM-API argument in string format
@@ -95,6 +133,128 @@ static int process_api_request(u32 pm_id, u64 *pm_api_arg, u32 *pm_api_ret)
 		sprintf(debugfs_buf, "PM-API Version = %d.%d\n",
 			pm_api_version >> 16, pm_api_version & 0xffff);
 		break;
+	case PM_FORCE_POWERDOWN:
+		ret = zynqmp_pm_force_pwrdwn(pm_api_arg[0],
+					     pm_api_arg[1] ? pm_api_arg[1] :
+					     ZYNQMP_PM_REQUEST_ACK_NO);
+		break;
+	case PM_REQUEST_WAKEUP:
+		ret = zynqmp_pm_request_wake(pm_api_arg[0],
+					     pm_api_arg[1], pm_api_arg[2],
+					     pm_api_arg[3] ? pm_api_arg[3] :
+					     ZYNQMP_PM_REQUEST_ACK_NO);
+		break;
+	case PM_SYSTEM_SHUTDOWN:
+		ret = zynqmp_pm_system_shutdown(pm_api_arg[0], pm_api_arg[1]);
+		break;
+	case PM_REQUEST_NODE:
+		ret = zynqmp_pm_request_node(pm_api_arg[0],
+					     pm_api_arg[1] ? pm_api_arg[1] :
+					     ZYNQMP_PM_CAPABILITY_ACCESS,
+					     pm_api_arg[2] ? pm_api_arg[2] : 0,
+					     pm_api_arg[3] ? pm_api_arg[3] :
+					     ZYNQMP_PM_REQUEST_ACK_BLOCKING);
+		break;
+	case PM_RELEASE_NODE:
+		ret = zynqmp_pm_release_node(pm_api_arg[0]);
+		break;
+	case PM_SET_REQUIREMENT:
+		ret = zynqmp_pm_set_requirement(pm_api_arg[0],
+						pm_api_arg[1] ? pm_api_arg[1] :
+						ZYNQMP_PM_CAPABILITY_CONTEXT,
+						pm_api_arg[2] ?
+						pm_api_arg[2] : 0,
+						pm_api_arg[3] ? pm_api_arg[3] :
+						ZYNQMP_PM_REQUEST_ACK_BLOCKING);
+		break;
+	case PM_REGISTER_NOTIFIER:
+		ret = zynqmp_pm_register_notifier(pm_api_arg[0],
+						  pm_api_arg[1] ?
+						  pm_api_arg[1] : 0,
+						  pm_api_arg[2] ?
+						  pm_api_arg[2] : 0,
+						  pm_api_arg[3] ?
+						  pm_api_arg[3] : 0);
+		break;
+	case PM_RESET_ASSERT:
+		ret = zynqmp_pm_reset_assert(pm_api_arg[0], pm_api_arg[1]);
+		break;
+	case PM_RESET_GET_STATUS:
+		ret = zynqmp_pm_reset_get_status(pm_api_arg[0], &pm_api_ret[0]);
+		if (!ret)
+			sprintf(debugfs_buf, "Reset status: %u\n",
+				pm_api_ret[0]);
+		break;
+	case PM_GET_CHIPID:
+		ret = zynqmp_pm_get_chipid(&pm_api_ret[0], &pm_api_ret[1]);
+		if (!ret)
+			sprintf(debugfs_buf, "Idcode: %#x, Version:%#x\n",
+				pm_api_ret[0], pm_api_ret[1]);
+		break;
+	case PM_PINCTRL_SET_FUNCTION:
+		ret = zynqmp_pm_pinctrl_set_function(pm_api_arg[0],
+						     pm_api_arg[1]);
+		break;
+	case PM_PINCTRL_CONFIG_PARAM_GET:
+		ret = zynqmp_pm_pinctrl_get_config(pm_api_arg[0], pm_api_arg[1],
+						   &pm_api_ret[0]);
+		if (!ret)
+			sprintf(debugfs_buf,
+				"Pin: %llu, Param: %llu, Value: %u\n",
+				pm_api_arg[0], pm_api_arg[1],
+				pm_api_ret[0]);
+		break;
+	case PM_PINCTRL_CONFIG_PARAM_SET:
+		ret = zynqmp_pm_pinctrl_set_config(pm_api_arg[0],
+						   pm_api_arg[1],
+						   pm_api_arg[2]);
+		break;
+	case PM_IOCTL:
+		ret = zynqmp_pm_ioctl(pm_api_arg[0], pm_api_arg[1],
+				      pm_api_arg[2], pm_api_arg[3],
+				      pm_api_arg[4], &pm_api_ret[0]);
+		if (!ret && (pm_api_arg[1] == IOCTL_GET_RPU_OPER_MODE ||
+			     pm_api_arg[1] == IOCTL_GET_PLL_FRAC_MODE ||
+			     pm_api_arg[1] == IOCTL_GET_PLL_FRAC_DATA ||
+			     pm_api_arg[1] == IOCTL_READ_GGS ||
+			     pm_api_arg[1] == IOCTL_READ_PGGS ||
+			     pm_api_arg[1] == IOCTL_READ_REG))
+			sprintf(debugfs_buf, "IOCTL return value: %u\n",
+				pm_api_ret[1]);
+		if (!ret && pm_api_arg[1] == IOCTL_GET_QOS)
+			sprintf(debugfs_buf, "Default QoS: %u\nCurrent QoS: %u\n",
+				pm_api_ret[1], pm_api_ret[2]);
+		break;
+	case PM_CLOCK_ENABLE:
+		ret = zynqmp_pm_clock_enable(pm_api_arg[0]);
+		break;
+	case PM_CLOCK_DISABLE:
+		ret = zynqmp_pm_clock_disable(pm_api_arg[0]);
+		break;
+	case PM_CLOCK_GETSTATE:
+		ret = zynqmp_pm_clock_getstate(pm_api_arg[0], &pm_api_ret[0]);
+		if (!ret)
+			sprintf(debugfs_buf, "Clock state: %u\n",
+				pm_api_ret[0]);
+		break;
+	case PM_CLOCK_SETDIVIDER:
+		ret = zynqmp_pm_clock_setdivider(pm_api_arg[0], pm_api_arg[1]);
+		break;
+	case PM_CLOCK_GETDIVIDER:
+		ret = zynqmp_pm_clock_getdivider(pm_api_arg[0], &pm_api_ret[0]);
+		if (!ret)
+			sprintf(debugfs_buf, "Divider Value: %d\n",
+				pm_api_ret[0]);
+		break;
+	case PM_CLOCK_SETPARENT:
+		ret = zynqmp_pm_clock_setparent(pm_api_arg[0], pm_api_arg[1]);
+		break;
+	case PM_CLOCK_GETPARENT:
+		ret = zynqmp_pm_clock_getparent(pm_api_arg[0], &pm_api_ret[0]);
+		if (!ret)
+			sprintf(debugfs_buf,
+				"Clock parent Index: %u\n", pm_api_ret[0]);
+		break;
 	case PM_QUERY_DATA:
 		qdata.qid = pm_api_arg[0];
 		qdata.arg1 = pm_api_arg[1];
@@ -150,7 +310,7 @@ static ssize_t zynqmp_pm_debugfs_api_write(struct file *file,
 	char *kern_buff, *tmp_buff;
 	char *pm_api_req;
 	u32 pm_id = 0;
-	u64 pm_api_arg[4] = {0, 0, 0, 0};
+	u64 pm_api_arg[5] = {0, 0, 0, 0, 0};
 	/* Return values from PM APIs calls */
 	u32 pm_api_ret[4] = {0, 0, 0, 0};
 
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index d7d07afc0532..563382cf16f2 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -218,9 +218,13 @@ enum pm_ioctl_id {
 	/* Runtime feature configuration */
 	IOCTL_SET_FEATURE_CONFIG = 26,
 	IOCTL_GET_FEATURE_CONFIG = 27,
+	/* IOCTL for Secure Read/Write Interface */
+	IOCTL_READ_REG = 28,
 	/* Dynamic SD/GEM configuration */
 	IOCTL_SET_SD_CONFIG = 30,
 	IOCTL_SET_GEM_CONFIG = 31,
+	/* IOCTL to get default/current QoS */
+	IOCTL_GET_QOS = 34,
 };
 
 enum pm_query_id {
-- 
cgit v1.2.3


From f33d6099edf78e3c97900c0173fedbfecc025a9e Mon Sep 17 00:00:00 2001
From: Ronak Jain <ronak.jain@amd.com>
Date: Fri, 30 Aug 2024 03:00:42 -0700
Subject: firmware: xilinx: use u32 for reset ID in reset APIs

Refactors the reset handling mechanisms by replacing the reset ID's
enum type with a u32. This update improves flexibility, allowing the
reset ID to accommodate a broader range of values, including those
that may not fit into predefined enum values.

The use of u32 for reset ID enhances extensibility, especially for
hardware platforms or features where more granular control of reset
operations is required. By shifting to a general integer type, this
change reduces constraints and simplifies integration with other
system components that rely on non-enum-based reset IDs.

Signed-off-by: Ronak Jain <ronak.jain@amd.com>
Link: https://lore.kernel.org/r/20240830100042.3163511-1-ronak.jain@amd.com
Signed-off-by: Michal Simek <michal.simek@amd.com>
---
 drivers/firmware/xilinx/zynqmp.c     | 4 ++--
 include/linux/firmware/xlnx-zynqmp.h | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index add8acf66a9c..c8be32d9c6af 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -920,7 +920,7 @@ int zynqmp_pm_set_boot_health_status(u32 value)
  *
  * Return: Returns status, either success or error+reason
  */
-int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
+int zynqmp_pm_reset_assert(const u32 reset,
 			   const enum zynqmp_pm_reset_action assert_flag)
 {
 	return zynqmp_pm_invoke_fn(PM_RESET_ASSERT, NULL, 2, reset, assert_flag);
@@ -934,7 +934,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_reset_assert);
  *
  * Return: Returns status, either success or error+reason
  */
-int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status)
+int zynqmp_pm_reset_get_status(const u32 reset, u32 *status)
 {
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 	int ret;
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 563382cf16f2..5b938fc2adad 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -557,9 +557,9 @@ int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data);
 int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value);
 int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type);
 int zynqmp_pm_ospi_mux_select(u32 dev_id, u32 select);
-int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
+int zynqmp_pm_reset_assert(const u32 reset,
 			   const enum zynqmp_pm_reset_action assert_flag);
-int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status);
+int zynqmp_pm_reset_get_status(const u32 reset, u32 *status);
 unsigned int zynqmp_pm_bootmode_read(u32 *ps_mode);
 int zynqmp_pm_bootmode_write(u32 ps_mode);
 int zynqmp_pm_init_finalize(void);
@@ -702,14 +702,13 @@ static inline int zynqmp_pm_ospi_mux_select(u32 dev_id, u32 select)
 	return -ENODEV;
 }
 
-static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
+static inline int zynqmp_pm_reset_assert(const u32 reset,
 					 const enum zynqmp_pm_reset_action assert_flag)
 {
 	return -ENODEV;
 }
 
-static inline int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset,
-					     u32 *status)
+static inline int zynqmp_pm_reset_get_status(const u32 reset, u32 *status)
 {
 	return -ENODEV;
 }
-- 
cgit v1.2.3


From 92fb71333d5737d0296fb968a653dfda4b225175 Mon Sep 17 00:00:00 2001
From: Ronak Jain <ronak.jain@amd.com>
Date: Thu, 19 Sep 2024 22:55:01 -0700
Subject: firmware: xilinx: add support for new SMC call format

Added zynqmp_pm_invoke_fw_fn() to use new SMC format in which
lower 12 bits of SMC id are fixed and firmware header is moved to
subsequent SMC arguments. The new SMC format supports full request and
response buffers.

Added zynqmp_pm_get_sip_svc_version() to get SiP SVC version
number to check if TF-A is newer or older and use the SMC format
accordingly to handle backward compatibility.

Used new SMC format for PM_QUERY_DATA API as more response values are
required in it.

Signed-off-by: Ronak Jain <ronak.jain@amd.com>
Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20240920055501.2658642-1-ronak.jain@amd.com
Signed-off-by: Michal Simek <michal.simek@amd.com>
---
 drivers/firmware/xilinx/zynqmp.c     | 137 ++++++++++++++++++++++++++++++++++-
 include/linux/firmware/xlnx-zynqmp.h |  26 ++++++-
 2 files changed, 157 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 920aba92feee..5022bd717054 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -3,7 +3,7 @@
  * Xilinx Zynq MPSoC Firmware layer
  *
  *  Copyright (C) 2014-2022 Xilinx, Inc.
- *  Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc.
+ *  Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc.
  *
  *  Michal Simek <michal.simek@amd.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -46,6 +46,7 @@ static DEFINE_HASHTABLE(pm_api_features_map, PM_API_FEATURE_CHECK_MAX_ORDER);
 static u32 ioctl_features[FEATURE_PAYLOAD_SIZE];
 static u32 query_features[FEATURE_PAYLOAD_SIZE];
 
+static u32 sip_svc_version;
 static struct platform_device *em_dev;
 
 /**
@@ -151,6 +152,9 @@ static noinline int do_fw_call_smc(u32 *ret_payload, u32 num_args, ...)
 		ret_payload[1] = upper_32_bits(res.a0);
 		ret_payload[2] = lower_32_bits(res.a1);
 		ret_payload[3] = upper_32_bits(res.a1);
+		ret_payload[4] = lower_32_bits(res.a2);
+		ret_payload[5] = upper_32_bits(res.a2);
+		ret_payload[6] = lower_32_bits(res.a3);
 	}
 
 	return zynqmp_pm_ret_code((enum pm_ret_status)res.a0);
@@ -191,6 +195,9 @@ static noinline int do_fw_call_hvc(u32 *ret_payload, u32 num_args, ...)
 		ret_payload[1] = upper_32_bits(res.a0);
 		ret_payload[2] = lower_32_bits(res.a1);
 		ret_payload[3] = upper_32_bits(res.a1);
+		ret_payload[4] = lower_32_bits(res.a2);
+		ret_payload[5] = upper_32_bits(res.a2);
+		ret_payload[6] = lower_32_bits(res.a3);
 	}
 
 	return zynqmp_pm_ret_code((enum pm_ret_status)res.a0);
@@ -331,6 +338,70 @@ int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id)
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_is_function_supported);
 
+/**
+ * zynqmp_pm_invoke_fw_fn() - Invoke the system-level platform management layer
+ *			caller function depending on the configuration
+ * @pm_api_id:		Requested PM-API call
+ * @ret_payload:	Returned value array
+ * @num_args:		Number of arguments to requested PM-API call
+ *
+ * Invoke platform management function for SMC or HVC call, depending on
+ * configuration.
+ * Following SMC Calling Convention (SMCCC) for SMC64:
+ * Pm Function Identifier,
+ * PM_SIP_SVC + PASS_THROUGH_FW_CMD_ID =
+ *	((SMC_TYPE_FAST << FUNCID_TYPE_SHIFT)
+ *	((SMC_64) << FUNCID_CC_SHIFT)
+ *	((SIP_START) << FUNCID_OEN_SHIFT)
+ *	(PASS_THROUGH_FW_CMD_ID))
+ *
+ * PM_SIP_SVC - Registered ZynqMP SIP Service Call.
+ * PASS_THROUGH_FW_CMD_ID - Fixed SiP SVC call ID for FW specific calls.
+ *
+ * Return: Returns status, either success or error+reason
+ */
+int zynqmp_pm_invoke_fw_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...)
+{
+	/*
+	 * Added SIP service call Function Identifier
+	 * Make sure to stay in x0 register
+	 */
+	u64 smc_arg[SMC_ARG_CNT_64];
+	int ret, i;
+	va_list arg_list;
+	u32 args[SMC_ARG_CNT_32] = {0};
+	u32 module_id;
+
+	if (num_args > SMC_ARG_CNT_32)
+		return -EINVAL;
+
+	va_start(arg_list, num_args);
+
+	/* Check if feature is supported or not */
+	ret = zynqmp_pm_feature(pm_api_id);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < num_args; i++)
+		args[i] = va_arg(arg_list, u32);
+
+	va_end(arg_list);
+
+	module_id = FIELD_GET(PLM_MODULE_ID_MASK, pm_api_id);
+
+	if (module_id == 0)
+		module_id = XPM_MODULE_ID;
+
+	smc_arg[0] = PM_SIP_SVC | PASS_THROUGH_FW_CMD_ID;
+	smc_arg[1] = ((u64)args[0] << 32U) | FIELD_PREP(PLM_MODULE_ID_MASK, module_id) |
+		      (pm_api_id & API_ID_MASK);
+	for (i = 1; i < (SMC_ARG_CNT_64 - 1); i++)
+		smc_arg[i + 1] = ((u64)args[(i * 2)] << 32U) | args[(i * 2) - 1];
+
+	return do_fw_call(ret_payload, 8, smc_arg[0], smc_arg[1], smc_arg[2], smc_arg[3],
+			  smc_arg[4], smc_arg[5], smc_arg[6], smc_arg[7]);
+}
+
 /**
  * zynqmp_pm_invoke_fn() - Invoke the system-level platform management layer
  *			   caller function depending on the configuration
@@ -488,6 +559,35 @@ int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily)
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_get_family_info);
 
+/**
+ * zynqmp_pm_get_sip_svc_version() - Get SiP service call version
+ * @version:	Returned version value
+ *
+ * Return: Returns status, either success or error+reason
+ */
+static int zynqmp_pm_get_sip_svc_version(u32 *version)
+{
+	struct arm_smccc_res res;
+	u64 args[SMC_ARG_CNT_64] = {0};
+
+	if (!version)
+		return -EINVAL;
+
+	/* Check if SiP SVC version already verified */
+	if (sip_svc_version > 0) {
+		*version = sip_svc_version;
+		return 0;
+	}
+
+	args[0] = GET_SIP_SVC_VERSION;
+
+	arm_smccc_smc(args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], &res);
+
+	*version = ((lower_32_bits(res.a0) << 16U) | lower_32_bits(res.a1));
+
+	return zynqmp_pm_ret_code(XST_PM_SUCCESS);
+}
+
 /**
  * zynqmp_pm_get_trustzone_version() - Get secure trustzone firmware version
  * @version:	Returned version value
@@ -552,10 +652,34 @@ static int get_set_conduit_method(struct device_node *np)
  */
 int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out)
 {
-	int ret;
+	int ret, i = 0;
+	u32 ret_payload[PAYLOAD_ARG_CNT] = {0};
+
+	if (sip_svc_version >= SIP_SVC_PASSTHROUGH_VERSION) {
+		ret = zynqmp_pm_invoke_fw_fn(PM_QUERY_DATA, ret_payload, 4,
+					     qdata.qid, qdata.arg1,
+					     qdata.arg2, qdata.arg3);
+		/* To support backward compatibility */
+		if (!ret && !ret_payload[0]) {
+			/*
+			 * TF-A passes return status on 0th index but
+			 * api to get clock name reads data from 0th
+			 * index so pass data at 0th index instead of
+			 * return status
+			 */
+			if (qdata.qid == PM_QID_CLOCK_GET_NAME ||
+			    qdata.qid == PM_QID_PINCTRL_GET_FUNCTION_NAME)
+				i = 1;
+
+			for (; i < PAYLOAD_ARG_CNT; i++, out++)
+				*out = ret_payload[i];
 
-	ret = zynqmp_pm_invoke_fn(PM_QUERY_DATA, out, 4, qdata.qid, qdata.arg1, qdata.arg2,
-				  qdata.arg3);
+			return ret;
+		}
+	}
+
+	ret = zynqmp_pm_invoke_fn(PM_QUERY_DATA, out, 4, qdata.qid,
+				  qdata.arg1, qdata.arg2, qdata.arg3);
 
 	/*
 	 * For clock name query, all bytes in SMC response are clock name
@@ -1890,6 +2014,11 @@ static int zynqmp_firmware_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	/* Get SiP SVC version number */
+	ret = zynqmp_pm_get_sip_svc_version(&sip_svc_version);
+	if (ret)
+		return ret;
+
 	ret = do_feature_check_call(PM_FEATURE_CHECK);
 	if (ret >= 0 && ((ret & FIRMWARE_VERSION_MASK) >= PM_API_VERSION_1))
 		feature_check_enabled = true;
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 5b938fc2adad..76d85ad82ec0 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -3,7 +3,7 @@
  * Xilinx Zynq MPSoC Firmware layer
  *
  *  Copyright (C) 2014-2021 Xilinx
- *  Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc.
+ *  Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc.
  *
  *  Michal Simek <michal.simek@amd.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -32,6 +32,19 @@
 /* SMC SIP service Call Function Identifier Prefix */
 #define PM_SIP_SVC			0xC2000000
 
+/* SMC function ID to get SiP SVC version */
+#define GET_SIP_SVC_VERSION	(0x8200ff03U)
+
+/* SiP Service Calls version numbers */
+#define SIP_SVC_VERSION_MAJOR		(0U)
+#define SIP_SVC_VERSION_MINOR		(2U)
+
+#define SIP_SVC_PASSTHROUGH_VERSION	((SIP_SVC_VERSION_MAJOR << 16) | \
+					 SIP_SVC_VERSION_MINOR)
+
+/* Fixed ID for FW specific APIs */
+#define PASS_THROUGH_FW_CMD_ID	GENMASK(11, 0)
+
 /* PM API versions */
 #define PM_API_VERSION_1	1
 #define PM_API_VERSION_2	2
@@ -51,6 +64,7 @@
 
 #define API_ID_MASK		GENMASK(7, 0)
 #define MODULE_ID_MASK		GENMASK(11, 8)
+#define PLM_MODULE_ID_MASK	GENMASK(15, 8)
 
 /* Firmware feature check version mask */
 #define FIRMWARE_VERSION_MASK		0xFFFFU
@@ -62,7 +76,13 @@
 #define GET_CALLBACK_DATA		0xa01
 
 /* Number of 32bits values in payload */
-#define PAYLOAD_ARG_CNT	4U
+#define PAYLOAD_ARG_CNT	7U
+
+/* Number of 64bits arguments for SMC call */
+#define SMC_ARG_CNT_64	8U
+
+/* Number of 32bits arguments for SMC call */
+#define SMC_ARG_CNT_32	13U
 
 /* Number of arguments for a callback */
 #define CB_ARG_CNT     4
@@ -130,6 +150,7 @@
 
 enum pm_module_id {
 	PM_MODULE_ID = 0x0,
+	XPM_MODULE_ID = 0x2,
 	XSEM_MODULE_ID = 0x3,
 	TF_A_MODULE_ID = 0xa,
 };
@@ -537,6 +558,7 @@ struct zynqmp_pm_query_data {
 };
 
 int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...);
+int zynqmp_pm_invoke_fw_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...);
 
 #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE)
 int zynqmp_pm_get_api_version(u32 *version);
-- 
cgit v1.2.3


From a849a0273d0f73a252d14d31c5003a8059ea51fc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 11 Sep 2024 15:17:37 +0200
Subject: ntp: Remove unused tick_nsec

tick_nsec is only updated in the NTP core, but there are no users.

Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-1-2d52f4e13476@linutronix.de
---
 arch/x86/include/asm/timer.h | 2 --
 include/linux/timex.h        | 1 -
 kernel/time/ntp.c            | 8 ++------
 3 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 7365dd4acffb..23baf8c9b34c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -6,8 +6,6 @@
 #include <linux/interrupt.h>
 #include <linux/math64.h>
 
-#define TICK_SIZE (tick_nsec / 1000)
-
 unsigned long long native_sched_clock(void);
 extern void recalibrate_cpu_khz(void);
 
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 3871b06bd302..7f7a12fd8200 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -145,7 +145,6 @@ unsigned long random_get_entropy_fallback(void);
  * estimated error = NTP dispersion.
  */
 extern unsigned long tick_usec;		/* USER_HZ period (usec) */
-extern unsigned long tick_nsec;		/* SHIFTED_HZ period (nsec) */
 
 /* Required to safely shift negative values */
 #define shift_right(x, s) ({	\
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 802b336f4b8c..c17cc9d857bc 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -33,9 +33,6 @@
 /* USER_HZ period (usecs): */
 unsigned long			tick_usec = USER_TICK_USEC;
 
-/* SHIFTED_HZ period (nsecs): */
-unsigned long			tick_nsec;
-
 static u64			tick_length;
 static u64			tick_length_base;
 
@@ -253,8 +250,8 @@ static inline int ntp_synced(void)
  */
 
 /*
- * Update (tick_length, tick_length_base, tick_nsec), based
- * on (tick_usec, ntp_tick_adj, time_freq):
+ * Update tick_length and tick_length_base, based on tick_usec, ntp_tick_adj and
+ * time_freq:
  */
 static void ntp_update_frequency(void)
 {
@@ -267,7 +264,6 @@ static void ntp_update_frequency(void)
 	second_length		+= ntp_tick_adj;
 	second_length		+= time_freq;
 
-	tick_nsec		 = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
 	new_base		 = div_u64(second_length, NTP_INTERVAL_FREQ);
 
 	/*
-- 
cgit v1.2.3


From 66606a93849bfe3cbe9f0b801b40f60b87c54e11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 11 Sep 2024 15:17:38 +0200
Subject: ntp: Make tick_usec static

There are no users of tick_usec outside of the NTP core code. Therefore
make tick_usec static.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-2-2d52f4e13476@linutronix.de
---
 include/linux/timex.h | 7 -------
 kernel/time/ntp.c     | 5 ++++-
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 7f7a12fd8200..4ee32eff3f22 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -139,13 +139,6 @@ unsigned long random_get_entropy_fallback(void);
 #define MAXSEC 2048		/* max interval between updates (s) */
 #define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */
 
-/*
- * kernel variables
- * Note: maximum error = NTP sync distance = dispersion + delay / 2;
- * estimated error = NTP dispersion.
- */
-extern unsigned long tick_usec;		/* USER_HZ period (usec) */
-
 /* Required to safely shift negative values */
 #define shift_right(x, s) ({	\
 	__typeof__(x) __x = (x);	\
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c17cc9d857bc..ed15ec993a82 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,7 +31,7 @@
 
 
 /* USER_HZ period (usecs): */
-unsigned long			tick_usec = USER_TICK_USEC;
+static unsigned long		tick_usec = USER_TICK_USEC;
 
 static u64			tick_length;
 static u64			tick_length_base;
@@ -44,6 +44,9 @@ static u64			tick_length_base;
 
 /*
  * phase-lock loop variables
+ *
+ * Note: maximum error = NTP sync distance = dispersion + delay / 2;
+ * estimated error = NTP dispersion.
  */
 
 /*
-- 
cgit v1.2.3


From 8102c4daf44ab86c2d2226a8136bec905d6e2bd1 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Wed, 11 Sep 2024 10:30:20 +0100
Subject: timekeeping: Add the boot clock to system time snapshot

For tracing purpose, the boot clock is interesting as it doesn't stop on
suspend. Export it as part of the time snapshot. This will later allow
the hypervisor to add boot clock timestamps to its events.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20240911093029.3279154-5-vdonnefort@google.com
---
 include/linux/timekeeping.h | 2 ++
 kernel/time/timekeeping.c   | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fc12a9ba2c88..e85c27347e44 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -275,6 +275,7 @@ struct ktime_timestamps {
  *				 counter value
  * @cycles:	Clocksource counter value to produce the system times
  * @real:	Realtime system time
+ * @boot:	Boot time
  * @raw:	Monotonic raw system time
  * @cs_id:	Clocksource ID
  * @clock_was_set_seq:	The sequence number of clock-was-set events
@@ -283,6 +284,7 @@ struct ktime_timestamps {
 struct system_time_snapshot {
 	u64			cycles;
 	ktime_t			real;
+	ktime_t			boot;
 	ktime_t			raw;
 	enum clocksource_ids	cs_id;
 	unsigned int		clock_was_set_seq;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e6f409bf311..47e44b9d2671 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1060,6 +1060,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 	unsigned int seq;
 	ktime_t base_raw;
 	ktime_t base_real;
+	ktime_t base_boot;
 	u64 nsec_raw;
 	u64 nsec_real;
 	u64 now;
@@ -1074,6 +1075,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 		systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
 		base_real = ktime_add(tk->tkr_mono.base,
 				      tk_core.timekeeper.offs_real);
+		base_boot = ktime_add(tk->tkr_mono.base,
+				      tk_core.timekeeper.offs_boot);
 		base_raw = tk->tkr_raw.base;
 		nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
 		nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
@@ -1081,6 +1084,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 
 	systime_snapshot->cycles = now;
 	systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
+	systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
 	systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
 }
 EXPORT_SYMBOL_GPL(ktime_get_snapshot);
-- 
cgit v1.2.3


From f69767a1ada3ac74be2e1ac0795a05e1d1384eff Mon Sep 17 00:00:00 2001
From: Wei Huang <wei.huang2@amd.com>
Date: Wed, 2 Oct 2024 11:59:50 -0500
Subject: PCI: Add TLP Processing Hints (TPH) support

Add support for PCIe TLP Processing Hints (TPH) support (see PCIe r6.2,
sec 6.17).

Add TPH register definitions in pci_regs.h, including the TPH Requester
capability register, TPH Requester control register, TPH Completer
capability, and the ST fields of MSI-X entry.

Introduce pcie_enable_tph() and pcie_disable_tph(), enabling drivers to
toggle TPH support and configure specific ST mode as needed. Also add a new
kernel parameter, "pci=notph", allowing users to disable TPH support across
the entire system.

Link: https://lore.kernel.org/r/20241002165954.128085-2-wei.huang2@amd.com
Co-developed-by: Jing Liu <jing2.liu@intel.com>
Co-developed-by: Paul Luse <paul.e.luse@linux.intel.com>
Co-developed-by: Eric Van Tassell <Eric.VanTassell@amd.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Paul Luse <paul.e.luse@linux.intel.com>
Signed-off-by: Eric Van Tassell <Eric.VanTassell@amd.com>
Signed-off-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
---
 Documentation/admin-guide/kernel-parameters.txt |   4 +
 drivers/pci/Kconfig                             |   9 ++
 drivers/pci/Makefile                            |   1 +
 drivers/pci/pci.c                               |   4 +
 drivers/pci/pci.h                               |  12 ++
 drivers/pci/probe.c                             |   1 +
 drivers/pci/tph.c                               | 197 ++++++++++++++++++++++++
 include/linux/pci-tph.h                         |  21 +++
 include/linux/pci.h                             |   7 +
 include/uapi/linux/pci_regs.h                   |  37 ++++-
 10 files changed, 285 insertions(+), 8 deletions(-)
 create mode 100644 drivers/pci/tph.c
 create mode 100644 include/linux/pci-tph.h

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1518343bbe22..178995b07451 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4678,6 +4678,10 @@
 		nomio		[S390] Do not use MIO instructions.
 		norid		[S390] ignore the RID field and force use of
 				one PCI domain per PCI function
+		notph		[PCIE] If the PCIE_TPH kernel config parameter
+				is enabled, this kernel boot option can be used
+				to disable PCIe TLP Processing Hints support
+				system-wide.
 
 	pcie_aspm=	[PCIE] Forcibly enable or ignore PCIe Active State Power
 			Management.
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 0d94e4a967d8..2f270e4414b3 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -173,6 +173,15 @@ config PCI_PASID
 
 	  If unsure, say N.
 
+config PCIE_TPH
+	bool "TLP Processing Hints"
+	help
+	  This option adds support for PCIe TLP Processing Hints (TPH).
+	  TPH allows endpoint devices to provide optimization hints, such as
+	  desired caching behavior, for requests that target memory space.
+	  These hints, called Steering Tags, can empower the system hardware
+	  to optimize the utilization of platform resources.
+
 config PCI_P2PDMA
 	bool "PCI peer-to-peer transfer support"
 	depends on ZONE_DEVICE
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 374c5c06d92f..b2a100f2e24a 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_VGA_ARB)		+= vgaarb.o
 obj-$(CONFIG_PCI_DOE)		+= doe.o
 obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o
 obj-$(CONFIG_PCI_NPEM)		+= npem.o
+obj-$(CONFIG_PCIE_TPH)		+= tph.o
 
 # Endpoint library must be initialized before its users
 obj-$(CONFIG_PCI_ENDPOINT)	+= endpoint/
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7d85c04fbba2..89dafecc869b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1828,6 +1828,7 @@ int pci_save_state(struct pci_dev *dev)
 	pci_save_dpc_state(dev);
 	pci_save_aer_state(dev);
 	pci_save_ptm_state(dev);
+	pci_save_tph_state(dev);
 	return pci_save_vc_state(dev);
 }
 EXPORT_SYMBOL(pci_save_state);
@@ -1933,6 +1934,7 @@ void pci_restore_state(struct pci_dev *dev)
 	pci_restore_rebar_state(dev);
 	pci_restore_dpc_state(dev);
 	pci_restore_ptm_state(dev);
+	pci_restore_tph_state(dev);
 
 	pci_aer_clear_status(dev);
 	pci_restore_aer_state(dev);
@@ -6896,6 +6898,8 @@ static int __init pci_setup(char *str)
 				pci_no_domains();
 			} else if (!strncmp(str, "noari", 5)) {
 				pcie_ari_disabled = true;
+			} else if (!strncmp(str, "notph", 5)) {
+				pci_no_tph();
 			} else if (!strncmp(str, "cbiosize=", 9)) {
 				pci_cardbus_io_size = memparse(str + 9, &str);
 			} else if (!strncmp(str, "cbmemsize=", 10)) {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 14d00ce45bfa..d89fdbf04f36 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -597,6 +597,18 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
 
 #endif /* CONFIG_PCI_IOV */
 
+#ifdef CONFIG_PCIE_TPH
+void pci_restore_tph_state(struct pci_dev *dev);
+void pci_save_tph_state(struct pci_dev *dev);
+void pci_no_tph(void);
+void pci_tph_init(struct pci_dev *dev);
+#else
+static inline void pci_restore_tph_state(struct pci_dev *dev) { }
+static inline void pci_save_tph_state(struct pci_dev *dev) { }
+static inline void pci_no_tph(void) { }
+static inline void pci_tph_init(struct pci_dev *dev) { }
+#endif
+
 #ifdef CONFIG_PCIE_PTM
 void pci_ptm_init(struct pci_dev *dev);
 void pci_save_ptm_state(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..b086d53a9048 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2495,6 +2495,7 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	pci_dpc_init(dev);		/* Downstream Port Containment */
 	pci_rcec_init(dev);		/* Root Complex Event Collector */
 	pci_doe_init(dev);		/* Data Object Exchange */
+	pci_tph_init(dev);		/* TLP Processing Hints */
 
 	pcie_report_downtraining(dev);
 	pci_init_reset_methods(dev);
diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c
new file mode 100644
index 000000000000..4d6317cbf8a6
--- /dev/null
+++ b/drivers/pci/tph.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * TPH (TLP Processing Hints) support
+ *
+ * Copyright (C) 2024 Advanced Micro Devices, Inc.
+ *     Eric Van Tassell <Eric.VanTassell@amd.com>
+ *     Wei Huang <wei.huang2@amd.com>
+ */
+#include <linux/pci.h>
+#include <linux/bitfield.h>
+#include <linux/pci-tph.h>
+
+#include "pci.h"
+
+/* System-wide TPH disabled */
+static bool pci_tph_disabled;
+
+static u8 get_st_modes(struct pci_dev *pdev)
+{
+	u32 reg;
+
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CAP, &reg);
+	reg &= PCI_TPH_CAP_ST_NS | PCI_TPH_CAP_ST_IV | PCI_TPH_CAP_ST_DS;
+
+	return reg;
+}
+
+/* Return device's Root Port completer capability */
+static u8 get_rp_completer_type(struct pci_dev *pdev)
+{
+	struct pci_dev *rp;
+	u32 reg;
+	int ret;
+
+	rp = pcie_find_root_port(pdev);
+	if (!rp)
+		return 0;
+
+	ret = pcie_capability_read_dword(rp, PCI_EXP_DEVCAP2, &reg);
+	if (ret)
+		return 0;
+
+	return FIELD_GET(PCI_EXP_DEVCAP2_TPH_COMP_MASK, reg);
+}
+
+/**
+ * pcie_disable_tph - Turn off TPH support for device
+ * @pdev: PCI device
+ *
+ * Return: none
+ */
+void pcie_disable_tph(struct pci_dev *pdev)
+{
+	if (!pdev->tph_cap)
+		return;
+
+	if (!pdev->tph_enabled)
+		return;
+
+	pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, 0);
+
+	pdev->tph_mode = 0;
+	pdev->tph_req_type = 0;
+	pdev->tph_enabled = 0;
+}
+EXPORT_SYMBOL(pcie_disable_tph);
+
+/**
+ * pcie_enable_tph - Enable TPH support for device using a specific ST mode
+ * @pdev: PCI device
+ * @mode: ST mode to enable. Current supported modes include:
+ *
+ *   - PCI_TPH_ST_NS_MODE: NO ST Mode
+ *   - PCI_TPH_ST_IV_MODE: Interrupt Vector Mode
+ *   - PCI_TPH_ST_DS_MODE: Device Specific Mode
+ *
+ * Check whether the mode is actually supported by the device before enabling
+ * and return an error if not. Additionally determine what types of requests,
+ * TPH or extended TPH, can be issued by the device based on its TPH requester
+ * capability and the Root Port's completer capability.
+ *
+ * Return: 0 on success, otherwise negative value (-errno)
+ */
+int pcie_enable_tph(struct pci_dev *pdev, int mode)
+{
+	u32 reg;
+	u8 dev_modes;
+	u8 rp_req_type;
+
+	/* Honor "notph" kernel parameter */
+	if (pci_tph_disabled)
+		return -EINVAL;
+
+	if (!pdev->tph_cap)
+		return -EINVAL;
+
+	if (pdev->tph_enabled)
+		return -EBUSY;
+
+	/* Sanitize and check ST mode compatibility */
+	mode &= PCI_TPH_CTRL_MODE_SEL_MASK;
+	dev_modes = get_st_modes(pdev);
+	if (!((1 << mode) & dev_modes))
+		return -EINVAL;
+
+	pdev->tph_mode = mode;
+
+	/* Get req_type supported by device and its Root Port */
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CAP, &reg);
+	if (FIELD_GET(PCI_TPH_CAP_EXT_TPH, reg))
+		pdev->tph_req_type = PCI_TPH_REQ_EXT_TPH;
+	else
+		pdev->tph_req_type = PCI_TPH_REQ_TPH_ONLY;
+
+	rp_req_type = get_rp_completer_type(pdev);
+
+	/* Final req_type is the smallest value of two */
+	pdev->tph_req_type = min(pdev->tph_req_type, rp_req_type);
+
+	if (pdev->tph_req_type == PCI_TPH_REQ_DISABLE)
+		return -EINVAL;
+
+	/* Write them into TPH control register */
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, &reg);
+
+	reg &= ~PCI_TPH_CTRL_MODE_SEL_MASK;
+	reg |= FIELD_PREP(PCI_TPH_CTRL_MODE_SEL_MASK, pdev->tph_mode);
+
+	reg &= ~PCI_TPH_CTRL_REQ_EN_MASK;
+	reg |= FIELD_PREP(PCI_TPH_CTRL_REQ_EN_MASK, pdev->tph_req_type);
+
+	pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, reg);
+
+	pdev->tph_enabled = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(pcie_enable_tph);
+
+void pci_restore_tph_state(struct pci_dev *pdev)
+{
+	struct pci_cap_saved_state *save_state;
+	u32 *cap;
+
+	if (!pdev->tph_cap)
+		return;
+
+	if (!pdev->tph_enabled)
+		return;
+
+	save_state = pci_find_saved_ext_cap(pdev, PCI_EXT_CAP_ID_TPH);
+	if (!save_state)
+		return;
+
+	/* Restore control register and all ST entries */
+	cap = &save_state->cap.data[0];
+	pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, *cap++);
+}
+
+void pci_save_tph_state(struct pci_dev *pdev)
+{
+	struct pci_cap_saved_state *save_state;
+	u32 *cap;
+
+	if (!pdev->tph_cap)
+		return;
+
+	if (!pdev->tph_enabled)
+		return;
+
+	save_state = pci_find_saved_ext_cap(pdev, PCI_EXT_CAP_ID_TPH);
+	if (!save_state)
+		return;
+
+	/* Save control register */
+	cap = &save_state->cap.data[0];
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, cap++);
+}
+
+void pci_no_tph(void)
+{
+	pci_tph_disabled = true;
+
+	pr_info("PCIe TPH is disabled\n");
+}
+
+void pci_tph_init(struct pci_dev *pdev)
+{
+	u32 save_size;
+
+	pdev->tph_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_TPH);
+	if (!pdev->tph_cap)
+		return;
+
+	save_size = sizeof(u32);
+	pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_TPH, save_size);
+}
diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h
new file mode 100644
index 000000000000..58654a334ffb
--- /dev/null
+++ b/include/linux/pci-tph.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * TPH (TLP Processing Hints)
+ *
+ * Copyright (C) 2024 Advanced Micro Devices, Inc.
+ *     Eric Van Tassell <Eric.VanTassell@amd.com>
+ *     Wei Huang <wei.huang2@amd.com>
+ */
+#ifndef LINUX_PCI_TPH_H
+#define LINUX_PCI_TPH_H
+
+#ifdef CONFIG_PCIE_TPH
+void pcie_disable_tph(struct pci_dev *pdev);
+int pcie_enable_tph(struct pci_dev *pdev, int mode);
+#else
+static inline void pcie_disable_tph(struct pci_dev *pdev) { }
+static inline int pcie_enable_tph(struct pci_dev *pdev, int mode)
+{ return -EINVAL; }
+#endif
+
+#endif /* LINUX_PCI_TPH_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..8351d76b6e12 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -434,6 +434,7 @@ struct pci_dev {
 	unsigned int	ats_enabled:1;		/* Address Translation Svc */
 	unsigned int	pasid_enabled:1;	/* Process Address Space ID */
 	unsigned int	pri_enabled:1;		/* Page Request Interface */
+	unsigned int	tph_enabled:1;		/* TLP Processing Hints */
 	unsigned int	is_managed:1;		/* Managed via devres */
 	unsigned int	is_msi_managed:1;	/* MSI release via devres installed */
 	unsigned int	needs_freset:1;		/* Requires fundamental reset */
@@ -534,6 +535,12 @@ struct pci_dev {
 
 	/* These methods index pci_reset_fn_methods[] */
 	u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */
+
+#ifdef CONFIG_PCIE_TPH
+	u16		tph_cap;	/* TPH capability offset */
+	u8		tph_mode;	/* TPH mode */
+	u8		tph_req_type;	/* TPH requester type */
+#endif
 };
 
 static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 12323b3334a9..155dea741615 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -340,7 +340,8 @@
 #define PCI_MSIX_ENTRY_UPPER_ADDR	0x4  /* Message Upper Address */
 #define PCI_MSIX_ENTRY_DATA		0x8  /* Message Data */
 #define PCI_MSIX_ENTRY_VECTOR_CTRL	0xc  /* Vector Control */
-#define  PCI_MSIX_ENTRY_CTRL_MASKBIT	0x00000001
+#define  PCI_MSIX_ENTRY_CTRL_MASKBIT	0x00000001  /* Mask Bit */
+#define  PCI_MSIX_ENTRY_CTRL_ST		0xffff0000  /* Steering Tag */
 
 /* CompactPCI Hotswap Register */
 
@@ -659,6 +660,7 @@
 #define  PCI_EXP_DEVCAP2_ATOMIC_COMP64	0x00000100 /* 64b AtomicOp completion */
 #define  PCI_EXP_DEVCAP2_ATOMIC_COMP128	0x00000200 /* 128b AtomicOp completion */
 #define  PCI_EXP_DEVCAP2_LTR		0x00000800 /* Latency tolerance reporting */
+#define  PCI_EXP_DEVCAP2_TPH_COMP_MASK	0x00003000 /* TPH completer support */
 #define  PCI_EXP_DEVCAP2_OBFF_MASK	0x000c0000 /* OBFF support mechanism */
 #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
 #define  PCI_EXP_DEVCAP2_OBFF_WAKE	0x00080000 /* Re-use WAKE# for OBFF */
@@ -1023,15 +1025,34 @@
 #define  PCI_DPA_CAP_SUBSTATE_MASK	0x1F	/* # substates - 1 */
 #define PCI_DPA_BASE_SIZEOF	16	/* size with 0 substates */
 
+/* TPH Completer Support */
+#define PCI_EXP_DEVCAP2_TPH_COMP_NONE		0x0 /* None */
+#define PCI_EXP_DEVCAP2_TPH_COMP_TPH_ONLY	0x1 /* TPH only */
+#define PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH	0x3 /* TPH and Extended TPH */
+
 /* TPH Requester */
 #define PCI_TPH_CAP		4	/* capability register */
-#define  PCI_TPH_CAP_LOC_MASK	0x600	/* location mask */
-#define   PCI_TPH_LOC_NONE	0x000	/* no location */
-#define   PCI_TPH_LOC_CAP	0x200	/* in capability */
-#define   PCI_TPH_LOC_MSIX	0x400	/* in MSI-X */
-#define PCI_TPH_CAP_ST_MASK	0x07FF0000	/* ST table mask */
-#define PCI_TPH_CAP_ST_SHIFT	16	/* ST table shift */
-#define PCI_TPH_BASE_SIZEOF	0xc	/* size with no ST table */
+#define  PCI_TPH_CAP_ST_NS	0x00000001 /* No ST Mode Supported */
+#define  PCI_TPH_CAP_ST_IV	0x00000002 /* Interrupt Vector Mode Supported */
+#define  PCI_TPH_CAP_ST_DS	0x00000004 /* Device Specific Mode Supported */
+#define  PCI_TPH_CAP_EXT_TPH	0x00000100 /* Ext TPH Requester Supported */
+#define  PCI_TPH_CAP_LOC_MASK	0x00000600 /* ST Table Location */
+#define   PCI_TPH_LOC_NONE	0x00000000 /* Not present */
+#define   PCI_TPH_LOC_CAP	0x00000200 /* In capability */
+#define   PCI_TPH_LOC_MSIX	0x00000400 /* In MSI-X */
+#define  PCI_TPH_CAP_ST_MASK	0x07FF0000 /* ST Table Size */
+#define  PCI_TPH_CAP_ST_SHIFT	16	/* ST Table Size shift */
+#define PCI_TPH_BASE_SIZEOF	0xc	/* Size with no ST table */
+
+#define PCI_TPH_CTRL		8	/* control register */
+#define  PCI_TPH_CTRL_MODE_SEL_MASK	0x00000007 /* ST Mode Select */
+#define   PCI_TPH_ST_NS_MODE		0x0 /* No ST Mode */
+#define   PCI_TPH_ST_IV_MODE		0x1 /* Interrupt Vector Mode */
+#define   PCI_TPH_ST_DS_MODE		0x2 /* Device Specific Mode */
+#define  PCI_TPH_CTRL_REQ_EN_MASK	0x00000300 /* TPH Requester Enable */
+#define   PCI_TPH_REQ_DISABLE		0x0 /* No TPH requests allowed */
+#define   PCI_TPH_REQ_TPH_ONLY		0x1 /* TPH only requests allowed */
+#define   PCI_TPH_REQ_EXT_TPH		0x3 /* Extended TPH requests allowed */
 
 /* Downstream Port Containment */
 #define PCI_EXP_DPC_CAP			0x04	/* DPC Capability */
-- 
cgit v1.2.3


From d2e8a34876ce69b27f450eebfc550ab8e316f752 Mon Sep 17 00:00:00 2001
From: Wei Huang <wei.huang2@amd.com>
Date: Wed, 2 Oct 2024 11:59:51 -0500
Subject: PCI/TPH: Add Steering Tag support

Add pcie_tph_get_cpu_st() to allow a caller to retrieve Steering Tags for a
target memory associated with a specific CPU. The ST tag is retrieved by
invoking PCI ACPI "_DSM to Query Cache Locality TPH Features" method
(rev=0x7, func=0xF) of the device's Root Port device.

Add pcie_tph_set_st_entry() to update the device's Steering Tags. The tags
will be written into the device's MSI-X table or the ST table located in
the TPH Extended Capability space.

Co-developed-by: Eric Van Tassell <Eric.VanTassell@amd.com>
Link: https://lore.kernel.org/r/20241002165954.128085-3-wei.huang2@amd.com
Signed-off-by: Eric Van Tassell <Eric.VanTassell@amd.com>
Signed-off-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
---
 drivers/pci/tph.c       | 352 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/pci-tph.h |  23 ++++
 2 files changed, 374 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c
index 4d6317cbf8a6..1e604fbbda65 100644
--- a/drivers/pci/tph.c
+++ b/drivers/pci/tph.c
@@ -7,6 +7,8 @@
  *     Wei Huang <wei.huang2@amd.com>
  */
 #include <linux/pci.h>
+#include <linux/pci-acpi.h>
+#include <linux/msi.h>
 #include <linux/bitfield.h>
 #include <linux/pci-tph.h>
 
@@ -15,6 +17,134 @@
 /* System-wide TPH disabled */
 static bool pci_tph_disabled;
 
+#ifdef CONFIG_ACPI
+/*
+ * The st_info struct defines the Steering Tag (ST) info returned by the
+ * firmware PCI ACPI _DSM method (rev=0x7, func=0xF, "_DSM to Query Cache
+ * Locality TPH Features"), as specified in the approved ECN for PCI Firmware
+ * Spec and available at https://members.pcisig.com/wg/PCI-SIG/document/15470.
+ *
+ * @vm_st_valid:  8-bit ST for volatile memory is valid
+ * @vm_xst_valid: 16-bit extended ST for volatile memory is valid
+ * @vm_ph_ignore: 1 => PH was and will be ignored, 0 => PH should be supplied
+ * @vm_st:        8-bit ST for volatile mem
+ * @vm_xst:       16-bit extended ST for volatile mem
+ * @pm_st_valid:  8-bit ST for persistent memory is valid
+ * @pm_xst_valid: 16-bit extended ST for persistent memory is valid
+ * @pm_ph_ignore: 1 => PH was and will be ignored, 0 => PH should be supplied
+ * @pm_st:        8-bit ST for persistent mem
+ * @pm_xst:       16-bit extended ST for persistent mem
+ */
+union st_info {
+	struct {
+		u64 vm_st_valid : 1;
+		u64 vm_xst_valid : 1;
+		u64 vm_ph_ignore : 1;
+		u64 rsvd1 : 5;
+		u64 vm_st : 8;
+		u64 vm_xst : 16;
+		u64 pm_st_valid : 1;
+		u64 pm_xst_valid : 1;
+		u64 pm_ph_ignore : 1;
+		u64 rsvd2 : 5;
+		u64 pm_st : 8;
+		u64 pm_xst : 16;
+	};
+	u64 value;
+};
+
+static u16 tph_extract_tag(enum tph_mem_type mem_type, u8 req_type,
+			   union st_info *info)
+{
+	switch (req_type) {
+	case PCI_TPH_REQ_TPH_ONLY: /* 8-bit tag */
+		switch (mem_type) {
+		case TPH_MEM_TYPE_VM:
+			if (info->vm_st_valid)
+				return info->vm_st;
+			break;
+		case TPH_MEM_TYPE_PM:
+			if (info->pm_st_valid)
+				return info->pm_st;
+			break;
+		}
+		break;
+	case PCI_TPH_REQ_EXT_TPH: /* 16-bit tag */
+		switch (mem_type) {
+		case TPH_MEM_TYPE_VM:
+			if (info->vm_xst_valid)
+				return info->vm_xst;
+			break;
+		case TPH_MEM_TYPE_PM:
+			if (info->pm_xst_valid)
+				return info->pm_xst;
+			break;
+		}
+		break;
+	default:
+		return 0;
+	}
+
+	return 0;
+}
+
+#define TPH_ST_DSM_FUNC_INDEX	0xF
+static acpi_status tph_invoke_dsm(acpi_handle handle, u32 cpu_uid,
+				  union st_info *st_out)
+{
+	union acpi_object arg3[3], in_obj, *out_obj;
+
+	if (!acpi_check_dsm(handle, &pci_acpi_dsm_guid, 7,
+			    BIT(TPH_ST_DSM_FUNC_INDEX)))
+		return AE_ERROR;
+
+	/* DWORD: feature ID (0 for processor cache ST query) */
+	arg3[0].integer.type = ACPI_TYPE_INTEGER;
+	arg3[0].integer.value = 0;
+
+	/* DWORD: target UID */
+	arg3[1].integer.type = ACPI_TYPE_INTEGER;
+	arg3[1].integer.value = cpu_uid;
+
+	/* QWORD: properties, all 0's */
+	arg3[2].integer.type = ACPI_TYPE_INTEGER;
+	arg3[2].integer.value = 0;
+
+	in_obj.type = ACPI_TYPE_PACKAGE;
+	in_obj.package.count = ARRAY_SIZE(arg3);
+	in_obj.package.elements = arg3;
+
+	out_obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 7,
+				    TPH_ST_DSM_FUNC_INDEX, &in_obj);
+	if (!out_obj)
+		return AE_ERROR;
+
+	if (out_obj->type != ACPI_TYPE_BUFFER) {
+		ACPI_FREE(out_obj);
+		return AE_ERROR;
+	}
+
+	st_out->value = *((u64 *)(out_obj->buffer.pointer));
+
+	ACPI_FREE(out_obj);
+
+	return AE_OK;
+}
+#endif
+
+/* Update the TPH Requester Enable field of TPH Control Register */
+static void set_ctrl_reg_req_en(struct pci_dev *pdev, u8 req_type)
+{
+	u32 reg;
+
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, &reg);
+
+	reg &= ~PCI_TPH_CTRL_REQ_EN_MASK;
+	reg |= FIELD_PREP(PCI_TPH_CTRL_REQ_EN_MASK, req_type);
+
+	pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, reg);
+}
+
 static u8 get_st_modes(struct pci_dev *pdev)
 {
 	u32 reg;
@@ -25,6 +155,37 @@ static u8 get_st_modes(struct pci_dev *pdev)
 	return reg;
 }
 
+static u32 get_st_table_loc(struct pci_dev *pdev)
+{
+	u32 reg;
+
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CAP, &reg);
+
+	return FIELD_GET(PCI_TPH_CAP_LOC_MASK, reg);
+}
+
+/*
+ * Return the size of ST table. If ST table is not in TPH Requester Extended
+ * Capability space, return 0. Otherwise return the ST Table Size + 1.
+ */
+static u16 get_st_table_size(struct pci_dev *pdev)
+{
+	u32 reg;
+	u32 loc;
+
+	/* Check ST table location first */
+	loc = get_st_table_loc(pdev);
+
+	/* Convert loc to match with PCI_TPH_LOC_* defined in pci_regs.h */
+	loc = FIELD_PREP(PCI_TPH_CAP_LOC_MASK, loc);
+	if (loc != PCI_TPH_LOC_CAP)
+		return 0;
+
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CAP, &reg);
+
+	return FIELD_GET(PCI_TPH_CAP_ST_MASK, reg) + 1;
+}
+
 /* Return device's Root Port completer capability */
 static u8 get_rp_completer_type(struct pci_dev *pdev)
 {
@@ -43,6 +204,171 @@ static u8 get_rp_completer_type(struct pci_dev *pdev)
 	return FIELD_GET(PCI_EXP_DEVCAP2_TPH_COMP_MASK, reg);
 }
 
+/* Write ST to MSI-X vector control reg - Return 0 if OK, otherwise -errno */
+static int write_tag_to_msix(struct pci_dev *pdev, int msix_idx, u16 tag)
+{
+#ifdef CONFIG_PCI_MSI
+	struct msi_desc *msi_desc = NULL;
+	void __iomem *vec_ctrl;
+	u32 val;
+	int err = 0;
+
+	msi_lock_descs(&pdev->dev);
+
+	/* Find the msi_desc entry with matching msix_idx */
+	msi_for_each_desc(msi_desc, &pdev->dev, MSI_DESC_ASSOCIATED) {
+		if (msi_desc->msi_index == msix_idx)
+			break;
+	}
+
+	if (!msi_desc) {
+		err = -ENXIO;
+		goto err_out;
+	}
+
+	/* Get the vector control register (offset 0xc) pointed by msix_idx */
+	vec_ctrl = pdev->msix_base + msix_idx * PCI_MSIX_ENTRY_SIZE;
+	vec_ctrl += PCI_MSIX_ENTRY_VECTOR_CTRL;
+
+	val = readl(vec_ctrl);
+	val &= ~PCI_MSIX_ENTRY_CTRL_ST;
+	val |= FIELD_PREP(PCI_MSIX_ENTRY_CTRL_ST, tag);
+	writel(val, vec_ctrl);
+
+	/* Read back to flush the update */
+	val = readl(vec_ctrl);
+
+err_out:
+	msi_unlock_descs(&pdev->dev);
+	return err;
+#else
+	return -ENODEV;
+#endif
+}
+
+/* Write tag to ST table - Return 0 if OK, otherwise -errno */
+static int write_tag_to_st_table(struct pci_dev *pdev, int index, u16 tag)
+{
+	int st_table_size;
+	int offset;
+
+	/* Check if index is out of bound */
+	st_table_size = get_st_table_size(pdev);
+	if (index >= st_table_size)
+		return -ENXIO;
+
+	offset = pdev->tph_cap + PCI_TPH_BASE_SIZEOF + index * sizeof(u16);
+
+	return pci_write_config_word(pdev, offset, tag);
+}
+
+/**
+ * pcie_tph_get_cpu_st() - Retrieve Steering Tag for a target memory associated
+ * with a specific CPU
+ * @pdev: PCI device
+ * @mem_type: target memory type (volatile or persistent RAM)
+ * @cpu_uid: associated CPU id
+ * @tag: Steering Tag to be returned
+ *
+ * Return the Steering Tag for a target memory that is associated with a
+ * specific CPU as indicated by cpu_uid.
+ *
+ * Return: 0 if success, otherwise negative value (-errno)
+ */
+int pcie_tph_get_cpu_st(struct pci_dev *pdev, enum tph_mem_type mem_type,
+			unsigned int cpu_uid, u16 *tag)
+{
+#ifdef CONFIG_ACPI
+	struct pci_dev *rp;
+	acpi_handle rp_acpi_handle;
+	union st_info info;
+
+	rp = pcie_find_root_port(pdev);
+	if (!rp || !rp->bus || !rp->bus->bridge)
+		return -ENODEV;
+
+	rp_acpi_handle = ACPI_HANDLE(rp->bus->bridge);
+
+	if (tph_invoke_dsm(rp_acpi_handle, cpu_uid, &info) != AE_OK) {
+		*tag = 0;
+		return -EINVAL;
+	}
+
+	*tag = tph_extract_tag(mem_type, pdev->tph_req_type, &info);
+
+	pci_dbg(pdev, "get steering tag: mem_type=%s, cpu_uid=%d, tag=%#04x\n",
+		(mem_type == TPH_MEM_TYPE_VM) ? "volatile" : "persistent",
+		cpu_uid, *tag);
+
+	return 0;
+#else
+	return -ENODEV;
+#endif
+}
+EXPORT_SYMBOL(pcie_tph_get_cpu_st);
+
+/**
+ * pcie_tph_set_st_entry() - Set Steering Tag in the ST table entry
+ * @pdev: PCI device
+ * @index: ST table entry index
+ * @tag: Steering Tag to be written
+ *
+ * Figure out the proper location of ST table, either in the MSI-X table or
+ * in the TPH Extended Capability space, and write the Steering Tag into
+ * the ST entry pointed by index.
+ *
+ * Return: 0 if success, otherwise negative value (-errno)
+ */
+int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag)
+{
+	u32 loc;
+	int err = 0;
+
+	if (!pdev->tph_cap)
+		return -EINVAL;
+
+	if (!pdev->tph_enabled)
+		return -EINVAL;
+
+	/* No need to write tag if device is in "No ST Mode" */
+	if (pdev->tph_mode == PCI_TPH_ST_NS_MODE)
+		return 0;
+
+	/*
+	 * Disable TPH before updating ST to avoid potential instability as
+	 * cautioned in PCIe r6.2, sec 6.17.3, "ST Modes of Operation"
+	 */
+	set_ctrl_reg_req_en(pdev, PCI_TPH_REQ_DISABLE);
+
+	loc = get_st_table_loc(pdev);
+	/* Convert loc to match with PCI_TPH_LOC_* */
+	loc = FIELD_PREP(PCI_TPH_CAP_LOC_MASK, loc);
+
+	switch (loc) {
+	case PCI_TPH_LOC_MSIX:
+		err = write_tag_to_msix(pdev, index, tag);
+		break;
+	case PCI_TPH_LOC_CAP:
+		err = write_tag_to_st_table(pdev, index, tag);
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	if (err) {
+		pcie_disable_tph(pdev);
+		return err;
+	}
+
+	set_ctrl_reg_req_en(pdev, pdev->tph_mode);
+
+	pci_dbg(pdev, "set steering tag: %s table, index=%d, tag=%#04x\n",
+		(loc == PCI_TPH_LOC_MSIX) ? "MSI-X" : "ST", index, tag);
+
+	return 0;
+}
+EXPORT_SYMBOL(pcie_tph_set_st_entry);
+
 /**
  * pcie_disable_tph - Turn off TPH support for device
  * @pdev: PCI device
@@ -140,6 +466,8 @@ EXPORT_SYMBOL(pcie_enable_tph);
 void pci_restore_tph_state(struct pci_dev *pdev)
 {
 	struct pci_cap_saved_state *save_state;
+	int num_entries, i, offset;
+	u16 *st_entry;
 	u32 *cap;
 
 	if (!pdev->tph_cap)
@@ -155,11 +483,21 @@ void pci_restore_tph_state(struct pci_dev *pdev)
 	/* Restore control register and all ST entries */
 	cap = &save_state->cap.data[0];
 	pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, *cap++);
+	st_entry = (u16 *)cap;
+	offset = PCI_TPH_BASE_SIZEOF;
+	num_entries = get_st_table_size(pdev);
+	for (i = 0; i < num_entries; i++) {
+		pci_write_config_word(pdev, pdev->tph_cap + offset,
+				      *st_entry++);
+		offset += sizeof(u16);
+	}
 }
 
 void pci_save_tph_state(struct pci_dev *pdev)
 {
 	struct pci_cap_saved_state *save_state;
+	int num_entries, i, offset;
+	u16 *st_entry;
 	u32 *cap;
 
 	if (!pdev->tph_cap)
@@ -175,6 +513,16 @@ void pci_save_tph_state(struct pci_dev *pdev)
 	/* Save control register */
 	cap = &save_state->cap.data[0];
 	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, cap++);
+
+	/* Save all ST entries in extended capability structure */
+	st_entry = (u16 *)cap;
+	offset = PCI_TPH_BASE_SIZEOF;
+	num_entries = get_st_table_size(pdev);
+	for (i = 0; i < num_entries; i++) {
+		pci_read_config_word(pdev, pdev->tph_cap + offset,
+				     st_entry++);
+		offset += sizeof(u16);
+	}
 }
 
 void pci_no_tph(void)
@@ -186,12 +534,14 @@ void pci_no_tph(void)
 
 void pci_tph_init(struct pci_dev *pdev)
 {
+	int num_entries;
 	u32 save_size;
 
 	pdev->tph_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_TPH);
 	if (!pdev->tph_cap)
 		return;
 
-	save_size = sizeof(u32);
+	num_entries = get_st_table_size(pdev);
+	save_size = sizeof(u32) + num_entries * sizeof(u16);
 	pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_TPH, save_size);
 }
diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h
index 58654a334ffb..c3e806c13d64 100644
--- a/include/linux/pci-tph.h
+++ b/include/linux/pci-tph.h
@@ -9,10 +9,33 @@
 #ifndef LINUX_PCI_TPH_H
 #define LINUX_PCI_TPH_H
 
+/*
+ * According to the ECN for PCI Firmware Spec, Steering Tag can be different
+ * depending on the memory type: Volatile Memory or Persistent Memory. When a
+ * caller query about a target's Steering Tag, it must provide the target's
+ * tph_mem_type. ECN link: https://members.pcisig.com/wg/PCI-SIG/document/15470.
+ */
+enum tph_mem_type {
+	TPH_MEM_TYPE_VM,	/* volatile memory */
+	TPH_MEM_TYPE_PM		/* persistent memory */
+};
+
 #ifdef CONFIG_PCIE_TPH
+int pcie_tph_set_st_entry(struct pci_dev *pdev,
+			  unsigned int index, u16 tag);
+int pcie_tph_get_cpu_st(struct pci_dev *dev,
+			enum tph_mem_type mem_type,
+			unsigned int cpu_uid, u16 *tag);
 void pcie_disable_tph(struct pci_dev *pdev);
 int pcie_enable_tph(struct pci_dev *pdev, int mode);
 #else
+static inline int pcie_tph_set_st_entry(struct pci_dev *pdev,
+					unsigned int index, u16 tag)
+{ return -EINVAL; }
+static inline int pcie_tph_get_cpu_st(struct pci_dev *dev,
+				      enum tph_mem_type mem_type,
+				      unsigned int cpu_uid, u16 *tag)
+{ return -EINVAL; }
 static inline void pcie_disable_tph(struct pci_dev *pdev) { }
 static inline int pcie_enable_tph(struct pci_dev *pdev, int mode)
 { return -EINVAL; }
-- 
cgit v1.2.3


From 277b339c4ba5c3d51f86892efd7bfbb012318ed8 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 1 Oct 2024 17:04:10 +0100
Subject: net: pcs: xpcs: move PCS reset to .pcs_pre_config()

Move the PCS reset to .pcs_pre_config() rather than at creation time,
which means we call the reset function with the interface that we're
actually going to be using to talk to the downstream device.

Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com> # sja1105
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: for them?
Link: https://patch.msgid.link/E1svfMA-005ZI3-Va@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pcs/pcs-xpcs.c   | 39 +++++++++++++++++++++++++++++----------
 include/linux/pcs/pcs-xpcs.h |  1 +
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 82463f9d50c8..7c6c40ddf722 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -659,6 +659,30 @@ int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable)
 }
 EXPORT_SYMBOL_GPL(xpcs_config_eee);
 
+static void xpcs_pre_config(struct phylink_pcs *pcs, phy_interface_t interface)
+{
+	struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs);
+	const struct dw_xpcs_compat *compat;
+	int ret;
+
+	if (!xpcs->need_reset)
+		return;
+
+	compat = xpcs_find_compat(xpcs->desc, interface);
+	if (!compat) {
+		dev_err(&xpcs->mdiodev->dev, "unsupported interface %s\n",
+			phy_modes(interface));
+		return;
+	}
+
+	ret = xpcs_soft_reset(xpcs, compat);
+	if (ret)
+		dev_err(&xpcs->mdiodev->dev, "soft reset failed: %pe\n",
+			ERR_PTR(ret));
+
+	xpcs->need_reset = false;
+}
+
 static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs,
 				      unsigned int neg_mode)
 {
@@ -1365,6 +1389,7 @@ static const struct dw_xpcs_desc xpcs_desc_list[] = {
 
 static const struct phylink_pcs_ops xpcs_phylink_ops = {
 	.pcs_validate = xpcs_validate,
+	.pcs_pre_config = xpcs_pre_config,
 	.pcs_config = xpcs_config,
 	.pcs_get_state = xpcs_get_state,
 	.pcs_an_restart = xpcs_an_restart,
@@ -1460,18 +1485,12 @@ static int xpcs_init_id(struct dw_xpcs *xpcs)
 
 static int xpcs_init_iface(struct dw_xpcs *xpcs, phy_interface_t interface)
 {
-	const struct dw_xpcs_compat *compat;
-
-	compat = xpcs_find_compat(xpcs->desc, interface);
-	if (!compat)
-		return -EINVAL;
-
-	if (xpcs->info.pma == WX_TXGBE_XPCS_PMA_10G_ID) {
+	if (xpcs->info.pma == WX_TXGBE_XPCS_PMA_10G_ID)
 		xpcs->pcs.poll = false;
-		return 0;
-	}
+	else
+		xpcs->need_reset = true;
 
-	return xpcs_soft_reset(xpcs, compat);
+	return 0;
 }
 
 static struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev,
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index b4a4eb6c8866..fd75d0605bb6 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -61,6 +61,7 @@ struct dw_xpcs {
 	struct clk_bulk_data clks[DW_XPCS_NUM_CLKS];
 	struct phylink_pcs pcs;
 	phy_interface_t interface;
+	bool need_reset;
 };
 
 int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface);
-- 
cgit v1.2.3


From bedea1539acb808c870ad85b16f8d8f0bef5e0bb Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 1 Oct 2024 17:04:26 +0100
Subject: net: pcs: xpcs: add xpcs_destroy_pcs() and xpcs_create_pcs_mdiodev()

Provide xpcs create/destroy functions that return and take a phylink_pcs
pointer instead of an xpcs pointer. This will be used by drivers that
have been converted to use phylink_pcs pointers internally, rather than
dw_xpcs pointers.

As xpcs_create_mdiodev() no longer makes use of its interface argument,
pass PHY_INTERFACE_MODE_NA into xpcs_create_mdiodev() until it is
removed later in the series.

Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1svfMQ-005ZIL-Bi@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pcs/pcs-xpcs.c   | 18 ++++++++++++++++++
 include/linux/pcs/pcs-xpcs.h |  3 +++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 8765b01c0b5d..9b61f97222b9 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -1550,6 +1550,18 @@ struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr,
 }
 EXPORT_SYMBOL_GPL(xpcs_create_mdiodev);
 
+struct phylink_pcs *xpcs_create_pcs_mdiodev(struct mii_bus *bus, int addr)
+{
+	struct dw_xpcs *xpcs;
+
+	xpcs = xpcs_create_mdiodev(bus, addr, PHY_INTERFACE_MODE_NA);
+	if (IS_ERR(xpcs))
+		return ERR_CAST(xpcs);
+
+	return &xpcs->pcs;
+}
+EXPORT_SYMBOL_GPL(xpcs_create_pcs_mdiodev);
+
 /**
  * xpcs_create_fwnode() - Create a DW xPCS instance from @fwnode
  * @fwnode: fwnode handle poining to the DW XPCS device
@@ -1599,5 +1611,11 @@ void xpcs_destroy(struct dw_xpcs *xpcs)
 }
 EXPORT_SYMBOL_GPL(xpcs_destroy);
 
+void xpcs_destroy_pcs(struct phylink_pcs *pcs)
+{
+	xpcs_destroy(phylink_pcs_to_xpcs(pcs));
+}
+EXPORT_SYMBOL_GPL(xpcs_destroy_pcs);
+
 MODULE_DESCRIPTION("Synopsys DesignWare XPCS library");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index fd75d0605bb6..a4e2243ce647 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -78,4 +78,7 @@ struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode,
 				   phy_interface_t interface);
 void xpcs_destroy(struct dw_xpcs *xpcs);
 
+struct phylink_pcs *xpcs_create_pcs_mdiodev(struct mii_bus *bus, int addr);
+void xpcs_destroy_pcs(struct phylink_pcs *pcs);
+
 #endif /* __LINUX_PCS_XPCS_H */
-- 
cgit v1.2.3


From bf5a61645bb2d51be53da5b26f948045b571147c Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 1 Oct 2024 17:04:51 +0100
Subject: net: pcs: xpcs: drop interface argument from xpcs_create*()

The XPCS sub-driver no longer uses the "interface" argument to the
xpcs_create_mdiodev() and xpcs_create_fwnode() functions. Remove
this now unnecessary argument, updating the stmmac driver
appropriately.

Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1svfMp-005ZIp-UX@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c |  7 +++----
 drivers/net/pcs/pcs-xpcs.c                        | 10 +++-------
 include/linux/pcs/pcs-xpcs.h                      |  6 ++----
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 03f90676b3ad..0c7d81ddd440 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -500,23 +500,22 @@ int stmmac_pcs_setup(struct net_device *ndev)
 	struct fwnode_handle *devnode, *pcsnode;
 	struct dw_xpcs *xpcs = NULL;
 	struct stmmac_priv *priv;
-	int addr, mode, ret;
+	int addr, ret;
 
 	priv = netdev_priv(ndev);
-	mode = priv->plat->phy_interface;
 	devnode = priv->plat->port_node;
 
 	if (priv->plat->pcs_init) {
 		ret = priv->plat->pcs_init(priv);
 	} else if (fwnode_property_present(devnode, "pcs-handle")) {
 		pcsnode = fwnode_find_reference(devnode, "pcs-handle", 0);
-		xpcs = xpcs_create_fwnode(pcsnode, mode);
+		xpcs = xpcs_create_fwnode(pcsnode);
 		fwnode_handle_put(pcsnode);
 		ret = PTR_ERR_OR_ZERO(xpcs);
 	} else if (priv->plat->mdio_bus_data &&
 		   priv->plat->mdio_bus_data->pcs_mask) {
 		addr = ffs(priv->plat->mdio_bus_data->pcs_mask) - 1;
-		xpcs = xpcs_create_mdiodev(priv->mii, addr, mode);
+		xpcs = xpcs_create_mdiodev(priv->mii, addr);
 		ret = PTR_ERR_OR_ZERO(xpcs);
 	} else {
 		return 0;
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 9b61f97222b9..f25e7afdfdf5 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -1520,14 +1520,12 @@ out_free_data:
  * xpcs_create_mdiodev() - create a DW xPCS instance with the MDIO @addr
  * @bus: pointer to the MDIO-bus descriptor for the device to be looked at
  * @addr: device MDIO-bus ID
- * @interface: requested PHY interface
  *
  * Return: a pointer to the DW XPCS handle if successful, otherwise -ENODEV if
  * the PCS device couldn't be found on the bus and other negative errno related
  * to the data allocation and MDIO-bus communications.
  */
-struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr,
-				    phy_interface_t interface)
+struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr)
 {
 	struct mdio_device *mdiodev;
 	struct dw_xpcs *xpcs;
@@ -1554,7 +1552,7 @@ struct phylink_pcs *xpcs_create_pcs_mdiodev(struct mii_bus *bus, int addr)
 {
 	struct dw_xpcs *xpcs;
 
-	xpcs = xpcs_create_mdiodev(bus, addr, PHY_INTERFACE_MODE_NA);
+	xpcs = xpcs_create_mdiodev(bus, addr);
 	if (IS_ERR(xpcs))
 		return ERR_CAST(xpcs);
 
@@ -1565,7 +1563,6 @@ EXPORT_SYMBOL_GPL(xpcs_create_pcs_mdiodev);
 /**
  * xpcs_create_fwnode() - Create a DW xPCS instance from @fwnode
  * @fwnode: fwnode handle poining to the DW XPCS device
- * @interface: requested PHY interface
  *
  * Return: a pointer to the DW XPCS handle if successful, otherwise -ENODEV if
  * the fwnode device is unavailable or the PCS device couldn't be found on the
@@ -1573,8 +1570,7 @@ EXPORT_SYMBOL_GPL(xpcs_create_pcs_mdiodev);
  * other negative errno related to the data allocations and MDIO-bus
  * communications.
  */
-struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode,
-				   phy_interface_t interface)
+struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode)
 {
 	struct mdio_device *mdiodev;
 	struct dw_xpcs *xpcs;
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index a4e2243ce647..758daabb76c7 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -72,10 +72,8 @@ int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
 void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces);
 int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns,
 		    int enable);
-struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr,
-				    phy_interface_t interface);
-struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode,
-				   phy_interface_t interface);
+struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr);
+struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode);
 void xpcs_destroy(struct dw_xpcs *xpcs);
 
 struct phylink_pcs *xpcs_create_pcs_mdiodev(struct mii_bus *bus, int addr);
-- 
cgit v1.2.3


From faefc9730d073e88a490e37b00e3e196b823abcb Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 1 Oct 2024 17:04:57 +0100
Subject: net: pcs: xpcs: make xpcs_do_config() and xpcs_link_up() internal

As nothing outside pcs-xpcs.c calls neither xpcs_do_config() nor
xpcs_link_up(), remove their exports and prototypes.

Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1svfMv-005ZIv-2M@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pcs/pcs-xpcs.c   | 11 +++++------
 include/linux/pcs/pcs-xpcs.h |  4 ----
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index f25e7afdfdf5..0a01c552f591 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -851,8 +851,9 @@ static int xpcs_config_2500basex(struct dw_xpcs *xpcs)
 	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL, ret);
 }
 
-int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
-		   const unsigned long *advertising, unsigned int neg_mode)
+static int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
+			  const unsigned long *advertising,
+			  unsigned int neg_mode)
 {
 	const struct dw_xpcs_compat *compat;
 	int ret;
@@ -905,7 +906,6 @@ int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(xpcs_do_config);
 
 static int xpcs_config(struct phylink_pcs *pcs, unsigned int neg_mode,
 		       phy_interface_t interface,
@@ -1207,8 +1207,8 @@ static void xpcs_link_up_1000basex(struct dw_xpcs *xpcs, unsigned int neg_mode,
 		pr_err("%s: xpcs_write returned %pe\n", __func__, ERR_PTR(ret));
 }
 
-void xpcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
-		  phy_interface_t interface, int speed, int duplex)
+static void xpcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
+			 phy_interface_t interface, int speed, int duplex)
 {
 	struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs);
 
@@ -1219,7 +1219,6 @@ void xpcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
 	if (interface == PHY_INTERFACE_MODE_1000BASEX)
 		return xpcs_link_up_1000basex(xpcs, neg_mode, speed, duplex);
 }
-EXPORT_SYMBOL_GPL(xpcs_link_up);
 
 static void xpcs_an_restart(struct phylink_pcs *pcs)
 {
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 758daabb76c7..abda475111d1 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -65,10 +65,6 @@ struct dw_xpcs {
 };
 
 int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface);
-void xpcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
-		  phy_interface_t interface, int speed, int duplex);
-int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
-		   const unsigned long *advertising, unsigned int neg_mode);
 void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces);
 int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns,
 		    int enable);
-- 
cgit v1.2.3


From ced20ea315fe8591093f19574ec32222c1ab71ba Mon Sep 17 00:00:00 2001
From: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Date: Tue, 24 Sep 2024 13:48:43 +0530
Subject: soundwire: amd: pass acp pci revision id as resource data

Pass ACP pci revision id as resource data and store it in amd SoundWire
manager private data structure. This field will be used to differentiate
ACP variants.

Signed-off-by: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Link: https://lore.kernel.org/r/20240924081846.1834612-2-Vijendar.Mukunda@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/amd_init.c      | 1 +
 drivers/soundwire/amd_manager.c   | 1 +
 include/linux/soundwire/sdw_amd.h | 5 +++++
 3 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soundwire/amd_init.c b/drivers/soundwire/amd_init.c
index db040f435059..4f6e356e6bd2 100644
--- a/drivers/soundwire/amd_init.c
+++ b/drivers/soundwire/amd_init.c
@@ -121,6 +121,7 @@ static struct sdw_amd_ctx *sdw_amd_probe_controller(struct sdw_amd_res *res)
 
 		sdw_pdata[index].instance = index;
 		sdw_pdata[index].acp_sdw_lock = res->acp_lock;
+		sdw_pdata[index].acp_rev = res->acp_rev;
 		pdevinfo[index].name = "amd_sdw_manager";
 		pdevinfo[index].id = index;
 		pdevinfo[index].parent = res->parent;
diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c
index 0d01849c3586..4a1966fb01f6 100644
--- a/drivers/soundwire/amd_manager.c
+++ b/drivers/soundwire/amd_manager.c
@@ -910,6 +910,7 @@ static int amd_sdw_manager_probe(struct platform_device *pdev)
 	amd_manager->mmio = amd_manager->acp_mmio +
 			    (amd_manager->instance * SDW_MANAGER_REG_OFFSET);
 	amd_manager->acp_sdw_lock = pdata->acp_sdw_lock;
+	amd_manager->acp_rev = pdata->acp_rev;
 	amd_manager->cols_index = sdw_find_col_index(AMD_SDW_DEFAULT_COLUMNS);
 	amd_manager->rows_index = sdw_find_row_index(AMD_SDW_DEFAULT_ROWS);
 	amd_manager->dev = dev;
diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h
index 28a4eb77717f..e0abc59d4748 100644
--- a/include/linux/soundwire/sdw_amd.h
+++ b/include/linux/soundwire/sdw_amd.h
@@ -30,6 +30,7 @@
 
 struct acp_sdw_pdata {
 	u16 instance;
+	u32 acp_rev;
 	/* mutex to protect acp common register access */
 	struct mutex *acp_sdw_lock;
 };
@@ -66,6 +67,7 @@ struct sdw_amd_dai_runtime {
  * @instance: SoundWire manager instance
  * @quirks: SoundWire manager quirks
  * @wake_en_mask: wake enable mask per SoundWire manager
+ * @acp_rev: acp pci device revision id
  * @clk_stopped: flag set to true when clock is stopped
  * @power_mode_mask: flag interprets amd SoundWire manager power mode
  * @dai_runtime_array: dai runtime array
@@ -94,6 +96,7 @@ struct amd_sdw_manager {
 	u32 quirks;
 	u32 wake_en_mask;
 	u32 power_mode_mask;
+	u32 acp_rev;
 	bool clk_stopped;
 
 	struct sdw_amd_dai_runtime **dai_runtime_array;
@@ -134,6 +137,7 @@ struct sdw_amd_ctx {
  * struct sdw_amd_res - Soundwire AMD global resource structure,
  * typically populated by the DSP driver/Legacy driver
  *
+ * @acp_rev: acp pci device revision id
  * @addr: acp pci device resource start address
  * @reg_range: ACP register range
  * @link_mask: bit-wise mask listing links selected by the DSP driver/
@@ -146,6 +150,7 @@ struct sdw_amd_ctx {
  * @acp_lock: mutex protecting acp common registers access
  */
 struct sdw_amd_res {
+	u32 acp_rev;
 	u32 addr;
 	u32 reg_range;
 	u32 link_mask;
-- 
cgit v1.2.3


From 7b54323dde29452dd06e6acd2701d9b489c9547d Mon Sep 17 00:00:00 2001
From: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Date: Tue, 24 Sep 2024 13:48:44 +0530
Subject: soundwire: amd: refactor existing code for acp 6.3 platform

Refactor existing code by adding acp pci revision id coditional checks
for ACP 6.3 platform. Rename the macros and structure names with ACP63
tag.

Signed-off-by: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Link: https://lore.kernel.org/r/20240924081846.1834612-3-Vijendar.Mukunda@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/amd_manager.c   | 98 +++++++++++++++++++++++++--------------
 drivers/soundwire/amd_manager.h   | 16 +++----
 include/linux/soundwire/sdw_amd.h |  1 +
 3 files changed, 72 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c
index 4a1966fb01f6..5a4bfaef65fb 100644
--- a/drivers/soundwire/amd_manager.c
+++ b/drivers/soundwire/amd_manager.c
@@ -433,12 +433,18 @@ static int amd_sdw_port_params(struct sdw_bus *bus, struct sdw_port_params *p_pa
 	u32 frame_fmt_reg, dpn_frame_fmt;
 
 	dev_dbg(amd_manager->dev, "p_params->num:0x%x\n", p_params->num);
-	switch (amd_manager->instance) {
-	case ACP_SDW0:
-		frame_fmt_reg = sdw0_manager_dp_reg[p_params->num].frame_fmt_reg;
-		break;
-	case ACP_SDW1:
-		frame_fmt_reg = sdw1_manager_dp_reg[p_params->num].frame_fmt_reg;
+	switch (amd_manager->acp_rev) {
+	case ACP63_PCI_REV_ID:
+		switch (amd_manager->instance) {
+		case ACP_SDW0:
+			frame_fmt_reg = acp63_sdw0_dp_reg[p_params->num].frame_fmt_reg;
+			break;
+		case ACP_SDW1:
+			frame_fmt_reg = acp63_sdw1_dp_reg[p_params->num].frame_fmt_reg;
+			break;
+		default:
+			return -EINVAL;
+		}
 		break;
 	default:
 		return -EINVAL;
@@ -465,20 +471,28 @@ static int amd_sdw_transport_params(struct sdw_bus *bus,
 	u32 frame_fmt_reg, sample_int_reg, hctrl_dp0_reg;
 	u32 offset_reg, lane_ctrl_ch_en_reg;
 
-	switch (amd_manager->instance) {
-	case ACP_SDW0:
-		frame_fmt_reg = sdw0_manager_dp_reg[params->port_num].frame_fmt_reg;
-		sample_int_reg = sdw0_manager_dp_reg[params->port_num].sample_int_reg;
-		hctrl_dp0_reg = sdw0_manager_dp_reg[params->port_num].hctrl_dp0_reg;
-		offset_reg = sdw0_manager_dp_reg[params->port_num].offset_reg;
-		lane_ctrl_ch_en_reg = sdw0_manager_dp_reg[params->port_num].lane_ctrl_ch_en_reg;
-		break;
-	case ACP_SDW1:
-		frame_fmt_reg = sdw1_manager_dp_reg[params->port_num].frame_fmt_reg;
-		sample_int_reg = sdw1_manager_dp_reg[params->port_num].sample_int_reg;
-		hctrl_dp0_reg = sdw1_manager_dp_reg[params->port_num].hctrl_dp0_reg;
-		offset_reg = sdw1_manager_dp_reg[params->port_num].offset_reg;
-		lane_ctrl_ch_en_reg = sdw1_manager_dp_reg[params->port_num].lane_ctrl_ch_en_reg;
+	switch (amd_manager->acp_rev) {
+	case ACP63_PCI_REV_ID:
+		switch (amd_manager->instance) {
+		case ACP_SDW0:
+			frame_fmt_reg = acp63_sdw0_dp_reg[params->port_num].frame_fmt_reg;
+			sample_int_reg = acp63_sdw0_dp_reg[params->port_num].sample_int_reg;
+			hctrl_dp0_reg = acp63_sdw0_dp_reg[params->port_num].hctrl_dp0_reg;
+			offset_reg = acp63_sdw0_dp_reg[params->port_num].offset_reg;
+			lane_ctrl_ch_en_reg =
+					acp63_sdw0_dp_reg[params->port_num].lane_ctrl_ch_en_reg;
+			break;
+		case ACP_SDW1:
+			frame_fmt_reg = acp63_sdw1_dp_reg[params->port_num].frame_fmt_reg;
+			sample_int_reg = acp63_sdw1_dp_reg[params->port_num].sample_int_reg;
+			hctrl_dp0_reg = acp63_sdw1_dp_reg[params->port_num].hctrl_dp0_reg;
+			offset_reg = acp63_sdw1_dp_reg[params->port_num].offset_reg;
+			lane_ctrl_ch_en_reg =
+					acp63_sdw1_dp_reg[params->port_num].lane_ctrl_ch_en_reg;
+			break;
+		default:
+			return -EINVAL;
+		}
 		break;
 	default:
 		return -EINVAL;
@@ -520,12 +534,20 @@ static int amd_sdw_port_enable(struct sdw_bus *bus,
 	u32 dpn_ch_enable;
 	u32 lane_ctrl_ch_en_reg;
 
-	switch (amd_manager->instance) {
-	case ACP_SDW0:
-		lane_ctrl_ch_en_reg = sdw0_manager_dp_reg[enable_ch->port_num].lane_ctrl_ch_en_reg;
-		break;
-	case ACP_SDW1:
-		lane_ctrl_ch_en_reg = sdw1_manager_dp_reg[enable_ch->port_num].lane_ctrl_ch_en_reg;
+	switch (amd_manager->acp_rev) {
+	case ACP63_PCI_REV_ID:
+		switch (amd_manager->instance) {
+		case ACP_SDW0:
+			lane_ctrl_ch_en_reg =
+					acp63_sdw0_dp_reg[enable_ch->port_num].lane_ctrl_ch_en_reg;
+			break;
+		case ACP_SDW1:
+			lane_ctrl_ch_en_reg =
+					acp63_sdw1_dp_reg[enable_ch->port_num].lane_ctrl_ch_en_reg;
+			break;
+		default:
+			return -EINVAL;
+		}
 		break;
 	default:
 		return -EINVAL;
@@ -927,15 +949,21 @@ static int amd_sdw_manager_probe(struct platform_device *pdev)
 	 * information.
 	 */
 	amd_manager->bus.controller_id = 0;
-
-	switch (amd_manager->instance) {
-	case ACP_SDW0:
-		amd_manager->num_dout_ports = AMD_SDW0_MAX_TX_PORTS;
-		amd_manager->num_din_ports = AMD_SDW0_MAX_RX_PORTS;
-		break;
-	case ACP_SDW1:
-		amd_manager->num_dout_ports = AMD_SDW1_MAX_TX_PORTS;
-		amd_manager->num_din_ports = AMD_SDW1_MAX_RX_PORTS;
+	dev_dbg(dev, "acp_rev:0x%x\n", amd_manager->acp_rev);
+	switch (amd_manager->acp_rev) {
+	case ACP63_PCI_REV_ID:
+		switch (amd_manager->instance) {
+		case ACP_SDW0:
+			amd_manager->num_dout_ports = AMD_ACP63_SDW0_MAX_TX_PORTS;
+			amd_manager->num_din_ports = AMD_ACP63_SDW0_MAX_RX_PORTS;
+			break;
+		case ACP_SDW1:
+			amd_manager->num_dout_ports = AMD_ACP63_SDW1_MAX_TX_PORTS;
+			amd_manager->num_din_ports = AMD_ACP63_SDW1_MAX_RX_PORTS;
+			break;
+		default:
+			return -EINVAL;
+		}
 		break;
 	default:
 		return -EINVAL;
diff --git a/drivers/soundwire/amd_manager.h b/drivers/soundwire/amd_manager.h
index 707065468e05..cc2170e4521e 100644
--- a/drivers/soundwire/amd_manager.h
+++ b/drivers/soundwire/amd_manager.h
@@ -155,12 +155,12 @@
 #define AMD_SDW_IRQ_MASK_8TO11		0x000c7777
 #define AMD_SDW_IRQ_ERROR_MASK		0xff
 #define AMD_SDW_MAX_FREQ_NUM		1
-#define AMD_SDW0_MAX_TX_PORTS		3
-#define AMD_SDW0_MAX_RX_PORTS		3
-#define AMD_SDW1_MAX_TX_PORTS		1
-#define AMD_SDW1_MAX_RX_PORTS		1
-#define AMD_SDW0_MAX_DAI		6
-#define AMD_SDW1_MAX_DAI		2
+#define AMD_ACP63_SDW0_MAX_TX_PORTS		3
+#define AMD_ACP63_SDW0_MAX_RX_PORTS		3
+#define AMD_ACP63_SDW1_MAX_TX_PORTS		1
+#define AMD_ACP63_SDW1_MAX_RX_PORTS		1
+#define AMD_ACP63_SDW0_MAX_DAI		6
+#define AMD_ACP63_SDW1_MAX_DAI		2
 #define AMD_SDW_SLAVE_0_ATTACHED	5
 #define AMD_SDW_SSP_COUNTER_VAL		3
 
@@ -222,7 +222,7 @@ struct sdw_manager_dp_reg {
  * in SoundWire DMA driver.
  */
 
-static struct sdw_manager_dp_reg sdw0_manager_dp_reg[AMD_SDW0_MAX_DAI] =  {
+static struct sdw_manager_dp_reg acp63_sdw0_dp_reg[AMD_ACP63_SDW0_MAX_DAI] =  {
 	{ACP_SW_AUDIO0_TX_FRAME_FORMAT, ACP_SW_AUDIO0_TX_SAMPLEINTERVAL, ACP_SW_AUDIO0_TX_HCTRL_DP0,
 	 ACP_SW_AUDIO0_TX_OFFSET_DP0, ACP_SW_AUDIO0_TX_CHANNEL_ENABLE_DP0},
 	{ACP_SW_AUDIO1_TX_FRAME_FORMAT, ACP_SW_AUDIO1_TX_SAMPLEINTERVAL, ACP_SW_AUDIO1_TX_HCTRL,
@@ -237,7 +237,7 @@ static struct sdw_manager_dp_reg sdw0_manager_dp_reg[AMD_SDW0_MAX_DAI] =  {
 	 ACP_SW_AUDIO2_RX_OFFSET, ACP_SW_AUDIO2_RX_CHANNEL_ENABLE_DP0},
 };
 
-static struct sdw_manager_dp_reg sdw1_manager_dp_reg[AMD_SDW1_MAX_DAI] =  {
+static struct sdw_manager_dp_reg acp63_sdw1_dp_reg[AMD_ACP63_SDW1_MAX_DAI] =  {
 	{ACP_SW_AUDIO1_TX_FRAME_FORMAT, ACP_SW_AUDIO1_TX_SAMPLEINTERVAL, ACP_SW_AUDIO1_TX_HCTRL,
 	 ACP_SW_AUDIO1_TX_OFFSET, ACP_SW_AUDIO1_TX_CHANNEL_ENABLE_DP0},
 	{ACP_SW_AUDIO1_RX_FRAME_FORMAT, ACP_SW_AUDIO1_RX_SAMPLEINTERVAL, ACP_SW_AUDIO1_RX_HCTRL,
diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h
index e0abc59d4748..c9586f22c5a9 100644
--- a/include/linux/soundwire/sdw_amd.h
+++ b/include/linux/soundwire/sdw_amd.h
@@ -27,6 +27,7 @@
 #define ACP_SDW0	0
 #define ACP_SDW1	1
 #define AMD_SDW_MAX_MANAGER_COUNT	2
+#define ACP63_PCI_REV_ID		0x63
 
 struct acp_sdw_pdata {
 	u16 instance;
-- 
cgit v1.2.3


From 444d6824a4feca142b0a57095a2f1f1bda98e2ab Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 3 Oct 2024 15:06:38 +0800
Subject: soundwire: optimize sdw_stream_runtime memory layout

pahole suggestion: swap position of 'm_rt_count'

before: pahole -C sdw_stream_runtime drivers/soundwire/soundwire-bus.ko
struct sdw_stream_runtime {
	const char  *              name;                 /*     0     8 */
	struct sdw_stream_params   params;               /*     8    12 */
	enum sdw_stream_state      state;                /*    20     4 */
	enum sdw_stream_type       type;                 /*    24     4 */

	/* XXX 4 bytes hole, try to pack */

	struct list_head           master_list;          /*    32    16 */
	int                        m_rt_count;           /*    48     4 */

	/* size: 56, cachelines: 1, members: 6 */
	/* sum members: 48, holes: 1, sum holes: 4 */
	/* padding: 4 */
	/* last cacheline: 56 bytes */
};

after: pahole --reorganize -C sdw_stream_runtime drivers/soundwire/soundwire-bus.ko
struct sdw_stream_runtime {
	const char  *              name;                 /*     0     8 */
	struct sdw_stream_params   params;               /*     8    12 */
	enum sdw_stream_state      state;                /*    20     4 */
	enum sdw_stream_type       type;                 /*    24     4 */
	int                        m_rt_count;           /*    28     4 */
	struct list_head           master_list;          /*    32    16 */

	/* size: 48, cachelines: 1, members: 6 */
	/* last cacheline: 48 bytes */
};   /* saved 8 bytes! */

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20241003070650.62787-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 5e0dd47a0412..a4fa45132030 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -820,15 +820,15 @@ struct sdw_master_port_ops {
 struct sdw_msg;
 
 /**
- * struct sdw_defer - SDW deffered message
- * @length: message length
+ * struct sdw_defer - SDW deferred message
  * @complete: message completion
  * @msg: SDW message
+ * @length: message length
  */
 struct sdw_defer {
+	struct sdw_msg *msg;
 	int length;
 	struct completion complete;
-	struct sdw_msg *msg;
 };
 
 /**
@@ -1010,18 +1010,18 @@ struct sdw_stream_params {
  * @params: Stream parameters
  * @state: Current state of the stream
  * @type: Stream type PCM or PDM
+ * @m_rt_count: Count of Master runtime(s) in this stream
  * @master_list: List of Master runtime(s) in this stream.
  * master_list can contain only one m_rt per Master instance
  * for a stream
- * @m_rt_count: Count of Master runtime(s) in this stream
  */
 struct sdw_stream_runtime {
 	const char *name;
 	struct sdw_stream_params params;
 	enum sdw_stream_state state;
 	enum sdw_stream_type type;
-	struct list_head master_list;
 	int m_rt_count;
+	struct list_head master_list;
 };
 
 struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name);
-- 
cgit v1.2.3


From 6cb2c156439430a7f9db2e1f71a7dccf1ca978bf Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 3 Oct 2024 15:06:39 +0800
Subject: soundwire: optimize sdw_master_prop

Make pahole happy by moving pointers and u64 first instead of
interleaving them.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20241003070650.62787-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index a4fa45132030..2caea7345c3e 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -406,13 +406,14 @@ struct sdw_slave_prop {
 
 /**
  * struct sdw_master_prop - Master properties
+ * @clk_gears: Clock gears supported
+ * @clk_freq: Clock frequencies supported, in Hz
+ * @quirks: bitmask identifying optional behavior beyond the scope of the MIPI specification
  * @revision: MIPI spec version of the implementation
  * @clk_stop_modes: Bitmap, bit N set when clock-stop-modeN supported
  * @max_clk_freq: Maximum Bus clock frequency, in Hz
  * @num_clk_gears: Number of clock gears supported
- * @clk_gears: Clock gears supported
  * @num_clk_freq: Number of clock frequencies supported, in Hz
- * @clk_freq: Clock frequencies supported, in Hz
  * @default_frame_rate: Controller default Frame rate, in Hz
  * @default_row: Number of rows
  * @default_col: Number of columns
@@ -421,24 +422,23 @@ struct sdw_slave_prop {
  * command
  * @mclk_freq: clock reference passed to SoundWire Master, in Hz.
  * @hw_disabled: if true, the Master is not functional, typically due to pin-mux
- * @quirks: bitmask identifying optional behavior beyond the scope of the MIPI specification
  */
 struct sdw_master_prop {
+	u32 *clk_gears;
+	u32 *clk_freq;
+	u64 quirks;
 	u32 revision;
 	u32 clk_stop_modes;
 	u32 max_clk_freq;
 	u32 num_clk_gears;
-	u32 *clk_gears;
 	u32 num_clk_freq;
-	u32 *clk_freq;
 	u32 default_frame_rate;
 	u32 default_row;
 	u32 default_col;
-	bool dynamic_frame;
 	u32 err_threshold;
 	u32 mclk_freq;
+	bool dynamic_frame;
 	bool hw_disabled;
-	u64 quirks;
 };
 
 /* Definitions for Master quirks */
-- 
cgit v1.2.3


From 0a323dad1c4e04048988cd04c60eaffd6ae61b1a Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 3 Oct 2024 15:06:40 +0800
Subject: soundwire: optimize sdw_bus structure

The sdw_bus structure has seen multiple additions over the years. It's
one of the most used structures in this subsystem, so there's merit in
reshuffling the members a bit with 'pahole' to reduce holes and
structures across cache lines.

before:

struct sdw_bus {
	struct device *            dev;                  /*     0     8 */
	struct sdw_master_device * md;                   /*     8     8 */
	int                        controller_id;        /*    16     4 */
	unsigned int               link_id;              /*    20     4 */
	int                        id;                   /*    24     4 */

	/* XXX 4 bytes hole, try to pack */

	struct list_head           slaves;               /*    32    16 */
	long unsigned int          assigned[1];          /*    48     8 */
	struct mutex               bus_lock;             /*    56   160 */
	/* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */
	struct lock_class_key      bus_lock_key;         /*   216    16 */
	struct mutex               msg_lock;             /*   232   160 */
	/* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
	struct lock_class_key      msg_lock_key;         /*   392    16 */
	int                        (*compute_params)(struct sdw_bus *); /*   408     8 */
	const struct sdw_master_ops  * ops;              /*   416     8 */
	const struct sdw_master_port_ops  * port_ops;    /*   424     8 */
	struct sdw_bus_params      params;               /*   432    36 */

	/* XXX 4 bytes hole, try to pack */

	/* --- cacheline 7 boundary (448 bytes) was 24 bytes ago --- */
	struct sdw_master_prop     prop;                 /*   472    72 */

	/* XXX last struct has 6 bytes of padding */

	/* --- cacheline 8 boundary (512 bytes) was 32 bytes ago --- */
	void *                     vendor_specific_prop; /*   544     8 */
	struct list_head           m_rt_list;            /*   552    16 */
	struct dentry *            debugfs;              /*   568     8 */
	/* --- cacheline 9 boundary (576 bytes) --- */
	struct irq_chip            irq_chip;             /*   576   264 */
	/* --- cacheline 13 boundary (832 bytes) was 8 bytes ago --- */
	struct irq_domain *        domain;               /*   840     8 */
	struct sdw_defer           defer_msg;            /*   848   112 */
	/* --- cacheline 15 boundary (960 bytes) --- */
	unsigned int               clk_stop_timeout;     /*   960     4 */
	u32                        bank_switch_timeout;  /*   964     4 */
	bool                       multi_link;           /*   968     1 */

	/* XXX 3 bytes hole, try to pack */

	int                        hw_sync_min_links;    /*   972     4 */
	int                        stream_refcount;      /*   976     4 */

	/* size: 984, cachelines: 16, members: 27 */
	/* sum members: 969, holes: 3, sum holes: 11 */
	/* padding: 4 */
	/* paddings: 1, sum paddings: 6 */
	/* last cacheline: 24 bytes */
};

after:

struct sdw_bus {
	struct device *            dev;                  /*     0     8 */
	struct sdw_master_device * md;                   /*     8     8 */
	struct lock_class_key      bus_lock_key;         /*    16    16 */
	struct mutex               bus_lock;             /*    32   160 */
	/* --- cacheline 3 boundary (192 bytes) --- */
	struct list_head           slaves;               /*   192    16 */
	struct lock_class_key      msg_lock_key;         /*   208    16 */
	struct mutex               msg_lock;             /*   224   160 */
	/* --- cacheline 6 boundary (384 bytes) --- */
	struct list_head           m_rt_list;            /*   384    16 */
	struct sdw_defer           defer_msg;            /*   400   112 */
	/* --- cacheline 8 boundary (512 bytes) --- */
	struct sdw_bus_params      params;               /*   512    36 */
	int                        stream_refcount;      /*   548     4 */
	const struct sdw_master_ops  * ops;              /*   552     8 */
	const struct sdw_master_port_ops  * port_ops;    /*   560     8 */
	struct sdw_master_prop     prop;                 /*   568    72 */

	/* XXX last struct has 6 bytes of padding */

	/* --- cacheline 10 boundary (640 bytes) --- */
	void *                     vendor_specific_prop; /*   640     8 */
	int                        hw_sync_min_links;    /*   648     4 */
	int                        controller_id;        /*   652     4 */
	unsigned int               link_id;              /*   656     4 */
	int                        id;                   /*   660     4 */
	int                        (*compute_params)(struct sdw_bus *); /*   664     8 */
	long unsigned int          assigned[1];          /*   672     8 */
	unsigned int               clk_stop_timeout;     /*   680     4 */
	u32                        bank_switch_timeout;  /*   684     4 */
	struct irq_chip            irq_chip;             /*   688   264 */
	/* --- cacheline 14 boundary (896 bytes) was 56 bytes ago --- */
	struct irq_domain *        domain;               /*   952     8 */
	/* --- cacheline 15 boundary (960 bytes) --- */
	struct dentry *            debugfs;              /*   960     8 */
	bool                       multi_link;           /*   968     1 */

	/* size: 976, cachelines: 16, members: 27 */
	/* padding: 7 */
	/* paddings: 1, sum paddings: 6 */
	/* last cacheline: 16 bytes */
};

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20241003070650.62787-5-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 77 ++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 2caea7345c3e..6fcf122c1831 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -871,68 +871,71 @@ struct sdw_master_ops {
  * struct sdw_bus - SoundWire bus
  * @dev: Shortcut to &bus->md->dev to avoid changing the entire code.
  * @md: Master device
- * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used.
- * @link_id: Link id number, can be 0 to N, unique for each Controller
- * @id: bus system-wide unique id
- * @slaves: list of Slaves on this bus
- * @assigned: Bitmap for Slave device numbers.
- * Bit set implies used number, bit clear implies unused number.
+ * @bus_lock_key: bus lock key associated to @bus_lock
  * @bus_lock: bus lock
+ * @slaves: list of Slaves on this bus
+ * @msg_lock_key: message lock key associated to @msg_lock
  * @msg_lock: message lock
- * @compute_params: points to Bus resource management implementation
- * @ops: Master callback ops
- * @port_ops: Master port callback ops
- * @params: Current bus parameters
- * @prop: Master properties
- * @vendor_specific_prop: pointer to non-standard properties
  * @m_rt_list: List of Master instance of all stream(s) running on Bus. This
  * is used to compute and program bus bandwidth, clock, frame shape,
  * transport and port parameters
- * @debugfs: Bus debugfs
- * @domain: IRQ domain
  * @defer_msg: Defer message
- * @clk_stop_timeout: Clock stop timeout computed
- * @bank_switch_timeout: Bank switch timeout computed
- * @multi_link: Store bus property that indicates if multi links
- * are supported. This flag is populated by drivers after reading
- * appropriate firmware (ACPI/DT).
+ * @params: Current bus parameters
+ * @stream_refcount: number of streams currently using this bus
+ * @ops: Master callback ops
+ * @port_ops: Master port callback ops
+ * @prop: Master properties
+ * @vendor_specific_prop: pointer to non-standard properties
  * @hw_sync_min_links: Number of links used by a stream above which
  * hardware-based synchronization is required. This value is only
  * meaningful if multi_link is set. If set to 1, hardware-based
  * synchronization will be used even if a stream only uses a single
  * SoundWire segment.
- * @stream_refcount: number of streams currently using this bus
+ * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used.
+ * @link_id: Link id number, can be 0 to N, unique for each Controller
+ * @id: bus system-wide unique id
+ * @compute_params: points to Bus resource management implementation
+ * @assigned: Bitmap for Slave device numbers.
+ * Bit set implies used number, bit clear implies unused number.
+ * @clk_stop_timeout: Clock stop timeout computed
+ * @bank_switch_timeout: Bank switch timeout computed
+ * @domain: IRQ domain
+ * @irq_chip: IRQ chip
+ * @debugfs: Bus debugfs (optional)
+ * @multi_link: Store bus property that indicates if multi links
+ * are supported. This flag is populated by drivers after reading
+ * appropriate firmware (ACPI/DT).
  */
 struct sdw_bus {
 	struct device *dev;
 	struct sdw_master_device *md;
-	int controller_id;
-	unsigned int link_id;
-	int id;
-	struct list_head slaves;
-	DECLARE_BITMAP(assigned, SDW_MAX_DEVICES);
-	struct mutex bus_lock;
 	struct lock_class_key bus_lock_key;
-	struct mutex msg_lock;
+	struct mutex bus_lock;
+	struct list_head slaves;
 	struct lock_class_key msg_lock_key;
-	int (*compute_params)(struct sdw_bus *bus);
+	struct mutex msg_lock;
+	struct list_head m_rt_list;
+	struct sdw_defer defer_msg;
+	struct sdw_bus_params params;
+	int stream_refcount;
 	const struct sdw_master_ops *ops;
 	const struct sdw_master_port_ops *port_ops;
-	struct sdw_bus_params params;
 	struct sdw_master_prop prop;
 	void *vendor_specific_prop;
-	struct list_head m_rt_list;
+	int hw_sync_min_links;
+	int controller_id;
+	unsigned int link_id;
+	int id;
+	int (*compute_params)(struct sdw_bus *bus);
+	DECLARE_BITMAP(assigned, SDW_MAX_DEVICES);
+	unsigned int clk_stop_timeout;
+	u32 bank_switch_timeout;
+	struct irq_chip irq_chip;
+	struct irq_domain *domain;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs;
 #endif
-	struct irq_chip irq_chip;
-	struct irq_domain *domain;
-	struct sdw_defer defer_msg;
-	unsigned int clk_stop_timeout;
-	u32 bank_switch_timeout;
 	bool multi_link;
-	int hw_sync_min_links;
-	int stream_refcount;
 };
 
 int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent,
-- 
cgit v1.2.3


From 1c758df5a83ea0c9b5055536336d8a586b5010b0 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 3 Oct 2024 15:06:41 +0800
Subject: soundwire: optimize sdw_slave_prop

move pointers first, and move booleans together.

before:

struct sdw_slave_prop {
	u32                        mipi_revision;        /*     0     4 */
	bool                       wake_capable;         /*     4     1 */
	bool                       test_mode_capable;    /*     5     1 */
	bool                       clk_stop_mode1;       /*     6     1 */
	bool                       simple_clk_stop_capable; /*     7     1 */
	u32                        clk_stop_timeout;     /*     8     4 */
	u32                        ch_prep_timeout;      /*    12     4 */
	enum sdw_clk_stop_reset_behave reset_behave;     /*    16     4 */
	bool                       high_PHY_capable;     /*    20     1 */
	bool                       paging_support;       /*    21     1 */
	bool                       bank_delay_support;   /*    22     1 */

	/* XXX 1 byte hole, try to pack */

	enum sdw_p15_behave        p15_behave;           /*    24     4 */
	bool                       lane_control_support; /*    28     1 */

	/* XXX 3 bytes hole, try to pack */

	u32                        master_count;         /*    32     4 */
	u32                        source_ports;         /*    36     4 */
	u32                        sink_ports;           /*    40     4 */

	/* XXX 4 bytes hole, try to pack */

	struct sdw_dp0_prop *      dp0_prop;             /*    48     8 */
	struct sdw_dpn_prop *      src_dpn_prop;         /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct sdw_dpn_prop *      sink_dpn_prop;        /*    64     8 */
	u8                         scp_int1_mask;        /*    72     1 */

	/* XXX 3 bytes hole, try to pack */

	u32                        quirks;               /*    76     4 */
	bool                       clock_reg_supported;  /*    80     1 */
	bool                       use_domain_irq;       /*    81     1 */

	/* size: 88, cachelines: 2, members: 23 */
	/* sum members: 71, holes: 4, sum holes: 11 */
	/* padding: 6 */
	/* last cacheline: 24 bytes */
};

after:

truct sdw_slave_prop {
	struct sdw_dp0_prop *      dp0_prop;             /*     0     8 */
	struct sdw_dpn_prop *      src_dpn_prop;         /*     8     8 */
	struct sdw_dpn_prop *      sink_dpn_prop;        /*    16     8 */
	u32                        mipi_revision;        /*    24     4 */
	bool                       wake_capable;         /*    28     1 */
	bool                       test_mode_capable;    /*    29     1 */
	bool                       clk_stop_mode1;       /*    30     1 */
	bool                       simple_clk_stop_capable; /*    31     1 */
	u32                        clk_stop_timeout;     /*    32     4 */
	u32                        ch_prep_timeout;      /*    36     4 */
	enum sdw_clk_stop_reset_behave reset_behave;     /*    40     4 */
	bool                       high_PHY_capable;     /*    44     1 */
	bool                       paging_support;       /*    45     1 */
	bool                       bank_delay_support;   /*    46     1 */
	bool                       lane_control_support; /*    47     1 */
	enum sdw_p15_behave        p15_behave;           /*    48     4 */
	u32                        master_count;         /*    52     4 */
	u32                        source_ports;         /*    56     4 */
	u32                        sink_ports;           /*    60     4 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	u32                        quirks;               /*    64     4 */
	u8                         scp_int1_mask;        /*    68     1 */
	bool                       clock_reg_supported;  /*    69     1 */
	bool                       use_domain_irq;       /*    70     1 */

	/* size: 72, cachelines: 2, members: 23 */
	/* padding: 1 */
	/* last cacheline: 8 bytes */
};

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20241003070650.62787-6-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 6fcf122c1831..38db81f5bdb9 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -344,6 +344,9 @@ struct sdw_dpn_prop {
 
 /**
  * struct sdw_slave_prop - SoundWire Slave properties
+ * @dp0_prop: Data Port 0 properties
+ * @src_dpn_prop: Source Data Port N properties
+ * @sink_dpn_prop: Sink Data Port N properties
  * @mipi_revision: Spec version of the implementation
  * @wake_capable: Wake-up events are supported
  * @test_mode_capable: If test mode is supported
@@ -360,15 +363,12 @@ struct sdw_dpn_prop {
  * SCP_AddrPage2
  * @bank_delay_support: Slave implements bank delay/bridge support registers
  * SCP_BankDelay and SCP_NextFrame
+ * @lane_control_support: Slave supports lane control
  * @p15_behave: Slave behavior when the Master attempts a read to the Port15
  * alias
- * @lane_control_support: Slave supports lane control
  * @master_count: Number of Masters present on this Slave
  * @source_ports: Bitmap identifying source ports
  * @sink_ports: Bitmap identifying sink ports
- * @dp0_prop: Data Port 0 properties
- * @src_dpn_prop: Source Data Port N properties
- * @sink_dpn_prop: Sink Data Port N properties
  * @scp_int1_mask: SCP_INT1_MASK desired settings
  * @quirks: bitmask identifying deltas from the MIPI specification
  * @clock_reg_supported: the Peripheral implements the clock base and scale
@@ -377,6 +377,9 @@ struct sdw_dpn_prop {
  * @use_domain_irq: call actual IRQ handler on slave, as well as callback
  */
 struct sdw_slave_prop {
+	struct sdw_dp0_prop *dp0_prop;
+	struct sdw_dpn_prop *src_dpn_prop;
+	struct sdw_dpn_prop *sink_dpn_prop;
 	u32 mipi_revision;
 	bool wake_capable;
 	bool test_mode_capable;
@@ -388,16 +391,13 @@ struct sdw_slave_prop {
 	bool high_PHY_capable;
 	bool paging_support;
 	bool bank_delay_support;
-	enum sdw_p15_behave p15_behave;
 	bool lane_control_support;
+	enum sdw_p15_behave p15_behave;
 	u32 master_count;
 	u32 source_ports;
 	u32 sink_ports;
-	struct sdw_dp0_prop *dp0_prop;
-	struct sdw_dpn_prop *src_dpn_prop;
-	struct sdw_dpn_prop *sink_dpn_prop;
-	u8 scp_int1_mask;
 	u32 quirks;
+	u8 scp_int1_mask;
 	bool clock_reg_supported;
 	bool use_domain_irq;
 };
-- 
cgit v1.2.3


From 557e28f8b53243097162cf4d3e59bcee9fb9713b Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 3 Oct 2024 15:06:42 +0800
Subject: soundwire: optimize sdw_dp0_prop

Move pointers and booleans.

Before:

struct sdw_dp0_prop {
	u32                        max_word;             /*     0     4 */
	u32                        min_word;             /*     4     4 */
	u32                        num_words;            /*     8     4 */

	/* XXX 4 bytes hole, try to pack */

	u32 *                      words;                /*    16     8 */
	bool                       BRA_flow_controlled;  /*    24     1 */
	bool                       simple_ch_prep_sm;    /*    25     1 */

	/* XXX 2 bytes hole, try to pack */

	u32                        ch_prep_timeout;      /*    28     4 */
	bool                       imp_def_interrupts;   /*    32     1 */

	/* size: 40, cachelines: 1, members: 8 */
	/* sum members: 27, holes: 2, sum holes: 6 */
	/* padding: 7 */
	/* last cacheline: 40 bytes */
};

after:

struct sdw_dp0_prop {
	u32 *                      words;                /*     0     8 */
	u32                        max_word;             /*     8     4 */
	u32                        min_word;             /*    12     4 */
	u32                        num_words;            /*    16     4 */
	u32                        ch_prep_timeout;      /*    20     4 */
	bool                       BRA_flow_controlled;  /*    24     1 */
	bool                       simple_ch_prep_sm;    /*    25     1 */
	bool                       imp_def_interrupts;   /*    26     1 */

	/* size: 32, cachelines: 1, members: 8 */
	/* padding: 5 */
	/* last cacheline: 32 bytes */
};

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20241003070650.62787-7-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 38db81f5bdb9..c72095137a35 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -226,16 +226,16 @@ enum sdw_clk_stop_mode {
 
 /**
  * struct sdw_dp0_prop - DP0 properties
+ * @words: wordlengths supported
  * @max_word: Maximum number of bits in a Payload Channel Sample, 1 to 64
  * (inclusive)
  * @min_word: Minimum number of bits in a Payload Channel Sample, 1 to 64
  * (inclusive)
  * @num_words: number of wordlengths supported
- * @words: wordlengths supported
+ * @ch_prep_timeout: Port-specific timeout value, in milliseconds
  * @BRA_flow_controlled: Slave implementation results in an OK_NotReady
  * response
  * @simple_ch_prep_sm: If channel prepare sequence is required
- * @ch_prep_timeout: Port-specific timeout value, in milliseconds
  * @imp_def_interrupts: If set, each bit corresponds to support for
  * implementation-defined interrupts
  *
@@ -244,13 +244,13 @@ enum sdw_clk_stop_mode {
  * support
  */
 struct sdw_dp0_prop {
+	u32 *words;
 	u32 max_word;
 	u32 min_word;
 	u32 num_words;
-	u32 *words;
+	u32 ch_prep_timeout;
 	bool BRA_flow_controlled;
 	bool simple_ch_prep_sm;
-	u32 ch_prep_timeout;
 	bool imp_def_interrupts;
 };
 
-- 
cgit v1.2.3


From 9942f90bdcc035eb5f01d7343dac99bd805ef3ec Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 3 Oct 2024 15:06:43 +0800
Subject: soundwire: optimize sdw_dpn_prop

before:
struct sdw_dpn_prop {
	u32                        num;                  /*     0     4 */
	u32                        max_word;             /*     4     4 */
	u32                        min_word;             /*     8     4 */
	u32                        num_words;            /*    12     4 */
	u32 *                      words;                /*    16     8 */
	enum sdw_dpn_type          type;                 /*    24     4 */
	u32                        max_grouping;         /*    28     4 */
	bool                       simple_ch_prep_sm;    /*    32     1 */

	/* XXX 3 bytes hole, try to pack */

	u32                        ch_prep_timeout;      /*    36     4 */
	u32                        imp_def_interrupts;   /*    40     4 */
	u32                        max_ch;               /*    44     4 */
	u32                        min_ch;               /*    48     4 */
	u32                        num_channels;         /*    52     4 */
	u32 *                      channels;             /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	u32                        num_ch_combinations;  /*    64     4 */

	/* XXX 4 bytes hole, try to pack */

	u32 *                      ch_combinations;      /*    72     8 */
	u32                        modes;                /*    80     4 */
	u32                        max_async_buffer;     /*    84     4 */
	bool                       block_pack_mode;      /*    88     1 */
	bool                       read_only_wordlength; /*    89     1 */

	/* XXX 2 bytes hole, try to pack */

	u32                        port_encoding;        /*    92     4 */
	struct sdw_dpn_audio_mode * audio_modes;         /*    96     8 */

	/* size: 104, cachelines: 2, members: 22 */
	/* sum members: 95, holes: 3, sum holes: 9 */
	/* last cacheline: 40 bytes */
};

after:

struct sdw_dpn_prop {
	struct sdw_dpn_audio_mode * audio_modes;         /*     0     8 */
	u32                        num;                  /*     8     4 */
	u32                        max_word;             /*    12     4 */
	u32                        min_word;             /*    16     4 */
	u32                        num_words;            /*    20     4 */
	u32 *                      words;                /*    24     8 */
	enum sdw_dpn_type          type;                 /*    32     4 */
	u32                        max_grouping;         /*    36     4 */
	u32                        ch_prep_timeout;      /*    40     4 */
	u32                        imp_def_interrupts;   /*    44     4 */
	u32                        max_ch;               /*    48     4 */
	u32                        min_ch;               /*    52     4 */
	u32                        num_channels;         /*    56     4 */
	u32                        num_ch_combinations;  /*    60     4 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	u32 *                      channels;             /*    64     8 */
	u32 *                      ch_combinations;      /*    72     8 */
	u32                        modes;                /*    80     4 */
	u32                        max_async_buffer;     /*    84     4 */
	u32                        port_encoding;        /*    88     4 */
	bool                       block_pack_mode;      /*    92     1 */
	bool                       read_only_wordlength; /*    93     1 */
	bool                       simple_ch_prep_sm;    /*    94     1 */

	/* size: 96, cachelines: 2, members: 22 */
	/* padding: 1 */
	/* last cacheline: 32 bytes */
};

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20241003070650.62787-8-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index c72095137a35..cc0afb8af333 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -288,6 +288,7 @@ struct sdw_dpn_audio_mode {
 
 /**
  * struct sdw_dpn_prop - Data Port DPn properties
+ * @audio_modes: Audio modes supported
  * @num: port number
  * @max_word: Maximum number of bits in a Payload Channel Sample, 1 to 64
  * (inclusive)
@@ -298,26 +299,26 @@ struct sdw_dpn_audio_mode {
  * @type: Data port type. Full, Simplified or Reduced
  * @max_grouping: Maximum number of samples that can be grouped together for
  * a full data port
- * @simple_ch_prep_sm: If the port supports simplified channel prepare state
- * machine
  * @ch_prep_timeout: Port-specific timeout value, in milliseconds
  * @imp_def_interrupts: If set, each bit corresponds to support for
  * implementation-defined interrupts
  * @max_ch: Maximum channels supported
  * @min_ch: Minimum channels supported
  * @num_channels: Number of discrete channels supported
- * @channels: Discrete channels supported
  * @num_ch_combinations: Number of channel combinations supported
+ * @channels: Discrete channels supported
  * @ch_combinations: Channel combinations supported
  * @modes: SDW mode supported
  * @max_async_buffer: Number of samples that this port can buffer in
  * asynchronous modes
+ * @port_encoding: Payload Channel Sample encoding schemes supported
  * @block_pack_mode: Type of block port mode supported
  * @read_only_wordlength: Read Only wordlength field in DPN_BlockCtrl1 register
- * @port_encoding: Payload Channel Sample encoding schemes supported
- * @audio_modes: Audio modes supported
+ * @simple_ch_prep_sm: If the port supports simplified channel prepare state
+ * machine
  */
 struct sdw_dpn_prop {
+	struct sdw_dpn_audio_mode *audio_modes;
 	u32 num;
 	u32 max_word;
 	u32 min_word;
@@ -325,21 +326,20 @@ struct sdw_dpn_prop {
 	u32 *words;
 	enum sdw_dpn_type type;
 	u32 max_grouping;
-	bool simple_ch_prep_sm;
 	u32 ch_prep_timeout;
 	u32 imp_def_interrupts;
 	u32 max_ch;
 	u32 min_ch;
 	u32 num_channels;
-	u32 *channels;
 	u32 num_ch_combinations;
+	u32 *channels;
 	u32 *ch_combinations;
 	u32 modes;
 	u32 max_async_buffer;
+	u32 port_encoding;
 	bool block_pack_mode;
 	bool read_only_wordlength;
-	u32 port_encoding;
-	struct sdw_dpn_audio_mode *audio_modes;
+	bool simple_ch_prep_sm;
 };
 
 /**
-- 
cgit v1.2.3


From 1ae4aa59d79399be0591c8d78c44e280406e2c34 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 3 Oct 2024 15:06:44 +0800
Subject: soundwire: mipi-disco: remove DPn audio-modes

The concept of DPn audio-modes was never used by anyone, and was
removed from the DisCo for SoundWire 2.0 specification.

Remove the definitions and TODO.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20241003070650.62787-9-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/mipi_disco.c |  2 --
 include/linux/soundwire/sdw.h  | 34 ----------------------------------
 2 files changed, 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/mipi_disco.c b/drivers/soundwire/mipi_disco.c
index fdab3d4a1379..79cf8212f97a 100644
--- a/drivers/soundwire/mipi_disco.c
+++ b/drivers/soundwire/mipi_disco.c
@@ -304,8 +304,6 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave,
 		fwnode_property_read_u32(node, "mipi-sdw-port-encoding-type",
 					 &dpn[i].port_encoding);
 
-		/* TODO: Read audio mode */
-
 		fwnode_handle_put(node);
 
 		i++;
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index cc0afb8af333..66feaa79ecfc 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -254,41 +254,8 @@ struct sdw_dp0_prop {
 	bool imp_def_interrupts;
 };
 
-/**
- * struct sdw_dpn_audio_mode - Audio mode properties for DPn
- * @bus_min_freq: Minimum bus frequency, in Hz
- * @bus_max_freq: Maximum bus frequency, in Hz
- * @bus_num_freq: Number of discrete frequencies supported
- * @bus_freq: Discrete bus frequencies, in Hz
- * @min_freq: Minimum sampling frequency, in Hz
- * @max_freq: Maximum sampling bus frequency, in Hz
- * @num_freq: Number of discrete sampling frequency supported
- * @freq: Discrete sampling frequencies, in Hz
- * @prep_ch_behave: Specifies the dependencies between Channel Prepare
- * sequence and bus clock configuration
- * If 0, Channel Prepare can happen at any Bus clock rate
- * If 1, Channel Prepare sequence shall happen only after Bus clock is
- * changed to a frequency supported by this mode or compatible modes
- * described by the next field
- * @glitchless: Bitmap describing possible glitchless transitions from this
- * Audio Mode to other Audio Modes
- */
-struct sdw_dpn_audio_mode {
-	u32 bus_min_freq;
-	u32 bus_max_freq;
-	u32 bus_num_freq;
-	u32 *bus_freq;
-	u32 max_freq;
-	u32 min_freq;
-	u32 num_freq;
-	u32 *freq;
-	u32 prep_ch_behave;
-	u32 glitchless;
-};
-
 /**
  * struct sdw_dpn_prop - Data Port DPn properties
- * @audio_modes: Audio modes supported
  * @num: port number
  * @max_word: Maximum number of bits in a Payload Channel Sample, 1 to 64
  * (inclusive)
@@ -318,7 +285,6 @@ struct sdw_dpn_audio_mode {
  * machine
  */
 struct sdw_dpn_prop {
-	struct sdw_dpn_audio_mode *audio_modes;
 	u32 num;
 	u32 max_word;
 	u32 min_word;
-- 
cgit v1.2.3


From 543bd28a3bfeff31f748ba83348b63313dd37ff9 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 3 Oct 2024 15:06:49 +0800
Subject: soundwire: mipi-disco: add new properties from 2.0 spec

The DisCo for SoundWire 2.0 spec adds support for new
'mipi-sdw-sdca-interrupt-register-list' and
'mipi-sdw-commit-register-supported'.

This patch only adds the definitions and property reads, but the use
of these properties will come at some point in the future when needed.

Note a slight conceptual disconnect between the MIPI DisCo definition
of a boolean property and the Linux implementation. The latter only
checks the presence of the property to set its value to 'true',
whereas the MIPI definitions allow for a property with a 'false'
value. This patch uses the new introduced mipi_device_property_read_bool()
to handle it.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20241003070650.62787-14-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/mipi_disco.c | 6 ++++++
 include/linux/soundwire/sdw.h  | 7 ++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/mipi_disco.c b/drivers/soundwire/mipi_disco.c
index d6eb63bf1252..36e734751225 100644
--- a/drivers/soundwire/mipi_disco.c
+++ b/drivers/soundwire/mipi_disco.c
@@ -398,6 +398,12 @@ int sdw_slave_read_prop(struct sdw_slave *slave)
 	device_property_read_u32(dev, "mipi-sdw-sink-port-list",
 				 &prop->sink_ports);
 
+	device_property_read_u32(dev, "mipi-sdw-sdca-interrupt-register-list",
+				 &prop->sdca_interrupt_register_list);
+
+	prop->commit_register_supported = mipi_device_property_read_bool(dev,
+			"mipi-sdw-commit-register-supported");
+
 	/*
 	 * Read dp0 properties - we don't rely on the 'mipi-sdw-dp-0-supported'
 	 * property since the 'mipi-sdw-dp0-subproperties' property is logically
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 66feaa79ecfc..952514f044f0 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -335,8 +335,11 @@ struct sdw_dpn_prop {
  * @master_count: Number of Masters present on this Slave
  * @source_ports: Bitmap identifying source ports
  * @sink_ports: Bitmap identifying sink ports
- * @scp_int1_mask: SCP_INT1_MASK desired settings
  * @quirks: bitmask identifying deltas from the MIPI specification
+ * @sdca_interrupt_register_list: indicates which sets of SDCA interrupt status
+ * and masks are supported
+ * @commit_register_supported: is PCP_Commit register supported
+ * @scp_int1_mask: SCP_INT1_MASK desired settings
  * @clock_reg_supported: the Peripheral implements the clock base and scale
  * registers introduced with the SoundWire 1.2 specification. SDCA devices
  * do not need to set this boolean property as the registers are required.
@@ -363,6 +366,8 @@ struct sdw_slave_prop {
 	u32 source_ports;
 	u32 sink_ports;
 	u32 quirks;
+	u32 sdca_interrupt_register_list;
+	u8 commit_register_supported;
 	u8 scp_int1_mask;
 	bool clock_reg_supported;
 	bool use_domain_irq;
-- 
cgit v1.2.3


From 71b405b184449fffcb76ea0814104b71dfdb2aee Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 3 Oct 2024 15:06:50 +0800
Subject: soundwire: mipi-disco: add support for DP0/DPn 'lane-list' property

The SoundWire specification did not clearly require that ports could
use all Lanes. Some SoundWire/SDCA peripheral adopters added
restrictions on which lanes can be used by what port, and the DisCo
for SoundWire 2.1 specification added a 'lane-list' property to model
this hardware limitation.

When not specified, the ports can use all Lanes. Otherwise, the
'lane-list' indicates which Lanes can be used, sorted by order of
preference (most-preferred-first).

This patch only reads the properties, the use of this property will
come at a later time with multi-lane support.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20241003070650.62787-15-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/mipi_disco.c | 32 ++++++++++++++++++++++++++++++++
 include/linux/soundwire/sdw.h  |  8 ++++++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soundwire/mipi_disco.c b/drivers/soundwire/mipi_disco.c
index 36e734751225..9d59f486edbe 100644
--- a/drivers/soundwire/mipi_disco.c
+++ b/drivers/soundwire/mipi_disco.c
@@ -197,6 +197,22 @@ static int sdw_slave_read_dp0(struct sdw_slave *slave,
 	dp0->imp_def_interrupts = mipi_fwnode_property_read_bool(port,
 				"mipi-sdw-imp-def-dp0-interrupts-supported");
 
+	nval = fwnode_property_count_u32(port, "mipi-sdw-lane-list");
+	if (nval > 0) {
+		dp0->num_lanes = nval;
+		dp0->lane_list = devm_kcalloc(&slave->dev,
+					      dp0->num_lanes, sizeof(*dp0->lane_list),
+					      GFP_KERNEL);
+		if (!dp0->lane_list)
+			return -ENOMEM;
+
+		ret = fwnode_property_read_u32_array(port,
+					       "mipi-sdw-lane-list",
+					       dp0->lane_list, dp0->num_lanes);
+		if (ret < 0)
+			return ret;
+	}
+
 	return 0;
 }
 
@@ -326,6 +342,22 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave,
 		fwnode_property_read_u32(node, "mipi-sdw-port-encoding-type",
 					 &dpn[i].port_encoding);
 
+		nval = fwnode_property_count_u32(node, "mipi-sdw-lane-list");
+		if (nval > 0) {
+			dpn[i].num_lanes = nval;
+			dpn[i].lane_list = devm_kcalloc(&slave->dev,
+							dpn[i].num_lanes, sizeof(*dpn[i].lane_list),
+							GFP_KERNEL);
+			if (!dpn[i].lane_list)
+				return -ENOMEM;
+
+			ret = fwnode_property_read_u32_array(node,
+						       "mipi-sdw-lane-list",
+						       dpn[i].lane_list, dpn[i].num_lanes);
+			if (ret < 0)
+				return ret;
+		}
+
 		fwnode_handle_put(node);
 
 		i++;
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 952514f044f0..73f655334fe9 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -238,6 +238,8 @@ enum sdw_clk_stop_mode {
  * @simple_ch_prep_sm: If channel prepare sequence is required
  * @imp_def_interrupts: If set, each bit corresponds to support for
  * implementation-defined interrupts
+ * @num_lanes: array size of @lane_list
+ * @lane_list: indicates which Lanes can be used by DP0
  *
  * The wordlengths are specified by Spec as max, min AND number of
  * discrete values, implementation can define based on the wordlengths they
@@ -252,6 +254,8 @@ struct sdw_dp0_prop {
 	bool BRA_flow_controlled;
 	bool simple_ch_prep_sm;
 	bool imp_def_interrupts;
+	int num_lanes;
+	u32 *lane_list;
 };
 
 /**
@@ -275,6 +279,8 @@ struct sdw_dp0_prop {
  * @num_ch_combinations: Number of channel combinations supported
  * @channels: Discrete channels supported
  * @ch_combinations: Channel combinations supported
+ * @lane_list: indicates which Lanes can be used by DPn
+ * @num_lanes: array size of @lane_list
  * @modes: SDW mode supported
  * @max_async_buffer: Number of samples that this port can buffer in
  * asynchronous modes
@@ -300,6 +306,8 @@ struct sdw_dpn_prop {
 	u32 num_ch_combinations;
 	u32 *channels;
 	u32 *ch_combinations;
+	u32 *lane_list;
+	int num_lanes;
 	u32 modes;
 	u32 max_async_buffer;
 	u32 port_encoding;
-- 
cgit v1.2.3


From 5b3fdc9f2ff10d3cee106ddaa0ee6636c7de381e Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 30 Sep 2024 14:33:29 +0200
Subject: random: Do not include <linux/prandom.h> in <linux/random.h>

Files that use prandom infrastructure are now converted to
use <linux/prandom.h> header instead of <linux/random.h>.
Remove the legacy inclusion of <linux/prandom.h> from
<linux/random.h>.

This is the "nice cleanup" part, wished in c0842fbc1b18.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Fixes: c0842fbc1b18 ("random32: move the pseudo-random 32-bit definitions to prandom.h")
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/random.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/random.h b/include/linux/random.h
index b0a940af4fff..333cecfca93f 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -145,13 +145,6 @@ declare_get_random_var_wait(u64, u32)
 declare_get_random_var_wait(long, unsigned long)
 #undef declare_get_random_var
 
-/*
- * This is designed to be standalone for just prandom
- * users, but for now we include it from <linux/random.h>
- * for legacy reasons.
- */
-#include <linux/prandom.h>
-
 #ifdef CONFIG_SMP
 int random_prepare_cpu(unsigned int cpu);
 int random_online_cpu(unsigned int cpu);
-- 
cgit v1.2.3


From d18c13697b4dcbf6a8f06c3d8e564c4f5ad1477c Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 30 Sep 2024 14:33:30 +0200
Subject: prandom: Include <linux/percpu.h> in <linux/prandom.h>

<linux/percpu.h> include was removed from <linux/prandom.h>
in d9f29deb7fe8 ("prandom: Remove unused include") because
this inclusion broke arm64 due to a circular dependency
on include files.

__percpu tag is defined in include/linux/compiler_types.h, so there
is currently no direct need for the inclusion of <linux/percpu.h>.
However, in [1] we would like to repurpose __percpu tag as a named
address space qualifier, where __percpu macro uses defines from
<linux/percpu.h>.

The circular dependency was removed in ddd8e37ebaa1 ("random: Do not
include <linux/prandom.h> in <linux/random.h>") and it cleared
the path for the inclusion of <linux/percpu.h> in <linux/prandom.h>.

This patch is basically a revert of d9f29deb7fe8
("prandom: Remove unused include").

[1] https://lore.kernel.org/lkml/20240812115945.484051-4-ubizjak@gmail.com/

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/prandom.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index f7f1e5251c67..f2ed5b72b3d6 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/once.h>
+#include <linux/percpu.h>
 #include <linux/random.h>
 
 struct rnd_state {
-- 
cgit v1.2.3


From 816ad8f1e498fe5e9e992da137316302219f2137 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Oct 2024 14:51:52 -0700
Subject: lib: packing: remove kernel-doc from header file

It is not necessary to have the kernel-doc duplicated both in the
header and in the implementation. It is better to have it near the
implementation of the function, since in C, a function can have N
declarations, but only one definition.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-3-8373e551eae3@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/packing.h | 26 --------------------------
 1 file changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/packing.h b/include/linux/packing.h
index 8d6571feb95d..69baefebcd02 100644
--- a/include/linux/packing.h
+++ b/include/linux/packing.h
@@ -17,32 +17,6 @@ enum packing_op {
 	UNPACK,
 };
 
-/**
- * packing - Convert numbers (currently u64) between a packed and an unpacked
- *	     format. Unpacked means laid out in memory in the CPU's native
- *	     understanding of integers, while packed means anything else that
- *	     requires translation.
- *
- * @pbuf: Pointer to a buffer holding the packed value.
- * @uval: Pointer to an u64 holding the unpacked value.
- * @startbit: The index (in logical notation, compensated for quirks) where
- *	      the packed value starts within pbuf. Must be larger than, or
- *	      equal to, endbit.
- * @endbit: The index (in logical notation, compensated for quirks) where
- *	    the packed value ends within pbuf. Must be smaller than, or equal
- *	    to, startbit.
- * @op: If PACK, then uval will be treated as const pointer and copied (packed)
- *	into pbuf, between startbit and endbit.
- *	If UNPACK, then pbuf will be treated as const pointer and the logical
- *	value between startbit and endbit will be copied (unpacked) to uval.
- * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and
- *	    QUIRK_MSB_ON_THE_RIGHT.
- *
- * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming
- *	   correct usage, return code may be discarded.
- *	   If op is PACK, pbuf is modified.
- *	   If op is UNPACK, uval is modified.
- */
 int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen,
 	    enum packing_op op, u8 quirks);
 
-- 
cgit v1.2.3


From 7263f64e16d90ded44ce211381b2def83db32fd9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Oct 2024 14:51:53 -0700
Subject: lib: packing: add pack() and unpack() wrappers over packing()

Geert Uytterhoeven described packing() as "really bad API" because of
not being able to enforce const correctness. The same function is used
both when "pbuf" is input and "uval" is output, as in the other way
around.

Create 2 wrapper functions where const correctness can be ensured.
Do ugly type casts inside, to be able to reuse packing() as currently
implemented - which will _not_ modify the input argument.

Also, take the opportunity to change the type of startbit and endbit to
size_t - an unsigned type - in these new function prototypes. When int,
an extra check for negative values is necessary. Hopefully, when
packing() goes away completely, that check can be dropped.

My concern is that code which does rely on the conditional directionality
of packing() is harder to refactor without blowing up in size. So it may
take a while to completely eliminate packing(). But let's make alternatives
available for those who do not need that.

Link: https://lore.kernel.org/netdev/20210223112003.2223332-1-geert+renesas@glider.be/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-4-8373e551eae3@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/packing.h |  6 ++++++
 lib/packing.c           | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/packing.h b/include/linux/packing.h
index 69baefebcd02..ea25cb93cc70 100644
--- a/include/linux/packing.h
+++ b/include/linux/packing.h
@@ -20,4 +20,10 @@ enum packing_op {
 int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen,
 	    enum packing_op op, u8 quirks);
 
+int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit,
+	 size_t pbuflen, u8 quirks);
+
+int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit,
+	   size_t pbuflen, u8 quirks);
+
 #endif
diff --git a/lib/packing.c b/lib/packing.c
index 435236a914fe..2922db8a528c 100644
--- a/lib/packing.c
+++ b/lib/packing.c
@@ -90,6 +90,8 @@ static int calculate_box_addr(int box, size_t len, u8 quirks)
  * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and
  *	    QUIRK_MSB_ON_THE_RIGHT.
  *
+ * Note: this is deprecated, prefer to use pack() or unpack() in new code.
+ *
  * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming
  *	   correct usage, return code may be discarded.
  *	   If op is PACK, pbuf is modified.
@@ -216,4 +218,56 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen,
 }
 EXPORT_SYMBOL(packing);
 
+/**
+ * pack - Pack u64 number into bitfield of buffer.
+ *
+ * @pbuf: Pointer to a buffer holding the packed value.
+ * @uval: Pointer to an u64 holding the unpacked value.
+ * @startbit: The index (in logical notation, compensated for quirks) where
+ *	      the packed value starts within pbuf. Must be larger than, or
+ *	      equal to, endbit.
+ * @endbit: The index (in logical notation, compensated for quirks) where
+ *	    the packed value ends within pbuf. Must be smaller than, or equal
+ *	    to, startbit.
+ * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf.
+ * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and
+ *	    QUIRK_MSB_ON_THE_RIGHT.
+ *
+ * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming
+ *	   correct usage, return code may be discarded. The @pbuf memory will
+ *	   be modified on success.
+ */
+int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit,
+	 size_t pbuflen, u8 quirks)
+{
+	return packing(pbuf, (u64 *)uval, startbit, endbit, pbuflen, PACK, quirks);
+}
+EXPORT_SYMBOL(pack);
+
+/**
+ * unpack - Unpack u64 number from packed buffer.
+ *
+ * @pbuf: Pointer to a buffer holding the packed value.
+ * @uval: Pointer to an u64 holding the unpacked value.
+ * @startbit: The index (in logical notation, compensated for quirks) where
+ *	      the packed value starts within pbuf. Must be larger than, or
+ *	      equal to, endbit.
+ * @endbit: The index (in logical notation, compensated for quirks) where
+ *	    the packed value ends within pbuf. Must be smaller than, or equal
+ *	    to, startbit.
+ * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf.
+ * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and
+ *	    QUIRK_MSB_ON_THE_RIGHT.
+ *
+ * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming
+ *	   correct usage, return code may be discarded. The @uval will be
+ *	   modified on success.
+ */
+int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit,
+	   size_t pbuflen, u8 quirks)
+{
+	return packing((void *)pbuf, uval, startbit, endbit, pbuflen, UNPACK, quirks);
+}
+EXPORT_SYMBOL(unpack);
+
 MODULE_DESCRIPTION("Generic bitfield packing and unpacking");
-- 
cgit v1.2.3


From 28aec9ca29f05dabb835293acc6e202bf51b8092 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Oct 2024 14:51:54 -0700
Subject: lib: packing: duplicate pack() and unpack() implementations

packing() is now used in some hot paths, and it would be good to get rid
of some ifs and buts that depend on "op", to speed things up a little bit.

With the main implementations now taking size_t endbit, we no longer
have to check for negative values. Update the local integer variables to
also be size_t to match.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-5-8373e551eae3@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/packing.h |   4 +-
 lib/packing.c           | 221 ++++++++++++++++++++++++++++++------------------
 2 files changed, 143 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/packing.h b/include/linux/packing.h
index ea25cb93cc70..5d36dcd06f60 100644
--- a/include/linux/packing.h
+++ b/include/linux/packing.h
@@ -20,8 +20,8 @@ enum packing_op {
 int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen,
 	    enum packing_op op, u8 quirks);
 
-int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit,
-	 size_t pbuflen, u8 quirks);
+int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen,
+	 u8 quirks);
 
 int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit,
 	   size_t pbuflen, u8 quirks);
diff --git a/lib/packing.c b/lib/packing.c
index 2922db8a528c..500184530d07 100644
--- a/lib/packing.c
+++ b/lib/packing.c
@@ -9,11 +9,11 @@
 #include <linux/types.h>
 #include <linux/bitrev.h>
 
-static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit,
-				       int *box_end_bit, u8 *box_mask)
+static void adjust_for_msb_right_quirk(u64 *to_write, size_t *box_start_bit,
+				       size_t *box_end_bit, u8 *box_mask)
 {
-	int box_bit_width = *box_start_bit - *box_end_bit + 1;
-	int new_box_start_bit, new_box_end_bit;
+	size_t box_bit_width = *box_start_bit - *box_end_bit + 1;
+	size_t new_box_start_bit, new_box_end_bit;
 
 	*to_write >>= *box_end_bit;
 	*to_write = bitrev8(*to_write) >> (8 - box_bit_width);
@@ -48,7 +48,7 @@ static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit,
  *
  * Return: the physical offset into the buffer corresponding to the logical box.
  */
-static int calculate_box_addr(int box, size_t len, u8 quirks)
+static size_t calculate_box_addr(size_t box, size_t len, u8 quirks)
 {
 	size_t offset_of_group, offset_in_group, this_group = box / 4;
 	size_t group_size;
@@ -69,13 +69,10 @@ static int calculate_box_addr(int box, size_t len, u8 quirks)
 }
 
 /**
- * packing - Convert numbers (currently u64) between a packed and an unpacked
- *	     format. Unpacked means laid out in memory in the CPU's native
- *	     understanding of integers, while packed means anything else that
- *	     requires translation.
+ * pack - Pack u64 number into bitfield of buffer.
  *
  * @pbuf: Pointer to a buffer holding the packed value.
- * @uval: Pointer to an u64 holding the unpacked value.
+ * @uval: CPU-readable unpacked value to pack.
  * @startbit: The index (in logical notation, compensated for quirks) where
  *	      the packed value starts within pbuf. Must be larger than, or
  *	      equal to, endbit.
@@ -83,58 +80,45 @@ static int calculate_box_addr(int box, size_t len, u8 quirks)
  *	    the packed value ends within pbuf. Must be smaller than, or equal
  *	    to, startbit.
  * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf.
- * @op: If PACK, then uval will be treated as const pointer and copied (packed)
- *	into pbuf, between startbit and endbit.
- *	If UNPACK, then pbuf will be treated as const pointer and the logical
- *	value between startbit and endbit will be copied (unpacked) to uval.
  * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and
  *	    QUIRK_MSB_ON_THE_RIGHT.
  *
- * Note: this is deprecated, prefer to use pack() or unpack() in new code.
- *
  * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming
- *	   correct usage, return code may be discarded.
- *	   If op is PACK, pbuf is modified.
- *	   If op is UNPACK, uval is modified.
+ *	   correct usage, return code may be discarded. The @pbuf memory will
+ *	   be modified on success.
  */
-int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen,
-	    enum packing_op op, u8 quirks)
+int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen,
+	 u8 quirks)
 {
-	/* Number of bits for storing "uval"
-	 * also width of the field to access in the pbuf
-	 */
-	u64 value_width;
 	/* Logical byte indices corresponding to the
 	 * start and end of the field.
 	 */
 	int plogical_first_u8, plogical_last_u8, box;
+	/* width of the field to access in the pbuf */
+	u64 value_width;
 
 	/* startbit is expected to be larger than endbit, and both are
 	 * expected to be within the logically addressable range of the buffer.
 	 */
-	if (unlikely(startbit < endbit || startbit >= 8 * pbuflen || endbit < 0))
+	if (unlikely(startbit < endbit || startbit >= 8 * pbuflen))
 		/* Invalid function call */
 		return -EINVAL;
 
 	value_width = startbit - endbit + 1;
-	if (value_width > 64)
+	if (unlikely(value_width > 64))
 		return -ERANGE;
 
 	/* Check if "uval" fits in "value_width" bits.
 	 * If value_width is 64, the check will fail, but any
 	 * 64-bit uval will surely fit.
 	 */
-	if (op == PACK && value_width < 64 && (*uval >= (1ull << value_width)))
+	if (unlikely(value_width < 64 && uval >= (1ull << value_width)))
 		/* Cannot store "uval" inside "value_width" bits.
 		 * Truncating "uval" is most certainly not desirable,
 		 * so simply erroring out is appropriate.
 		 */
 		return -ERANGE;
 
-	/* Initialize parameter */
-	if (op == UNPACK)
-		*uval = 0;
-
 	/* Iterate through an idealistic view of the pbuf as an u64 with
 	 * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low
 	 * logical bit significance. "box" denotes the current logical u8.
@@ -144,11 +128,12 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen,
 
 	for (box = plogical_first_u8; box >= plogical_last_u8; box--) {
 		/* Bit indices into the currently accessed 8-bit box */
-		int box_start_bit, box_end_bit, box_addr;
+		size_t box_start_bit, box_end_bit, box_addr;
 		u8  box_mask;
 		/* Corresponding bits from the unpacked u64 parameter */
-		int proj_start_bit, proj_end_bit;
+		size_t proj_start_bit, proj_end_bit;
 		u64 proj_mask;
+		u64 pval;
 
 		/* This u8 may need to be accessed in its entirety
 		 * (from bit 7 to bit 0), or not, depending on the
@@ -182,44 +167,25 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen,
 		 */
 		box_addr = calculate_box_addr(box, pbuflen, quirks);
 
-		if (op == UNPACK) {
-			u64 pval;
-
-			/* Read from pbuf, write to uval */
-			pval = ((u8 *)pbuf)[box_addr] & box_mask;
-			if (quirks & QUIRK_MSB_ON_THE_RIGHT)
-				adjust_for_msb_right_quirk(&pval,
-							   &box_start_bit,
-							   &box_end_bit,
-							   &box_mask);
-
-			pval >>= box_end_bit;
-			pval <<= proj_end_bit;
-			*uval &= ~proj_mask;
-			*uval |= pval;
-		} else {
-			u64 pval;
-
-			/* Write to pbuf, read from uval */
-			pval = (*uval) & proj_mask;
-			pval >>= proj_end_bit;
-			if (quirks & QUIRK_MSB_ON_THE_RIGHT)
-				adjust_for_msb_right_quirk(&pval,
-							   &box_start_bit,
-							   &box_end_bit,
-							   &box_mask);
-
-			pval <<= box_end_bit;
-			((u8 *)pbuf)[box_addr] &= ~box_mask;
-			((u8 *)pbuf)[box_addr] |= pval;
-		}
+		/* Write to pbuf, read from uval */
+		pval = uval & proj_mask;
+		pval >>= proj_end_bit;
+		if (quirks & QUIRK_MSB_ON_THE_RIGHT)
+			adjust_for_msb_right_quirk(&pval,
+						   &box_start_bit,
+						   &box_end_bit,
+						   &box_mask);
+
+		pval <<= box_end_bit;
+		((u8 *)pbuf)[box_addr] &= ~box_mask;
+		((u8 *)pbuf)[box_addr] |= pval;
 	}
 	return 0;
 }
-EXPORT_SYMBOL(packing);
+EXPORT_SYMBOL(pack);
 
 /**
- * pack - Pack u64 number into bitfield of buffer.
+ * unpack - Unpack u64 number from packed buffer.
  *
  * @pbuf: Pointer to a buffer holding the packed value.
  * @uval: Pointer to an u64 holding the unpacked value.
@@ -234,18 +200,103 @@ EXPORT_SYMBOL(packing);
  *	    QUIRK_MSB_ON_THE_RIGHT.
  *
  * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming
- *	   correct usage, return code may be discarded. The @pbuf memory will
- *	   be modified on success.
+ *	   correct usage, return code may be discarded. The @uval will be
+ *	   modified on success.
  */
-int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit,
-	 size_t pbuflen, u8 quirks)
+int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit,
+	   size_t pbuflen, u8 quirks)
 {
-	return packing(pbuf, (u64 *)uval, startbit, endbit, pbuflen, PACK, quirks);
+	/* Logical byte indices corresponding to the
+	 * start and end of the field.
+	 */
+	int plogical_first_u8, plogical_last_u8, box;
+	/* width of the field to access in the pbuf */
+	u64 value_width;
+
+	/* startbit is expected to be larger than endbit, and both are
+	 * expected to be within the logically addressable range of the buffer.
+	 */
+	if (unlikely(startbit < endbit || startbit >= 8 * pbuflen))
+		/* Invalid function call */
+		return -EINVAL;
+
+	value_width = startbit - endbit + 1;
+	if (unlikely(value_width > 64))
+		return -ERANGE;
+
+	/* Initialize parameter */
+	*uval = 0;
+
+	/* Iterate through an idealistic view of the pbuf as an u64 with
+	 * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low
+	 * logical bit significance. "box" denotes the current logical u8.
+	 */
+	plogical_first_u8 = startbit / 8;
+	plogical_last_u8  = endbit / 8;
+
+	for (box = plogical_first_u8; box >= plogical_last_u8; box--) {
+		/* Bit indices into the currently accessed 8-bit box */
+		size_t box_start_bit, box_end_bit, box_addr;
+		u8  box_mask;
+		/* Corresponding bits from the unpacked u64 parameter */
+		size_t proj_start_bit, proj_end_bit;
+		u64 proj_mask;
+		u64 pval;
+
+		/* This u8 may need to be accessed in its entirety
+		 * (from bit 7 to bit 0), or not, depending on the
+		 * input arguments startbit and endbit.
+		 */
+		if (box == plogical_first_u8)
+			box_start_bit = startbit % 8;
+		else
+			box_start_bit = 7;
+		if (box == plogical_last_u8)
+			box_end_bit = endbit % 8;
+		else
+			box_end_bit = 0;
+
+		/* We have determined the box bit start and end.
+		 * Now we calculate where this (masked) u8 box would fit
+		 * in the unpacked (CPU-readable) u64 - the u8 box's
+		 * projection onto the unpacked u64. Though the
+		 * box is u8, the projection is u64 because it may fall
+		 * anywhere within the unpacked u64.
+		 */
+		proj_start_bit = ((box * 8) + box_start_bit) - endbit;
+		proj_end_bit   = ((box * 8) + box_end_bit) - endbit;
+		proj_mask = GENMASK_ULL(proj_start_bit, proj_end_bit);
+		box_mask  = GENMASK_ULL(box_start_bit, box_end_bit);
+
+		/* Determine the offset of the u8 box inside the pbuf,
+		 * adjusted for quirks. The adjusted box_addr will be used for
+		 * effective addressing inside the pbuf (so it's not
+		 * logical any longer).
+		 */
+		box_addr = calculate_box_addr(box, pbuflen, quirks);
+
+		/* Read from pbuf, write to uval */
+		pval = ((u8 *)pbuf)[box_addr] & box_mask;
+		if (quirks & QUIRK_MSB_ON_THE_RIGHT)
+			adjust_for_msb_right_quirk(&pval,
+						   &box_start_bit,
+						   &box_end_bit,
+						   &box_mask);
+
+		pval >>= box_end_bit;
+		pval <<= proj_end_bit;
+		*uval &= ~proj_mask;
+		*uval |= pval;
+	}
+	return 0;
 }
-EXPORT_SYMBOL(pack);
+EXPORT_SYMBOL(unpack);
 
 /**
- * unpack - Unpack u64 number from packed buffer.
+ * packing - Convert numbers (currently u64) between a packed and an unpacked
+ *	     format. Unpacked means laid out in memory in the CPU's native
+ *	     understanding of integers, while packed means anything else that
+ *	     requires translation.
  *
  * @pbuf: Pointer to a buffer holding the packed value.
  * @uval: Pointer to an u64 holding the unpacked value.
@@ -256,18 +307,28 @@ EXPORT_SYMBOL(pack);
  *	    the packed value ends within pbuf. Must be smaller than, or equal
  *	    to, startbit.
  * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf.
+ * @op: If PACK, then uval will be treated as const pointer and copied (packed)
+ *	into pbuf, between startbit and endbit.
+ *	If UNPACK, then pbuf will be treated as const pointer and the logical
+ *	value between startbit and endbit will be copied (unpacked) to uval.
  * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and
  *	    QUIRK_MSB_ON_THE_RIGHT.
  *
+ * Note: this is deprecated, prefer to use pack() or unpack() in new code.
+ *
  * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming
- *	   correct usage, return code may be discarded. The @uval will be
- *	   modified on success.
+ *	   correct usage, return code may be discarded.
+ *	   If op is PACK, pbuf is modified.
+ *	   If op is UNPACK, uval is modified.
  */
-int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit,
-	   size_t pbuflen, u8 quirks)
+int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen,
+	    enum packing_op op, u8 quirks)
 {
-	return packing((void *)pbuf, uval, startbit, endbit, pbuflen, UNPACK, quirks);
+	if (op == PACK)
+		return pack(pbuf, *uval, startbit, endbit, pbuflen, quirks);
+
+	return unpack(pbuf, uval, startbit, endbit, pbuflen, quirks);
 }
-EXPORT_SYMBOL(unpack);
+EXPORT_SYMBOL(packing);
 
 MODULE_DESCRIPTION("Generic bitfield packing and unpacking");
-- 
cgit v1.2.3


From da7d71bcb0637b7aa18934628fdb5a55f2db49a6 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 16 Sep 2024 02:17:11 -0700
Subject: bpf: Use KF_FASTCALL to mark kfuncs supporting fastcall contract

In order to allow pahole add btf_decl_tag("bpf_fastcall") for kfuncs
supporting bpf_fastcall, mark such functions with KF_FASTCALL in
id_set8 objects.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240916091712.2929279-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h   | 1 +
 kernel/bpf/helpers.c  | 4 ++--
 kernel/bpf/verifier.c | 5 +----
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index b8a583194c4a..631060e3ad14 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -75,6 +75,7 @@
 #define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
 #define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
 #define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */
+#define KF_FASTCALL     (1 << 12) /* kfunc supports bpf_fastcall protocol */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1a43d06eab28..4053f279ed4c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3052,8 +3052,8 @@ BTF_ID(func, bpf_cgroup_release_dtor)
 #endif
 
 BTF_KFUNCS_START(common_btf_ids)
-BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
-BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
+BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
 BTF_ID_FLAGS(func, bpf_rcu_read_lock)
 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
 BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9a7ed527e47e..7d9b38ffd220 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -16176,10 +16176,7 @@ static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta)
 /* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */
 static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta)
 {
-	if (meta->btf == btf_vmlinux)
-		return meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
-		       meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast];
-	return false;
+	return meta->kfunc_flags & KF_FASTCALL;
 }
 
 /* LLVM define a bpf_fastcall function attribute.
-- 
cgit v1.2.3


From a04d5f82fa3893aa06386db93883b151b93b11ed Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 2 Oct 2024 05:01:05 +0100
Subject: mm: Remove PageMappedToDisk

All callers have now been converted to the folio APIs, so remove
the page API for this flag.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20241002040111.1023018-4-willy@infradead.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/page-flags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1b3a76710487..35d08c30d4a6 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -554,7 +554,7 @@ FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)
  */
 TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
 	TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
-PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
+FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE)
 
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
 PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
-- 
cgit v1.2.3


From fd15ba4cb00a43fb4df42e1f95f94857ad122eea Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 2 Oct 2024 05:01:07 +0100
Subject: ceph: Remove call to PagePrivate2()

Use the folio that we already have to call folio_test_private_2()
instead.  This is the last call to PagePrivate2(), so replace its
PAGEFLAG() definition with FOLIO_FLAG().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20241002040111.1023018-6-willy@infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ceph/addr.c             | 20 ++++++++++----------
 include/linux/page-flags.h |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 53fef258c2bc..a8788e300dc7 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1051,7 +1051,9 @@ get_more_pages:
 		if (!nr_folios && !locked_pages)
 			break;
 		for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
-			page = &fbatch.folios[i]->page;
+			struct folio *folio = fbatch.folios[i];
+
+			page = &folio->page;
 			doutc(cl, "? %p idx %lu\n", page, page->index);
 			if (locked_pages == 0)
 				lock_page(page);  /* first page */
@@ -1078,8 +1080,6 @@ get_more_pages:
 				continue;
 			}
 			if (page_offset(page) >= ceph_wbc.i_size) {
-				struct folio *folio = page_folio(page);
-
 				doutc(cl, "folio at %lu beyond eof %llu\n",
 				      folio->index, ceph_wbc.i_size);
 				if ((ceph_wbc.size_stable ||
@@ -1095,16 +1095,16 @@ get_more_pages:
 				unlock_page(page);
 				break;
 			}
-			if (PageWriteback(page) ||
-			    PagePrivate2(page) /* [DEPRECATED] */) {
+			if (folio_test_writeback(folio) ||
+			    folio_test_private_2(folio) /* [DEPRECATED] */) {
 				if (wbc->sync_mode == WB_SYNC_NONE) {
-					doutc(cl, "%p under writeback\n", page);
-					unlock_page(page);
+					doutc(cl, "%p under writeback\n", folio);
+					folio_unlock(folio);
 					continue;
 				}
-				doutc(cl, "waiting on writeback %p\n", page);
-				wait_on_page_writeback(page);
-				folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
+				doutc(cl, "waiting on writeback %p\n", folio);
+				folio_wait_writeback(folio);
+				folio_wait_private_2(folio); /* [DEPRECATED] */
 			}
 
 			if (!clear_page_dirty_for_io(page)) {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 35d08c30d4a6..4c2dfe289046 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -543,7 +543,7 @@ FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
  * - PG_private and PG_private_2 cause release_folio() and co to be invoked
  */
 PAGEFLAG(Private, private, PF_ANY)
-PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE)
 
 /* owner_2 can be set on tail pages for anon memory */
 FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)
-- 
cgit v1.2.3


From 0e45a09a1da0872786885c505467aab8fb29b5b4 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 4 Sep 2024 21:17:06 -0700
Subject: Input: serio - define serio_pause_rx guard to pause and resume serio
 ports

serio_pause_rx() and serio_continue_rx() are usually used together to
temporarily stop receiving interrupts/data for a given serio port.
Define "serio_pause_rx" guard for this so that the port is always
resumed once critical section is over.

Example:

	scoped_guard(serio_pause_rx, elo->serio) {
		elo->expected_packet = toupper(packet[0]);
		init_completion(&elo->cmd_done);
	}

Link: https://lore.kernel.org/r/20240905041732.2034348-2-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/serio.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serio.h b/include/linux/serio.h
index bf2191f25350..69a47674af65 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -6,6 +6,7 @@
 #define _SERIO_H
 
 
+#include <linux/cleanup.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
@@ -161,4 +162,6 @@ static inline void serio_continue_rx(struct serio *serio)
 	spin_unlock_irq(&serio->lock);
 }
 
+DEFINE_GUARD(serio_pause_rx, struct serio *, serio_pause_rx(_T), serio_continue_rx(_T))
+
 #endif
-- 
cgit v1.2.3


From c653ffc283404a6c1c0e65143a833180c7ff799b Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 3 Oct 2024 07:46:51 -0700
Subject: HID: stop exporting hid_snto32()

The only user of hid_snto32() is Logitech HID++ driver, which always
calls hid_snto32() with valid size (constant, either 12 or 8) and
therefore can simply use sign_extend32().

Make the switch and remove hid_snto32(). Move snto32() and s32ton() to
avoid introducing forward declaration.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Link: https://patch.msgid.link/20241003144656.3786064-2-dmitry.torokhov@gmail.com
[bentiss: fix checkpatch warning]
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 drivers/hid/hid-core.c           | 63 ++++++++++++++++++----------------------
 drivers/hid/hid-logitech-hidpp.c |  6 ++--
 include/linux/hid.h              |  1 -
 3 files changed, 32 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index a6fe04e1c726..87b2cdd7f0c4 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -45,6 +45,34 @@ static int hid_ignore_special_drivers = 0;
 module_param_named(ignore_special_drivers, hid_ignore_special_drivers, int, 0600);
 MODULE_PARM_DESC(ignore_special_drivers, "Ignore any special drivers and handle all devices by generic driver");
 
+/*
+ * Convert a signed n-bit integer to signed 32-bit integer.
+ */
+
+static s32 snto32(__u32 value, unsigned int n)
+{
+	if (!value || !n)
+		return 0;
+
+	if (n > 32)
+		n = 32;
+
+	return sign_extend32(value, n - 1);
+}
+
+/*
+ * Convert a signed 32-bit integer to a signed n-bit integer.
+ */
+
+static u32 s32ton(__s32 value, unsigned int n)
+{
+	s32 a = value >> (n - 1);
+
+	if (a && a != -1)
+		return value < 0 ? 1 << (n - 1) : (1 << (n - 1)) - 1;
+	return value & ((1 << n) - 1);
+}
+
 /*
  * Register a new report for a device.
  */
@@ -425,7 +453,7 @@ static int hid_parser_global(struct hid_parser *parser, struct hid_item *item)
 		 * both this and the standard encoding. */
 		raw_value = item_sdata(item);
 		if (!(raw_value & 0xfffffff0))
-			parser->global.unit_exponent = hid_snto32(raw_value, 4);
+			parser->global.unit_exponent = snto32(raw_value, 4);
 		else
 			parser->global.unit_exponent = raw_value;
 		return 0;
@@ -1312,39 +1340,6 @@ alloc_err:
 }
 EXPORT_SYMBOL_GPL(hid_open_report);
 
-/*
- * Convert a signed n-bit integer to signed 32-bit integer.
- */
-
-static s32 snto32(__u32 value, unsigned n)
-{
-	if (!value || !n)
-		return 0;
-
-	if (n > 32)
-		n = 32;
-
-	return sign_extend32(value, n - 1);
-}
-
-s32 hid_snto32(__u32 value, unsigned n)
-{
-	return snto32(value, n);
-}
-EXPORT_SYMBOL_GPL(hid_snto32);
-
-/*
- * Convert a signed 32-bit integer to a signed n-bit integer.
- */
-
-static u32 s32ton(__s32 value, unsigned n)
-{
-	s32 a = value >> (n - 1);
-	if (a && a != -1)
-		return value < 0 ? 1 << (n - 1) : (1 << (n - 1)) - 1;
-	return value & ((1 << n) - 1);
-}
-
 /*
  * Extract/implement a data field from/to a little endian report (bit array).
  *
diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c
index 0e33fa0eb8db..0d114268ade8 100644
--- a/drivers/hid/hid-logitech-hidpp.c
+++ b/drivers/hid/hid-logitech-hidpp.c
@@ -3296,13 +3296,13 @@ static int m560_raw_event(struct hid_device *hdev, u8 *data, int size)
 					 120);
 		}
 
-		v = hid_snto32(hid_field_extract(hdev, data+3, 0, 12), 12);
+		v = sign_extend32(hid_field_extract(hdev, data + 3, 0, 12), 11);
 		input_report_rel(hidpp->input, REL_X, v);
 
-		v = hid_snto32(hid_field_extract(hdev, data+3, 12, 12), 12);
+		v = sign_extend32(hid_field_extract(hdev, data + 3, 12, 12), 11);
 		input_report_rel(hidpp->input, REL_Y, v);
 
-		v = hid_snto32(data[6], 8);
+		v = sign_extend32(data[6], 7);
 		if (v != 0)
 			hidpp_scroll_counter_handle_scroll(hidpp->input,
 					&hidpp->vertical_wheel_counter, v);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 121d5b8bc867..f7c3a66647fd 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -974,7 +974,6 @@ const struct hid_device_id *hid_match_device(struct hid_device *hdev,
 					     struct hid_driver *hdrv);
 bool hid_compare_device_paths(struct hid_device *hdev_a,
 			      struct hid_device *hdev_b, char separator);
-s32 hid_snto32(__u32 value, unsigned n);
 __u32 hid_field_extract(const struct hid_device *hid, __u8 *report,
 		     unsigned offset, unsigned n);
 
-- 
cgit v1.2.3


From 9ab515b18f8463fbb340fece47cd461809e42a9d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:41 +0100
Subject: mm: Define VM_HIGH_ARCH_6

The addition of protection keys means that on arm64 we now use all of the
currently defined VM_HIGH_ARCH_x bits. In order to allow us to allocate a
new flag for GCS pages define VM_HIGH_ARCH_6.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-2-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..182bad0c55df 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -329,12 +329,14 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_5	37	/* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_6	38	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
 #define VM_HIGH_ARCH_5	BIT(VM_HIGH_ARCH_BIT_5)
+#define VM_HIGH_ARCH_6	BIT(VM_HIGH_ARCH_BIT_6)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
-- 
cgit v1.2.3


From 91e102e79740ae43ded050ccac71aa3371db4f33 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:43 +0100
Subject: prctl: arch-agnostic prctl for shadow stack

Three architectures (x86, aarch64, riscv) have announced support for
shadow stacks with fairly similar functionality.  While x86 is using
arch_prctl() to control the functionality neither arm64 nor riscv uses
that interface so this patch adds arch-agnostic prctl() support to
get and set status of shadow stacks and lock the current configuation to
prevent further changes, with support for turning on and off individual
subfeatures so applications can limit their exposure to features that
they do not need.  The features are:

  - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks,
    including allocation of a shadow stack if one is not already
    allocated.
  - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow
    stack.
  - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack.

These features are expected to be inherited by new threads and cleared
on exec(), unknown features should be rejected for enable but accepted
for locking (in order to allow for future proofing).

This is based on a patch originally written by Deepak Gupta but modified
fairly heavily, support for indirect landing pads is removed, additional
modes added and the locking interface reworked.  The set status prctl()
is also reworked to just set flags, if setting/reading the shadow stack
pointer is required this could be a separate prctl.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Deepak Gupta <debug@rivosinc.com>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-4-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/mm.h         |  4 ++++
 include/uapi/linux/prctl.h | 22 ++++++++++++++++++++++
 kernel/sys.c               | 30 ++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 182bad0c55df..56654306a832 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4221,4 +4221,8 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
 }
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
+int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
+int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
+int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
+
 #endif /* _LINUX_MM_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879..557a3d2ac1d4 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,26 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
 # define PR_PPC_DEXCR_CTRL_MASK		0x1f
 
+/*
+ * Get the current shadow stack configuration for the current thread,
+ * this will be the value configured via PR_SET_SHADOW_STACK_STATUS.
+ */
+#define PR_GET_SHADOW_STACK_STATUS      74
+
+/*
+ * Set the current shadow stack configuration.  Enabling the shadow
+ * stack will cause a shadow stack to be allocated for the thread.
+ */
+#define PR_SET_SHADOW_STACK_STATUS      75
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+# define PR_SHADOW_STACK_WRITE		(1UL << 1)
+# define PR_SHADOW_STACK_PUSH		(1UL << 2)
+
+/*
+ * Prevent further changes to the specified shadow stack
+ * configuration.  All bits may be locked via this call, including
+ * undefined bits.
+ */
+#define PR_LOCK_SHADOW_STACK_STATUS      76
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda8..3d38a9c7c5c9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2324,6 +2324,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
 	return -EINVAL;
 }
 
+int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status)
+{
+	return -EINVAL;
+}
+
+int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status)
+{
+	return -EINVAL;
+}
+
+int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status)
+{
+	return -EINVAL;
+}
+
 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
 
 #ifdef CONFIG_ANON_VMA_NAME
@@ -2784,6 +2799,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
 		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
 		break;
+	case PR_GET_SHADOW_STACK_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2);
+		break;
+	case PR_SET_SHADOW_STACK_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_set_shadow_stack_status(me, arg2);
+		break;
+	case PR_LOCK_SHADOW_STACK_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_lock_shadow_stack_status(me, arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit v1.2.3


From ae80e1629aeaf5be726a9ea94eb7345b1a44b00d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:53 +0100
Subject: mm: Define VM_SHADOW_STACK for arm64 when we support GCS

Use VM_HIGH_ARCH_5 for guarded control stack pages.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-14-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/filesystems/proc.rst |  2 +-
 include/linux/mm.h                 | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index e834779d9611..6a882c57a7e7 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -579,7 +579,7 @@ encoded manner. The codes are the following:
     mt    arm64 MTE allocation tags are enabled
     um    userfaultfd missing tracking
     uw    userfaultfd wr-protect tracking
-    ss    shadow stack page
+    ss    shadow/guarded control stack page
     sl    sealed
     ==    =======================================
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 56654306a832..8852c39c7695 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -367,7 +367,17 @@ extern unsigned int kobjsize(const void *objp);
  * for more details on the guard size.
  */
 # define VM_SHADOW_STACK	VM_HIGH_ARCH_5
-#else
+#endif
+
+#if defined(CONFIG_ARM64_GCS)
+/*
+ * arm64's Guarded Control Stack implements similar functionality and
+ * has similar constraints to shadow stacks.
+ */
+# define VM_SHADOW_STACK	VM_HIGH_ARCH_6
+#endif
+
+#ifndef VM_SHADOW_STACK
 # define VM_SHADOW_STACK	VM_NONE
 #endif
 
-- 
cgit v1.2.3


From b63c755cb65d43c8aba987c4f6b57c77c6f123f2 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 30 Sep 2024 14:29:53 +0100
Subject: appletalk: Remove deadcode

alloc_ltalkdev in net/appletalk/dev.c is dead since
commit 00f3696f7555 ("net: appletalk: remove cops support")

Removing it (and it's helper) leaves dev.c and if_ltalk.h empty;
remove them and the Makefile entry.

tun.c was including that if_ltalk.h but actually wanted
the uapi version for LTALK_ALEN, fix up the path.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c        |  2 +-
 include/linux/if_ltalk.h |  8 --------
 net/appletalk/Makefile   |  2 +-
 net/appletalk/dev.c      | 46 ----------------------------------------------
 4 files changed, 2 insertions(+), 56 deletions(-)
 delete mode 100644 include/linux/if_ltalk.h
 delete mode 100644 net/appletalk/dev.c

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9a0f6eb32016..d7a865ef370b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,7 +71,7 @@
 #include <linux/bpf_trace.h>
 #include <linux/mutex.h>
 #include <linux/ieee802154.h>
-#include <linux/if_ltalk.h>
+#include <uapi/linux/if_ltalk.h>
 #include <uapi/linux/if_fddi.h>
 #include <uapi/linux/if_hippi.h>
 #include <uapi/linux/if_fc.h>
diff --git a/include/linux/if_ltalk.h b/include/linux/if_ltalk.h
deleted file mode 100644
index 4cc1c0b77870..000000000000
--- a/include/linux/if_ltalk.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_LTALK_H
-#define __LINUX_LTALK_H
-
-#include <uapi/linux/if_ltalk.h>
-
-extern struct net_device *alloc_ltalkdev(int sizeof_priv);
-#endif
diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile
index 33164d972d37..152312a15180 100644
--- a/net/appletalk/Makefile
+++ b/net/appletalk/Makefile
@@ -5,6 +5,6 @@
 
 obj-$(CONFIG_ATALK) += appletalk.o
 
-appletalk-y			:= aarp.o ddp.o dev.o
+appletalk-y			:= aarp.o ddp.o
 appletalk-$(CONFIG_PROC_FS)	+= atalk_proc.o
 appletalk-$(CONFIG_SYSCTL)	+= sysctl_net_atalk.o
diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c
deleted file mode 100644
index 284c8e585533..000000000000
--- a/net/appletalk/dev.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Moved here from drivers/net/net_init.c, which is:
- *	Written 1993,1994,1995 by Donald Becker.
- */
-
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/if_ltalk.h>
-
-static void ltalk_setup(struct net_device *dev)
-{
-	/* Fill in the fields of the device structure with localtalk-generic values. */
-
-	dev->type		= ARPHRD_LOCALTLK;
-	dev->hard_header_len 	= LTALK_HLEN;
-	dev->mtu		= LTALK_MTU;
-	dev->addr_len		= LTALK_ALEN;
-	dev->tx_queue_len	= 10;
-
-	dev->broadcast[0]	= 0xFF;
-
-	dev->flags		= IFF_BROADCAST|IFF_MULTICAST|IFF_NOARP;
-}
-
-/**
- * alloc_ltalkdev - Allocates and sets up an localtalk device
- * @sizeof_priv: Size of additional driver-private structure to be allocated
- *	for this localtalk device
- *
- * Fill in the fields of the device structure with localtalk-generic
- * values. Basically does everything except registering the device.
- *
- * Constructs a new net device, complete with a private data area of
- * size @sizeof_priv.  A 32-byte (not bit) alignment is enforced for
- * this private data area.
- */
-
-struct net_device *alloc_ltalkdev(int sizeof_priv)
-{
-	return alloc_netdev(sizeof_priv, "lt%d", NET_NAME_UNKNOWN,
-			    ltalk_setup);
-}
-EXPORT_SYMBOL(alloc_ltalkdev);
-- 
cgit v1.2.3


From ec841b8d73cff37f8960e209017efe1eb2fb21f2 Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2@nxp.com>
Date: Mon, 23 Sep 2024 16:12:01 +0800
Subject: usb: chipidea: add CI_HDRC_HAS_SHORT_PKT_LIMIT flag

Currently, the imx deivice controller has below limitations:

1. can't generate short packet interrupt if IOC not set in dTD. So if one
   request span more than one dTDs and only the last dTD set IOC, the usb
   request will pending there if no more data comes.
2. the controller can't accurately deliver data to differtent usb requests
   in some cases due to short packet. For example: one usb request span 3
   dTDs, then if the controller received a short packet the next packet
   will go to 2nd dTD of current request rather than the first dTD of next
   request.
3. can't build a bus packet use multiple dTDs. For example: controller
   needs to send one packet of 512 bytes use dTD1 (200 bytes) + dTD2
   (312 bytes), actually the host side will see 200 bytes short packet.

Based on these limits, add CI_HDRC_HAS_SHORT_PKT_LIMIT flag and use it on
imx platforms.

Signed-off-by: Xu Yang <xu.yang_2@nxp.com>
Acked-by: Peter Chen <peter.chen@kernel.org>
Link: https://lore.kernel.org/r/20240923081203.2851768-1-xu.yang_2@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/chipidea/ci.h          | 1 +
 drivers/usb/chipidea/ci_hdrc_imx.c | 1 +
 drivers/usb/chipidea/core.c        | 2 ++
 include/linux/usb/chipidea.h       | 1 +
 4 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/ci.h b/drivers/usb/chipidea/ci.h
index 2a38e1eb6546..e4b003d060c2 100644
--- a/drivers/usb/chipidea/ci.h
+++ b/drivers/usb/chipidea/ci.h
@@ -260,6 +260,7 @@ struct ci_hdrc {
 	bool				b_sess_valid_event;
 	bool				imx28_write_fix;
 	bool				has_portsc_pec_bug;
+	bool				has_short_pkt_limit;
 	bool				supports_runtime_pm;
 	bool				in_lpm;
 	bool				wakeup_int;
diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c
index d655db733797..f2801700be8e 100644
--- a/drivers/usb/chipidea/ci_hdrc_imx.c
+++ b/drivers/usb/chipidea/ci_hdrc_imx.c
@@ -342,6 +342,7 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev)
 	struct ci_hdrc_platform_data pdata = {
 		.name		= dev_name(&pdev->dev),
 		.capoffset	= DEF_CAPOFFSET,
+		.flags		= CI_HDRC_HAS_SHORT_PKT_LIMIT,
 		.notify_event	= ci_hdrc_imx_notify_event,
 	};
 	int ret;
diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 36143b2ae482..2d01af746ff8 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -1076,6 +1076,8 @@ static int ci_hdrc_probe(struct platform_device *pdev)
 		CI_HDRC_SUPPORTS_RUNTIME_PM);
 	ci->has_portsc_pec_bug = !!(ci->platdata->flags &
 		CI_HDRC_HAS_PORTSC_PEC_MISSED);
+	ci->has_short_pkt_limit = !!(ci->platdata->flags &
+		CI_HDRC_HAS_SHORT_PKT_LIMIT);
 	platform_set_drvdata(pdev, ci);
 
 	ret = hw_device_init(ci, base);
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 5a7f96684ea2..ebdfef124b2b 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -65,6 +65,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_PHY_VBUS_CONTROL	BIT(16)
 #define CI_HDRC_HAS_PORTSC_PEC_MISSED	BIT(17)
 #define CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS	BIT(18)
+#define	CI_HDRC_HAS_SHORT_PKT_LIMIT	BIT(19)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-- 
cgit v1.2.3


From 8b7fd6a15f8c32760c2026a62dcf55219b4da15b Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Tue, 1 Oct 2024 16:30:05 +0200
Subject: HID: bpf: move HID-BPF report descriptor fixup earlier

Currently, hid_bpf_rdesc_fixup() is called once the match between the
HID device and the driver is done. This can be problematic in case
the driver selected by the kernel would change the report descriptor
after the fact.

To give a chance for hid_bpf_rdesc_fixup() to provide hints on to how
to select a dedicated driver or not, move the call to that BPF hook
earlier in the .probe() process, when we get the first match.

However, this means that we might get called more than once (typically
once for hid-generic, and once for hid-vendor-specific). So we store the
result of HID-BPF fixup in struct hid_device. Basically, this means that
->bpf_rdesc can replace ->dev_rdesc when it was used in the code.

In order to not grow struct hid_device, some fields are re-ordered. This
was the output of pahole for the first 128 bytes:
struct hid_device {
	__u8 *                     dev_rdesc;            /*     0     8 */
	unsigned int               dev_rsize;            /*     8     4 */

	/* XXX 4 bytes hole, try to pack */

	__u8 *                     rdesc;                /*    16     8 */
	unsigned int               rsize;                /*    24     4 */

	/* XXX 4 bytes hole, try to pack */

	struct hid_collection *    collection;           /*    32     8 */
	unsigned int               collection_size;      /*    40     4 */
	unsigned int               maxcollection;        /*    44     4 */
	unsigned int               maxapplication;       /*    48     4 */
	__u16                      bus;                  /*    52     2 */
	__u16                      group;                /*    54     2 */
	__u32                      vendor;               /*    56     4 */
	__u32                      product;              /*    60     4 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	__u32                      version;              /*    64     4 */
	enum hid_type              type;                 /*    68     4 */
	unsigned int               country;              /*    72     4 */

	/* XXX 4 bytes hole, try to pack */

	struct hid_report_enum     report_enum[3];       /*    80  6216 */

Basically, we got three holes of 4 bytes. We can reorder things a little
and makes those 3 holes a continuous 12 bytes hole, which can be replaced
by the new pointer and the new unsigned int we need.

In terms of code allocation, when not using HID-BPF, we are back to kernel
v6.2 in hid_open_report(). These multiple kmemdup() calls will be fixed
in a later commit.

Link: https://patch.msgid.link/20241001-hid-bpf-hid-generic-v3-1-2ef1019468df@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 drivers/hid/bpf/hid_bpf_dispatch.c |  9 ++++++---
 drivers/hid/hid-core.c             | 30 ++++++++++++++++++++++++++----
 include/linux/hid.h                | 18 ++++++++++--------
 include/linux/hid_bpf.h            | 11 +++--------
 4 files changed, 45 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c
index 8420c227e21b..961b7f35aa67 100644
--- a/drivers/hid/bpf/hid_bpf_dispatch.c
+++ b/drivers/hid/bpf/hid_bpf_dispatch.c
@@ -148,7 +148,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(dispatch_hid_bpf_output_report);
 
-u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size)
+const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size)
 {
 	int ret;
 	struct hid_bpf_ctx_kern ctx_kern = {
@@ -183,7 +183,7 @@ u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned
 
  ignore_bpf:
 	kfree(ctx_kern.data);
-	return kmemdup(rdesc, *size, GFP_KERNEL);
+	return rdesc;
 }
 EXPORT_SYMBOL_GPL(call_hid_bpf_rdesc_fixup);
 
@@ -260,8 +260,11 @@ int hid_bpf_allocate_event_data(struct hid_device *hdev)
 
 int hid_bpf_reconnect(struct hid_device *hdev)
 {
-	if (!test_and_set_bit(ffs(HID_STAT_REPROBED), &hdev->status))
+	if (!test_and_set_bit(ffs(HID_STAT_REPROBED), &hdev->status)) {
+		/* trigger call to call_hid_bpf_rdesc_fixup() during the next probe */
+		hdev->bpf_rsize = 0;
 		return device_reprobe(&hdev->dev);
+	}
 
 	return 0;
 }
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 30de92d0bf0f..d6bf933623e8 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -698,6 +698,14 @@ static void hid_close_report(struct hid_device *device)
 	device->status &= ~HID_STAT_PARSED;
 }
 
+static inline void hid_free_bpf_rdesc(struct hid_device *hdev)
+{
+	/* bpf_rdesc is either equal to dev_rdesc or allocated by call_hid_bpf_rdesc_fixup() */
+	if (hdev->bpf_rdesc != hdev->dev_rdesc)
+		kfree(hdev->bpf_rdesc);
+	hdev->bpf_rdesc = NULL;
+}
+
 /*
  * Free a device structure, all reports, and all fields.
  */
@@ -707,6 +715,7 @@ void hiddev_free(struct kref *ref)
 	struct hid_device *hid = container_of(ref, struct hid_device, ref);
 
 	hid_close_report(hid);
+	hid_free_bpf_rdesc(hid);
 	kfree(hid->dev_rdesc);
 	kfree(hid);
 }
@@ -1221,13 +1230,12 @@ int hid_open_report(struct hid_device *device)
 	if (WARN_ON(device->status & HID_STAT_PARSED))
 		return -EBUSY;
 
-	start = device->dev_rdesc;
+	start = device->bpf_rdesc;
 	if (WARN_ON(!start))
 		return -ENODEV;
-	size = device->dev_rsize;
+	size = device->bpf_rsize;
 
-	/* call_hid_bpf_rdesc_fixup() ensures we work on a copy of rdesc */
-	buf = call_hid_bpf_rdesc_fixup(device, start, &size);
+	buf = kmemdup(start, size, GFP_KERNEL);
 	if (buf == NULL)
 		return -ENOMEM;
 
@@ -2684,6 +2692,18 @@ static int __hid_device_probe(struct hid_device *hdev, struct hid_driver *hdrv)
 	const struct hid_device_id *id;
 	int ret;
 
+	if (!hdev->bpf_rsize) {
+		/* in case a bpf program gets detached, we need to free the old one */
+		hid_free_bpf_rdesc(hdev);
+
+		/* keep this around so we know we called it once */
+		hdev->bpf_rsize = hdev->dev_rsize;
+
+		/* call_hid_bpf_rdesc_fixup will always return a valid pointer */
+		hdev->bpf_rdesc = call_hid_bpf_rdesc_fixup(hdev, hdev->dev_rdesc,
+							   &hdev->bpf_rsize);
+	}
+
 	if (!hid_check_device_match(hdev, hdrv, &id))
 		return -ENODEV;
 
@@ -2940,9 +2960,11 @@ static void hid_remove_device(struct hid_device *hdev)
 		hid_debug_unregister(hdev);
 		hdev->status &= ~HID_STAT_ADDED;
 	}
+	hid_free_bpf_rdesc(hdev);
 	kfree(hdev->dev_rdesc);
 	hdev->dev_rdesc = NULL;
 	hdev->dev_rsize = 0;
+	hdev->bpf_rsize = 0;
 }
 
 /**
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 121d5b8bc867..ff58b5ceb62e 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -599,15 +599,17 @@ enum hid_battery_status {
 struct hid_driver;
 struct hid_ll_driver;
 
-struct hid_device {							/* device report descriptor */
-	const __u8 *dev_rdesc;
-	unsigned dev_rsize;
-	const __u8 *rdesc;
-	unsigned rsize;
+struct hid_device {
+	const __u8 *dev_rdesc;						/* device report descriptor */
+	const __u8 *bpf_rdesc;						/* bpf modified report descriptor, if any */
+	const __u8 *rdesc;						/* currently used report descriptor */
+	unsigned int dev_rsize;
+	unsigned int bpf_rsize;
+	unsigned int rsize;
+	unsigned int collection_size;					/* Number of allocated hid_collections */
 	struct hid_collection *collection;				/* List of HID collections */
-	unsigned collection_size;					/* Number of allocated hid_collections */
-	unsigned maxcollection;						/* Number of parsed collections */
-	unsigned maxapplication;					/* Number of applications */
+	unsigned int maxcollection;						/* Number of parsed collections */
+	unsigned int maxapplication;					/* Number of applications */
 	__u16 bus;							/* BUS ID */
 	__u16 group;							/* Report group */
 	__u32 vendor;							/* Vendor ID */
diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index 6a47223e6460..a6876ab29004 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -212,7 +212,7 @@ int hid_bpf_connect_device(struct hid_device *hdev);
 void hid_bpf_disconnect_device(struct hid_device *hdev);
 void hid_bpf_destroy_device(struct hid_device *hid);
 int hid_bpf_device_init(struct hid_device *hid);
-u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size);
+const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size);
 #else /* CONFIG_HID_BPF */
 static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type,
 						u8 *data, u32 *size, int interrupt,
@@ -228,13 +228,8 @@ static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; }
 static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {}
 static inline void hid_bpf_destroy_device(struct hid_device *hid) {}
 static inline int hid_bpf_device_init(struct hid_device *hid) { return 0; }
-/*
- * This specialized allocator has to be a macro for its allocations to be
- * accounted separately (to have a separate alloc_tag). The typecast is
- * intentional to enforce typesafety.
- */
-#define call_hid_bpf_rdesc_fixup(_hdev, _rdesc, _size)	\
-		((u8 *)kmemdup(_rdesc, *(_size), GFP_KERNEL))
+static inline const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc,
+						 unsigned int *size) { return rdesc; }
 
 #endif /* CONFIG_HID_BPF */
 
-- 
cgit v1.2.3


From 645c224ac5f6e0013931c342ea707b398d24d410 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Tue, 1 Oct 2024 16:30:12 +0200
Subject: HID: add per device quirk to force bind to hid-generic

We already have the possibility to force not binding to hid-generic and
rely on a dedicated driver, but we couldn't do the other way around.

This is useful for BPF programs where we are fixing the report descriptor
and the events, but want to avoid a specialized driver to come after BPF
which would unwind everything that is done there.

Reviewed-by: Peter Hutterer <peter.hutterer@who-t.net>
Link: https://patch.msgid.link/20241001-hid-bpf-hid-generic-v3-8-2ef1019468df@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 drivers/hid/hid-core.c    | 5 +++--
 drivers/hid/hid-generic.c | 3 +++
 include/linux/hid.h       | 2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 8e879937e956..f1c23b21e96a 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2698,9 +2698,10 @@ static bool hid_check_device_match(struct hid_device *hdev,
 	/*
 	 * hid-generic implements .match(), so we must be dealing with a
 	 * different HID driver here, and can simply check if
-	 * hid_ignore_special_drivers is set or not.
+	 * hid_ignore_special_drivers or HID_QUIRK_IGNORE_SPECIAL_DRIVER
+	 * are set or not.
 	 */
-	return !hid_ignore_special_drivers;
+	return !hid_ignore_special_drivers && !(hdev->quirks & HID_QUIRK_IGNORE_SPECIAL_DRIVER);
 }
 
 static int __hid_device_probe(struct hid_device *hdev, struct hid_driver *hdrv)
diff --git a/drivers/hid/hid-generic.c b/drivers/hid/hid-generic.c
index f9db991d3c5a..88882c1bfffe 100644
--- a/drivers/hid/hid-generic.c
+++ b/drivers/hid/hid-generic.c
@@ -40,6 +40,9 @@ static bool hid_generic_match(struct hid_device *hdev,
 	if (ignore_special_driver)
 		return true;
 
+	if (hdev->quirks & HID_QUIRK_IGNORE_SPECIAL_DRIVER)
+		return true;
+
 	if (hdev->quirks & HID_QUIRK_HAVE_SPECIAL_DRIVER)
 		return false;
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index ff58b5ceb62e..63330d623335 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -359,6 +359,7 @@ struct hid_item {
  * | @HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP:
  * | @HID_QUIRK_HAVE_SPECIAL_DRIVER:
  * | @HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE:
+ * | @HID_QUIRK_IGNORE_SPECIAL_DRIVER
  * | @HID_QUIRK_FULLSPEED_INTERVAL:
  * | @HID_QUIRK_NO_INIT_REPORTS:
  * | @HID_QUIRK_NO_IGNORE:
@@ -384,6 +385,7 @@ struct hid_item {
 #define HID_QUIRK_HAVE_SPECIAL_DRIVER		BIT(19)
 #define HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE	BIT(20)
 #define HID_QUIRK_NOINVERT			BIT(21)
+#define HID_QUIRK_IGNORE_SPECIAL_DRIVER		BIT(22)
 #define HID_QUIRK_FULLSPEED_INTERVAL		BIT(28)
 #define HID_QUIRK_NO_INIT_REPORTS		BIT(29)
 #define HID_QUIRK_NO_IGNORE			BIT(30)
-- 
cgit v1.2.3


From 5acd957a986c167d357e617a887630203be29ea4 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Tue, 1 Oct 2024 13:37:04 +0300
Subject: net/mlx5: hw counters: Make fc_stats & fc_pool private

The mlx5_fc_stats and mlx5_fc_pool structs are only used from
fs_counters.c. As such, make them private there.

mlx5_fc_pool is not used or referenced at all outside fs_counters.

mlx5_fc_stats is referenced from mlx5_core_dev, so instead of having it
as a direct member (which requires exporting it from fs_counters), store
a pointer to it, allocate it on init and clear it on destroy.
One caveat is that a simple container_of to get from a 'work' struct to
the outermost mlx5_core_dev struct directly no longer works, so an extra
pointer had to be added to mlx5_fc_stats back to the parent dev.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241001103709.58127-2-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 79 ++++++++++++++++------
 include/linux/mlx5/driver.h                        | 33 +--------
 2 files changed, 60 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 0c26d707eed2..7d6174d0f260 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -69,6 +69,36 @@ struct mlx5_fc {
 	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
 };
 
+struct mlx5_fc_pool {
+	struct mlx5_core_dev *dev;
+	struct mutex pool_lock; /* protects pool lists */
+	struct list_head fully_used;
+	struct list_head partially_used;
+	struct list_head unused;
+	int available_fcs;
+	int used_fcs;
+	int threshold;
+};
+
+struct mlx5_fc_stats {
+	spinlock_t counters_idr_lock; /* protects counters_idr */
+	struct idr counters_idr;
+	struct list_head counters;
+	struct llist_head addlist;
+	struct llist_head dellist;
+
+	struct workqueue_struct *wq;
+	struct delayed_work work;
+	unsigned long next_query;
+	unsigned long sampling_interval; /* jiffies */
+	u32 *bulk_query_out;
+	int bulk_query_len;
+	size_t num_counters;
+	bool bulk_query_alloc_failed;
+	unsigned long next_bulk_query_alloc;
+	struct mlx5_fc_pool fc_pool;
+};
+
 static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev);
 static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool);
 static struct mlx5_fc *mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool);
@@ -109,7 +139,7 @@ static void mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct ml
 static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev,
 						      u32 id)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	unsigned long next_id = (unsigned long)id + 1;
 	struct mlx5_fc *counter;
 	unsigned long tmp;
@@ -137,7 +167,7 @@ static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev,
 static void mlx5_fc_stats_remove(struct mlx5_core_dev *dev,
 				 struct mlx5_fc *counter)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	list_del(&counter->list);
 
@@ -178,7 +208,7 @@ static void mlx5_fc_stats_query_counter_range(struct mlx5_core_dev *dev,
 					      struct mlx5_fc *first,
 					      u32 last_id)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	bool query_more_counters = (first->id <= last_id);
 	int cur_bulk_len = fc_stats->bulk_query_len;
 	u32 *data = fc_stats->bulk_query_out;
@@ -225,7 +255,7 @@ static void mlx5_fc_free(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 
 static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	if (counter->bulk)
 		mlx5_fc_pool_release_counter(&fc_stats->fc_pool, counter);
@@ -235,7 +265,7 @@ static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 
 static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	int max_bulk_len = get_max_bulk_query_len(dev);
 	unsigned long now = jiffies;
 	u32 *bulk_query_out_tmp;
@@ -270,9 +300,9 @@ static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev)
 
 static void mlx5_fc_stats_work(struct work_struct *work)
 {
-	struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev,
-						 priv.fc_stats.work.work);
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = container_of(work, struct mlx5_fc_stats,
+						      work.work);
+	struct mlx5_core_dev *dev = fc_stats->fc_pool.dev;
 	/* Take dellist first to ensure that counters cannot be deleted before
 	 * they are inserted.
 	 */
@@ -334,7 +364,7 @@ static struct mlx5_fc *mlx5_fc_single_alloc(struct mlx5_core_dev *dev)
 
 static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	struct mlx5_fc *counter;
 
 	if (aging && MLX5_CAP_GEN(dev, flow_counter_bulk_alloc) != 0) {
@@ -349,7 +379,7 @@ static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging)
 struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging)
 {
 	struct mlx5_fc *counter = mlx5_fc_acquire(dev, aging);
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	int err;
 
 	if (IS_ERR(counter))
@@ -389,7 +419,7 @@ err_out_alloc:
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
 {
 	struct mlx5_fc *counter = mlx5_fc_create_ex(dev, aging);
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	if (aging)
 		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
@@ -405,7 +435,7 @@ EXPORT_SYMBOL(mlx5_fc_id);
 
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	if (!counter)
 		return;
@@ -422,10 +452,14 @@ EXPORT_SYMBOL(mlx5_fc_destroy);
 
 int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats;
 	int init_bulk_len;
 	int init_out_len;
 
+	fc_stats = kzalloc(sizeof(*fc_stats), GFP_KERNEL);
+	if (!fc_stats)
+		return -ENOMEM;
+
 	spin_lock_init(&fc_stats->counters_idr_lock);
 	idr_init(&fc_stats->counters_idr);
 	INIT_LIST_HEAD(&fc_stats->counters);
@@ -436,7 +470,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	init_out_len = mlx5_cmd_fc_get_bulk_query_out_len(init_bulk_len);
 	fc_stats->bulk_query_out = kzalloc(init_out_len, GFP_KERNEL);
 	if (!fc_stats->bulk_query_out)
-		return -ENOMEM;
+		goto err_bulk;
 	fc_stats->bulk_query_len = init_bulk_len;
 
 	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
@@ -447,23 +481,27 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
 
 	mlx5_fc_pool_init(&fc_stats->fc_pool, dev);
+	dev->priv.fc_stats = fc_stats;
+
 	return 0;
 
 err_wq_create:
 	kfree(fc_stats->bulk_query_out);
+err_bulk:
+	kfree(fc_stats);
 	return -ENOMEM;
 }
 
 void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	struct llist_node *tmplist;
 	struct mlx5_fc *counter;
 	struct mlx5_fc *tmp;
 
-	cancel_delayed_work_sync(&dev->priv.fc_stats.work);
-	destroy_workqueue(dev->priv.fc_stats.wq);
-	dev->priv.fc_stats.wq = NULL;
+	cancel_delayed_work_sync(&fc_stats->work);
+	destroy_workqueue(fc_stats->wq);
+	fc_stats->wq = NULL;
 
 	tmplist = llist_del_all(&fc_stats->addlist);
 	llist_for_each_entry_safe(counter, tmp, tmplist, addlist)
@@ -475,6 +513,7 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 	mlx5_fc_pool_cleanup(&fc_stats->fc_pool);
 	idr_destroy(&fc_stats->counters_idr);
 	kfree(fc_stats->bulk_query_out);
+	kfree(fc_stats);
 }
 
 int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
@@ -518,7 +557,7 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
 			      struct delayed_work *dwork,
 			      unsigned long delay)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	queue_delayed_work(fc_stats->wq, dwork, delay);
 }
@@ -526,7 +565,7 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
 void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
 				      unsigned long interval)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	fc_stats->sampling_interval = min_t(unsigned long, interval,
 					    fc_stats->sampling_interval);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e23c692a34c7..fc7e6153b73d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -45,7 +45,6 @@
 #include <linux/workqueue.h>
 #include <linux/mempool.h>
 #include <linux/interrupt.h>
-#include <linux/idr.h>
 #include <linux/notifier.h>
 #include <linux/refcount.h>
 #include <linux/auxiliary_bus.h>
@@ -474,36 +473,6 @@ struct mlx5_core_sriov {
 	u16			max_ec_vfs;
 };
 
-struct mlx5_fc_pool {
-	struct mlx5_core_dev *dev;
-	struct mutex pool_lock; /* protects pool lists */
-	struct list_head fully_used;
-	struct list_head partially_used;
-	struct list_head unused;
-	int available_fcs;
-	int used_fcs;
-	int threshold;
-};
-
-struct mlx5_fc_stats {
-	spinlock_t counters_idr_lock; /* protects counters_idr */
-	struct idr counters_idr;
-	struct list_head counters;
-	struct llist_head addlist;
-	struct llist_head dellist;
-
-	struct workqueue_struct *wq;
-	struct delayed_work work;
-	unsigned long next_query;
-	unsigned long sampling_interval; /* jiffies */
-	u32 *bulk_query_out;
-	int bulk_query_len;
-	size_t num_counters;
-	bool bulk_query_alloc_failed;
-	unsigned long next_bulk_query_alloc;
-	struct mlx5_fc_pool fc_pool;
-};
-
 struct mlx5_events;
 struct mlx5_mpfs;
 struct mlx5_eswitch;
@@ -630,7 +599,7 @@ struct mlx5_priv {
 	struct mlx5_devcom_comp_dev *hca_devcom_comp;
 	struct mlx5_fw_reset	*fw_reset;
 	struct mlx5_core_roce	roce;
-	struct mlx5_fc_stats		fc_stats;
+	struct mlx5_fc_stats		*fc_stats;
 	struct mlx5_rl_table            rl_table;
 	struct mlx5_ft_pool		*ft_pool;
 
-- 
cgit v1.2.3


From d1c9cffe4b01f4d8bc52169139a5fedd48908abc Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Tue, 1 Oct 2024 13:37:09 +0300
Subject: net/mlx5: hw counters: Remove mlx5_fc_create_ex

It no longer serves any purpose and is identical to mlx5_fc_create upon
which it was originally based of.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241001103709.58127-7-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c    | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 7 +------
 include/linux/mlx5/fs.h                               | 3 ---
 3 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
index dcfccaaa8d91..4877a9d86807 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
@@ -1026,7 +1026,7 @@ mlx5_tc_ct_counter_create(struct mlx5_tc_ct_priv *ct_priv)
 		return ERR_PTR(-ENOMEM);
 
 	counter->is_shared = false;
-	counter->counter = mlx5_fc_create_ex(ct_priv->dev, true);
+	counter->counter = mlx5_fc_create(ct_priv->dev, true);
 	if (IS_ERR(counter->counter)) {
 		ct_dbg("Failed to create counter for ct entry");
 		ret = PTR_ERR(counter->counter);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 0b80c33cba5f..62d0c689796b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -275,7 +275,7 @@ static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging)
 	return mlx5_fc_single_alloc(dev);
 }
 
-struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging)
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
 {
 	struct mlx5_fc *counter = mlx5_fc_acquire(dev, aging);
 	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
@@ -304,11 +304,6 @@ err_out_alloc:
 	mlx5_fc_release(dev, counter);
 	return ERR_PTR(err);
 }
-
-struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
-{
-	return mlx5_fc_create_ex(dev, aging);
-}
 EXPORT_SYMBOL(mlx5_fc_create);
 
 u32 mlx5_fc_id(struct mlx5_fc *counter)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index b744e554f014..438db888bde0 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -298,9 +298,6 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
 
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 
-/* As mlx5_fc_create() but doesn't queue stats refresh thread. */
-struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging);
-
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
-- 
cgit v1.2.3


From f858cc9eed5b05cbe38d7ffd2787c21e3718eb7d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 3 Oct 2024 12:12:18 +0000
Subject: net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute

Some network devices have the ability to offload EDT (Earliest
Departure Time) which is the model used for TCP pacing and FQ
packet scheduler.

Some of them implement the timing wheel mechanism described in
https://saeed.github.io/files/carousel-sigcomm17.pdf
with an associated 'timing wheel horizon'.

This patch adds dev->max_pacing_offload_horizon expressing
this timing wheel horizon in nsec units.

This is a read-only attribute.

Unless a driver sets it, dev->max_pacing_offload_horizon
is zero.

v2: addressed Jakub feedback ( https://lore.kernel.org/netdev/20240930152304.472767-2-edumazet@google.com/T/#mf6294d714c41cc459962154cc2580ce3c9693663 )
v3: added yaml doc (also per Jakub feedback)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20241003121219.2396589-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/rt_link.yaml               | 4 ++++
 Documentation/networking/net_cachelines/net_device.rst | 1 +
 include/linux/netdevice.h                              | 4 ++++
 include/uapi/linux/if_link.h                           | 1 +
 net/core/rtnetlink.c                                   | 4 ++++
 tools/include/uapi/linux/if_link.h                     | 1 +
 6 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml
index 0c4d5d40cae9..d7131a1afadf 100644
--- a/Documentation/netlink/specs/rt_link.yaml
+++ b/Documentation/netlink/specs/rt_link.yaml
@@ -1137,6 +1137,10 @@ attribute-sets:
         name: dpll-pin
         type: nest
         nested-attributes: link-dpll-pin-attrs
+      -
+        name: max-pacing-offload-horizon
+        type: uint
+        doc: EDT offload horizon supported by the device (in nsec).
   -
     name: af-spec-attrs
     attributes:
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 22b07c814f4a..49f03cb78c6e 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -183,3 +183,4 @@ struct_devlink_port*                devlink_port
 struct_dpll_pin*                    dpll_pin                                                        
 struct hlist_head                   page_pools
 struct dim_irq_moder*               irq_moder
+u64                                 max_pacing_offload_horizon
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4d20c776a4ff..49a7e7db0883 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2009,6 +2009,8 @@ enum netdev_reg_state {
  *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
  *		   where the clock is recovered.
  *
+ *	@max_pacing_offload_horizon: max EDT offload horizon in nsec.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2399,6 +2401,8 @@ struct net_device {
 	/** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */
 	struct dim_irq_moder	*irq_moder;
 
+	u64			max_pacing_offload_horizon;
+
 	u8			priv[] ____cacheline_aligned
 				       __counted_by(priv_len);
 } ____cacheline_aligned;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6dc258993b17..506ba9c80e83 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -377,6 +377,7 @@ enum {
 	IFLA_GSO_IPV4_MAX_SIZE,
 	IFLA_GRO_IPV4_MAX_SIZE,
 	IFLA_DPLL_PIN,
+	IFLA_MAX_PACING_OFFLOAD_HORIZON,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f0a520987085..682d8d3127db 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1118,6 +1118,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
 	       + rtnl_devlink_port_size(dev)
 	       + rtnl_dpll_pin_size(dev)
+	       + nla_total_size(8)  /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
 	       + 0;
 }
 
@@ -1867,6 +1868,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			READ_ONCE(dev->tso_max_size)) ||
 	    nla_put_u32(skb, IFLA_TSO_MAX_SEGS,
 			READ_ONCE(dev->tso_max_segs)) ||
+	    nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON,
+			 READ_ONCE(dev->max_pacing_offload_horizon)) ||
 #ifdef CONFIG_RPS
 	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES,
 			READ_ONCE(dev->num_rx_queues)) ||
@@ -1975,6 +1978,7 @@ nla_put_failure:
 }
 
 static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+	[IFLA_UNSPEC]		= { .strict_start_type = IFLA_DPLL_PIN },
 	[IFLA_IFNAME]		= { .type = NLA_STRING, .len = IFNAMSIZ-1 },
 	[IFLA_ADDRESS]		= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
 	[IFLA_BROADCAST]	= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index f0d71b2a3f1e..96ec2b01e725 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -377,6 +377,7 @@ enum {
 	IFLA_GSO_IPV4_MAX_SIZE,
 	IFLA_GRO_IPV4_MAX_SIZE,
 	IFLA_DPLL_PIN,
+	IFLA_MAX_PACING_OFFLOAD_HORIZON,
 	__IFLA_MAX
 };
 
-- 
cgit v1.2.3


From 1e562deacecca1f1bec7d23da526904a1e87525e Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 10 Sep 2024 16:30:16 +0200
Subject: crypto: rsassa-pkcs1 - Migrate to sig_alg backend

A sig_alg backend has just been introduced with the intent of moving all
asymmetric sign/verify algorithms to it one by one.

Migrate the sign/verify operations from rsa-pkcs1pad.c to a separate
rsassa-pkcs1.c which uses the new backend.

Consequently there are now two templates which build on the "rsa"
akcipher_alg:

* The existing "pkcs1pad" template, which is instantiated as an
  akcipher_instance and retains the encrypt/decrypt operations of
  RSAES-PKCS1-v1_5 (RFC 8017 sec 7.2).

* The new "pkcs1" template, which is instantiated as a sig_instance
  and contains the sign/verify operations of RSASSA-PKCS1-v1_5
  (RFC 8017 sec 8.2).

In a separate step, rsa-pkcs1pad.c could optionally be renamed to
rsaes-pkcs1.c for clarity.  Additional "oaep" and "pss" templates
could be added for RSAES-OAEP and RSASSA-PSS.

Note that it's currently allowed to allocate a "pkcs1pad(rsa)" transform
without specifying a hash algorithm.  That makes sense if the transform
is only used for encrypt/decrypt and continues to be supported.  But for
sign/verify, such transforms previously did not insert the Full Hash
Prefix into the padding.  The resulting message encoding was incompliant
with EMSA-PKCS1-v1_5 (RFC 8017 sec 9.2) and therefore nonsensical.

From here on in, it is no longer allowed to allocate a transform without
specifying a hash algorithm if the transform is used for sign/verify
operations.  This simplifies the code because the insertion of the Full
Hash Prefix is no longer optional, so various "if (digest_info)" clauses
can be removed.

There has been a previous attempt to forbid transform allocation without
specifying a hash algorithm, namely by commit c0d20d22e0ad ("crypto:
rsa-pkcs1pad - Require hash to be present").  It had to be rolled back
with commit b3a8c8a5ebb5 ("crypto: rsa-pkcs1pad: Allow hash to be
optional [ver #2]"), presumably because it broke allocation of a
transform which was solely used for encrypt/decrypt, not sign/verify.
Avoid such breakage by allowing transform allocation for encrypt/decrypt
with and without specifying a hash algorithm (and simply ignoring the
hash algorithm in the former case).

So again, specifying a hash algorithm is now mandatory for sign/verify,
but optional and ignored for encrypt/decrypt.

The new sig_alg API uses kernel buffers instead of sglists, which
avoids the overhead of copying signature and digest from sglists back
into kernel buffers.  rsassa-pkcs1.c is thus simplified quite a bit.

sig_alg is always synchronous, whereas the underlying "rsa" akcipher_alg
may be asynchronous.  So await the result of the akcipher_alg, similar
to crypto_akcipher_sync_{en,de}crypt().

As part of the migration, rename "rsa_digest_info" to "hash_prefix" to
adhere to the spec language in RFC 9580.  Otherwise keep the code
unmodified wherever possible to ease reviewing and bisecting.  Leave
several simplification and hardening opportunities to separate commits.

rsassa-pkcs1.c uses modern __free() syntax for allocation of buffers
which need to be freed by kfree_sensitive(), hence a DEFINE_FREE()
clause for kfree_sensitive() is introduced herein as a byproduct.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig                      |   1 +
 crypto/Makefile                     |   1 +
 crypto/asymmetric_keys/public_key.c |  10 +-
 crypto/rsa-pkcs1pad.c               | 341 ++---------------------------
 crypto/rsa.c                        |  17 +-
 crypto/rsassa-pkcs1.c               | 422 ++++++++++++++++++++++++++++++++++++
 crypto/testmgr.c                    |  22 +-
 crypto/testmgr.h                    |   3 +-
 include/crypto/internal/rsa.h       |   1 +
 include/linux/slab.h                |   1 +
 security/integrity/ima/ima_main.c   |   6 +-
 11 files changed, 480 insertions(+), 345 deletions(-)
 create mode 100644 crypto/rsassa-pkcs1.c

(limited to 'include/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 989418fe75eb..b3fb3b2ae12f 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -250,6 +250,7 @@ config CRYPTO_RSA
 	tristate "RSA (Rivest-Shamir-Adleman)"
 	select CRYPTO_AKCIPHER
 	select CRYPTO_MANAGER
+	select CRYPTO_SIG
 	select MPILIB
 	select ASN1
 	help
diff --git a/crypto/Makefile b/crypto/Makefile
index 4c99e5d376f6..7de29bf843e9 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -48,6 +48,7 @@ rsa_generic-y += rsaprivkey.asn1.o
 rsa_generic-y += rsa.o
 rsa_generic-y += rsa_helper.o
 rsa_generic-y += rsa-pkcs1pad.o
+rsa_generic-y += rsassa-pkcs1.o
 obj-$(CONFIG_CRYPTO_RSA) += rsa_generic.o
 
 $(obj)/ecdsasignature.asn1.o: $(obj)/ecdsasignature.asn1.c $(obj)/ecdsasignature.asn1.h
diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
index 422940a6706a..3fb27ecd65f6 100644
--- a/crypto/asymmetric_keys/public_key.c
+++ b/crypto/asymmetric_keys/public_key.c
@@ -83,13 +83,19 @@ software_key_determine_akcipher(const struct public_key *pkey,
 		if (strcmp(encoding, "pkcs1") == 0) {
 			*sig = op == kernel_pkey_sign ||
 			       op == kernel_pkey_verify;
-			if (!hash_algo) {
+			if (!*sig) {
+				/*
+				 * For encrypt/decrypt, hash_algo is not used
+				 * but allowed to be set for historic reasons.
+				 */
 				n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME,
 					     "pkcs1pad(%s)",
 					     pkey->pkey_algo);
 			} else {
+				if (!hash_algo)
+					return -EINVAL;
 				n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME,
-					     "pkcs1pad(%s,%s)",
+					     "pkcs1(%s,%s)",
 					     pkey->pkey_algo, hash_algo);
 			}
 			return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0;
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 3c5fe8c93938..50bdb18e7b48 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -16,101 +16,6 @@
 #include <linux/random.h>
 #include <linux/scatterlist.h>
 
-/*
- * Hash algorithm OIDs plus ASN.1 DER wrappings [RFC4880 sec 5.2.2].
- */
-static const u8 rsa_digest_info_md5[] = {
-	0x30, 0x20, 0x30, 0x0c, 0x06, 0x08,
-	0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x05, /* OID */
-	0x05, 0x00, 0x04, 0x10
-};
-
-static const u8 rsa_digest_info_sha1[] = {
-	0x30, 0x21, 0x30, 0x09, 0x06, 0x05,
-	0x2b, 0x0e, 0x03, 0x02, 0x1a,
-	0x05, 0x00, 0x04, 0x14
-};
-
-static const u8 rsa_digest_info_rmd160[] = {
-	0x30, 0x21, 0x30, 0x09, 0x06, 0x05,
-	0x2b, 0x24, 0x03, 0x02, 0x01,
-	0x05, 0x00, 0x04, 0x14
-};
-
-static const u8 rsa_digest_info_sha224[] = {
-	0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09,
-	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04,
-	0x05, 0x00, 0x04, 0x1c
-};
-
-static const u8 rsa_digest_info_sha256[] = {
-	0x30, 0x31, 0x30, 0x0d, 0x06, 0x09,
-	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01,
-	0x05, 0x00, 0x04, 0x20
-};
-
-static const u8 rsa_digest_info_sha384[] = {
-	0x30, 0x41, 0x30, 0x0d, 0x06, 0x09,
-	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02,
-	0x05, 0x00, 0x04, 0x30
-};
-
-static const u8 rsa_digest_info_sha512[] = {
-	0x30, 0x51, 0x30, 0x0d, 0x06, 0x09,
-	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03,
-	0x05, 0x00, 0x04, 0x40
-};
-
-static const u8 rsa_digest_info_sha3_256[] = {
-	0x30, 0x31, 0x30, 0x0d, 0x06, 0x09,
-	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x08,
-	0x05, 0x00, 0x04, 0x20
-};
-
-static const u8 rsa_digest_info_sha3_384[] = {
-	0x30, 0x41, 0x30, 0x0d, 0x06, 0x09,
-	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x09,
-	0x05, 0x00, 0x04, 0x30
-};
-
-static const u8 rsa_digest_info_sha3_512[] = {
-	0x30, 0x51, 0x30, 0x0d, 0x06, 0x09,
-	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x0A,
-	0x05, 0x00, 0x04, 0x40
-};
-
-static const struct rsa_asn1_template {
-	const char	*name;
-	const u8	*data;
-	size_t		size;
-} rsa_asn1_templates[] = {
-#define _(X) { #X, rsa_digest_info_##X, sizeof(rsa_digest_info_##X) }
-	_(md5),
-	_(sha1),
-	_(rmd160),
-	_(sha256),
-	_(sha384),
-	_(sha512),
-	_(sha224),
-#undef _
-#define _(X) { "sha3-" #X, rsa_digest_info_sha3_##X, sizeof(rsa_digest_info_sha3_##X) }
-	_(256),
-	_(384),
-	_(512),
-#undef _
-	{ NULL }
-};
-
-static const struct rsa_asn1_template *rsa_lookup_asn1(const char *name)
-{
-	const struct rsa_asn1_template *p;
-
-	for (p = rsa_asn1_templates; p->name; p++)
-		if (strcmp(name, p->name) == 0)
-			return p;
-	return NULL;
-}
-
 struct pkcs1pad_ctx {
 	struct crypto_akcipher *child;
 	unsigned int key_size;
@@ -118,7 +23,6 @@ struct pkcs1pad_ctx {
 
 struct pkcs1pad_inst_ctx {
 	struct crypto_akcipher_spawn spawn;
-	const struct rsa_asn1_template *digest_info;
 };
 
 struct pkcs1pad_request {
@@ -148,9 +52,9 @@ static unsigned int pkcs1pad_get_max_size(struct crypto_akcipher *tfm)
 	struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
 
 	/*
-	 * The maximum destination buffer size for the encrypt/sign operations
+	 * The maximum destination buffer size for the encrypt operation
 	 * will be the same as for RSA, even though it's smaller for
-	 * decrypt/verify.
+	 * decrypt.
 	 */
 
 	return ctx->key_size;
@@ -168,7 +72,7 @@ static void pkcs1pad_sg_set_buf(struct scatterlist *sg, void *buf, size_t len,
 		sg_chain(sg, nsegs, next);
 }
 
-static int pkcs1pad_encrypt_sign_complete(struct akcipher_request *req, int err)
+static int pkcs1pad_encrypt_complete(struct akcipher_request *req, int err)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
@@ -207,14 +111,14 @@ out:
 	return err;
 }
 
-static void pkcs1pad_encrypt_sign_complete_cb(void *data, int err)
+static void pkcs1pad_encrypt_complete_cb(void *data, int err)
 {
 	struct akcipher_request *req = data;
 
 	if (err == -EINPROGRESS)
 		goto out;
 
-	err = pkcs1pad_encrypt_sign_complete(req, err);
+	err = pkcs1pad_encrypt_complete(req, err);
 
 out:
 	akcipher_request_complete(req, err);
@@ -255,7 +159,7 @@ static int pkcs1pad_encrypt(struct akcipher_request *req)
 
 	akcipher_request_set_tfm(&req_ctx->child_req, ctx->child);
 	akcipher_request_set_callback(&req_ctx->child_req, req->base.flags,
-			pkcs1pad_encrypt_sign_complete_cb, req);
+			pkcs1pad_encrypt_complete_cb, req);
 
 	/* Reuse output buffer */
 	akcipher_request_set_crypt(&req_ctx->child_req, req_ctx->in_sg,
@@ -263,7 +167,7 @@ static int pkcs1pad_encrypt(struct akcipher_request *req)
 
 	err = crypto_akcipher_encrypt(&req_ctx->child_req);
 	if (err != -EINPROGRESS && err != -EBUSY)
-		return pkcs1pad_encrypt_sign_complete(req, err);
+		return pkcs1pad_encrypt_complete(req, err);
 
 	return err;
 }
@@ -368,195 +272,6 @@ static int pkcs1pad_decrypt(struct akcipher_request *req)
 	return err;
 }
 
-static int pkcs1pad_sign(struct akcipher_request *req)
-{
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-	struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
-	struct pkcs1pad_request *req_ctx = akcipher_request_ctx(req);
-	struct akcipher_instance *inst = akcipher_alg_instance(tfm);
-	struct pkcs1pad_inst_ctx *ictx = akcipher_instance_ctx(inst);
-	const struct rsa_asn1_template *digest_info = ictx->digest_info;
-	int err;
-	unsigned int ps_end, digest_info_size = 0;
-
-	if (!ctx->key_size)
-		return -EINVAL;
-
-	if (digest_info)
-		digest_info_size = digest_info->size;
-
-	if (req->src_len + digest_info_size > ctx->key_size - 11)
-		return -EOVERFLOW;
-
-	if (req->dst_len < ctx->key_size) {
-		req->dst_len = ctx->key_size;
-		return -EOVERFLOW;
-	}
-
-	req_ctx->in_buf = kmalloc(ctx->key_size - 1 - req->src_len,
-				  GFP_KERNEL);
-	if (!req_ctx->in_buf)
-		return -ENOMEM;
-
-	ps_end = ctx->key_size - digest_info_size - req->src_len - 2;
-	req_ctx->in_buf[0] = 0x01;
-	memset(req_ctx->in_buf + 1, 0xff, ps_end - 1);
-	req_ctx->in_buf[ps_end] = 0x00;
-
-	if (digest_info)
-		memcpy(req_ctx->in_buf + ps_end + 1, digest_info->data,
-		       digest_info->size);
-
-	pkcs1pad_sg_set_buf(req_ctx->in_sg, req_ctx->in_buf,
-			ctx->key_size - 1 - req->src_len, req->src);
-
-	akcipher_request_set_tfm(&req_ctx->child_req, ctx->child);
-	akcipher_request_set_callback(&req_ctx->child_req, req->base.flags,
-			pkcs1pad_encrypt_sign_complete_cb, req);
-
-	/* Reuse output buffer */
-	akcipher_request_set_crypt(&req_ctx->child_req, req_ctx->in_sg,
-				   req->dst, ctx->key_size - 1, req->dst_len);
-
-	err = crypto_akcipher_decrypt(&req_ctx->child_req);
-	if (err != -EINPROGRESS && err != -EBUSY)
-		return pkcs1pad_encrypt_sign_complete(req, err);
-
-	return err;
-}
-
-static int pkcs1pad_verify_complete(struct akcipher_request *req, int err)
-{
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-	struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
-	struct pkcs1pad_request *req_ctx = akcipher_request_ctx(req);
-	struct akcipher_instance *inst = akcipher_alg_instance(tfm);
-	struct pkcs1pad_inst_ctx *ictx = akcipher_instance_ctx(inst);
-	const struct rsa_asn1_template *digest_info = ictx->digest_info;
-	const unsigned int sig_size = req->src_len;
-	const unsigned int digest_size = req->dst_len;
-	unsigned int dst_len;
-	unsigned int pos;
-	u8 *out_buf;
-
-	if (err)
-		goto done;
-
-	err = -EINVAL;
-	dst_len = req_ctx->child_req.dst_len;
-	if (dst_len < ctx->key_size - 1)
-		goto done;
-
-	out_buf = req_ctx->out_buf;
-	if (dst_len == ctx->key_size) {
-		if (out_buf[0] != 0x00)
-			/* Decrypted value had no leading 0 byte */
-			goto done;
-
-		dst_len--;
-		out_buf++;
-	}
-
-	err = -EBADMSG;
-	if (out_buf[0] != 0x01)
-		goto done;
-
-	for (pos = 1; pos < dst_len; pos++)
-		if (out_buf[pos] != 0xff)
-			break;
-
-	if (pos < 9 || pos == dst_len || out_buf[pos] != 0x00)
-		goto done;
-	pos++;
-
-	if (digest_info) {
-		if (digest_info->size > dst_len - pos)
-			goto done;
-		if (crypto_memneq(out_buf + pos, digest_info->data,
-				  digest_info->size))
-			goto done;
-
-		pos += digest_info->size;
-	}
-
-	err = 0;
-
-	if (digest_size != dst_len - pos) {
-		err = -EKEYREJECTED;
-		req->dst_len = dst_len - pos;
-		goto done;
-	}
-	/* Extract appended digest. */
-	sg_pcopy_to_buffer(req->src,
-			   sg_nents_for_len(req->src, sig_size + digest_size),
-			   req_ctx->out_buf + ctx->key_size,
-			   digest_size, sig_size);
-	/* Do the actual verification step. */
-	if (memcmp(req_ctx->out_buf + ctx->key_size, out_buf + pos,
-		   digest_size) != 0)
-		err = -EKEYREJECTED;
-done:
-	kfree_sensitive(req_ctx->out_buf);
-
-	return err;
-}
-
-static void pkcs1pad_verify_complete_cb(void *data, int err)
-{
-	struct akcipher_request *req = data;
-
-	if (err == -EINPROGRESS)
-		goto out;
-
-	err = pkcs1pad_verify_complete(req, err);
-
-out:
-	akcipher_request_complete(req, err);
-}
-
-/*
- * The verify operation is here for completeness similar to the verification
- * defined in RFC2313 section 10.2 except that block type 0 is not accepted,
- * as in RFC2437.  RFC2437 section 9.2 doesn't define any operation to
- * retrieve the DigestInfo from a signature, instead the user is expected
- * to call the sign operation to generate the expected signature and compare
- * signatures instead of the message-digests.
- */
-static int pkcs1pad_verify(struct akcipher_request *req)
-{
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-	struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
-	struct pkcs1pad_request *req_ctx = akcipher_request_ctx(req);
-	const unsigned int sig_size = req->src_len;
-	const unsigned int digest_size = req->dst_len;
-	int err;
-
-	if (WARN_ON(req->dst) || WARN_ON(!digest_size) ||
-	    !ctx->key_size || sig_size != ctx->key_size)
-		return -EINVAL;
-
-	req_ctx->out_buf = kmalloc(ctx->key_size + digest_size, GFP_KERNEL);
-	if (!req_ctx->out_buf)
-		return -ENOMEM;
-
-	pkcs1pad_sg_set_buf(req_ctx->out_sg, req_ctx->out_buf,
-			    ctx->key_size, NULL);
-
-	akcipher_request_set_tfm(&req_ctx->child_req, ctx->child);
-	akcipher_request_set_callback(&req_ctx->child_req, req->base.flags,
-			pkcs1pad_verify_complete_cb, req);
-
-	/* Reuse input buffer, output to a new buffer */
-	akcipher_request_set_crypt(&req_ctx->child_req, req->src,
-				   req_ctx->out_sg, sig_size, ctx->key_size);
-
-	err = crypto_akcipher_encrypt(&req_ctx->child_req);
-	if (err != -EINPROGRESS && err != -EBUSY)
-		return pkcs1pad_verify_complete(req, err);
-
-	return err;
-}
-
 static int pkcs1pad_init_tfm(struct crypto_akcipher *tfm)
 {
 	struct akcipher_instance *inst = akcipher_alg_instance(tfm);
@@ -598,7 +313,6 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
 	struct akcipher_instance *inst;
 	struct pkcs1pad_inst_ctx *ctx;
 	struct akcipher_alg *rsa_alg;
-	const char *hash_name;
 	int err;
 
 	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AKCIPHER, &mask);
@@ -624,36 +338,15 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
 	}
 
 	err = -ENAMETOOLONG;
-	hash_name = crypto_attr_alg_name(tb[2]);
-	if (IS_ERR(hash_name)) {
-		if (snprintf(inst->alg.base.cra_name,
-			     CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)",
-			     rsa_alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME)
-			goto err_free_inst;
-
-		if (snprintf(inst->alg.base.cra_driver_name,
-			     CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)",
-			     rsa_alg->base.cra_driver_name) >=
-			     CRYPTO_MAX_ALG_NAME)
-			goto err_free_inst;
-	} else {
-		ctx->digest_info = rsa_lookup_asn1(hash_name);
-		if (!ctx->digest_info) {
-			err = -EINVAL;
-			goto err_free_inst;
-		}
-
-		if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
-			     "pkcs1pad(%s,%s)", rsa_alg->base.cra_name,
-			     hash_name) >= CRYPTO_MAX_ALG_NAME)
-			goto err_free_inst;
-
-		if (snprintf(inst->alg.base.cra_driver_name,
-			     CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s,%s)",
-			     rsa_alg->base.cra_driver_name,
-			     hash_name) >= CRYPTO_MAX_ALG_NAME)
-			goto err_free_inst;
-	}
+	if (snprintf(inst->alg.base.cra_name,
+		     CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)",
+		     rsa_alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME)
+		goto err_free_inst;
+
+	if (snprintf(inst->alg.base.cra_driver_name,
+		     CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)",
+		     rsa_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+		goto err_free_inst;
 
 	inst->alg.base.cra_priority = rsa_alg->base.cra_priority;
 	inst->alg.base.cra_ctxsize = sizeof(struct pkcs1pad_ctx);
@@ -663,8 +356,6 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
 
 	inst->alg.encrypt = pkcs1pad_encrypt;
 	inst->alg.decrypt = pkcs1pad_decrypt;
-	inst->alg.sign = pkcs1pad_sign;
-	inst->alg.verify = pkcs1pad_verify;
 	inst->alg.set_pub_key = pkcs1pad_set_pub_key;
 	inst->alg.set_priv_key = pkcs1pad_set_priv_key;
 	inst->alg.max_size = pkcs1pad_get_max_size;
diff --git a/crypto/rsa.c b/crypto/rsa.c
index 78b28d14ced3..b7d21529c552 100644
--- a/crypto/rsa.c
+++ b/crypto/rsa.c
@@ -407,16 +407,25 @@ static int __init rsa_init(void)
 		return err;
 
 	err = crypto_register_template(&rsa_pkcs1pad_tmpl);
-	if (err) {
-		crypto_unregister_akcipher(&rsa);
-		return err;
-	}
+	if (err)
+		goto err_unregister_rsa;
+
+	err = crypto_register_template(&rsassa_pkcs1_tmpl);
+	if (err)
+		goto err_unregister_rsa_pkcs1pad;
 
 	return 0;
+
+err_unregister_rsa_pkcs1pad:
+	crypto_unregister_template(&rsa_pkcs1pad_tmpl);
+err_unregister_rsa:
+	crypto_unregister_akcipher(&rsa);
+	return err;
 }
 
 static void __exit rsa_exit(void)
 {
+	crypto_unregister_template(&rsassa_pkcs1_tmpl);
 	crypto_unregister_template(&rsa_pkcs1pad_tmpl);
 	crypto_unregister_akcipher(&rsa);
 }
diff --git a/crypto/rsassa-pkcs1.c b/crypto/rsassa-pkcs1.c
new file mode 100644
index 000000000000..779c080fc013
--- /dev/null
+++ b/crypto/rsassa-pkcs1.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RSA Signature Scheme with Appendix - PKCS #1 v1.5 (RFC 8017 sec 8.2)
+ *
+ * https://www.rfc-editor.org/rfc/rfc8017#section-8.2
+ *
+ * Copyright (c) 2015 - 2024 Intel Corporation
+ */
+
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <crypto/akcipher.h>
+#include <crypto/algapi.h>
+#include <crypto/sig.h>
+#include <crypto/internal/akcipher.h>
+#include <crypto/internal/rsa.h>
+#include <crypto/internal/sig.h>
+
+/*
+ * Full Hash Prefix for EMSA-PKCS1-v1_5 encoding method (RFC 9580 table 24)
+ *
+ * RSA keys are usually much larger than the hash of the message to be signed.
+ * The hash is therefore prepended by the Full Hash Prefix and a 0xff padding.
+ * The Full Hash Prefix is an ASN.1 SEQUENCE containing the hash algorithm OID.
+ *
+ * https://www.rfc-editor.org/rfc/rfc9580#table-24
+ */
+
+static const u8 hash_prefix_md5[] = {
+	0x30, 0x20, 0x30, 0x0c, 0x06, 0x08,	  /* SEQUENCE (SEQUENCE (OID */
+	0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x05,	/*	<algorithm>, */
+	0x05, 0x00, 0x04, 0x10		      /* NULL), OCTET STRING <hash>) */
+};
+
+static const u8 hash_prefix_sha1[] = {
+	0x30, 0x21, 0x30, 0x09, 0x06, 0x05,
+	0x2b, 0x0e, 0x03, 0x02, 0x1a,
+	0x05, 0x00, 0x04, 0x14
+};
+
+static const u8 hash_prefix_rmd160[] = {
+	0x30, 0x21, 0x30, 0x09, 0x06, 0x05,
+	0x2b, 0x24, 0x03, 0x02, 0x01,
+	0x05, 0x00, 0x04, 0x14
+};
+
+static const u8 hash_prefix_sha224[] = {
+	0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09,
+	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04,
+	0x05, 0x00, 0x04, 0x1c
+};
+
+static const u8 hash_prefix_sha256[] = {
+	0x30, 0x31, 0x30, 0x0d, 0x06, 0x09,
+	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01,
+	0x05, 0x00, 0x04, 0x20
+};
+
+static const u8 hash_prefix_sha384[] = {
+	0x30, 0x41, 0x30, 0x0d, 0x06, 0x09,
+	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02,
+	0x05, 0x00, 0x04, 0x30
+};
+
+static const u8 hash_prefix_sha512[] = {
+	0x30, 0x51, 0x30, 0x0d, 0x06, 0x09,
+	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03,
+	0x05, 0x00, 0x04, 0x40
+};
+
+static const u8 hash_prefix_sha3_256[] = {
+	0x30, 0x31, 0x30, 0x0d, 0x06, 0x09,
+	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x08,
+	0x05, 0x00, 0x04, 0x20
+};
+
+static const u8 hash_prefix_sha3_384[] = {
+	0x30, 0x41, 0x30, 0x0d, 0x06, 0x09,
+	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x09,
+	0x05, 0x00, 0x04, 0x30
+};
+
+static const u8 hash_prefix_sha3_512[] = {
+	0x30, 0x51, 0x30, 0x0d, 0x06, 0x09,
+	0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x0a,
+	0x05, 0x00, 0x04, 0x40
+};
+
+static const struct hash_prefix {
+	const char	*name;
+	const u8	*data;
+	size_t		size;
+} hash_prefixes[] = {
+#define _(X) { #X, hash_prefix_##X, sizeof(hash_prefix_##X) }
+	_(md5),
+	_(sha1),
+	_(rmd160),
+	_(sha256),
+	_(sha384),
+	_(sha512),
+	_(sha224),
+#undef _
+#define _(X) { "sha3-" #X, hash_prefix_sha3_##X, sizeof(hash_prefix_sha3_##X) }
+	_(256),
+	_(384),
+	_(512),
+#undef _
+	{ NULL }
+};
+
+static const struct hash_prefix *rsassa_pkcs1_find_hash_prefix(const char *name)
+{
+	const struct hash_prefix *p;
+
+	for (p = hash_prefixes; p->name; p++)
+		if (strcmp(name, p->name) == 0)
+			return p;
+	return NULL;
+}
+
+struct rsassa_pkcs1_ctx {
+	struct crypto_akcipher *child;
+	unsigned int key_size;
+};
+
+struct rsassa_pkcs1_inst_ctx {
+	struct crypto_akcipher_spawn spawn;
+	const struct hash_prefix *hash_prefix;
+};
+
+static int rsassa_pkcs1_sign(struct crypto_sig *tfm,
+			     const void *src, unsigned int slen,
+			     void *dst, unsigned int dlen)
+{
+	struct sig_instance *inst = sig_alg_instance(tfm);
+	struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst);
+	const struct hash_prefix *hash_prefix = ictx->hash_prefix;
+	struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+	unsigned int child_reqsize = crypto_akcipher_reqsize(ctx->child);
+	struct akcipher_request *child_req __free(kfree_sensitive) = NULL;
+	struct scatterlist in_sg[2], out_sg;
+	struct crypto_wait cwait;
+	unsigned int pad_len;
+	unsigned int ps_end;
+	unsigned int len;
+	u8 *in_buf;
+	int err;
+
+	if (!ctx->key_size)
+		return -EINVAL;
+
+	if (dlen < ctx->key_size)
+		return -EOVERFLOW;
+
+	if (slen + hash_prefix->size > ctx->key_size - 11)
+		return -EOVERFLOW;
+
+	child_req = kmalloc(sizeof(*child_req) + child_reqsize +
+			    ctx->key_size - 1 - slen, GFP_KERNEL);
+	if (!child_req)
+		return -ENOMEM;
+
+	/* RFC 8017 sec 8.2.1 step 1 - EMSA-PKCS1-v1_5 encoding generation */
+	in_buf = (u8 *)(child_req + 1) + child_reqsize;
+	ps_end = ctx->key_size - hash_prefix->size - slen - 2;
+	in_buf[0] = 0x01;
+	memset(in_buf + 1, 0xff, ps_end - 1);
+	in_buf[ps_end] = 0x00;
+	memcpy(in_buf + ps_end + 1, hash_prefix->data, hash_prefix->size);
+
+	/* RFC 8017 sec 8.2.1 step 2 - RSA signature */
+	crypto_init_wait(&cwait);
+	sg_init_table(in_sg, 2);
+	sg_set_buf(&in_sg[0], in_buf, ctx->key_size - 1 - slen);
+	sg_set_buf(&in_sg[1], src, slen);
+	sg_init_one(&out_sg, dst, dlen);
+	akcipher_request_set_tfm(child_req, ctx->child);
+	akcipher_request_set_crypt(child_req, in_sg, &out_sg,
+				   ctx->key_size - 1, dlen);
+	akcipher_request_set_callback(child_req, CRYPTO_TFM_REQ_MAY_SLEEP,
+				      crypto_req_done, &cwait);
+
+	err = crypto_akcipher_decrypt(child_req);
+	err = crypto_wait_req(err, &cwait);
+	if (err)
+		return err;
+
+	len = child_req->dst_len;
+	pad_len = ctx->key_size - len;
+
+	/* Four billion to one */
+	if (unlikely(pad_len)) {
+		memmove(dst + pad_len, dst, len);
+		memset(dst, 0, pad_len);
+	}
+
+	return 0;
+}
+
+static int rsassa_pkcs1_verify(struct crypto_sig *tfm,
+			       const void *src, unsigned int slen,
+			       const void *digest, unsigned int dlen)
+{
+	struct sig_instance *inst = sig_alg_instance(tfm);
+	struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst);
+	const struct hash_prefix *hash_prefix = ictx->hash_prefix;
+	struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+	unsigned int child_reqsize = crypto_akcipher_reqsize(ctx->child);
+	struct akcipher_request *child_req __free(kfree_sensitive) = NULL;
+	struct scatterlist in_sg, out_sg;
+	struct crypto_wait cwait;
+	unsigned int dst_len;
+	unsigned int pos;
+	u8 *out_buf;
+	int err;
+
+	/* RFC 8017 sec 8.2.2 step 1 - length checking */
+	if (!ctx->key_size ||
+	    slen != ctx->key_size ||
+	    !dlen)
+		return -EINVAL;
+
+	/* RFC 8017 sec 8.2.2 step 2 - RSA verification */
+	child_req = kmalloc(sizeof(*child_req) + child_reqsize + ctx->key_size,
+			    GFP_KERNEL);
+	if (!child_req)
+		return -ENOMEM;
+
+	out_buf = (u8 *)(child_req + 1) + child_reqsize;
+
+	crypto_init_wait(&cwait);
+	sg_init_one(&in_sg, src, slen);
+	sg_init_one(&out_sg, out_buf, ctx->key_size);
+	akcipher_request_set_tfm(child_req, ctx->child);
+	akcipher_request_set_crypt(child_req, &in_sg, &out_sg,
+				   slen, ctx->key_size);
+	akcipher_request_set_callback(child_req, CRYPTO_TFM_REQ_MAY_SLEEP,
+				      crypto_req_done, &cwait);
+
+	err = crypto_akcipher_encrypt(child_req);
+	err = crypto_wait_req(err, &cwait);
+	if (err)
+		return err;
+
+	/* RFC 8017 sec 8.2.2 step 3 - EMSA-PKCS1-v1_5 encoding verification */
+	dst_len = child_req->dst_len;
+	if (dst_len < ctx->key_size - 1)
+		return -EINVAL;
+
+	if (dst_len == ctx->key_size) {
+		if (out_buf[0] != 0x00)
+			/* Encrypted value had no leading 0 byte */
+			return -EINVAL;
+
+		dst_len--;
+		out_buf++;
+	}
+
+	if (out_buf[0] != 0x01)
+		return -EBADMSG;
+
+	for (pos = 1; pos < dst_len; pos++)
+		if (out_buf[pos] != 0xff)
+			break;
+
+	if (pos < 9 || pos == dst_len || out_buf[pos] != 0x00)
+		return -EBADMSG;
+	pos++;
+
+	if (hash_prefix->size > dst_len - pos)
+		return -EBADMSG;
+	if (crypto_memneq(out_buf + pos, hash_prefix->data, hash_prefix->size))
+		return -EBADMSG;
+	pos += hash_prefix->size;
+
+	/* RFC 8017 sec 8.2.2 step 4 - comparison of digest with out_buf */
+	if (dlen != dst_len - pos)
+		return -EKEYREJECTED;
+	if (memcmp(digest, out_buf + pos, dlen) != 0)
+		return -EKEYREJECTED;
+
+	return 0;
+}
+
+static unsigned int rsassa_pkcs1_max_size(struct crypto_sig *tfm)
+{
+	struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+
+	return ctx->key_size;
+}
+
+static int rsassa_pkcs1_set_pub_key(struct crypto_sig *tfm,
+				    const void *key, unsigned int keylen)
+{
+	struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+
+	return rsa_set_key(ctx->child, &ctx->key_size, RSA_PUB, key, keylen);
+}
+
+static int rsassa_pkcs1_set_priv_key(struct crypto_sig *tfm,
+				     const void *key, unsigned int keylen)
+{
+	struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+
+	return rsa_set_key(ctx->child, &ctx->key_size, RSA_PRIV, key, keylen);
+}
+
+static int rsassa_pkcs1_init_tfm(struct crypto_sig *tfm)
+{
+	struct sig_instance *inst = sig_alg_instance(tfm);
+	struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst);
+	struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+	struct crypto_akcipher *child_tfm;
+
+	child_tfm = crypto_spawn_akcipher(&ictx->spawn);
+	if (IS_ERR(child_tfm))
+		return PTR_ERR(child_tfm);
+
+	ctx->child = child_tfm;
+
+	return 0;
+}
+
+static void rsassa_pkcs1_exit_tfm(struct crypto_sig *tfm)
+{
+	struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+
+	crypto_free_akcipher(ctx->child);
+}
+
+static void rsassa_pkcs1_free(struct sig_instance *inst)
+{
+	struct rsassa_pkcs1_inst_ctx *ctx = sig_instance_ctx(inst);
+	struct crypto_akcipher_spawn *spawn = &ctx->spawn;
+
+	crypto_drop_akcipher(spawn);
+	kfree(inst);
+}
+
+static int rsassa_pkcs1_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+	struct rsassa_pkcs1_inst_ctx *ctx;
+	struct akcipher_alg *rsa_alg;
+	struct sig_instance *inst;
+	const char *hash_name;
+	u32 mask;
+	int err;
+
+	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SIG, &mask);
+	if (err)
+		return err;
+
+	inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
+	if (!inst)
+		return -ENOMEM;
+
+	ctx = sig_instance_ctx(inst);
+
+	err = crypto_grab_akcipher(&ctx->spawn, sig_crypto_instance(inst),
+				   crypto_attr_alg_name(tb[1]), 0, mask);
+	if (err)
+		goto err_free_inst;
+
+	rsa_alg = crypto_spawn_akcipher_alg(&ctx->spawn);
+
+	if (strcmp(rsa_alg->base.cra_name, "rsa") != 0) {
+		err = -EINVAL;
+		goto err_free_inst;
+	}
+
+	hash_name = crypto_attr_alg_name(tb[2]);
+	if (IS_ERR(hash_name)) {
+		err = PTR_ERR(hash_name);
+		goto err_free_inst;
+	}
+
+	ctx->hash_prefix = rsassa_pkcs1_find_hash_prefix(hash_name);
+	if (!ctx->hash_prefix) {
+		err = -EINVAL;
+		goto err_free_inst;
+	}
+
+	err = -ENAMETOOLONG;
+	if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
+		     "pkcs1(%s,%s)", rsa_alg->base.cra_name,
+		     hash_name) >= CRYPTO_MAX_ALG_NAME)
+		goto err_free_inst;
+
+	if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
+		     "pkcs1(%s,%s)", rsa_alg->base.cra_driver_name,
+		     hash_name) >= CRYPTO_MAX_ALG_NAME)
+		goto err_free_inst;
+
+	inst->alg.base.cra_priority = rsa_alg->base.cra_priority;
+	inst->alg.base.cra_ctxsize = sizeof(struct rsassa_pkcs1_ctx);
+
+	inst->alg.init = rsassa_pkcs1_init_tfm;
+	inst->alg.exit = rsassa_pkcs1_exit_tfm;
+
+	inst->alg.sign = rsassa_pkcs1_sign;
+	inst->alg.verify = rsassa_pkcs1_verify;
+	inst->alg.max_size = rsassa_pkcs1_max_size;
+	inst->alg.set_pub_key = rsassa_pkcs1_set_pub_key;
+	inst->alg.set_priv_key = rsassa_pkcs1_set_priv_key;
+
+	inst->free = rsassa_pkcs1_free;
+
+	err = sig_register_instance(tmpl, inst);
+	if (err) {
+err_free_inst:
+		rsassa_pkcs1_free(inst);
+	}
+	return err;
+}
+
+struct crypto_template rsassa_pkcs1_tmpl = {
+	.name = "pkcs1",
+	.create = rsassa_pkcs1_create,
+	.module = THIS_MODULE,
+};
+
+MODULE_ALIAS_CRYPTO("pkcs1");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index ed971d857057..76401b3b634f 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5569,34 +5569,38 @@ static const struct alg_test_desc alg_test_descs[] = {
 			.cipher = __VECS(fcrypt_pcbc_tv_template)
 		}
 	}, {
-		.alg = "pkcs1pad(rsa,sha224)",
+		.alg = "pkcs1(rsa,sha224)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
 	}, {
-		.alg = "pkcs1pad(rsa,sha256)",
-		.test = alg_test_akcipher,
+		.alg = "pkcs1(rsa,sha256)",
+		.test = alg_test_sig,
 		.fips_allowed = 1,
 		.suite = {
-			.akcipher = __VECS(pkcs1pad_rsa_tv_template)
+			.sig = __VECS(pkcs1_rsa_tv_template)
 		}
 	}, {
-		.alg = "pkcs1pad(rsa,sha3-256)",
+		.alg = "pkcs1(rsa,sha3-256)",
+		.test = alg_test_null,
+		.fips_allowed = 1,
+	}, {
+		.alg = "pkcs1(rsa,sha3-384)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
 	}, {
-		.alg = "pkcs1pad(rsa,sha3-384)",
+		.alg = "pkcs1(rsa,sha3-512)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
 	}, {
-		.alg = "pkcs1pad(rsa,sha3-512)",
+		.alg = "pkcs1(rsa,sha384)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
 	}, {
-		.alg = "pkcs1pad(rsa,sha384)",
+		.alg = "pkcs1(rsa,sha512)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
 	}, {
-		.alg = "pkcs1pad(rsa,sha512)",
+		.alg = "pkcs1pad(rsa)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
 	}, {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index fd4823c26d93..d29d03fec852 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -1268,7 +1268,7 @@ static const struct sig_testvec ecrdsa_tv_template[] = {
 /*
  * PKCS#1 RSA test vectors. Obtained from CAVS testing.
  */
-static const struct akcipher_testvec pkcs1pad_rsa_tv_template[] = {
+static const struct sig_testvec pkcs1_rsa_tv_template[] = {
 	{
 	.key =
 	"\x30\x82\x04\xa5\x02\x01\x00\x02\x82\x01\x01\x00\xd7\x1e\x77\x82"
@@ -1380,7 +1380,6 @@ static const struct akcipher_testvec pkcs1pad_rsa_tv_template[] = {
 	"\xda\x62\x8d\xe1\x2a\x71\x91\x43\x40\x61\x3c\x5a\xbe\x86\xfc\x5b"
 	"\xe6\xf9\xa9\x16\x31\x1f\xaf\x25\x6d\xc2\x4a\x23\x6e\x63\x02\xa2",
 	.c_size = 256,
-	.siggen_sigver_test = true,
 	}
 };
 
diff --git a/include/crypto/internal/rsa.h b/include/crypto/internal/rsa.h
index 754f687134df..071a1951b992 100644
--- a/include/crypto/internal/rsa.h
+++ b/include/crypto/internal/rsa.h
@@ -82,4 +82,5 @@ static inline int rsa_set_key(struct crypto_akcipher *child,
 }
 
 extern struct crypto_template rsa_pkcs1pad_tmpl;
+extern struct crypto_template rsassa_pkcs1_tmpl;
 #endif
diff --git a/include/linux/slab.h b/include/linux/slab.h
index b35e2db7eb0e..0268ea7abf8b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -448,6 +448,7 @@ void kfree_sensitive(const void *objp);
 size_t __ksize(const void *objp);
 
 DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))
+DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T))
 
 /**
  * ksize - Report actual allocation size of associated object
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 06132cf47016..34ea02addb70 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -1114,7 +1114,7 @@ EXPORT_SYMBOL_GPL(ima_measure_critical_data);
 #ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
 
 /**
- * ima_kernel_module_request - Prevent crypto-pkcs1pad(rsa,*) requests
+ * ima_kernel_module_request - Prevent crypto-pkcs1(rsa,*) requests
  * @kmod_name: kernel module name
  *
  * Avoid a verification loop where verifying the signature of the modprobe
@@ -1128,7 +1128,7 @@ EXPORT_SYMBOL_GPL(ima_measure_critical_data);
  * algorithm on the fly, but crypto_larval_lookup() will try to use alg_name
  * in order to load a kernel module with same name.
  *
- * Since we don't have any real "crypto-pkcs1pad(rsa,*)" kernel modules,
+ * Since we don't have any real "crypto-pkcs1(rsa,*)" kernel modules,
  * we are safe to fail such module request from crypto_larval_lookup(), and
  * avoid the verification loop.
  *
@@ -1136,7 +1136,7 @@ EXPORT_SYMBOL_GPL(ima_measure_critical_data);
  */
 static int ima_kernel_module_request(char *kmod_name)
 {
-	if (strncmp(kmod_name, "crypto-pkcs1pad(rsa,", 20) == 0)
+	if (strncmp(kmod_name, "crypto-pkcs1(rsa,", 17) == 0)
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.2.3


From 4df86c6ea5c37fe0452638f39a1e4b189da75c54 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 10 Sep 2024 16:30:23 +0200
Subject: ASN.1: Clean up include statements in public headers

If <linux/asn1_decoder.h> is the first header included from a .c file
(due to headers being sorted alphabetically), the compiler complains:

  include/linux/asn1_decoder.h:18:29: error: unknown type name 'size_t'

Avoid by including <linux/types.h>.

Jonathan notes that the counterpart <linux/asn1_encoder.h> already
includes <linux/types.h>, but additionally includes the unnecessary
<linux/bug.h>.  Drop it.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/asn1_decoder.h | 1 +
 include/linux/asn1_encoder.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/asn1_decoder.h b/include/linux/asn1_decoder.h
index 83f9c6e1e5e9..b41bce82a191 100644
--- a/include/linux/asn1_decoder.h
+++ b/include/linux/asn1_decoder.h
@@ -9,6 +9,7 @@
 #define _LINUX_ASN1_DECODER_H
 
 #include <linux/asn1.h>
+#include <linux/types.h>
 
 struct asn1_decoder;
 
diff --git a/include/linux/asn1_encoder.h b/include/linux/asn1_encoder.h
index 08cd0c2ad34f..d17484dffb74 100644
--- a/include/linux/asn1_encoder.h
+++ b/include/linux/asn1_encoder.h
@@ -6,7 +6,6 @@
 #include <linux/types.h>
 #include <linux/asn1.h>
 #include <linux/asn1_ber_bytecode.h>
-#include <linux/bug.h>
 
 #define asn1_oid_len(oid) (sizeof(oid)/sizeof(u32))
 unsigned char *
-- 
cgit v1.2.3


From 452c55dcefa958f7e87798d764497485687e4bc3 Mon Sep 17 00:00:00 2001
From: Chenghai Huang <huangchenghai2@huawei.com>
Date: Sun, 29 Sep 2024 19:26:57 +0800
Subject: crypto: hisilicon/qm - fix the coding specifications issue

Ensure that the inline function contains no more than 10 lines.
move q_num_set() from hisi_acc_qm.h to qm.c.

Signed-off-by: Chenghai Huang <huangchenghai2@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/hpre/hpre_main.c |  2 +-
 drivers/crypto/hisilicon/qm.c             | 31 +++++++++++++++++++++++++++++
 drivers/crypto/hisilicon/sec2/sec_main.c  |  2 +-
 drivers/crypto/hisilicon/zip/zip_main.c   |  2 +-
 include/linux/hisi_acc_qm.h               | 33 ++-----------------------------
 5 files changed, 36 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index 23e8fb9414af..f129878559c8 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -415,7 +415,7 @@ static int pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
 	pf_q_num_flag = true;
 
-	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_HPRE_PF);
+	return hisi_qm_q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_HPRE_PF);
 }
 
 static const struct kernel_param_ops hpre_pf_q_num_ops = {
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 07983af9e3e2..f7e8237e3a93 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -451,6 +451,37 @@ static struct qm_typical_qos_table shaper_cbs_s[] = {
 
 static void qm_irqs_unregister(struct hisi_qm *qm);
 static int qm_reset_device(struct hisi_qm *qm);
+int hisi_qm_q_num_set(const char *val, const struct kernel_param *kp,
+		      unsigned int device)
+{
+	struct pci_dev *pdev;
+	u32 n, q_num;
+	int ret;
+
+	if (!val)
+		return -EINVAL;
+
+	pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI, device, NULL);
+	if (!pdev) {
+		q_num = min_t(u32, QM_QNUM_V1, QM_QNUM_V2);
+		pr_info("No device found currently, suppose queue number is %u\n",
+			q_num);
+	} else {
+		if (pdev->revision == QM_HW_V1)
+			q_num = QM_QNUM_V1;
+		else
+			q_num = QM_QNUM_V2;
+
+		pci_dev_put(pdev);
+	}
+
+	ret = kstrtou32(val, 10, &n);
+	if (ret || n < QM_MIN_QNUM || n > q_num)
+		return -EINVAL;
+
+	return param_set_int(val, kp);
+}
+EXPORT_SYMBOL_GPL(hisi_qm_q_num_set);
 
 static u32 qm_get_hw_error_status(struct hisi_qm *qm)
 {
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index c35533d8930b..3abd12017250 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -322,7 +322,7 @@ static int sec_pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
 	pf_q_num_flag = true;
 
-	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_SEC_PF);
+	return hisi_qm_q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_SEC_PF);
 }
 
 static const struct kernel_param_ops sec_pf_q_num_ops = {
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index d07e47b48be0..f547e6732bf5 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -402,7 +402,7 @@ static int pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
 	pf_q_num_flag = true;
 
-	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_ZIP_PF);
+	return hisi_qm_q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_ZIP_PF);
 }
 
 static const struct kernel_param_ops pf_q_num_ops = {
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 9d7754ad5e9b..389e95754776 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -436,37 +436,6 @@ struct hisi_qp {
 	struct uacce_queue *uacce_q;
 };
 
-static inline int q_num_set(const char *val, const struct kernel_param *kp,
-			    unsigned int device)
-{
-	struct pci_dev *pdev;
-	u32 n, q_num;
-	int ret;
-
-	if (!val)
-		return -EINVAL;
-
-	pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI, device, NULL);
-	if (!pdev) {
-		q_num = min_t(u32, QM_QNUM_V1, QM_QNUM_V2);
-		pr_info("No device found currently, suppose queue number is %u\n",
-			q_num);
-	} else {
-		if (pdev->revision == QM_HW_V1)
-			q_num = QM_QNUM_V1;
-		else
-			q_num = QM_QNUM_V2;
-
-		pci_dev_put(pdev);
-	}
-
-	ret = kstrtou32(val, 10, &n);
-	if (ret || n < QM_MIN_QNUM || n > q_num)
-		return -EINVAL;
-
-	return param_set_int(val, kp);
-}
-
 static inline int vfs_num_set(const char *val, const struct kernel_param *kp)
 {
 	u32 n;
@@ -526,6 +495,8 @@ static inline void hisi_qm_del_list(struct hisi_qm *qm, struct hisi_qm_list *qm_
 	mutex_unlock(&qm_list->lock);
 }
 
+int hisi_qm_q_num_set(const char *val, const struct kernel_param *kp,
+		      unsigned int device);
 int hisi_qm_init(struct hisi_qm *qm);
 void hisi_qm_uninit(struct hisi_qm *qm);
 int hisi_qm_start(struct hisi_qm *qm);
-- 
cgit v1.2.3


From 21e92806d39c68af2accd1fb238c2daecfcf9fbd Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Sat, 14 Sep 2024 20:29:12 -0700
Subject: function_graph: Support recording and printing the function return
 address

When using function_graph tracer to analyze the flow of kernel function
execution, it is often necessary to quickly locate the exact line of code
where the call occurs. While this may be easy at times, it can be more
time-consuming when some functions are inlined or the flow is too long.

This feature aims to simplify the process by recording the return address
of traced funcions and printing it when outputing trace logs.

To enhance human readability, the prefix 'ret=' is used for the kernel return
value, while '<-' serves as the prefix for the return address in trace logs to
make it look more like the function tracer.

A new trace option named 'funcgraph-retaddr' has been introduced, and the
existing option 'sym-addr' can be used to control the format of the return
address.

See below logs with both funcgraph-retval and funcgraph-retaddr enabled.

0)             | load_elf_binary() { /* <-bprm_execve+0x249/0x600 */
0)             |   load_elf_phdrs() { /* <-load_elf_binary+0x84/0x1730 */
0)             |     __kmalloc_noprof() { /* <-load_elf_phdrs+0x4a/0xb0 */
0)   3.657 us  |       __cond_resched(); /* <-__kmalloc_noprof+0x28c/0x390 ret=0x0 */
0) + 24.335 us |     } /* __kmalloc_noprof ret=0xffff8882007f3000 */
0)             |     kernel_read() { /* <-load_elf_phdrs+0x6c/0xb0 */
0)             |       rw_verify_area() { /* <-kernel_read+0x2b/0x50 */
0)             |         security_file_permission() { /* <-kernel_read+0x2b/0x50 */
0)             |           selinux_file_permission() { /* <-security_file_permission+0x26/0x40 */
0)             |             __inode_security_revalidate() { /* <-selinux_file_permission+0x6d/0x140 */
0)   2.034 us  |               __cond_resched(); /* <-__inode_security_revalidate+0x5f/0x80 ret=0x0 */
0)   6.602 us  |             } /* __inode_security_revalidate ret=0x0 */
0)   2.214 us  |             avc_policy_seqno(); /* <-selinux_file_permission+0x107/0x140 ret=0x0 */
0) + 16.670 us |           } /* selinux_file_permission ret=0x0 */
0) + 20.809 us |         } /* security_file_permission ret=0x0 */
0) + 25.217 us |       } /* rw_verify_area ret=0x0 */
0)             |       __kernel_read() { /* <-load_elf_phdrs+0x6c/0xb0 */
0)             |         ext4_file_read_iter() { /* <-__kernel_read+0x160/0x2e0 */

Then, we can use the faddr2line to locate the source code, for example:

$ ./scripts/faddr2line ./vmlinux load_elf_phdrs+0x6c/0xb0
load_elf_phdrs+0x6c/0xb0:
elf_read at fs/binfmt_elf.c:471
(inlined by) load_elf_phdrs at fs/binfmt_elf.c:531

Link: https://lore.kernel.org/20240915032912.1118397-1-dolinux.peng@gmail.com
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202409150605.HgUmU8ea-lkp@intel.com/
Signed-off-by: Donglin Peng <dolinux.peng@gmail.com>
[ Rebased to handle text_delta offsets ]
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h                             |  27 ++-
 kernel/trace/Kconfig                               |  10 +
 kernel/trace/fgraph.c                              |  22 ++-
 kernel/trace/ftrace.c                              |   3 +-
 kernel/trace/trace.h                               |  11 +-
 kernel/trace/trace_entries.h                       |  29 ++-
 kernel/trace/trace_functions_graph.c               | 216 ++++++++++++++++-----
 kernel/trace/trace_irqsoff.c                       |   3 +-
 kernel/trace/trace_sched_wakeup.c                  |   3 +-
 kernel/trace/trace_selftest.c                      |   9 +-
 .../ftrace/test.d/ftrace/fgraph-retval.tc          |   2 +-
 11 files changed, 274 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e684addf6508..2ac3b3b53cd0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1040,6 +1040,17 @@ struct ftrace_graph_ent {
 	int depth;
 } __packed;
 
+/*
+ * Structure that defines an entry function trace with retaddr.
+ * It's already packed but the attribute "packed" is needed
+ * to remove extra padding at the end.
+ */
+struct fgraph_retaddr_ent {
+	unsigned long func; /* Current function */
+	int depth;
+	unsigned long retaddr;  /* Return address */
+} __packed;
+
 /*
  * Structure that defines a return function trace.
  * It's already packed but the attribute "packed" is needed
@@ -1057,19 +1068,29 @@ struct ftrace_graph_ret {
 	unsigned long long rettime;
 } __packed;
 
+struct fgraph_extras;
 struct fgraph_ops;
 
 /* Type of the callback handlers for tracing function graph*/
 typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *,
 				       struct fgraph_ops *); /* return */
 typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *,
-				      struct fgraph_ops *); /* entry */
+				      struct fgraph_ops *,
+				      struct fgraph_extras *); /* entry */
 
-extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
+extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
+				   struct fgraph_ops *gops,
+				   struct fgraph_extras *extras);
 bool ftrace_pids_enabled(struct ftrace_ops *ops);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
+/* Used to convey some extra datas when creating a graph entry */
+struct fgraph_extras {
+	u32 flags;
+	unsigned long retaddr;
+};
+
 struct fgraph_ops {
 	trace_func_graph_ent_t		entryfunc;
 	trace_func_graph_ret_t		retfunc;
@@ -1115,6 +1136,8 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 				    unsigned long ret, unsigned long *retp);
 unsigned long *fgraph_get_task_var(struct fgraph_ops *gops);
 
+u32 graph_tracer_flags_get(u32 flags);
+
 /*
  * Sometimes we don't want to trace a function with the function
  * graph tracer but we want them to keep traced by the usual function
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 721c3b221048..74c2b1d43bb9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -242,6 +242,16 @@ config FUNCTION_GRAPH_RETVAL
 	  enable it via the trace option funcgraph-retval.
 	  See Documentation/trace/ftrace.rst
 
+config FUNCTION_GRAPH_RETADDR
+	bool "Kernel Function Graph Return Address"
+	depends on FUNCTION_GRAPH_TRACER
+	default n
+	help
+	  Support recording and printing the function return address when
+	  using function graph tracer. It can be helpful to locate code line that
+	  the function is called. This feature is off by default, and you can
+	  enable it via the trace option funcgraph-retaddr.
+
 config DYNAMIC_FTRACE
 	bool "enable/disable function tracing dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 58a28ec35dab..875aefe60a13 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -290,7 +290,8 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset)
 }
 
 /* ftrace_graph_entry set to this to tell some archs to run function graph */
-static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops)
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops,
+		     struct fgraph_extras *extras)
 {
 	return 0;
 }
@@ -518,7 +519,8 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
 #endif
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
-			    struct fgraph_ops *gops)
+			    struct fgraph_ops *gops,
+			    struct fgraph_extras *extras)
 {
 	return 0;
 }
@@ -646,13 +648,20 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 			 unsigned long frame_pointer, unsigned long *retp)
 {
 	struct ftrace_graph_ent trace;
+	struct fgraph_extras extras;
 	unsigned long bitmap = 0;
 	int offset;
 	int i;
+	int idx = 0;
 
 	trace.func = func;
 	trace.depth = ++current->curr_ret_depth;
 
+	extras.flags = graph_tracer_flags_get(TRACE_GRAPH_PRINT_RETADDR);
+	if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR)
+		&& extras.flags & TRACE_GRAPH_PRINT_RETADDR)
+		extras.retaddr = ftrace_graph_ret_addr(current, &idx, ret, retp);
+
 	offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
 	if (offset < 0)
 		goto out;
@@ -661,7 +670,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 	if (static_branch_likely(&fgraph_do_direct)) {
 		int save_curr_ret_stack = current->curr_ret_stack;
 
-		if (static_call(fgraph_func)(&trace, fgraph_direct_gops))
+		if (static_call(fgraph_func)(&trace, fgraph_direct_gops, &extras))
 			bitmap |= BIT(fgraph_direct_gops->idx);
 		else
 			/* Clear out any saved storage */
@@ -679,7 +688,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 
 			save_curr_ret_stack = current->curr_ret_stack;
 			if (ftrace_ops_test(&gops->ops, func, NULL) &&
-			    gops->entryfunc(&trace, gops))
+			    gops->entryfunc(&trace, gops, &extras))
 				bitmap |= BIT(i);
 			else
 				/* Clear out any saved storage */
@@ -1136,7 +1145,8 @@ void ftrace_graph_exit_task(struct task_struct *t)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 static int fgraph_pid_func(struct ftrace_graph_ent *trace,
-			   struct fgraph_ops *gops)
+			   struct fgraph_ops *gops,
+			   struct fgraph_extras *extras)
 {
 	struct trace_array *tr = gops->ops.private;
 	int pid;
@@ -1150,7 +1160,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace,
 			return 0;
 	}
 
-	return gops->saved_func(trace, gops);
+	return gops->saved_func(trace, gops, NULL);
 }
 
 void fgraph_update_pid_func(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cae388122ca8..5d87dac83b80 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -827,7 +827,8 @@ struct profile_fgraph_data {
 };
 
 static int profile_graph_entry(struct ftrace_graph_ent *trace,
-			       struct fgraph_ops *gops)
+			       struct fgraph_ops *gops,
+			       struct fgraph_extras *extras)
 {
 	struct profile_fgraph_data *profile_data;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2f8017f8d34d..13f08f257c0b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -46,6 +46,7 @@ enum trace_type {
 	TRACE_BRANCH,
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
+	TRACE_GRAPH_RETADDR_ENT,
 	TRACE_USER_STACK,
 	TRACE_BLK,
 	TRACE_BPUTS,
@@ -512,6 +513,8 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,	\
 			  TRACE_GRAPH_ENT);		\
+		IF_ASSIGN(var, ent, struct fgraph_retaddr_ent_entry,\
+			  TRACE_GRAPH_RETADDR_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct func_repeats_entry,		\
@@ -692,7 +695,8 @@ void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 
 void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
-int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
+int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+		      struct fgraph_extras *extras);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
@@ -879,6 +883,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
 #define TRACE_GRAPH_GRAPH_TIME          0x400
 #define TRACE_GRAPH_PRINT_RETVAL        0x800
 #define TRACE_GRAPH_PRINT_RETVAL_HEX    0x1000
+#define TRACE_GRAPH_PRINT_RETADDR       0x2000
 #define TRACE_GRAPH_PRINT_FILL_SHIFT	28
 #define TRACE_GRAPH_PRINT_FILL_MASK	(0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
 
@@ -900,6 +905,10 @@ extern void graph_trace_close(struct trace_iterator *iter);
 extern int __trace_graph_entry(struct trace_array *tr,
 			       struct ftrace_graph_ent *trace,
 			       unsigned int trace_ctx);
+extern int __trace_graph_retaddr_entry(struct trace_array *tr,
+				struct ftrace_graph_ent *trace,
+				unsigned int trace_ctx,
+				unsigned long retaddr);
 extern void __trace_graph_return(struct trace_array *tr,
 				 struct ftrace_graph_ret *trace,
 				 unsigned int trace_ctx);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c47422b20908..82fd174ebbe0 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -85,9 +85,35 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
 	F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
 );
 
-/* Function return entry */
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+
+/* Function call entry with a return address */
+FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
+
+	TRACE_GRAPH_RETADDR_ENT,
+
+	F_STRUCT(
+		__field_struct(	struct fgraph_retaddr_ent,	graph_ent	)
+		__field_packed(	unsigned long,	graph_ent,	func		)
+		__field_packed(	int,		graph_ent,	depth		)
+		__field_packed(	unsigned long,	graph_ent,	retaddr		)
+	),
+
+	F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth,
+		(void *)__entry->retaddr)
+);
+
+#else
+
+#ifndef fgraph_retaddr_ent_entry
+#define fgraph_retaddr_ent_entry ftrace_graph_ent_entry
+#endif
+
+#endif
+
 #ifdef CONFIG_FUNCTION_GRAPH_RETVAL
 
+/* Function return entry */
 FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 
 	TRACE_GRAPH_RET,
@@ -110,6 +136,7 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 
 #else
 
+/* Function return entry */
 FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 
 	TRACE_GRAPH_RET,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 5c1b150fbba3..3dd63ae2afe8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -31,7 +31,10 @@ struct fgraph_data {
 	struct fgraph_cpu_data __percpu *cpu_data;
 
 	/* Place to preserve last processed entry. */
-	struct ftrace_graph_ent_entry	ent;
+	union {
+		struct ftrace_graph_ent_entry	ent;
+		struct fgraph_retaddr_ent_entry	rent;
+	} ent;
 	struct ftrace_graph_ret_entry	ret;
 	int				failed;
 	int				cpu;
@@ -63,6 +66,10 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) },
 	/* Display function return value in hexadecimal format ? */
 	{ TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) },
+#endif
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+	/* Display function return address ? */
+	{ TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) },
 #endif
 	/* Include sleep time (scheduled out) between entry and return */
 	{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
@@ -83,6 +90,11 @@ static struct tracer_flags tracer_flags = {
 	.opts = trace_opts
 };
 
+u32 graph_tracer_flags_get(u32 flags)
+{
+	return tracer_flags.val & flags;
+}
+
 /*
  * DURATION column is being also used to display IRQ signs,
  * following values are used by print_graph_irq and others
@@ -119,6 +131,40 @@ int __trace_graph_entry(struct trace_array *tr,
 	return 1;
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+				struct ftrace_graph_ent *trace,
+				unsigned int trace_ctx,
+				unsigned long retaddr)
+{
+	struct trace_event_call *call = &event_fgraph_retaddr_entry;
+	struct ring_buffer_event *event;
+	struct trace_buffer *buffer = tr->array_buffer.buffer;
+	struct fgraph_retaddr_ent_entry *entry;
+
+	event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT,
+					  sizeof(*entry), trace_ctx);
+	if (!event)
+		return 0;
+	entry	= ring_buffer_event_data(event);
+	entry->graph_ent.func = trace->func;
+	entry->graph_ent.depth = trace->depth;
+	entry->graph_ent.retaddr = retaddr;
+	if (!call_filter_check_discard(call, entry, buffer, event))
+		trace_buffer_unlock_commit_nostack(buffer, event);
+
+	return 1;
+}
+#else
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+				struct ftrace_graph_ent *trace,
+				unsigned int trace_ctx,
+				unsigned long retaddr)
+{
+	return 1;
+}
+#endif
+
 static inline int ftrace_graph_ignore_irqs(void)
 {
 	if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
@@ -133,7 +179,8 @@ struct fgraph_times {
 };
 
 int trace_graph_entry(struct ftrace_graph_ent *trace,
-		      struct fgraph_ops *gops)
+		      struct fgraph_ops *gops,
+		      struct fgraph_extras *extras)
 {
 	unsigned long *task_var = fgraph_get_task_var(gops);
 	struct trace_array *tr = gops->private;
@@ -199,7 +246,12 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		trace_ctx = tracing_gen_ctx_flags(flags);
-		ret = __trace_graph_entry(tr, trace, trace_ctx);
+		if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && extras
+				&& (extras->flags & TRACE_GRAPH_PRINT_RETADDR)))
+			ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx,
+							  extras->retaddr);
+		else
+			ret = __trace_graph_entry(tr, trace, trace_ctx);
 	} else {
 		ret = 0;
 	}
@@ -507,7 +559,7 @@ get_return_for_leaf(struct trace_iterator *iter,
 	 * then we just reuse the data from before.
 	 */
 	if (data && data->failed) {
-		curr = &data->ent;
+		curr = &data->ent.ent;
 		next = &data->ret;
 	} else {
 
@@ -537,7 +589,10 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * Save current and next entries for later reference
 			 * if the output fails.
 			 */
-			data->ent = *curr;
+			if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT))
+				data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr;
+			else
+				data->ent.ent = *curr;
 			/*
 			 * If the next event is not a return type, then
 			 * we only care about what type it is. Otherwise we can
@@ -701,52 +756,96 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_RETVAL
-
 #define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL
+#else
+#define __TRACE_GRAPH_PRINT_RETVAL 0
+#endif
 
-static void print_graph_retval(struct trace_seq *s, unsigned long retval,
-				bool leaf, void *func, bool hex_format)
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+#define __TRACE_GRAPH_PRINT_RETADDR TRACE_GRAPH_PRINT_RETADDR
+static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_entry *entry,
+				u32 trace_flags, bool comment)
+{
+	if (comment)
+		trace_seq_puts(s, " /*");
+
+	trace_seq_puts(s, " <-");
+	seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET);
+
+	if (comment)
+		trace_seq_puts(s, " */");
+}
+#else
+#define __TRACE_GRAPH_PRINT_RETADDR 0
+#define print_graph_retaddr(_seq, _entry, _tflags, _comment)		do { } while (0)
+#endif
+
+#if defined(CONFIG_FUNCTION_GRAPH_RETVAL) || defined(CONFIG_FUNCTION_GRAPH_RETADDR)
+
+static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry,
+				struct ftrace_graph_ret *graph_ret, void *func,
+				u32 opt_flags, u32 trace_flags)
 {
 	unsigned long err_code = 0;
+	unsigned long retval = 0;
+	bool print_retaddr = false;
+	bool print_retval = false;
+	bool hex_format = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL_HEX);
 
-	if (retval == 0 || hex_format)
-		goto done;
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+	retval = graph_ret->retval;
+	print_retval = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL);
+#endif
 
-	/* Check if the return value matches the negative format */
-	if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
-		(((u64)retval) >> 32) == 0) {
-		/* sign extension */
-		err_code = (unsigned long)(s32)retval;
-	} else {
-		err_code = retval;
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+	print_retaddr = !!(opt_flags & TRACE_GRAPH_PRINT_RETADDR);
+#endif
+
+	if (print_retval && retval && !hex_format) {
+		/* Check if the return value matches the negative format */
+		if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
+			(((u64)retval) >> 32) == 0) {
+			err_code = sign_extend64(retval, 31);
+		} else {
+			err_code = retval;
+		}
+
+		if (!IS_ERR_VALUE(err_code))
+			err_code = 0;
 	}
 
-	if (!IS_ERR_VALUE(err_code))
-		err_code = 0;
+	if (entry) {
+		if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT)
+			print_retaddr = false;
 
-done:
-	if (leaf) {
-		if (hex_format || (err_code == 0))
-			trace_seq_printf(s, "%ps(); /* = 0x%lx */\n",
-					func, retval);
+		trace_seq_printf(s, "%ps();", func);
+		if (print_retval || print_retaddr)
+			trace_seq_puts(s, " /*");
 		else
-			trace_seq_printf(s, "%ps(); /* = %ld */\n",
-					func, err_code);
+			trace_seq_putc(s, '\n');
 	} else {
+		print_retaddr = false;
+		trace_seq_printf(s, "} /* %ps", func);
+	}
+
+	if (print_retaddr)
+		print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+				    trace_flags, false);
+
+	if (print_retval) {
 		if (hex_format || (err_code == 0))
-			trace_seq_printf(s, "} /* %ps = 0x%lx */\n",
-					func, retval);
+			trace_seq_printf(s, " ret=0x%lx", retval);
 		else
-			trace_seq_printf(s, "} /* %ps = %ld */\n",
-					func, err_code);
+			trace_seq_printf(s, " ret=%ld", err_code);
 	}
+
+	if (!entry || print_retval || print_retaddr)
+		trace_seq_puts(s, " */\n");
 }
 
 #else
 
-#define __TRACE_GRAPH_PRINT_RETVAL 0
-
-#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0)
+#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0)
 
 #endif
 
@@ -798,14 +897,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 		trace_seq_putc(s, ' ');
 
 	/*
-	 * Write out the function return value if the option function-retval is
-	 * enabled.
+	 * Write out the function return value or return address
 	 */
-	if (flags & __TRACE_GRAPH_PRINT_RETVAL)
-		print_graph_retval(s, graph_ret->retval, true, (void *)func,
-				!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
-	else
+	if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) {
+		print_graph_retval(s, entry, graph_ret,
+				   (void *)graph_ret->func + iter->tr->text_delta,
+				   flags, tr->trace_flags);
+	} else {
 		trace_seq_printf(s, "%ps();\n", (void *)func);
+	}
 
 	print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
 			cpu, iter->ent->pid, flags);
@@ -846,7 +946,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
 
 	func = call->func + iter->tr->text_delta;
 
-	trace_seq_printf(s, "%ps() {\n", (void *)func);
+	trace_seq_printf(s, "%ps() {", (void *)func);
+	if (flags & __TRACE_GRAPH_PRINT_RETADDR  &&
+		entry->ent.type == TRACE_GRAPH_RETADDR_ENT)
+		print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+			tr->trace_flags, true);
+	trace_seq_putc(s, '\n');
 
 	if (trace_seq_has_overflowed(s))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -1093,11 +1198,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 
 	/*
 	 * Always write out the function name and its return value if the
-	 * function-retval option is enabled.
+	 * funcgraph-retval option is enabled.
 	 */
 	if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
-		print_graph_retval(s, trace->retval, false, (void *)func,
-			!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
+		print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags);
 	} else {
 		/*
 		 * If the return function does not have a matching entry,
@@ -1212,7 +1316,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 	 * to print out the missing entry which would never go out.
 	 */
 	if (data && data->failed) {
-		field = &data->ent;
+		field = &data->ent.ent;
 		iter->cpu = data->cpu;
 		ret = print_graph_entry(field, s, iter, flags);
 		if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
@@ -1236,6 +1340,16 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 		saved = *field;
 		return print_graph_entry(&saved, s, iter, flags);
 	}
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+	case TRACE_GRAPH_RETADDR_ENT: {
+		struct fgraph_retaddr_ent_entry saved;
+		struct fgraph_retaddr_ent_entry *rfield;
+
+		trace_assign_type(rfield, entry);
+		saved = *rfield;
+		return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags);
+	}
+#endif
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
@@ -1430,6 +1544,13 @@ static struct trace_event graph_trace_entry_event = {
 	.funcs		= &graph_functions,
 };
 
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+static struct trace_event graph_trace_retaddr_entry_event = {
+	.type		= TRACE_GRAPH_RETADDR_ENT,
+	.funcs		= &graph_functions,
+};
+#endif
+
 static struct trace_event graph_trace_ret_event = {
 	.type		= TRACE_GRAPH_RET,
 	.funcs		= &graph_functions
@@ -1516,6 +1637,13 @@ static __init int init_graph_trace(void)
 		return 1;
 	}
 
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+	if (!register_trace_event(&graph_trace_retaddr_entry_event)) {
+		pr_warn("Warning: could not register graph trace retaddr events\n");
+		return 1;
+	}
+#endif
+
 	if (!register_trace_event(&graph_trace_ret_event)) {
 		pr_warn("Warning: could not register graph trace events\n");
 		return 1;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index fce064e20570..eb3aa36cf10f 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -176,7 +176,8 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
 }
 
 static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
-			       struct fgraph_ops *gops)
+			       struct fgraph_ops *gops,
+			       struct fgraph_extras *extras)
 {
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ae2ace5e515a..155de2551507 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -113,7 +113,8 @@ static int wakeup_display_graph(struct trace_array *tr, int set)
 }
 
 static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
-			      struct fgraph_ops *gops)
+			      struct fgraph_ops *gops,
+			      struct fgraph_extras *extras)
 {
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c4ad7cd7e778..fbb99f8c8062 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_PRINT:
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
+	case TRACE_GRAPH_RETADDR_ENT:
 	case TRACE_GRAPH_RET:
 		return 1;
 	}
@@ -773,7 +774,8 @@ struct fgraph_fixture {
 };
 
 static __init int store_entry(struct ftrace_graph_ent *trace,
-			      struct fgraph_ops *gops)
+			      struct fgraph_ops *gops,
+			      struct fgraph_extras *extras)
 {
 	struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
 	const char *type = fixture->store_type_name;
@@ -1024,7 +1026,8 @@ static unsigned int graph_hang_thresh;
 
 /* Wrap the real function entry probe to avoid possible hanging */
 static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
-				      struct fgraph_ops *gops)
+				      struct fgraph_ops *gops,
+				      struct fgraph_extras *extras)
 {
 	/* This is harmlessly racy, we want to approximately detect a hang */
 	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
@@ -1038,7 +1041,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
 		return 0;
 	}
 
-	return trace_graph_entry(trace, gops);
+	return trace_graph_entry(trace, gops, NULL);
 }
 
 static struct fgraph_ops fgraph_ops __initdata  = {
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc
index e34c0bdef3ed..e8e46378b88d 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc
@@ -29,7 +29,7 @@ set -e
 
 : "Test printing the error code in signed decimal format"
 echo 0 > options/funcgraph-retval-hex
-count=`cat trace | grep 'proc_reg_write' | grep '= -5' | wc -l`
+count=`cat trace | grep 'proc_reg_write' | grep '=-5' | wc -l`
 if [ $count -eq 0 ]; then
     fail "Return value can not be printed in signed decimal format"
 fi
-- 
cgit v1.2.3


From 0a97195d2181caced187acd7454464b8e37021d7 Mon Sep 17 00:00:00 2001
From: Rajendra Nayak <quic_rjendra@quicinc.com>
Date: Tue, 3 Sep 2024 15:45:10 +0530
Subject: EDAC/qcom: Make irq configuration optional

On most modern qualcomm SoCs, the configuration necessary to enable the
Tag/Data RAM related irqs being propagated to the SoC irq controller is
already done in firmware (in DSF or 'DDR System Firmware')

On some like the x1e80100, these registers aren't even accesible to the
kernel causing a crash when edac device is probed.

Hence, make the irq configuration optional in the driver and mark x1e80100
as the SoC on which this should be avoided.

Fixes: af16b00578a7 ("arm64: dts: qcom: Add base X1E80100 dtsi and the QCP dts")
Reported-by: Bjorn Andersson <andersson@kernel.org>
Signed-off-by: Rajendra Nayak <quic_rjendra@quicinc.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
Link: https://lore.kernel.org/r/20240903101510.3452734-1-quic_rjendra@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/edac/qcom_edac.c           | 8 +++++---
 drivers/soc/qcom/llcc-qcom.c       | 3 +++
 include/linux/soc/qcom/llcc-qcom.h | 2 ++
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c
index d3cd4cc54ace..a9a8ba067007 100644
--- a/drivers/edac/qcom_edac.c
+++ b/drivers/edac/qcom_edac.c
@@ -342,9 +342,11 @@ static int qcom_llcc_edac_probe(struct platform_device *pdev)
 	int ecc_irq;
 	int rc;
 
-	rc = qcom_llcc_core_setup(llcc_driv_data, llcc_driv_data->bcast_regmap);
-	if (rc)
-		return rc;
+	if (!llcc_driv_data->ecc_irq_configured) {
+		rc = qcom_llcc_core_setup(llcc_driv_data, llcc_driv_data->bcast_regmap);
+		if (rc)
+			return rc;
+	}
 
 	/* Allocate edac control info */
 	edev_ctl = edac_device_alloc_ctl_info(0, "qcom-llcc", 1, "bank",
diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index 8fa4ffd3a9b5..28bcc65e91be 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -139,6 +139,7 @@ struct qcom_llcc_config {
 	int size;
 	bool need_llcc_cfg;
 	bool no_edac;
+	bool irq_configured;
 };
 
 struct qcom_sct_config {
@@ -718,6 +719,7 @@ static const struct qcom_llcc_config x1e80100_cfg[] = {
 		.need_llcc_cfg	= true,
 		.reg_offset	= llcc_v2_1_reg_offset,
 		.edac_reg_offset = &llcc_v2_1_edac_reg_offset,
+		.irq_configured = true,
 	},
 };
 
@@ -1345,6 +1347,7 @@ static int qcom_llcc_probe(struct platform_device *pdev)
 	drv_data->cfg = llcc_cfg;
 	drv_data->cfg_size = sz;
 	drv_data->edac_reg_offset = cfg->edac_reg_offset;
+	drv_data->ecc_irq_configured = cfg->irq_configured;
 	mutex_init(&drv_data->lock);
 	platform_set_drvdata(pdev, drv_data);
 
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 9e9f528b1370..2f20281d4ad4 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -125,6 +125,7 @@ struct llcc_edac_reg_offset {
  * @num_banks: Number of llcc banks
  * @bitmap: Bit map to track the active slice ids
  * @ecc_irq: interrupt for llcc cache error detection and reporting
+ * @ecc_irq_configured: 'True' if firmware has already configured the irq propagation
  * @version: Indicates the LLCC version
  */
 struct llcc_drv_data {
@@ -139,6 +140,7 @@ struct llcc_drv_data {
 	u32 num_banks;
 	unsigned long *bitmap;
 	int ecc_irq;
+	bool ecc_irq_configured;
 	u32 version;
 };
 
-- 
cgit v1.2.3


From 70c8fd00a9bd0509bbf7bccd9baea8bbd5ddc756 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 2 Oct 2024 17:27:16 -0400
Subject: timekeeping: Add interfaces for handling timestamps with a floor
 value

Multigrain timestamps allow the kernel to use fine-grained timestamps when
an inode's attributes is being actively observed via ->getattr().  With
this support, it's possible for a file to get a fine-grained timestamp, and
another modified after it to get a coarse-grained stamp that is earlier
than the fine-grained time.  If this happens then the files can appear to
have been modified in reverse order, which breaks VFS ordering guarantees
[1].

To prevent this, maintain a floor value for multigrain timestamps.
Whenever a fine-grained timestamp is handed out, record it, and when later
coarse-grained stamps are handed out, ensure they are not earlier than that
value. If the coarse-grained timestamp is earlier than the fine-grained
floor, return the floor value instead.

Add a static singleton atomic64_t into timekeeper.c that is used to keep
track of the latest fine-grained time ever handed out. This is tracked as a
monotonic ktime_t value to ensure that it isn't affected by clock
jumps. Because it is updated at different times than the rest of the
timekeeper object, the floor value is managed independently of the
timekeeper via a cmpxchg() operation, and sits on its own cacheline.

Add two new public interfaces:

- ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the
  coarse-grained clock and the floor time

- ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries
  to swap it into the floor. A timespec64 is filled with the result.

The floor value is global and updated via a single try_cmpxchg(). If
that fails then the operation raced with a concurrent update. Any
concurrent update must be later than the existing floor value, so any
racing tasks can accept any resulting floor value without retrying.

[1]: POSIX requires that files be stamped with realtime clock values, and
     makes no provision for dealing with backward clock jumps. If a backward
     realtime clock jump occurs, then files can appear to have been modified
     in reverse order.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241002-mgtime-v10-1-d1c4717f5284@kernel.org
---
 include/linux/timekeeping.h |   4 ++
 kernel/time/timekeeping.c   | 104 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fc12a9ba2c88..7aa85246c183 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -45,6 +45,10 @@ extern void ktime_get_real_ts64(struct timespec64 *tv);
 extern void ktime_get_coarse_ts64(struct timespec64 *ts);
 extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
 
+/* Multigrain timestamp interfaces */
+extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
+extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
+
 void getboottime64(struct timespec64 *ts);
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e6f409bf311..441792c907fa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -114,6 +114,23 @@ static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
 	.base[1] = FAST_TK_INIT,
 };
 
+/*
+ * Multigrain timestamps require tracking the latest fine-grained timestamp
+ * that has been issued, and never returning a coarse-grained timestamp that is
+ * earlier than that value.
+ *
+ * mg_floor represents the latest fine-grained time that has been handed out as
+ * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
+ * converted to a realtime clock value on an as-needed basis.
+ *
+ * Maintaining mg_floor ensures the multigrain interfaces never issue a
+ * timestamp earlier than one that has been previously issued.
+ *
+ * The exception to this rule is when there is a backward realtime clock jump. If
+ * such an event occurs, a timestamp can appear to be earlier than a previous one.
+ */
+static __cacheline_aligned_in_smp atomic64_t mg_floor;
+
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
 	while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -2394,6 +2411,93 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
 }
 EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
 
+/**
+ * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
+ * @ts:		timespec64 to be filled
+ *
+ * Fetch the global mg_floor value, convert it to realtime and compare it
+ * to the current coarse-grained time. Fill @ts with whichever is
+ * latest. Note that this is a filesystem-specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	u64 floor = atomic64_read(&mg_floor);
+	ktime_t f_real, offset, coarse;
+	unsigned int seq;
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		*ts = tk_xtime(tk);
+		offset = tk_core.timekeeper.offs_real;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	coarse = timespec64_to_ktime(*ts);
+	f_real = ktime_add(floor, offset);
+	if (ktime_after(f_real, coarse))
+		*ts = ktime_to_timespec64(f_real);
+}
+
+/**
+ * ktime_get_real_ts64_mg - attempt to update floor value and return result
+ * @ts:		pointer to the timespec to be set
+ *
+ * Get a monotonic fine-grained time value and attempt to swap it into
+ * mg_floor. If that succeeds then accept the new floor value. If it fails
+ * then another task raced in during the interim time and updated the
+ * floor.  Since any update to the floor must be later than the previous
+ * floor, either outcome is acceptable.
+ *
+ * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
+ * and determining that the resulting coarse-grained timestamp did not effect
+ * a change in ctime. Any more recent floor value would effect a change to
+ * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
+ *
+ * @ts will be filled with the latest floor value, regardless of the outcome of
+ * the cmpxchg. Note that this is a filesystem specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_real_ts64_mg(struct timespec64 *ts)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	ktime_t old = atomic64_read(&mg_floor);
+	ktime_t offset, mono;
+	unsigned int seq;
+	u64 nsecs;
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+
+		ts->tv_sec = tk->xtime_sec;
+		mono = tk->tkr_mono.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
+		offset = tk_core.timekeeper.offs_real;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	mono = ktime_add_ns(mono, nsecs);
+
+	/*
+	 * Attempt to update the floor with the new time value. As any
+	 * update must be later then the existing floor, and would effect
+	 * a change to ctime from the perspective of the current task,
+	 * accept the resulting floor value regardless of the outcome of
+	 * the swap.
+	 */
+	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
+		ts->tv_nsec = 0;
+		timespec64_add_ns(ts, nsecs);
+	} else {
+		/*
+		 * Another task changed mg_floor since "old" was fetched.
+		 * "old" has been updated with the latest value of "mg_floor".
+		 * That value is newer than the previous floor value, which
+		 * is enough to effect a change to ctime. Accept it.
+		 */
+		*ts = ktime_to_timespec64(ktime_add(old, offset));
+	}
+}
+
 void ktime_get_coarse_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-- 
cgit v1.2.3


From 96f9a366ec8abe027326d7aab84d64370019f0f1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 2 Oct 2024 17:27:17 -0400
Subject: timekeeping: Add percpu counter for tracking floor swap events

The mgtime_floor value is a global variable for tracking the latest
fine-grained timestamp handed out. Because it's a global, track the
number of times that a new floor value is assigned.

Add a new percpu counter to the timekeeping code to track the number of
floor swap events that have occurred. A later patch will add a debugfs
file to display this counter alongside other stats involving multigrain
timestamps.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Link: https://lore.kernel.org/all/20241002-mgtime-v10-2-d1c4717f5284@kernel.org
---
 include/linux/timekeeping.h        |  1 +
 kernel/time/timekeeping.c          |  1 +
 kernel/time/timekeeping_debug.c    | 13 +++++++++++++
 kernel/time/timekeeping_internal.h | 15 +++++++++++++++
 4 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 7aa85246c183..84a035e86ac8 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -48,6 +48,7 @@ extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
 /* Multigrain timestamp interfaces */
 extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
 extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
+extern unsigned long timekeeping_get_mg_floor_swaps(void);
 
 void getboottime64(struct timespec64 *ts);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 441792c907fa..962b2a31f015 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2487,6 +2487,7 @@ void ktime_get_real_ts64_mg(struct timespec64 *ts)
 	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
 		ts->tv_nsec = 0;
 		timespec64_add_ns(ts, nsecs);
+		timekeeping_inc_mg_floor_swaps();
 	} else {
 		/*
 		 * Another task changed mg_floor since "old" was fetched.
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index b73e8850e58d..badeb222eab9 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -17,6 +17,9 @@
 
 #define NUM_BINS 32
 
+/* Incremented every time mg_floor is updated */
+DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
 static unsigned int sleep_time_bin[NUM_BINS] = {0};
 
 static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
@@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t)
 			   (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
 }
 
+unsigned long timekeeping_get_mg_floor_swaps(void)
+{
+	unsigned long sum = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu));
+
+	return sum;
+}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ca2787d1642..0bbae825bc02 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -10,9 +10,24 @@
  * timekeeping debug functions
  */
 #ifdef CONFIG_DEBUG_FS
+
+DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+	this_cpu_inc(timekeeping_mg_floor_swaps);
+}
+
 extern void tk_debug_account_sleep_time(const struct timespec64 *t);
+
 #else
+
 #define tk_debug_account_sleep_time(x)
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+}
+
 #endif
 
 #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
-- 
cgit v1.2.3


From 2382d68d7d43873ba856baf567cab0d5c523f23b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Sep 2024 15:31:38 +1000
Subject: sched: change wake_up_bit() and related function to expect unsigned
 long *

wake_up_bit() currently allows a "void *".  While this isn't strictly a
problem as the address is never dereferenced, it is inconsistent with
the corresponding wait_on_bit() which requires "unsigned long *" and
does dereference the pointer.

Any code that needs to wait for a change in something other than an
unsigned long would be better served by wake_up_var()/wait_var_event().

This patch changes all related "void *" to "unsigned long *".

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240925053405.3960701-2-neilb@suse.de
---
 include/linux/wait_bit.h | 16 ++++++++--------
 kernel/sched/wait_bit.c  | 12 ++++++------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7725b7579b78..48e123839892 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -8,7 +8,7 @@
 #include <linux/wait.h>
 
 struct wait_bit_key {
-	void			*flags;
+	unsigned long		*flags;
 	int			bit_nr;
 	unsigned long		timeout;
 };
@@ -23,14 +23,14 @@ struct wait_bit_queue_entry {
 
 typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);
 
-void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
+void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit);
 int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
 int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
-void wake_up_bit(void *word, int bit);
-int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
-int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
-int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
-struct wait_queue_head *bit_waitqueue(void *word, int bit);
+void wake_up_bit(unsigned long *word, int bit);
+int out_of_line_wait_on_bit(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode);
+int out_of_line_wait_on_bit_timeout(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
+int out_of_line_wait_on_bit_lock(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode);
+struct wait_queue_head *bit_waitqueue(unsigned long *word, int bit);
 extern void __init wait_bit_init(void);
 
 int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
@@ -327,7 +327,7 @@ do {									\
  * You can use this helper if bitflags are manipulated atomically rather than
  * non-atomically under a lock.
  */
-static inline void clear_and_wake_up_bit(int bit, void *word)
+static inline void clear_and_wake_up_bit(int bit, unsigned long *word)
 {
 	clear_bit_unlock(bit, word);
 	/* See wake_up_bit() for which memory barrier you need to use. */
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 134d7112ef71..058b0e18727e 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -9,7 +9,7 @@
 
 static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
 
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
+wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit)
 {
 	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
 	unsigned long val = (unsigned long)word << shift | bit;
@@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
 }
 EXPORT_SYMBOL(__wait_on_bit);
 
-int __sched out_of_line_wait_on_bit(void *word, int bit,
+int __sched out_of_line_wait_on_bit(unsigned long *word, int bit,
 				    wait_bit_action_f *action, unsigned mode)
 {
 	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
 EXPORT_SYMBOL(out_of_line_wait_on_bit);
 
 int __sched out_of_line_wait_on_bit_timeout(
-	void *word, int bit, wait_bit_action_f *action,
+	unsigned long *word, int bit, wait_bit_action_f *action,
 	unsigned mode, unsigned long timeout)
 {
 	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry
 }
 EXPORT_SYMBOL(__wait_on_bit_lock);
 
-int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit,
 					 wait_bit_action_f *action, unsigned mode)
 {
 	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
 
-void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
+void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit)
 {
 	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
 
@@ -144,7 +144,7 @@ EXPORT_SYMBOL(__wake_up_bit);
  * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
  * because spin_unlock() does not guarantee a memory barrier.
  */
-void wake_up_bit(void *word, int bit)
+void wake_up_bit(unsigned long *word, int bit)
 {
 	__wake_up_bit(bit_waitqueue(word, bit), word, bit);
 }
-- 
cgit v1.2.3


From 3cdee6b359f134da22f7fd4606e0338413cfd79e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Sep 2024 15:31:39 +1000
Subject: sched: Improve documentation for wake_up_bit/wait_on_bit family of
 functions

This patch revises the documention for wake_up_bit(),
clear_and_wake_up_bit(), and all the wait_on_bit() family of functions.

The new documentation places less emphasis on the pool of waitqueues
used (an implementation detail) and focuses instead on details of how
the functions behave.

The barriers included in the wait functions and clear_and_wake_up_bit()
and those required for wake_up_bit() are spelled out more clearly.

The error statuses returned are given explicitly.

The fact that the wait_on_bit_lock() function sets the bit is made more
obvious.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240925053405.3960701-3-neilb@suse.de
---
 include/linux/wait_bit.h | 159 +++++++++++++++++++++++++----------------------
 kernel/sched/wait_bit.c  |  34 ++++++----
 2 files changed, 107 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 48e123839892..723e7bf35747 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -53,19 +53,21 @@ extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);
 
 /**
  * wait_on_bit - wait for a bit to be cleared
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * @word: the address containing the bit being waited on
+ * @bit: the bit at that address being waited on
  * @mode: the task state to sleep in
  *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that waits on a bit.
- * For instance, if one were to have waiters on a bitflag, one would
- * call wait_on_bit() in threads waiting for the bit to clear.
- * One uses wait_on_bit() where one is waiting for the bit to clear,
- * but has no intention of setting it.
- * Returned value will be zero if the bit was cleared, or non-zero
- * if the process received a signal and the mode permitted wakeup
- * on that signal.
+ * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP())
+ * to be cleared.  The clearing of the bit must be signalled with
+ * wake_up_bit(), often as clear_and_wake_up_bit().
+ *
+ * The process will wait on a waitqueue selected by hash from a shared
+ * pool.  It will only be woken on a wake_up for the target bit, even
+ * if other processes on the same queue are waiting for other bits.
+ *
+ * Returned value will be zero if the bit was cleared in which case the
+ * call has ACQUIRE semantics, or %-EINTR if the process received a
+ * signal and the mode permitted wake up on that signal.
  */
 static inline int
 wait_on_bit(unsigned long *word, int bit, unsigned mode)
@@ -80,17 +82,20 @@ wait_on_bit(unsigned long *word, int bit, unsigned mode)
 
 /**
  * wait_on_bit_io - wait for a bit to be cleared
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * @word: the address containing the bit being waited on
+ * @bit: the bit at that address being waited on
  * @mode: the task state to sleep in
  *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared.  This is similar to wait_on_bit(), but calls
- * io_schedule() instead of schedule() for the actual waiting.
+ * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP())
+ * to be cleared.  The clearing of the bit must be signalled with
+ * wake_up_bit(), often as clear_and_wake_up_bit().
+ *
+ * This is similar to wait_on_bit(), but calls io_schedule() instead of
+ * schedule() for the actual waiting.
  *
- * Returned value will be zero if the bit was cleared, or non-zero
- * if the process received a signal and the mode permitted wakeup
- * on that signal.
+ * Returned value will be zero if the bit was cleared in which case the
+ * call has ACQUIRE semantics, or %-EINTR if the process received a
+ * signal and the mode permitted wake up on that signal.
  */
 static inline int
 wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
@@ -104,19 +109,24 @@ wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
 }
 
 /**
- * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * wait_on_bit_timeout - wait for a bit to be cleared or a timeout to elapse
+ * @word: the address containing the bit being waited on
+ * @bit: the bit at that address being waited on
  * @mode: the task state to sleep in
  * @timeout: timeout, in jiffies
  *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared. This is similar to wait_on_bit(), except also takes a
- * timeout parameter.
+ * Wait for the given bit in an unsigned long or bitmap (see
+ * DECLARE_BITMAP()) to be cleared, or for a timeout to expire.  The
+ * clearing of the bit must be signalled with wake_up_bit(), often as
+ * clear_and_wake_up_bit().
  *
- * Returned value will be zero if the bit was cleared before the
- * @timeout elapsed, or non-zero if the @timeout elapsed or process
- * received a signal and the mode permitted wakeup on that signal.
+ * This is similar to wait_on_bit(), except it also takes a timeout
+ * parameter.
+ *
+ * Returned value will be zero if the bit was cleared in which case the
+ * call has ACQUIRE semantics, or %-EINTR if the process received a
+ * signal and the mode permitted wake up on that signal, or %-EAGAIN if the
+ * timeout elapsed.
  */
 static inline int
 wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
@@ -132,19 +142,21 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
 
 /**
  * wait_on_bit_action - wait for a bit to be cleared
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * @word: the address containing the bit waited on
+ * @bit: the bit at that address being waited on
  * @action: the function used to sleep, which may take special actions
  * @mode: the task state to sleep in
  *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared, and allow the waiting action to be specified.
- * This is like wait_on_bit() but allows fine control of how the waiting
- * is done.
+ * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP())
+ * to be cleared.  The clearing of the bit must be signalled with
+ * wake_up_bit(), often as clear_and_wake_up_bit().
+ *
+ * This is similar to wait_on_bit(), but calls @action() instead of
+ * schedule() for the actual waiting.
  *
- * Returned value will be zero if the bit was cleared, or non-zero
- * if the process received a signal and the mode permitted wakeup
- * on that signal.
+ * Returned value will be zero if the bit was cleared in which case the
+ * call has ACQUIRE semantics, or the error code returned by @action if
+ * that call returned non-zero.
  */
 static inline int
 wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
@@ -157,23 +169,22 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
 }
 
 /**
- * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * wait_on_bit_lock - wait for a bit to be cleared, then set it
+ * @word: the address containing the bit being waited on
+ * @bit: the bit of the word being waited on and set
  * @mode: the task state to sleep in
  *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that waits on a bit
- * when one intends to set it, for instance, trying to lock bitflags.
- * For instance, if one were to have waiters trying to set bitflag
- * and waiting for it to clear before setting it, one would call
- * wait_on_bit() in threads waiting to be able to set the bit.
- * One uses wait_on_bit_lock() where one is waiting for the bit to
- * clear with the intention of setting it, and when done, clearing it.
+ * Wait for the given bit in an unsigned long or bitmap (see
+ * DECLARE_BITMAP()) to be cleared.  The clearing of the bit must be
+ * signalled with wake_up_bit(), often as clear_and_wake_up_bit().  As
+ * soon as it is clear, atomically set it and return.
  *
- * Returns zero if the bit was (eventually) found to be clear and was
- * set.  Returns non-zero if a signal was delivered to the process and
- * the @mode allows that signal to wake the process.
+ * This is similar to wait_on_bit(), but sets the bit before returning.
+ *
+ * Returned value will be zero if the bit was successfully set in which
+ * case the call has the same memory sequencing semantics as
+ * test_and_clear_bit(), or %-EINTR if the process received a signal and
+ * the mode permitted wake up on that signal.
  */
 static inline int
 wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
@@ -185,15 +196,18 @@ wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
 }
 
 /**
- * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * wait_on_bit_lock_io - wait for a bit to be cleared, then set it
+ * @word: the address containing the bit being waited on
+ * @bit: the bit of the word being waited on and set
  * @mode: the task state to sleep in
  *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared and then to atomically set it.  This is similar
- * to wait_on_bit(), but calls io_schedule() instead of schedule()
- * for the actual waiting.
+ * Wait for the given bit in an unsigned long or bitmap (see
+ * DECLARE_BITMAP()) to be cleared.  The clearing of the bit must be
+ * signalled with wake_up_bit(), often as clear_and_wake_up_bit().  As
+ * soon as it is clear, atomically set it and return.
+ *
+ * This is similar to wait_on_bit_lock(), but calls io_schedule() instead
+ * of schedule().
  *
  * Returns zero if the bit was (eventually) found to be clear and was
  * set.  Returns non-zero if a signal was delivered to the process and
@@ -209,21 +223,19 @@ wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
 }
 
 /**
- * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * wait_on_bit_lock_action - wait for a bit to be cleared, then set it
+ * @word: the address containing the bit being waited on
+ * @bit: the bit of the word being waited on and set
  * @action: the function used to sleep, which may take special actions
  * @mode: the task state to sleep in
  *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared and then to set it, and allow the waiting action
- * to be specified.
- * This is like wait_on_bit() but allows fine control of how the waiting
- * is done.
+ * This is similar to wait_on_bit_lock(), but calls @action() instead of
+ * schedule() for the actual waiting.
  *
- * Returns zero if the bit was (eventually) found to be clear and was
- * set.  Returns non-zero if a signal was delivered to the process and
- * the @mode allows that signal to wake the process.
+ * Returned value will be zero if the bit was successfully set in which
+ * case the call has the same memory sequencing semantics as
+ * test_and_clear_bit(), or the error code returned by @action if that
+ * call returned non-zero.
  */
 static inline int
 wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
@@ -320,12 +332,13 @@ do {									\
 
 /**
  * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
- *
  * @bit: the bit of the word being waited on
- * @word: the word being waited on, a kernel virtual address
+ * @word: the address containing the bit being waited on
  *
- * You can use this helper if bitflags are manipulated atomically rather than
- * non-atomically under a lock.
+ * The designated bit is cleared and any tasks waiting in wait_on_bit()
+ * or similar will be woken.  This call has RELEASE semantics so that
+ * any changes to memory made before this call are guaranteed to be visible
+ * after the corresponding wait_on_bit() completes.
  */
 static inline void clear_and_wake_up_bit(int bit, unsigned long *word)
 {
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 058b0e18727e..bd2fc750fb1f 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -128,21 +128,29 @@ void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit
 EXPORT_SYMBOL(__wake_up_bit);
 
 /**
- * wake_up_bit - wake up a waiter on a bit
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * wake_up_bit - wake up waiters on a bit
+ * @word: the address containing the bit being waited on
+ * @bit: the bit at that address being waited on
  *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hash-table's accessor API that wakes up waiters
- * on a bit. For instance, if one were to have waiters on a bitflag,
- * one would call wake_up_bit() after clearing the bit.
+ * Wake up any process waiting in wait_on_bit() or similar for the
+ * given bit to be cleared.
  *
- * In order for this to function properly, as it uses waitqueue_active()
- * internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_atomic(), but in some
- * cases where bitflags are manipulated non-atomically under a lock, one
- * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
- * because spin_unlock() does not guarantee a memory barrier.
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool.  Only those tasks on that queue which have requested
+ * wake_up on this specific address and bit will be woken, and only if the
+ * bit is clear.
+ *
+ * In order for this to function properly there must be a full memory
+ * barrier after the bit is cleared and before this function is called.
+ * If the bit was cleared atomically, such as a by clear_bit() then
+ * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed.
+ * If the bit was cleared with a fully-ordered operation, no further
+ * barrier is required.
+ *
+ * Normally the bit should be cleared by an operation with RELEASE
+ * semantics so that any changes to memory made before the bit is
+ * cleared are guaranteed to be visible after the matching wait_on_bit()
+ * completes.
  */
 void wake_up_bit(unsigned long *word, int bit)
 {
-- 
cgit v1.2.3


From bf39882edc798279765ca31751f6e679b50b97ef Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Sep 2024 15:31:40 +1000
Subject: sched: Document wait_var_event() family of functions and
 wake_up_var()

wake_up_var(), wait_var_event() and related interfaces are not
documented but have important ordering requirements.  This patch adds
documentation and makes these requirements explicit.

The return values for those wait_var_event_* functions which return a
value are documented.  Note that these are, perhaps surprisingly,
sometimes different from comparable wait_on_bit() functions.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240925053405.3960701-4-neilb@suse.de
---
 include/linux/wait_bit.h | 71 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/wait_bit.c  | 30 ++++++++++++++++++++
 2 files changed, 101 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 723e7bf35747..06ec99b90bf3 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -282,6 +282,22 @@ __out:	__ret;								\
 	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
 			  schedule())
 
+/**
+ * wait_var_event - wait for a variable to be updated and notified
+ * @var: the address of variable being waited on
+ * @condition: the condition to wait for
+ *
+ * Wait for a @condition to be true, only re-checking when a wake up is
+ * received for the given @var (an arbitrary kernel address which need
+ * not be directly related to the given condition, but usually is).
+ *
+ * The process will wait on a waitqueue selected by hash from a shared
+ * pool.  It will only be woken on a wake_up for the given address.
+ *
+ * The condition should normally use smp_load_acquire() or a similarly
+ * ordered access to ensure that any changes to memory made before the
+ * condition became true will be visible after the wait completes.
+ */
 #define wait_var_event(var, condition)					\
 do {									\
 	might_sleep();							\
@@ -294,6 +310,24 @@ do {									\
 	___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,		\
 			  schedule())
 
+/**
+ * wait_var_event_killable - wait for a variable to be updated and notified
+ * @var: the address of variable being waited on
+ * @condition: the condition to wait for
+ *
+ * Wait for a @condition to be true or a fatal signal to be received,
+ * only re-checking the condition when a wake up is received for the given
+ * @var (an arbitrary kernel address which need not be directly related
+ * to the given condition, but usually is).
+ *
+ * This is similar to wait_var_event() but returns a value which is
+ * 0 if the condition became true, or %-ERESTARTSYS if a fatal signal
+ * was received.
+ *
+ * The condition should normally use smp_load_acquire() or a similarly
+ * ordered access to ensure that any changes to memory made before the
+ * condition became true will be visible after the wait completes.
+ */
 #define wait_var_event_killable(var, condition)				\
 ({									\
 	int __ret = 0;							\
@@ -308,6 +342,26 @@ do {									\
 			  TASK_UNINTERRUPTIBLE, 0, timeout,		\
 			  __ret = schedule_timeout(__ret))
 
+/**
+ * wait_var_event_timeout - wait for a variable to be updated or a timeout to expire
+ * @var: the address of variable being waited on
+ * @condition: the condition to wait for
+ * @timeout: maximum time to wait in jiffies
+ *
+ * Wait for a @condition to be true or a timeout to expire, only
+ * re-checking the condition when a wake up is received for the given
+ * @var (an arbitrary kernel address which need not be directly related
+ * to the given condition, but usually is).
+ *
+ * This is similar to wait_var_event() but returns a value which is 0 if
+ * the timeout expired and the condition was still false, or the
+ * remaining time left in the timeout (but at least 1) if the condition
+ * was found to be true.
+ *
+ * The condition should normally use smp_load_acquire() or a similarly
+ * ordered access to ensure that any changes to memory made before the
+ * condition became true will be visible after the wait completes.
+ */
 #define wait_var_event_timeout(var, condition, timeout)			\
 ({									\
 	long __ret = timeout;						\
@@ -321,6 +375,23 @@ do {									\
 	___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0,	\
 			  schedule())
 
+/**
+ * wait_var_event_killable - wait for a variable to be updated and notified
+ * @var: the address of variable being waited on
+ * @condition: the condition to wait for
+ *
+ * Wait for a @condition to be true or a signal to be received, only
+ * re-checking the condition when a wake up is received for the given
+ * @var (an arbitrary kernel address which need not be directly related
+ * to the given condition, but usually is).
+ *
+ * This is similar to wait_var_event() but returns a value which is 0 if
+ * the condition became true, or %-ERESTARTSYS if a signal was received.
+ *
+ * The condition should normally use smp_load_acquire() or a similarly
+ * ordered access to ensure that any changes to memory made before the
+ * condition became true will be visible after the wait completes.
+ */
 #define wait_var_event_interruptible(var, condition)			\
 ({									\
 	int __ret = 0;							\
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index bd2fc750fb1f..22ec270f5ab5 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -196,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int
 }
 EXPORT_SYMBOL(init_wait_var_entry);
 
+/**
+ * wake_up_var - wake up waiters on a variable (kernel address)
+ * @var: the address of the variable being waited on
+ *
+ * Wake up any process waiting in wait_var_event() or similar for the
+ * given variable to change.  wait_var_event() can be waiting for an
+ * arbitrary condition to be true and associates that condition with an
+ * address.  Calling wake_up_var() suggests that the condition has been
+ * made true, but does not strictly require the condtion to use the
+ * address given.
+ *
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool.  Only those tasks on that queue which have requested
+ * wake_up on this specific address will be woken.
+ *
+ * In order for this to function properly there must be a full memory
+ * barrier after the variable is updated (or more accurately, after the
+ * condition waited on has been made to be true) and before this function
+ * is called.  If the variable was updated atomically, such as a by
+ * atomic_dec() then smb_mb__after_atomic() can be used.  If the
+ * variable was updated by a fully ordered operation such as
+ * atomic_dec_and_test() then no extra barrier is required.  Otherwise
+ * smb_mb() is needed.
+ *
+ * Normally the variable should be updated (the condition should be made
+ * to be true) by an operation with RELEASE semantics such as
+ * smp_store_release() so that any changes to memory made before the
+ * variable was updated are guaranteed to be visible after the matching
+ * wait_var_event() completes.
+ */
 void wake_up_var(void *var)
 {
 	__wake_up_bit(__var_waitqueue(var), var, -1);
-- 
cgit v1.2.3


From 52d633def56c10fe3e82a2c5d88c3ecb3f4e4852 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Sep 2024 15:31:41 +1000
Subject: sched: Add test_and_clear_wake_up_bit() and atomic_dec_and_wake_up()

There are common patterns in the kernel of using test_and_clear_bit()
before wake_up_bit(), and atomic_dec_and_test() before wake_up_var().

These combinations don't need extra barriers but sometimes include them
unnecessarily.

To help avoid the unnecessary barriers and to help discourage the
general use of wake_up_bit/var (which is a fragile interface) introduce
two combined functions which implement these patterns.

Also add store_release_wake_up() which supports the task of simply
setting a non-atomic variable and sending a wakeup.  This pattern
requires barriers which are often omitted.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240925053405.3960701-5-neilb@suse.de
---
 include/linux/wait_bit.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 06ec99b90bf3..0272629b590a 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -419,4 +419,64 @@ static inline void clear_and_wake_up_bit(int bit, unsigned long *word)
 	wake_up_bit(word, bit);
 }
 
+/**
+ * test_and_clear_wake_up_bit - clear a bit if it was set: wake up anyone waiting on that bit
+ * @bit: the bit of the word being waited on
+ * @word: the address of memory containing that bit
+ *
+ * If the bit is set and can be atomically cleared, any tasks waiting in
+ * wait_on_bit() or similar will be woken.  This call has the same
+ * complete ordering semantics as test_and_clear_bit().  Any changes to
+ * memory made before this call are guaranteed to be visible after the
+ * corresponding wait_on_bit() completes.
+ *
+ * Returns %true if the bit was successfully set and the wake up was sent.
+ */
+static inline bool test_and_clear_wake_up_bit(int bit, unsigned long *word)
+{
+	if (!test_and_clear_bit(bit, word))
+		return false;
+	/* no extra barrier required */
+	wake_up_bit(word, bit);
+	return true;
+}
+
+/**
+ * atomic_dec_and_wake_up - decrement an atomic_t and if zero, wake up waiters
+ * @var: the variable to dec and test
+ *
+ * Decrements the atomic variable and if it reaches zero, send a wake_up to any
+ * processes waiting on the variable.
+ *
+ * This function has the same complete ordering semantics as atomic_dec_and_test.
+ *
+ * Returns %true is the variable reaches zero and the wake up was sent.
+ */
+
+static inline bool atomic_dec_and_wake_up(atomic_t *var)
+{
+	if (!atomic_dec_and_test(var))
+		return false;
+	/* No extra barrier required */
+	wake_up_var(var);
+	return true;
+}
+
+/**
+ * store_release_wake_up - update a variable and send a wake_up
+ * @var: the address of the variable to be updated and woken
+ * @val: the value to store in the variable.
+ *
+ * Store the given value in the variable send a wake up to any tasks
+ * waiting on the variable.  All necessary barriers are included to ensure
+ * the task calling wait_var_event() sees the new value and all values
+ * written to memory before this call.
+ */
+#define store_release_wake_up(var, val)					\
+do {									\
+	smp_store_release(var, val);					\
+	smp_mb();							\
+	wake_up_var(var);						\
+} while (0)
+
 #endif /* _LINUX_WAIT_BIT_H */
-- 
cgit v1.2.3


From cc2e1c82d7e474753681a38b07b63034e107e369 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Sep 2024 15:31:42 +1000
Subject: sched: Add wait/wake interface for variable updated under a lock.

Sometimes we need to wait for a condition to be true which must be
testing while holding a lock.  Correspondingly the condition is made
true while holding the lock and the wake up is sent under the lock.

This patch provides wake and wait interfaces which can be used for this
situation when the lock is a mutex or a spinlock, or any other lock for
which there are foo_lock() and foo_unlock() functions.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240925053405.3960701-6-neilb@suse.de
---
 include/linux/wait_bit.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 0272629b590a..6aea10efca3d 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -401,6 +401,112 @@ do {									\
 	__ret;								\
 })
 
+/**
+ * wait_var_event_any_lock - wait for a variable to be updated under a lock
+ * @var: the address of the variable being waited on
+ * @condition: condition to wait for
+ * @lock: the object that is locked to protect updates to the variable
+ * @type: prefix on lock and unlock operations
+ * @state: waiting state, %TASK_UNINTERRUPTIBLE etc.
+ *
+ * Wait for a condition which can only be reliably tested while holding
+ * a lock.  The variables assessed in the condition will normal be updated
+ * under the same lock, and the wake up should be signalled with
+ * wake_up_var_locked() under the same lock.
+ *
+ * This is similar to wait_var_event(), but assumes a lock is held
+ * while calling this function and while updating the variable.
+ *
+ * This must be called while the given lock is held and the lock will be
+ * dropped when schedule() is called to wait for a wake up, and will be
+ * reclaimed before testing the condition again.  The functions used to
+ * unlock and lock the object are constructed by appending _unlock and _lock
+ * to @type.
+ *
+ * Return %-ERESTARTSYS if a signal arrives which is allowed to interrupt
+ * the wait according to @state.
+ */
+#define wait_var_event_any_lock(var, condition, lock, type, state)	\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__ret = ___wait_var_event(var, condition, state, 0, 0,	\
+					  type ## _unlock(lock);	\
+					  schedule();			\
+					  type ## _lock(lock));		\
+	__ret;								\
+})
+
+/**
+ * wait_var_event_spinlock - wait for a variable to be updated under a spinlock
+ * @var: the address of the variable being waited on
+ * @condition: condition to wait for
+ * @lock: the spinlock which protects updates to the variable
+ *
+ * Wait for a condition which can only be reliably tested while holding
+ * a spinlock.  The variables assessed in the condition will normal be updated
+ * under the same spinlock, and the wake up should be signalled with
+ * wake_up_var_locked() under the same spinlock.
+ *
+ * This is similar to wait_var_event(), but assumes a spinlock is held
+ * while calling this function and while updating the variable.
+ *
+ * This must be called while the given lock is held and the lock will be
+ * dropped when schedule() is called to wait for a wake up, and will be
+ * reclaimed before testing the condition again.
+ */
+#define wait_var_event_spinlock(var, condition, lock)			\
+	wait_var_event_any_lock(var, condition, lock, spin, TASK_UNINTERRUPTIBLE)
+
+/**
+ * wait_var_event_mutex - wait for a variable to be updated under a mutex
+ * @var: the address of the variable being waited on
+ * @condition: condition to wait for
+ * @mutex: the mutex which protects updates to the variable
+ *
+ * Wait for a condition which can only be reliably tested while holding
+ * a mutex.  The variables assessed in the condition will normal be
+ * updated under the same mutex, and the wake up should be signalled
+ * with wake_up_var_locked() under the same mutex.
+ *
+ * This is similar to wait_var_event(), but assumes a mutex is held
+ * while calling this function and while updating the variable.
+ *
+ * This must be called while the given mutex is held and the mutex will be
+ * dropped when schedule() is called to wait for a wake up, and will be
+ * reclaimed before testing the condition again.
+ */
+#define wait_var_event_mutex(var, condition, lock)			\
+	wait_var_event_any_lock(var, condition, lock, mutex, TASK_UNINTERRUPTIBLE)
+
+/**
+ * wake_up_var_protected - wake up waiters for a variable asserting that it is safe
+ * @var: the address of the variable being waited on
+ * @cond: the condition which afirms this is safe
+ *
+ * When waking waiters which use wait_var_event_any_lock() the waker must be
+ * holding the reelvant lock to avoid races.  This version of wake_up_var()
+ * asserts that the relevant lock is held and so no barrier is needed.
+ * The @cond is only tested when CONFIG_LOCKDEP is enabled.
+ */
+#define wake_up_var_protected(var, cond)				\
+do {									\
+	lockdep_assert(cond);						\
+	wake_up_var(var);						\
+} while (0)
+
+/**
+ * wake_up_var_locked - wake up waiters for a variable while holding a spinlock or mutex
+ * @var: the address of the variable being waited on
+ * @lock: The spinlock or mutex what protects the variable
+ *
+ * Send a wake up for the given variable which should be waited for with
+ * wait_var_event_spinlock() or wait_var_event_mutex().  Unlike wake_up_var(),
+ * no extra barriers are needed as the locking provides sufficient sequencing.
+ */
+#define wake_up_var_locked(var, lock)					\
+	wake_up_var_protected(var, lockdep_is_held(lock))
+
 /**
  * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
  * @bit: the bit of the word being waited on
-- 
cgit v1.2.3


From 80681c04c5e8e4297b9ebf201ca3ce6242aa16c3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Sep 2024 15:31:43 +1000
Subject: sched: add wait_var_event_io()

It is not currently possible to wait wait_var_event for an io_schedule()
style wait.  This patch adds wait_var_event_io() for that purpose.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240925053405.3960701-7-neilb@suse.de
---
 include/linux/wait_bit.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 6aea10efca3d..6346e26fbfd1 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -281,6 +281,9 @@ __out:	__ret;								\
 #define __wait_var_event(var, condition)				\
 	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
 			  schedule())
+#define __wait_var_event_io(var, condition)				\
+	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			  io_schedule())
 
 /**
  * wait_var_event - wait for a variable to be updated and notified
@@ -306,6 +309,34 @@ do {									\
 	__wait_var_event(var, condition);				\
 } while (0)
 
+/**
+ * wait_var_event_io - wait for a variable to be updated and notified
+ * @var: the address of variable being waited on
+ * @condition: the condition to wait for
+ *
+ * Wait for an IO related @condition to be true, only re-checking when a
+ * wake up is received for the given @var (an arbitrary kernel address
+ * which need not be directly related to the given condition, but
+ * usually is).
+ *
+ * The process will wait on a waitqueue selected by hash from a shared
+ * pool.  It will only be woken on a wake_up for the given address.
+ *
+ * This is similar to wait_var_event(), but calls io_schedule() instead
+ * of schedule().
+ *
+ * The condition should normally use smp_load_acquire() or a similarly
+ * ordered access to ensure that any changes to memory made before the
+ * condition became true will be visible after the wait completes.
+ */
+#define wait_var_event_io(var, condition)				\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__wait_var_event_io(var, condition);				\
+} while (0)
+
 #define __wait_var_event_killable(var, condition)			\
 	___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,		\
 			  schedule())
-- 
cgit v1.2.3


From 5e9f0c4819deb9459f32f12c4fd2b47993b8c395 Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Mon, 30 Sep 2024 05:09:46 +0000
Subject: sched: remove unused __HAVE_THREAD_FUNCTIONS hook support

__HAVE_THREAD_FUNCTIONS could be defined by architectures wishing to
provide their own task_thread_info(), task_stack_page(),
setup_thread_stack() and end_of_stack() hooks.

Commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture")
removed the last upstream consumer of __HAVE_THREAD_FUNCTIONS, so change
the remaining !CONFIG_THREAD_INFO_IN_TASK && !__HAVE_THREAD_FUNCTIONS
conditionals to only check for the former case.

Signed-off-by: David Disseldorp <ddiss@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lkml.kernel.org/r/20240930050945.30304-2-ddiss@suse.de
---
 include/linux/sched.h            | 2 +-
 include/linux/sched/task_stack.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e6ee4258169a..abf26f1e1447 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1898,7 +1898,7 @@ extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 # define task_thread_info(task)	(&(task)->thread_info)
-#elif !defined(__HAVE_THREAD_FUNCTIONS)
+#else
 # define task_thread_info(task)	((struct thread_info *)(task)->stack)
 #endif
 
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index bf10bdb487dd..2e52cc421bce 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -33,7 +33,7 @@ static __always_inline unsigned long *end_of_stack(const struct task_struct *tas
 #endif
 }
 
-#elif !defined(__HAVE_THREAD_FUNCTIONS)
+#else
 
 #define task_stack_page(task)	((void *)(task)->stack)
 
-- 
cgit v1.2.3


From 0ac8f14ef22a1592b44dc90272aab35e43b0106a Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Wed, 2 Oct 2024 00:40:16 +0100
Subject: sched/wait: Remove unused bit_wait_io_timeout

bit_wait_io_timeout has been unused since 2016's
commit 62906027091f ("mm: add PageWaiters indicating tasks are waiting for a page bit")

Remove it.

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lore.kernel.org/r/20241001234016.231696-1-linux@treblig.org
---
 include/linux/wait_bit.h |  1 -
 kernel/sched/wait_bit.c  | 14 --------------
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 6346e26fbfd1..9e29d79fc790 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -49,7 +49,6 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 extern int bit_wait(struct wait_bit_key *key, int mode);
 extern int bit_wait_io(struct wait_bit_key *key, int mode);
 extern int bit_wait_timeout(struct wait_bit_key *key, int mode);
-extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);
 
 /**
  * wait_on_bit - wait for a bit to be cleared
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 22ec270f5ab5..b410b61cec95 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -266,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
 }
 EXPORT_SYMBOL_GPL(bit_wait_timeout);
 
-__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
-{
-	unsigned long now = READ_ONCE(jiffies);
-
-	if (time_after_eq(now, word->timeout))
-		return -EAGAIN;
-	io_schedule_timeout(word->timeout - now);
-	if (signal_pending_state(mode, current))
-		return -EINTR;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
-
 void __init wait_bit_init(void)
 {
 	int i;
-- 
cgit v1.2.3


From 066c779b094b63754e0742ad8675d72d6c0a46f6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 3 Oct 2024 18:48:19 +0300
Subject: platform/x86: intel_scu_ipc: Don't use "proxy" headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update header inclusions to follow IWYU (Include What You Use)
principle.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Link: https://lore.kernel.org/r/20241003154819.1075141-1-andriy.shevchenko@linux.intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/platform_data/x86/intel_scu_ipc.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/intel_scu_ipc.h b/include/linux/platform_data/x86/intel_scu_ipc.h
index 0ca9962e97f2..b287627759f7 100644
--- a/include/linux/platform_data/x86/intel_scu_ipc.h
+++ b/include/linux/platform_data/x86/intel_scu_ipc.h
@@ -2,9 +2,13 @@
 #ifndef __PLATFORM_X86_INTEL_SCU_IPC_H_
 #define __PLATFORM_X86_INTEL_SCU_IPC_H_
 
+#include <linux/init.h>
 #include <linux/ioport.h>
+#include <linux/types.h>
 
 struct device;
+struct module;
+
 struct intel_scu_ipc_dev;
 
 /**
-- 
cgit v1.2.3


From 4e40eff0b5737c0de39e1ae5812509efbc0b986e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 2 Oct 2024 17:27:18 -0400
Subject: fs: add infrastructure for multigrain timestamps

The VFS has always used coarse-grained timestamps when updating the
ctime and mtime after a change. This has the benefit of allowing
filesystems to optimize away a lot metadata updates, down to around 1
per jiffy, even when a file is under heavy writes.

Unfortunately, this has always been an issue when we're exporting via
NFSv3, which relies on timestamps to validate caches. A lot of changes
can happen in a jiffy, so timestamps aren't sufficient to help the
client decide when to invalidate the cache. Even with NFSv4, a lot of
exported filesystems don't properly support a change attribute and are
subject to the same problems with timestamp granularity. Other
applications have similar issues with timestamps (e.g backup
applications).

If fine-grained timestamps were always used, that would improve the
situation, but that becomes rather expensive, as the underlying
filesystem would have to log a lot more metadata updates.

What is needed is a way to only use fine-grained timestamps when they
are being actively queried. Use the (unused) top bit in
inode->i_ctime_nsec as a flag that indicates whether the current
timestamps have been queried via stat() or the like. When it's set,
allow the update to use a fine-grained timestamp iff it's necessary to
make the ctime show a different value.

If it has been queried, then first see whether the current coarse time
is later than the existing ctime. If it is, accept that value.  If it
isn't, then get a fine-grained timestamp and attempt to stamp the inode
ctime with that value. If that races with another concurrent stamp, then
abandon the update and take the new value without retrying.

Filesystems can opt into this by setting the FS_MGTIME fstype flag.
Others should be unaffected (other than being subject to the same floor
value as multigrain filesystems).

Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20241002-mgtime-v10-3-d1c4717f5284@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c         | 139 +++++++++++++++++++++++++++++++++++++++++++----------
 fs/stat.c          |  43 ++++++++++++++++-
 include/linux/fs.h |  34 ++++++++++---
 3 files changed, 181 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 10c4619faeef..53f56f6e1ff2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2172,19 +2172,58 @@ int file_remove_privs(struct file *file)
 }
 EXPORT_SYMBOL(file_remove_privs);
 
+/**
+ * current_time - Return FS time (possibly fine-grained)
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
+ * as having been QUERIED, get a fine-grained timestamp, but don't update
+ * the floor.
+ *
+ * For a multigrain inode, this is effectively an estimate of the timestamp
+ * that a file would receive. An actual update must go through
+ * inode_set_ctime_current().
+ */
+struct timespec64 current_time(struct inode *inode)
+{
+	struct timespec64 now;
+	u32 cns;
+
+	ktime_get_coarse_real_ts64_mg(&now);
+
+	if (!is_mgtime(inode))
+		goto out;
+
+	/* If nothing has queried it, then coarse time is fine */
+	cns = smp_load_acquire(&inode->i_ctime_nsec);
+	if (cns & I_CTIME_QUERIED) {
+		/*
+		 * If there is no apparent change, then get a fine-grained
+		 * timestamp.
+		 */
+		if (now.tv_nsec == (cns & ~I_CTIME_QUERIED))
+			ktime_get_real_ts64(&now);
+	}
+out:
+	return timestamp_truncate(now, inode);
+}
+EXPORT_SYMBOL(current_time);
+
 static int inode_needs_update_time(struct inode *inode)
 {
+	struct timespec64 now, ts;
 	int sync_it = 0;
-	struct timespec64 now = current_time(inode);
-	struct timespec64 ts;
 
 	/* First try to exhaust all avenues to not sync */
 	if (IS_NOCMTIME(inode))
 		return 0;
 
+	now = current_time(inode);
+
 	ts = inode_get_mtime(inode);
 	if (!timespec64_equal(&ts, &now))
-		sync_it = S_MTIME;
+		sync_it |= S_MTIME;
 
 	ts = inode_get_ctime(inode);
 	if (!timespec64_equal(&ts, &now))
@@ -2562,6 +2601,15 @@ void inode_nohighmem(struct inode *inode)
 }
 EXPORT_SYMBOL(inode_nohighmem);
 
+struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts)
+{
+	set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
+	inode->i_ctime_sec = ts.tv_sec;
+	inode->i_ctime_nsec = ts.tv_nsec;
+	return ts;
+}
+EXPORT_SYMBOL(inode_set_ctime_to_ts);
+
 /**
  * timestamp_truncate - Truncate timespec to a granularity
  * @t: Timespec
@@ -2594,36 +2642,77 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
 EXPORT_SYMBOL(timestamp_truncate);
 
 /**
- * current_time - Return FS time
- * @inode: inode.
+ * inode_set_ctime_current - set the ctime to current_time
+ * @inode: inode
  *
- * Return the current time truncated to the time granularity supported by
- * the fs.
+ * Set the inode's ctime to the current value for the inode. Returns the
+ * current value that was assigned. If this is not a multigrain inode, then we
+ * set it to the later of the coarse time and floor value.
  *
- * Note that inode and inode->sb cannot be NULL.
- * Otherwise, the function warns and returns time without truncation.
+ * If it is multigrain, then we first see if the coarse-grained timestamp is
+ * distinct from what is already there. If so, then use that. Otherwise, get a
+ * fine-grained timestamp.
+ *
+ * After that, try to swap the new value into i_ctime_nsec. Accept the
+ * resulting ctime, regardless of the outcome of the swap. If it has
+ * already been replaced, then that timestamp is later than the earlier
+ * unacceptable one, and is thus acceptable.
  */
-struct timespec64 current_time(struct inode *inode)
+struct timespec64 inode_set_ctime_current(struct inode *inode)
 {
 	struct timespec64 now;
+	u32 cns, cur;
 
-	ktime_get_coarse_real_ts64(&now);
-	return timestamp_truncate(now, inode);
-}
-EXPORT_SYMBOL(current_time);
+	ktime_get_coarse_real_ts64_mg(&now);
+	now = timestamp_truncate(now, inode);
 
-/**
- * inode_set_ctime_current - set the ctime to current_time
- * @inode: inode
- *
- * Set the inode->i_ctime to the current value for the inode. Returns
- * the current value that was assigned to i_ctime.
- */
-struct timespec64 inode_set_ctime_current(struct inode *inode)
-{
-	struct timespec64 now = current_time(inode);
+	/* Just return that if this is not a multigrain fs */
+	if (!is_mgtime(inode)) {
+		inode_set_ctime_to_ts(inode, now);
+		goto out;
+	}
 
-	inode_set_ctime_to_ts(inode, now);
+	/*
+	 * A fine-grained time is only needed if someone has queried
+	 * for timestamps, and the current coarse grained time isn't
+	 * later than what's already there.
+	 */
+	cns = smp_load_acquire(&inode->i_ctime_nsec);
+	if (cns & I_CTIME_QUERIED) {
+		struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
+					    .tv_nsec = cns & ~I_CTIME_QUERIED };
+
+		if (timespec64_compare(&now, &ctime) <= 0) {
+			ktime_get_real_ts64_mg(&now);
+			now = timestamp_truncate(now, inode);
+		}
+	}
+
+	/* No need to cmpxchg if it's exactly the same */
+	if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec)
+		goto out;
+	cur = cns;
+retry:
+	/* Try to swap the nsec value into place. */
+	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
+		/* If swap occurred, then we're (mostly) done */
+		inode->i_ctime_sec = now.tv_sec;
+	} else {
+		/*
+		 * Was the change due to someone marking the old ctime QUERIED?
+		 * If so then retry the swap. This can only happen once since
+		 * the only way to clear I_CTIME_QUERIED is to stamp the inode
+		 * with a new ctime.
+		 */
+		if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) {
+			cns = cur;
+			goto retry;
+		}
+		/* Otherwise, keep the existing ctime */
+		now.tv_sec = inode->i_ctime_sec;
+		now.tv_nsec = cur & ~I_CTIME_QUERIED;
+	}
+out:
 	return now;
 }
 EXPORT_SYMBOL(inode_set_ctime_current);
diff --git a/fs/stat.c b/fs/stat.c
index 89ce1be56310..dd480bf51a2a 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -26,6 +26,39 @@
 #include "internal.h"
 #include "mount.h"
 
+/**
+ * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
+ * @stat: where to store the resulting values
+ * @request_mask: STATX_* values requested
+ * @inode: inode from which to grab the c/mtime
+ *
+ * Given @inode, grab the ctime and mtime out if it and store the result
+ * in @stat. When fetching the value, flag it as QUERIED (if not already)
+ * so the next write will record a distinct timestamp.
+ *
+ * NB: The QUERIED flag is tracked in the ctime, but we set it there even
+ * if only the mtime was requested, as that ensures that the next mtime
+ * change will be distinct.
+ */
+void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
+{
+	atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec;
+
+	/* If neither time was requested, then don't report them */
+	if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
+		stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
+		return;
+	}
+
+	stat->mtime = inode_get_mtime(inode);
+	stat->ctime.tv_sec = inode->i_ctime_sec;
+	stat->ctime.tv_nsec = (u32)atomic_read(pcn);
+	if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
+		stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));
+	stat->ctime.tv_nsec &= ~I_CTIME_QUERIED;
+}
+EXPORT_SYMBOL(fill_mg_cmtime);
+
 /**
  * generic_fillattr - Fill in the basic attributes from the inode struct
  * @idmap:		idmap of the mount the inode was found from
@@ -58,8 +91,14 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
 	stat->atime = inode_get_atime(inode);
-	stat->mtime = inode_get_mtime(inode);
-	stat->ctime = inode_get_ctime(inode);
+
+	if (is_mgtime(inode)) {
+		fill_mg_cmtime(stat, request_mask, inode);
+	} else {
+		stat->ctime = inode_get_ctime(inode);
+		stat->mtime = inode_get_mtime(inode);
+	}
+
 	stat->blksize = i_blocksize(inode);
 	stat->blocks = inode->i_blocks;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6ca11e241a24..eff688e75f2f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1613,6 +1613,17 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode,
 	return inode_set_mtime_to_ts(inode, ts);
 }
 
+/*
+ * Multigrain timestamps
+ *
+ * Conditionally use fine-grained ctime and mtime timestamps when there
+ * are users actively observing them via getattr. The primary use-case
+ * for this is NFS clients that use the ctime to distinguish between
+ * different states of the file, and that are often fooled by multiple
+ * operations that occur in the same coarse-grained timer tick.
+ */
+#define I_CTIME_QUERIED		((u32)BIT(31))
+
 static inline time64_t inode_get_ctime_sec(const struct inode *inode)
 {
 	return inode->i_ctime_sec;
@@ -1620,7 +1631,7 @@ static inline time64_t inode_get_ctime_sec(const struct inode *inode)
 
 static inline long inode_get_ctime_nsec(const struct inode *inode)
 {
-	return inode->i_ctime_nsec;
+	return inode->i_ctime_nsec & ~I_CTIME_QUERIED;
 }
 
 static inline struct timespec64 inode_get_ctime(const struct inode *inode)
@@ -1631,13 +1642,7 @@ static inline struct timespec64 inode_get_ctime(const struct inode *inode)
 	return ts;
 }
 
-static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
-						      struct timespec64 ts)
-{
-	inode->i_ctime_sec = ts.tv_sec;
-	inode->i_ctime_nsec = ts.tv_nsec;
-	return ts;
-}
+struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts);
 
 /**
  * inode_set_ctime - set the ctime in the inode
@@ -2500,6 +2505,7 @@ struct file_system_type {
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_DISALLOW_NOTIFY_PERM	16	/* Disable fanotify permission events */
 #define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
+#define FS_MGTIME		64	/* FS uses multigrain timestamps */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
 	const struct fs_parameter_spec *parameters;
@@ -2523,6 +2529,17 @@ struct file_system_type {
 
 #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
 
+/**
+ * is_mgtime: is this inode using multigrain timestamps
+ * @inode: inode to test for multigrain timestamps
+ *
+ * Return true if the inode uses multigrain timestamps, false otherwise.
+ */
+static inline bool is_mgtime(const struct inode *inode)
+{
+	return inode->i_sb->s_type->fs_flags & FS_MGTIME;
+}
+
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
@@ -3262,6 +3279,7 @@ extern void page_put_link(void *);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_link(void *);
+void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
 void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
 void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
 void generic_fill_statx_atomic_writes(struct kstat *stat,
-- 
cgit v1.2.3


From 8fd3395ec9051a52828fcca2328cb50a69dea8ef Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 31 Jul 2024 11:49:04 -0400
Subject: get rid of ...lookup...fdget_rcu() family

Once upon a time, predecessors of those used to do file lookup
without bumping a refcount, provided that caller held rcu_read_lock()
across the lookup and whatever it wanted to read from the struct
file found.  When struct file allocation switched to SLAB_TYPESAFE_BY_RCU,
that stopped being feasible and these primitives started to bump the
file refcount for lookup result, requiring the caller to call fput()
afterwards.

But that turned them pointless - e.g.
	rcu_read_lock();
	file = lookup_fdget_rcu(fd);
	rcu_read_unlock();
is equivalent to
	file = fget_raw(fd);
and all callers of lookup_fdget_rcu() are of that form.  Similarly,
task_lookup_fdget_rcu() calls can be replaced with calling fget_task().
task_lookup_next_fdget_rcu() doesn't have direct counterparts, but
its callers would be happier if we replaced it with an analogue that
deals with RCU internally.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/coredump.c |  4 +---
 fs/file.c                                    | 28 ++++------------------------
 fs/gfs2/glock.c                              | 12 ++----------
 fs/notify/dnotify/dnotify.c                  |  5 +----
 fs/proc/fd.c                                 | 12 +++---------
 include/linux/fdtable.h                      |  4 ----
 include/linux/file.h                         |  1 +
 kernel/bpf/task_iter.c                       |  6 +-----
 kernel/kcmp.c                                |  4 +---
 9 files changed, 14 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 18daafbe2e65..301ee7d8b7df 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -73,9 +73,7 @@ static struct spu_context *coredump_next_context(int *fd)
 		return NULL;
 	*fd = n - 1;
 
-	rcu_read_lock();
-	file = lookup_fdget_rcu(*fd);
-	rcu_read_unlock();
+	file = fget_raw(*fd);
 	if (file) {
 		ctx = SPUFS_I(file_inode(file))->i_ctx;
 		get_spu_context(ctx);
diff --git a/fs/file.c b/fs/file.c
index eb093e736972..991860ee7848 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1037,29 +1037,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
 	return file;
 }
 
-struct file *lookup_fdget_rcu(unsigned int fd)
-{
-	return __fget_files_rcu(current->files, fd, 0);
-
-}
-EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
-
-struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
-{
-	/* Must be called with rcu_read_lock held */
-	struct files_struct *files;
-	struct file *file = NULL;
-
-	task_lock(task);
-	files = task->files;
-	if (files)
-		file = __fget_files_rcu(files, fd, 0);
-	task_unlock(task);
-
-	return file;
-}
-
-struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -1069,17 +1047,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *
 	task_lock(task);
 	files = task->files;
 	if (files) {
+		rcu_read_lock();
 		for (; fd < files_fdtable(files)->max_fds; fd++) {
 			file = __fget_files_rcu(files, fd, 0);
 			if (file)
 				break;
 		}
+		rcu_read_unlock();
 	}
 	task_unlock(task);
 	*ret_fd = fd;
 	return file;
 }
-EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
+EXPORT_SYMBOL(fget_task_next);
 
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 269c3bc7fced..4701c4aafbf4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -34,7 +34,6 @@
 #include <linux/lockref.h>
 #include <linux/rhashtable.h>
 #include <linux/pid_namespace.h>
-#include <linux/fdtable.h>
 #include <linux/file.h>
 
 #include "gfs2.h"
@@ -2768,25 +2767,18 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
 		i->file = NULL;
 	}
 
-	rcu_read_lock();
 	for(;; i->fd++) {
-		struct inode *inode;
-
-		i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
+		i->file = fget_task_next(i->task, &i->fd);
 		if (!i->file) {
 			i->fd = 0;
 			break;
 		}
 
-		inode = file_inode(i->file);
-		if (inode->i_sb == i->sb)
+		if (file_inode(i->file)->i_sb == i->sb)
 			break;
 
-		rcu_read_unlock();
 		fput(i->file);
-		rcu_read_lock();
 	}
-	rcu_read_unlock();
 	return i->file;
 }
 
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index d5dbef7f5c95..6004dfdfdf0f 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -16,7 +16,6 @@
 #include <linux/security.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
-#include <linux/fdtable.h>
 #include <linux/fsnotify_backend.h>
 
 static int dir_notify_enable __read_mostly = 1;
@@ -347,9 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 		new_fsn_mark = NULL;
 	}
 
-	rcu_read_lock();
-	f = lookup_fdget_rcu(fd);
-	rcu_read_unlock();
+	f = fget_raw(fd);
 
 	/* if (f != filp) means that we lost a race and another task/thread
 	 * actually closed the fd we are still playing with before we grabbed
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 1f54a54bfb91..18d0dddc8e2f 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -116,9 +116,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
 {
 	struct file *file;
 
-	rcu_read_lock();
-	file = task_lookup_fdget_rcu(task, fd);
-	rcu_read_unlock();
+	file = fget_task(task, fd);
 	if (file) {
 		*mode = file->f_mode;
 		fput(file);
@@ -258,19 +256,17 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 	if (!dir_emit_dots(file, ctx))
 		goto out;
 
-	rcu_read_lock();
 	for (fd = ctx->pos - 2;; fd++) {
 		struct file *f;
 		struct fd_data data;
 		char name[10 + 1];
 		unsigned int len;
 
-		f = task_lookup_next_fdget_rcu(p, &fd);
+		f = fget_task_next(p, &fd);
 		ctx->pos = fd + 2LL;
 		if (!f)
 			break;
 		data.mode = f->f_mode;
-		rcu_read_unlock();
 		fput(f);
 		data.fd = fd;
 
@@ -278,11 +274,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 		if (!proc_fill_cache(file, ctx,
 				     name, len, instantiate, p,
 				     &data))
-			goto out;
+			break;
 		cond_resched();
-		rcu_read_lock();
 	}
-	rcu_read_unlock();
 out:
 	put_task_struct(p);
 	return 0;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index b1c5722f2b3c..e25e2cb65d30 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -92,10 +92,6 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
 	return files_lookup_fd_raw(files, fd);
 }
 
-struct file *lookup_fdget_rcu(unsigned int fd);
-struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
-struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
-
 static inline bool close_on_exec(unsigned int fd, const struct files_struct *files)
 {
 	return test_bit(fd, files_fdtable(files)->close_on_exec);
diff --git a/include/linux/file.h b/include/linux/file.h
index f98de143245a..ec4ad5e6a061 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -72,6 +72,7 @@ static inline void fdput(struct fd fd)
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
 extern struct file *fget_task(struct task_struct *task, unsigned int fd);
+extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd);
 extern void __f_unlock_pos(struct file *);
 
 struct fd fdget(unsigned int fd);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 02aa9db8d796..7fe602ca74a0 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -5,7 +5,6 @@
 #include <linux/namei.h>
 #include <linux/pid_namespace.h>
 #include <linux/fs.h>
-#include <linux/fdtable.h>
 #include <linux/filter.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/btf_ids.h>
@@ -286,17 +285,14 @@ again:
 			curr_fd = 0;
 	}
 
-	rcu_read_lock();
-	f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
+	f = fget_task_next(curr_task, &curr_fd);
 	if (f) {
 		/* set info->fd */
 		info->fd = curr_fd;
-		rcu_read_unlock();
 		return f;
 	}
 
 	/* the current task is done, go to the next task */
-	rcu_read_unlock();
 	put_task_struct(curr_task);
 
 	if (info->common.type == BPF_TASK_ITER_TID) {
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index b0639f21041f..2c596851f8a9 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -63,9 +63,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
 {
 	struct file *file;
 
-	rcu_read_lock();
-	file = task_lookup_fdget_rcu(task, idx);
-	rcu_read_unlock();
+	file = fget_task(task, idx);
 	if (file)
 		fput(file);
 
-- 
cgit v1.2.3


From cab0515211f483e392d6862021ed008f49058561 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Jun 2024 17:48:36 -0400
Subject: move close_range(2) into fs/file.c, fold __close_range() into it

	We never had callers for __close_range() except for close_range(2)
itself.  Nothing of that sort has appeared in four years and if any users
do show up, we can always separate those suckers again.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c               |  6 ++++--
 fs/open.c               | 17 -----------------
 include/linux/fdtable.h |  1 -
 3 files changed, 4 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 8770010170c5..8e8f504782bf 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -713,7 +713,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
 }
 
 /**
- * __close_range() - Close all file descriptors in a given range.
+ * sys_close_range() - Close all file descriptors in a given range.
  *
  * @fd:     starting file descriptor to close
  * @max_fd: last file descriptor to close
@@ -721,8 +721,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
  *
  * This closes a range of file descriptors. All file descriptors
  * from @fd up to and including @max_fd are closed.
+ * Currently, errors to close a given file descriptor are ignored.
  */
-int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
+SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
+		unsigned int, flags)
 {
 	struct task_struct *me = current;
 	struct files_struct *cur_fds = me->files, *fds = NULL;
diff --git a/fs/open.c b/fs/open.c
index acaeb3e25c88..62dd1383d6f9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1574,23 +1574,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
 	return retval;
 }
 
-/**
- * sys_close_range() - Close all file descriptors in a given range.
- *
- * @fd:     starting file descriptor to close
- * @max_fd: last file descriptor to close
- * @flags:  reserved for future extensions
- *
- * This closes a range of file descriptors. All file descriptors
- * from @fd up to and including @max_fd are closed.
- * Currently, errors to close a given file descriptor are ignored.
- */
-SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
-		unsigned int, flags)
-{
-	return __close_range(fd, max_fd, flags);
-}
-
 /*
  * This routine simulates a hangup on the tty, to arrange that users
  * are given clean terminals at login time.
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e25e2cb65d30..c45306a9f007 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -111,7 +111,6 @@ int iterate_fd(struct files_struct *, unsigned,
 		const void *);
 
 extern int close_fd(unsigned int fd);
-extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
 extern struct file *file_close_fd(unsigned int fd);
 
 extern struct kmem_cache *files_cachep;
-- 
cgit v1.2.3


From a3f5f4c2f9b6bc2aa6f5a3e8e23b7519e4f2e3e3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 4 Oct 2024 13:47:20 +0000
Subject: ipv4: remove fib_info_devhash[]

Upcoming per-netns RTNL conversion needs to get rid
of shared hash tables.

fib_info_devhash[] is one of them.

It is unclear why we used a hash table, because
a single hlist_head per net device was cheaper and scalable.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20241004134720.579244-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/net_cachelines/net_device.rst       |  1 +
 include/linux/netdevice.h                          |  3 ++
 net/ipv4/fib_semantics.c                           | 35 ++++++++++------------
 3 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 49f03cb78c6e..556711c4d3cf 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -83,6 +83,7 @@ unsigned_int                        allmulti
 bool                                uc_promisc                                                      
 unsigned_char                       nested_level                                                    
 struct_in_device*                   ip_ptr                  read_mostly         read_mostly         __in_dev_get
+struct hlist_head                   fib_nh_head
 struct_inet6_dev*                   ip6_ptr                 read_mostly         read_mostly         __in6_dev_get
 struct_vlan_info*                   vlan_info                                                       
 struct_dsa_port*                    dsa_ptr                                                         
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 49a7e7db0883..3baf8e539b6f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2211,6 +2211,9 @@ struct net_device {
 
 	/* Protocol-specific pointers */
 	struct in_device __rcu	*ip_ptr;
+	/** @fib_nh_head: nexthops associated with this netdev */
+	struct hlist_head	fib_nh_head;
+
 #if IS_ENABLED(CONFIG_VLAN_8021Q)
 	struct vlan_info __rcu	*vlan_info;
 #endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ece779bfb8f6..d2cee5c314f5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -56,10 +56,6 @@ static unsigned int fib_info_hash_size;
 static unsigned int fib_info_hash_bits;
 static unsigned int fib_info_cnt;
 
-#define DEVINDEX_HASHBITS 8
-#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
-static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
-
 /* for_nexthops and change_nexthops only used when nexthop object
  * is not set in a fib_info. The logic within can reference fib_nh.
  */
@@ -319,12 +315,9 @@ static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
 	return 0;
 }
 
-static struct hlist_head *
-fib_info_devhash_bucket(const struct net_device *dev)
+static struct hlist_head *fib_nh_head(struct net_device *dev)
 {
-	u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex;
-
-	return &fib_info_devhash[hash_32(val, DEVINDEX_HASHBITS)];
+	return &dev->fib_nh_head;
 }
 
 static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
@@ -435,11 +428,11 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
 	struct hlist_head *head;
 	struct fib_nh *nh;
 
-	head = fib_info_devhash_bucket(dev);
+	head = fib_nh_head(dev);
 
 	hlist_for_each_entry_rcu(nh, head, nh_hash) {
-		if (nh->fib_nh_dev == dev &&
-		    nh->fib_nh_gw4 == gw &&
+		DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+		if (nh->fib_nh_gw4 == gw &&
 		    !(nh->fib_nh_flags & RTNH_F_DEAD)) {
 			return 0;
 		}
@@ -1595,7 +1588,7 @@ link_it:
 
 			if (!nexthop_nh->fib_nh_dev)
 				continue;
-			head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev);
+			head = fib_nh_head(nexthop_nh->fib_nh_dev);
 			hlist_add_head_rcu(&nexthop_nh->nh_hash, head);
 		} endfor_nexthops(fi)
 	}
@@ -1948,12 +1941,12 @@ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
 
 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
 {
-	struct hlist_head *head = fib_info_devhash_bucket(dev);
+	struct hlist_head *head = fib_nh_head(dev);
 	struct fib_nh *nh;
 
 	hlist_for_each_entry(nh, head, nh_hash) {
-		if (nh->fib_nh_dev == dev)
-			fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
+		DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+		fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
 	}
 }
 
@@ -1967,7 +1960,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
  */
 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 {
-	struct hlist_head *head = fib_info_devhash_bucket(dev);
+	struct hlist_head *head = fib_nh_head(dev);
 	struct fib_info *prev_fi = NULL;
 	int scope = RT_SCOPE_NOWHERE;
 	struct fib_nh *nh;
@@ -1981,7 +1974,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 		int dead;
 
 		BUG_ON(!fi->fib_nhs);
-		if (nh->fib_nh_dev != dev || fi == prev_fi)
+		DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+		if (fi == prev_fi)
 			continue;
 		prev_fi = fi;
 		dead = 0;
@@ -2131,7 +2125,7 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
 	}
 
 	prev_fi = NULL;
-	head = fib_info_devhash_bucket(dev);
+	head = fib_nh_head(dev);
 	ret = 0;
 
 	hlist_for_each_entry(nh, head, nh_hash) {
@@ -2139,7 +2133,8 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
 		int alive;
 
 		BUG_ON(!fi->fib_nhs);
-		if (nh->fib_nh_dev != dev || fi == prev_fi)
+		DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+		if (fi == prev_fi)
 			continue;
 
 		prev_fi = fi;
-- 
cgit v1.2.3


From 20a4da20e0bd0297bf1ce083362e78b6702f991e Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Fri, 4 Oct 2024 11:01:00 +0200
Subject: net: phy: Add support for PHY timing-role configuration via device
 tree

Introduce support for configuring the master/slave role of PHYs based on
the `timing-role` property in the device tree. While this functionality
is necessary for Single Pair Ethernet (SPE) PHYs (1000/100/10Base-T1)
where hardware strap pins may be unavailable or incorrectly set, it
works for any PHY type.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Divya Koppera <divya.koppera@microchip.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/phy-core.c   | 33 +++++++++++++++++++++++++++++++++
 drivers/net/phy/phy_device.c |  3 +++
 include/linux/phy.h          |  1 +
 3 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 1f98b6a96c15..4e8db12d6092 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -412,6 +412,39 @@ void of_set_phy_eee_broken(struct phy_device *phydev)
 	phydev->eee_broken_modes = broken;
 }
 
+/**
+ * of_set_phy_timing_role - Set the master/slave mode of the PHY
+ *
+ * @phydev: The phy_device struct
+ *
+ * Set master/slave configuration of the PHY based on the device tree.
+ */
+void of_set_phy_timing_role(struct phy_device *phydev)
+{
+	struct device_node *node = phydev->mdio.dev.of_node;
+	const char *master;
+
+	if (!IS_ENABLED(CONFIG_OF_MDIO))
+		return;
+
+	if (!node)
+		return;
+
+	if (of_property_read_string(node, "timing-role", &master))
+		return;
+
+	if (strcmp(master, "forced-master") == 0)
+		phydev->master_slave_set = MASTER_SLAVE_CFG_MASTER_FORCE;
+	else if (strcmp(master, "forced-slave") == 0)
+		phydev->master_slave_set = MASTER_SLAVE_CFG_SLAVE_FORCE;
+	else if (strcmp(master, "preferred-master") == 0)
+		phydev->master_slave_set = MASTER_SLAVE_CFG_MASTER_PREFERRED;
+	else if (strcmp(master, "preferred-slave") == 0)
+		phydev->master_slave_set = MASTER_SLAVE_CFG_SLAVE_PREFERRED;
+	else
+		phydev_warn(phydev, "Unknown master-slave mode %s\n", master);
+}
+
 /**
  * phy_resolve_aneg_pause - Determine pause autoneg results
  *
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 560e338b307a..4ccf504a8b2c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3608,6 +3608,9 @@ static int phy_probe(struct device *dev)
 	 */
 	of_set_phy_eee_broken(phydev);
 
+	/* Get master/slave strap overrides */
+	of_set_phy_timing_role(phydev);
+
 	/* The Pause Frame bits indicate that the PHY can support passing
 	 * pause frames. During autonegotiation, the PHYs will determine if
 	 * they should allow pause frames to pass.  The MAC driver should then
diff --git a/include/linux/phy.h b/include/linux/phy.h
index a98bc91a0cde..ff762a3d8270 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1260,6 +1260,7 @@ size_t phy_speeds(unsigned int *speeds, size_t size,
 		  unsigned long *mask);
 void of_set_phy_supported(struct phy_device *phydev);
 void of_set_phy_eee_broken(struct phy_device *phydev);
+void of_set_phy_timing_role(struct phy_device *phydev);
 int phy_speed_down_core(struct phy_device *phydev);
 
 /**
-- 
cgit v1.2.3


From 95397784be23f66c5d4280b0a4c4e0d1ee74f2ef Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 2 Sep 2024 10:42:31 +0200
Subject: media: staging: drop omap4iss

The omap4 camera driver has seen no progress since forever, and
now OMAP4 support has also been dropped from u-boot (1). So it is
time to retire this driver.

(1): https://lists.denx.de/pipermail/u-boot/2024-July/558846.html

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
---
 Documentation/admin-guide/media/omap4_camera.rst |   62 -
 Documentation/admin-guide/media/v4l-drivers.rst  |    1 -
 MAINTAINERS                                      |    8 -
 drivers/staging/media/Kconfig                    |    2 -
 drivers/staging/media/Makefile                   |    1 -
 drivers/staging/media/omap4iss/Kconfig           |   12 -
 drivers/staging/media/omap4iss/Makefile          |    9 -
 drivers/staging/media/omap4iss/TODO              |    3 -
 drivers/staging/media/omap4iss/iss.c             | 1354 ---------------------
 drivers/staging/media/omap4iss/iss.h             |  247 ----
 drivers/staging/media/omap4iss/iss_csi2.c        | 1379 ----------------------
 drivers/staging/media/omap4iss/iss_csi2.h        |  155 ---
 drivers/staging/media/omap4iss/iss_csiphy.c      |  277 -----
 drivers/staging/media/omap4iss/iss_csiphy.h      |   47 -
 drivers/staging/media/omap4iss/iss_ipipe.c       |  579 ---------
 drivers/staging/media/omap4iss/iss_ipipe.h       |   63 -
 drivers/staging/media/omap4iss/iss_ipipeif.c     |  844 -------------
 drivers/staging/media/omap4iss/iss_ipipeif.h     |   89 --
 drivers/staging/media/omap4iss/iss_regs.h        |  899 --------------
 drivers/staging/media/omap4iss/iss_resizer.c     |  884 --------------
 drivers/staging/media/omap4iss/iss_resizer.h     |   72 --
 drivers/staging/media/omap4iss/iss_video.c       | 1274 --------------------
 drivers/staging/media/omap4iss/iss_video.h       |  203 ----
 include/linux/platform_data/media/omap4iss.h     |   66 --
 24 files changed, 8530 deletions(-)
 delete mode 100644 Documentation/admin-guide/media/omap4_camera.rst
 delete mode 100644 drivers/staging/media/omap4iss/Kconfig
 delete mode 100644 drivers/staging/media/omap4iss/Makefile
 delete mode 100644 drivers/staging/media/omap4iss/TODO
 delete mode 100644 drivers/staging/media/omap4iss/iss.c
 delete mode 100644 drivers/staging/media/omap4iss/iss.h
 delete mode 100644 drivers/staging/media/omap4iss/iss_csi2.c
 delete mode 100644 drivers/staging/media/omap4iss/iss_csi2.h
 delete mode 100644 drivers/staging/media/omap4iss/iss_csiphy.c
 delete mode 100644 drivers/staging/media/omap4iss/iss_csiphy.h
 delete mode 100644 drivers/staging/media/omap4iss/iss_ipipe.c
 delete mode 100644 drivers/staging/media/omap4iss/iss_ipipe.h
 delete mode 100644 drivers/staging/media/omap4iss/iss_ipipeif.c
 delete mode 100644 drivers/staging/media/omap4iss/iss_ipipeif.h
 delete mode 100644 drivers/staging/media/omap4iss/iss_regs.h
 delete mode 100644 drivers/staging/media/omap4iss/iss_resizer.c
 delete mode 100644 drivers/staging/media/omap4iss/iss_resizer.h
 delete mode 100644 drivers/staging/media/omap4iss/iss_video.c
 delete mode 100644 drivers/staging/media/omap4iss/iss_video.h
 delete mode 100644 include/linux/platform_data/media/omap4iss.h

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/media/omap4_camera.rst b/Documentation/admin-guide/media/omap4_camera.rst
deleted file mode 100644
index 2ada9b1e6897..000000000000
--- a/Documentation/admin-guide/media/omap4_camera.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-OMAP4 ISS Driver
-================
-
-Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
-
-Copyright (C) 2012, Texas Instruments
-
-Introduction
-------------
-
-The OMAP44XX family of chips contains the Imaging SubSystem (a.k.a. ISS),
-Which contains several components that can be categorized in 3 big groups:
-
-- Interfaces (2 Interfaces: CSI2-A & CSI2-B/CCP2)
-- ISP (Image Signal Processor)
-- SIMCOP (Still Image Coprocessor)
-
-For more information, please look in [#f1]_ for latest version of:
-"OMAP4430 Multimedia Device Silicon Revision 2.x"
-
-As of Revision AB, the ISS is described in detail in section 8.
-
-This driver is supporting **only** the CSI2-A/B interfaces for now.
-
-It makes use of the Media Controller framework [#f2]_, and inherited most of the
-code from OMAP3 ISP driver (found under drivers/media/platform/ti/omap3isp/\*),
-except that it doesn't need an IOMMU now for ISS buffers memory mapping.
-
-Supports usage of MMAP buffers only (for now).
-
-Tested platforms
-----------------
-
-- OMAP4430SDP, w/ ES2.1 GP & SEVM4430-CAM-V1-0 (Contains IMX060 & OV5640, in
-  which only the last one is supported, outputting YUV422 frames).
-
-- TI Blaze MDP, w/ OMAP4430 ES2.2 EMU (Contains 1 IMX060 & 2 OV5650 sensors, in
-  which only the OV5650 are supported, outputting RAW10 frames).
-
-- PandaBoard, Rev. A2, w/ OMAP4430 ES2.1 GP & OV adapter board, tested with
-  following sensors:
-  * OV5640
-  * OV5650
-
-- Tested on mainline kernel:
-
-	http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=summary
-
-  Tag: v3.3 (commit c16fa4f2ad19908a47c63d8fa436a1178438c7e7)
-
-File list
----------
-drivers/staging/media/omap4iss/
-include/linux/platform_data/media/omap4iss.h
-
-References
-----------
-
-.. [#f1] http://focus.ti.com/general/docs/wtbu/wtbudocumentcenter.tsp?navigationId=12037&templateId=6123#62
-.. [#f2] http://lwn.net/Articles/420485/
diff --git a/Documentation/admin-guide/media/v4l-drivers.rst b/Documentation/admin-guide/media/v4l-drivers.rst
index b6af448b9fe9..7d9029bbf109 100644
--- a/Documentation/admin-guide/media/v4l-drivers.rst
+++ b/Documentation/admin-guide/media/v4l-drivers.rst
@@ -20,7 +20,6 @@ Video4Linux (V4L) driver-specific documentation
 	ivtv
 	mgb4
 	omap3isp
-	omap4_camera
 	philips
 	qcom_camss
 	raspberrypi-pisp-be
diff --git a/MAINTAINERS b/MAINTAINERS
index c27f3190737f..251ae8a52066 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16932,14 +16932,6 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/i2c/ti,omap4-i2c.yaml
 F:	drivers/i2c/busses/i2c-omap.c
 
-OMAP IMAGING SUBSYSTEM (OMAP3 ISP and OMAP4 ISS)
-M:	Laurent Pinchart <laurent.pinchart@ideasonboard.com>
-L:	linux-media@vger.kernel.org
-S:	Maintained
-F:	Documentation/devicetree/bindings/media/ti,omap3isp.txt
-F:	drivers/media/platform/ti/omap3isp/
-F:	drivers/staging/media/omap4iss/
-
 OMAP MMC SUPPORT
 M:	Aaro Koskinen <aaro.koskinen@iki.fi>
 L:	linux-omap@vger.kernel.org
diff --git a/drivers/staging/media/Kconfig b/drivers/staging/media/Kconfig
index 554c2e475ce3..b44214854399 100644
--- a/drivers/staging/media/Kconfig
+++ b/drivers/staging/media/Kconfig
@@ -32,8 +32,6 @@ source "drivers/staging/media/max96712/Kconfig"
 
 source "drivers/staging/media/meson/vdec/Kconfig"
 
-source "drivers/staging/media/omap4iss/Kconfig"
-
 source "drivers/staging/media/rkvdec/Kconfig"
 
 source "drivers/staging/media/starfive/Kconfig"
diff --git a/drivers/staging/media/Makefile b/drivers/staging/media/Makefile
index dcaeeca0ee6d..ad4e9619a9e0 100644
--- a/drivers/staging/media/Makefile
+++ b/drivers/staging/media/Makefile
@@ -4,7 +4,6 @@ obj-$(CONFIG_INTEL_ATOMISP)     += atomisp/
 obj-$(CONFIG_VIDEO_IMX_MEDIA)	+= imx/
 obj-$(CONFIG_VIDEO_MAX96712)	+= max96712/
 obj-$(CONFIG_VIDEO_MESON_VDEC)	+= meson/vdec/
-obj-$(CONFIG_VIDEO_OMAP4)	+= omap4iss/
 obj-$(CONFIG_VIDEO_ROCKCHIP_VDEC)	+= rkvdec/
 obj-$(CONFIG_VIDEO_STARFIVE_CAMSS)	+= starfive/
 obj-$(CONFIG_VIDEO_SUNXI)	+= sunxi/
diff --git a/drivers/staging/media/omap4iss/Kconfig b/drivers/staging/media/omap4iss/Kconfig
deleted file mode 100644
index 6d1f55b09132..000000000000
--- a/drivers/staging/media/omap4iss/Kconfig
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-config VIDEO_OMAP4
-	tristate "OMAP 4 Camera support"
-	depends on VIDEO_DEV  && I2C
-	depends on ARCH_OMAP4 || COMPILE_TEST
-	select MEDIA_CONTROLLER
-	select VIDEO_V4L2_SUBDEV_API
-	select MFD_SYSCON
-	select VIDEOBUF2_DMA_CONTIG
-	help
-	  Driver for an OMAP 4 ISS controller.
diff --git a/drivers/staging/media/omap4iss/Makefile b/drivers/staging/media/omap4iss/Makefile
deleted file mode 100644
index e64d489a4a76..000000000000
--- a/drivers/staging/media/omap4iss/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for OMAP4 ISS driver
-#
-
-omap4-iss-objs += \
-	iss.o iss_csi2.o iss_csiphy.o iss_ipipeif.o iss_ipipe.o iss_resizer.o iss_video.o
-
-obj-$(CONFIG_VIDEO_OMAP4) += omap4-iss.o
diff --git a/drivers/staging/media/omap4iss/TODO b/drivers/staging/media/omap4iss/TODO
deleted file mode 100644
index 4d220ef82653..000000000000
--- a/drivers/staging/media/omap4iss/TODO
+++ /dev/null
@@ -1,3 +0,0 @@
-* Fix FIFO/buffer overflows and underflows
-* Replace dummy resizer code with a real implementation
-* Fix checkpatch errors and warnings
diff --git a/drivers/staging/media/omap4iss/iss.c b/drivers/staging/media/omap4iss/iss.c
deleted file mode 100644
index 0c4283bb48ad..000000000000
--- a/drivers/staging/media/omap4iss/iss.c
+++ /dev/null
@@ -1,1354 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * TI OMAP4 ISS V4L2 Driver
- *
- * Copyright (C) 2012, Texas Instruments
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/i2c.h>
-#include <linux/interrupt.h>
-#include <linux/mfd/syscon.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/vmalloc.h>
-
-#include <media/v4l2-common.h>
-#include <media/v4l2-device.h>
-#include <media/v4l2-ctrls.h>
-
-#include "iss.h"
-#include "iss_regs.h"
-
-#define ISS_PRINT_REGISTER(iss, name)\
-	dev_dbg(iss->dev, "###ISS " #name "=0x%08x\n", \
-		iss_reg_read(iss, OMAP4_ISS_MEM_TOP, ISS_##name))
-
-static void iss_print_status(struct iss_device *iss)
-{
-	dev_dbg(iss->dev, "-------------ISS HL Register dump-------------\n");
-
-	ISS_PRINT_REGISTER(iss, HL_REVISION);
-	ISS_PRINT_REGISTER(iss, HL_SYSCONFIG);
-	ISS_PRINT_REGISTER(iss, HL_IRQSTATUS(5));
-	ISS_PRINT_REGISTER(iss, HL_IRQENABLE_SET(5));
-	ISS_PRINT_REGISTER(iss, HL_IRQENABLE_CLR(5));
-	ISS_PRINT_REGISTER(iss, CTRL);
-	ISS_PRINT_REGISTER(iss, CLKCTRL);
-	ISS_PRINT_REGISTER(iss, CLKSTAT);
-
-	dev_dbg(iss->dev, "-----------------------------------------------\n");
-}
-
-/*
- * omap4iss_flush - Post pending L3 bus writes by doing a register readback
- * @iss: OMAP4 ISS device
- *
- * In order to force posting of pending writes, we need to write and
- * readback the same register, in this case the revision register.
- *
- * See this link for reference:
- *   https://www.mail-archive.com/linux-omap@vger.kernel.org/msg08149.html
- */
-static void omap4iss_flush(struct iss_device *iss)
-{
-	iss_reg_write(iss, OMAP4_ISS_MEM_TOP, ISS_HL_REVISION, 0);
-	iss_reg_read(iss, OMAP4_ISS_MEM_TOP, ISS_HL_REVISION);
-}
-
-/*
- * iss_isp_enable_interrupts - Enable ISS ISP interrupts.
- * @iss: OMAP4 ISS device
- */
-static void omap4iss_isp_enable_interrupts(struct iss_device *iss)
-{
-	static const u32 isp_irq = ISP5_IRQ_OCP_ERR |
-				   ISP5_IRQ_RSZ_FIFO_IN_BLK_ERR |
-				   ISP5_IRQ_RSZ_FIFO_OVF |
-				   ISP5_IRQ_RSZ_INT_DMA |
-				   ISP5_IRQ_ISIF_INT(0);
-
-	/* Enable ISP interrupts */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_IRQSTATUS(0), isp_irq);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_IRQENABLE_SET(0),
-		      isp_irq);
-}
-
-/*
- * iss_isp_disable_interrupts - Disable ISS interrupts.
- * @iss: OMAP4 ISS device
- */
-static void omap4iss_isp_disable_interrupts(struct iss_device *iss)
-{
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_IRQENABLE_CLR(0), ~0);
-}
-
-/*
- * iss_enable_interrupts - Enable ISS interrupts.
- * @iss: OMAP4 ISS device
- */
-static void iss_enable_interrupts(struct iss_device *iss)
-{
-	static const u32 hl_irq = ISS_HL_IRQ_CSIA | ISS_HL_IRQ_CSIB
-				| ISS_HL_IRQ_ISP(0);
-
-	/* Enable HL interrupts */
-	iss_reg_write(iss, OMAP4_ISS_MEM_TOP, ISS_HL_IRQSTATUS(5), hl_irq);
-	iss_reg_write(iss, OMAP4_ISS_MEM_TOP, ISS_HL_IRQENABLE_SET(5), hl_irq);
-
-	if (iss->regs[OMAP4_ISS_MEM_ISP_SYS1])
-		omap4iss_isp_enable_interrupts(iss);
-}
-
-/*
- * iss_disable_interrupts - Disable ISS interrupts.
- * @iss: OMAP4 ISS device
- */
-static void iss_disable_interrupts(struct iss_device *iss)
-{
-	if (iss->regs[OMAP4_ISS_MEM_ISP_SYS1])
-		omap4iss_isp_disable_interrupts(iss);
-
-	iss_reg_write(iss, OMAP4_ISS_MEM_TOP, ISS_HL_IRQENABLE_CLR(5), ~0);
-}
-
-int omap4iss_get_external_info(struct iss_pipeline *pipe,
-			       struct media_link *link)
-{
-	struct iss_device *iss =
-		container_of(pipe, struct iss_video, pipe)->iss;
-	struct v4l2_subdev_format fmt;
-	struct v4l2_ctrl *ctrl;
-	int ret;
-
-	if (!pipe->external)
-		return 0;
-
-	if (pipe->external_rate)
-		return 0;
-
-	memset(&fmt, 0, sizeof(fmt));
-
-	fmt.pad = link->source->index;
-	fmt.which = V4L2_SUBDEV_FORMAT_ACTIVE;
-	ret = v4l2_subdev_call(media_entity_to_v4l2_subdev(link->sink->entity),
-			       pad, get_fmt, NULL, &fmt);
-	if (ret < 0)
-		return -EPIPE;
-
-	pipe->external_bpp = omap4iss_video_format_info(fmt.format.code)->bpp;
-
-	ctrl = v4l2_ctrl_find(pipe->external->ctrl_handler,
-			      V4L2_CID_PIXEL_RATE);
-	if (!ctrl) {
-		dev_warn(iss->dev, "no pixel rate control in subdev %s\n",
-			 pipe->external->name);
-		return -EPIPE;
-	}
-
-	pipe->external_rate = v4l2_ctrl_g_ctrl_int64(ctrl);
-
-	return 0;
-}
-
-/*
- * Configure the bridge. Valid inputs are
- *
- * IPIPEIF_INPUT_CSI2A: CSI2a receiver
- * IPIPEIF_INPUT_CSI2B: CSI2b receiver
- *
- * The bridge and lane shifter are configured according to the selected input
- * and the ISP platform data.
- */
-void omap4iss_configure_bridge(struct iss_device *iss,
-			       enum ipipeif_input_entity input)
-{
-	u32 issctrl_val;
-	u32 isp5ctrl_val;
-
-	issctrl_val = iss_reg_read(iss, OMAP4_ISS_MEM_TOP, ISS_CTRL);
-	issctrl_val &= ~ISS_CTRL_INPUT_SEL_MASK;
-	issctrl_val &= ~ISS_CTRL_CLK_DIV_MASK;
-
-	isp5ctrl_val = iss_reg_read(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_CTRL);
-
-	switch (input) {
-	case IPIPEIF_INPUT_CSI2A:
-		issctrl_val |= ISS_CTRL_INPUT_SEL_CSI2A;
-		break;
-
-	case IPIPEIF_INPUT_CSI2B:
-		issctrl_val |= ISS_CTRL_INPUT_SEL_CSI2B;
-		break;
-
-	default:
-		return;
-	}
-
-	issctrl_val |= ISS_CTRL_SYNC_DETECT_VS_RAISING;
-
-	isp5ctrl_val |= ISP5_CTRL_VD_PULSE_EXT | ISP5_CTRL_PSYNC_CLK_SEL |
-			ISP5_CTRL_SYNC_ENABLE;
-
-	iss_reg_write(iss, OMAP4_ISS_MEM_TOP, ISS_CTRL, issctrl_val);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_CTRL, isp5ctrl_val);
-}
-
-#ifdef ISS_ISR_DEBUG
-static void iss_isr_dbg(struct iss_device *iss, u32 irqstatus)
-{
-	static const char * const name[] = {
-		"ISP_0",
-		"ISP_1",
-		"ISP_2",
-		"ISP_3",
-		"CSIA",
-		"CSIB",
-		"CCP2_0",
-		"CCP2_1",
-		"CCP2_2",
-		"CCP2_3",
-		"CBUFF",
-		"BTE",
-		"SIMCOP_0",
-		"SIMCOP_1",
-		"SIMCOP_2",
-		"SIMCOP_3",
-		"CCP2_8",
-		"HS_VS",
-		"18",
-		"19",
-		"20",
-		"21",
-		"22",
-		"23",
-		"24",
-		"25",
-		"26",
-		"27",
-		"28",
-		"29",
-		"30",
-		"31",
-	};
-	unsigned int i;
-
-	dev_dbg(iss->dev, "ISS IRQ: ");
-
-	for (i = 0; i < ARRAY_SIZE(name); i++) {
-		if ((1 << i) & irqstatus)
-			pr_cont("%s ", name[i]);
-	}
-	pr_cont("\n");
-}
-
-static void iss_isp_isr_dbg(struct iss_device *iss, u32 irqstatus)
-{
-	static const char * const name[] = {
-		"ISIF_0",
-		"ISIF_1",
-		"ISIF_2",
-		"ISIF_3",
-		"IPIPEREQ",
-		"IPIPELAST_PIX",
-		"IPIPEDMA",
-		"IPIPEBSC",
-		"IPIPEHST",
-		"IPIPEIF",
-		"AEW",
-		"AF",
-		"H3A",
-		"RSZ_REG",
-		"RSZ_LAST_PIX",
-		"RSZ_DMA",
-		"RSZ_CYC_RZA",
-		"RSZ_CYC_RZB",
-		"RSZ_FIFO_OVF",
-		"RSZ_FIFO_IN_BLK_ERR",
-		"20",
-		"21",
-		"RSZ_EOF0",
-		"RSZ_EOF1",
-		"H3A_EOF",
-		"IPIPE_EOF",
-		"26",
-		"IPIPE_DPC_INI",
-		"IPIPE_DPC_RNEW0",
-		"IPIPE_DPC_RNEW1",
-		"30",
-		"OCP_ERR",
-	};
-	unsigned int i;
-
-	dev_dbg(iss->dev, "ISP IRQ: ");
-
-	for (i = 0; i < ARRAY_SIZE(name); i++) {
-		if ((1 << i) & irqstatus)
-			pr_cont("%s ", name[i]);
-	}
-	pr_cont("\n");
-}
-#endif
-
-/*
- * iss_isr - Interrupt Service Routine for ISS module.
- * @irq: Not used currently.
- * @_iss: Pointer to the OMAP4 ISS device
- *
- * Handles the corresponding callback if plugged in.
- *
- * Returns IRQ_HANDLED when IRQ was correctly handled, or IRQ_NONE when the
- * IRQ wasn't handled.
- */
-static irqreturn_t iss_isr(int irq, void *_iss)
-{
-	static const u32 ipipeif_events = ISP5_IRQ_IPIPEIF_IRQ |
-					  ISP5_IRQ_ISIF_INT(0);
-	static const u32 resizer_events = ISP5_IRQ_RSZ_FIFO_IN_BLK_ERR |
-					  ISP5_IRQ_RSZ_FIFO_OVF |
-					  ISP5_IRQ_RSZ_INT_DMA;
-	struct iss_device *iss = _iss;
-	u32 irqstatus;
-
-	irqstatus = iss_reg_read(iss, OMAP4_ISS_MEM_TOP, ISS_HL_IRQSTATUS(5));
-	iss_reg_write(iss, OMAP4_ISS_MEM_TOP, ISS_HL_IRQSTATUS(5), irqstatus);
-
-	if (irqstatus & ISS_HL_IRQ_CSIA)
-		omap4iss_csi2_isr(&iss->csi2a);
-
-	if (irqstatus & ISS_HL_IRQ_CSIB)
-		omap4iss_csi2_isr(&iss->csi2b);
-
-	if (irqstatus & ISS_HL_IRQ_ISP(0)) {
-		u32 isp_irqstatus = iss_reg_read(iss, OMAP4_ISS_MEM_ISP_SYS1,
-						 ISP5_IRQSTATUS(0));
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_IRQSTATUS(0),
-			      isp_irqstatus);
-
-		if (isp_irqstatus & ISP5_IRQ_OCP_ERR)
-			dev_dbg(iss->dev, "ISP5 OCP Error!\n");
-
-		if (isp_irqstatus & ipipeif_events) {
-			omap4iss_ipipeif_isr(&iss->ipipeif,
-					     isp_irqstatus & ipipeif_events);
-		}
-
-		if (isp_irqstatus & resizer_events)
-			omap4iss_resizer_isr(&iss->resizer,
-					     isp_irqstatus & resizer_events);
-
-#ifdef ISS_ISR_DEBUG
-		iss_isp_isr_dbg(iss, isp_irqstatus);
-#endif
-	}
-
-	omap4iss_flush(iss);
-
-#ifdef ISS_ISR_DEBUG
-	iss_isr_dbg(iss, irqstatus);
-#endif
-
-	return IRQ_HANDLED;
-}
-
-static const struct media_device_ops iss_media_ops = {
-	.link_notify = v4l2_pipeline_link_notify,
-};
-
-/* -----------------------------------------------------------------------------
- * Pipeline stream management
- */
-
-/*
- * iss_pipeline_disable - Disable streaming on a pipeline
- * @pipe: ISS pipeline
- * @until: entity at which to stop pipeline walk
- *
- * Walk the entities chain starting at the pipeline output video node and stop
- * all modules in the chain. Wait synchronously for the modules to be stopped if
- * necessary.
- *
- * If the until argument isn't NULL, stop the pipeline walk when reaching the
- * until entity. This is used to disable a partially started pipeline due to a
- * subdev start error.
- */
-static int iss_pipeline_disable(struct iss_pipeline *pipe,
-				struct media_entity *until)
-{
-	struct iss_device *iss = pipe->output->iss;
-	struct media_entity *entity;
-	struct media_pad *pad;
-	struct v4l2_subdev *subdev;
-	int failure = 0;
-	int ret;
-
-	entity = &pipe->output->video.entity;
-	while (1) {
-		pad = &entity->pads[0];
-		if (!(pad->flags & MEDIA_PAD_FL_SINK))
-			break;
-
-		pad = media_pad_remote_pad_first(pad);
-		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
-			break;
-
-		entity = pad->entity;
-		if (entity == until)
-			break;
-
-		subdev = media_entity_to_v4l2_subdev(entity);
-		ret = v4l2_subdev_call(subdev, video, s_stream, 0);
-		if (ret < 0) {
-			dev_warn(iss->dev, "%s: module stop timeout.\n",
-				 subdev->name);
-			/* If the entity failed to stopped, assume it has
-			 * crashed. Mark it as such, the ISS will be reset when
-			 * applications will release it.
-			 */
-			media_entity_enum_set(&iss->crashed, &subdev->entity);
-			failure = -ETIMEDOUT;
-		}
-	}
-
-	return failure;
-}
-
-/*
- * iss_pipeline_enable - Enable streaming on a pipeline
- * @pipe: ISS pipeline
- * @mode: Stream mode (single shot or continuous)
- *
- * Walk the entities chain starting at the pipeline output video node and start
- * all modules in the chain in the given mode.
- *
- * Return 0 if successful, or the return value of the failed video::s_stream
- * operation otherwise.
- */
-static int iss_pipeline_enable(struct iss_pipeline *pipe,
-			       enum iss_pipeline_stream_state mode)
-{
-	struct iss_device *iss = pipe->output->iss;
-	struct media_entity *entity;
-	struct media_pad *pad;
-	struct v4l2_subdev *subdev;
-	unsigned long flags;
-	int ret;
-
-	/* If one of the entities in the pipeline has crashed it will not work
-	 * properly. Refuse to start streaming in that case. This check must be
-	 * performed before the loop below to avoid starting entities if the
-	 * pipeline won't start anyway (those entities would then likely fail to
-	 * stop, making the problem worse).
-	 */
-	if (media_entity_enum_intersects(&pipe->ent_enum, &iss->crashed))
-		return -EIO;
-
-	spin_lock_irqsave(&pipe->lock, flags);
-	pipe->state &= ~(ISS_PIPELINE_IDLE_INPUT | ISS_PIPELINE_IDLE_OUTPUT);
-	spin_unlock_irqrestore(&pipe->lock, flags);
-
-	pipe->do_propagation = false;
-
-	mutex_lock(&iss->media_dev.graph_mutex);
-
-	entity = &pipe->output->video.entity;
-	while (1) {
-		pad = &entity->pads[0];
-		if (!(pad->flags & MEDIA_PAD_FL_SINK))
-			break;
-
-		pad = media_pad_remote_pad_first(pad);
-		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
-			break;
-
-		entity = pad->entity;
-		subdev = media_entity_to_v4l2_subdev(entity);
-
-		ret = v4l2_subdev_call(subdev, video, s_stream, mode);
-		if (ret < 0 && ret != -ENOIOCTLCMD) {
-			iss_pipeline_disable(pipe, entity);
-			mutex_unlock(&iss->media_dev.graph_mutex);
-			return ret;
-		}
-
-		if (subdev == &iss->csi2a.subdev ||
-		    subdev == &iss->csi2b.subdev)
-			pipe->do_propagation = true;
-	}
-
-	mutex_unlock(&iss->media_dev.graph_mutex);
-	iss_print_status(pipe->output->iss);
-
-	return 0;
-}
-
-/*
- * omap4iss_pipeline_set_stream - Enable/disable streaming on a pipeline
- * @pipe: ISS pipeline
- * @state: Stream state (stopped, single shot or continuous)
- *
- * Set the pipeline to the given stream state. Pipelines can be started in
- * single-shot or continuous mode.
- *
- * Return 0 if successful, or the return value of the failed video::s_stream
- * operation otherwise. The pipeline state is not updated when the operation
- * fails, except when stopping the pipeline.
- */
-int omap4iss_pipeline_set_stream(struct iss_pipeline *pipe,
-				 enum iss_pipeline_stream_state state)
-{
-	int ret;
-
-	if (state == ISS_PIPELINE_STREAM_STOPPED)
-		ret = iss_pipeline_disable(pipe, NULL);
-	else
-		ret = iss_pipeline_enable(pipe, state);
-
-	if (ret == 0 || state == ISS_PIPELINE_STREAM_STOPPED)
-		pipe->stream_state = state;
-
-	return ret;
-}
-
-/*
- * omap4iss_pipeline_cancel_stream - Cancel stream on a pipeline
- * @pipe: ISS pipeline
- *
- * Cancelling a stream mark all buffers on all video nodes in the pipeline as
- * erroneous and makes sure no new buffer can be queued. This function is called
- * when a fatal error that prevents any further operation on the pipeline
- * occurs.
- */
-void omap4iss_pipeline_cancel_stream(struct iss_pipeline *pipe)
-{
-	if (pipe->input)
-		omap4iss_video_cancel_stream(pipe->input);
-	if (pipe->output)
-		omap4iss_video_cancel_stream(pipe->output);
-}
-
-/*
- * iss_pipeline_is_last - Verify if entity has an enabled link to the output
- *			  video node
- * @me: ISS module's media entity
- *
- * Returns 1 if the entity has an enabled link to the output video node or 0
- * otherwise. It's true only while pipeline can have no more than one output
- * node.
- */
-static int iss_pipeline_is_last(struct media_entity *me)
-{
-	struct iss_pipeline *pipe;
-	struct media_pad *pad;
-
-	pipe = to_iss_pipeline(me);
-	if (!pipe || pipe->stream_state == ISS_PIPELINE_STREAM_STOPPED)
-		return 0;
-	pad = media_pad_remote_pad_first(&pipe->output->pad);
-	return pad->entity == me;
-}
-
-static int iss_reset(struct iss_device *iss)
-{
-	unsigned int timeout;
-
-	iss_reg_set(iss, OMAP4_ISS_MEM_TOP, ISS_HL_SYSCONFIG,
-		    ISS_HL_SYSCONFIG_SOFTRESET);
-
-	timeout = iss_poll_condition_timeout(
-		!(iss_reg_read(iss, OMAP4_ISS_MEM_TOP, ISS_HL_SYSCONFIG) &
-		ISS_HL_SYSCONFIG_SOFTRESET), 1000, 10, 100);
-	if (timeout) {
-		dev_err(iss->dev, "ISS reset timeout\n");
-		return -ETIMEDOUT;
-	}
-
-	media_entity_enum_zero(&iss->crashed);
-
-	return 0;
-}
-
-static int iss_isp_reset(struct iss_device *iss)
-{
-	unsigned int timeout;
-
-	/* Fist, ensure that the ISP is IDLE (no transactions happening) */
-	iss_reg_update(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_SYSCONFIG,
-		       ISP5_SYSCONFIG_STANDBYMODE_MASK,
-		       ISP5_SYSCONFIG_STANDBYMODE_SMART);
-
-	iss_reg_set(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_CTRL, ISP5_CTRL_MSTANDBY);
-
-	timeout = iss_poll_condition_timeout(
-		iss_reg_read(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_CTRL) &
-		ISP5_CTRL_MSTANDBY_WAIT, 1000000, 1000, 1500);
-	if (timeout) {
-		dev_err(iss->dev, "ISP5 standby timeout\n");
-		return -ETIMEDOUT;
-	}
-
-	/* Now finally, do the reset */
-	iss_reg_set(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_SYSCONFIG,
-		    ISP5_SYSCONFIG_SOFTRESET);
-
-	timeout = iss_poll_condition_timeout(
-		!(iss_reg_read(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_SYSCONFIG) &
-		ISP5_SYSCONFIG_SOFTRESET), 1000000, 1000, 1500);
-	if (timeout) {
-		dev_err(iss->dev, "ISP5 reset timeout\n");
-		return -ETIMEDOUT;
-	}
-
-	return 0;
-}
-
-/*
- * iss_module_sync_idle - Helper to sync module with its idle state
- * @me: ISS submodule's media entity
- * @wait: ISS submodule's wait queue for streamoff/interrupt synchronization
- * @stopping: flag which tells module wants to stop
- *
- * This function checks if ISS submodule needs to wait for next interrupt. If
- * yes, makes the caller to sleep while waiting for such event.
- */
-int omap4iss_module_sync_idle(struct media_entity *me, wait_queue_head_t *wait,
-			      atomic_t *stopping)
-{
-	struct iss_pipeline *pipe = to_iss_pipeline(me);
-	struct iss_video *video = pipe->output;
-	unsigned long flags;
-
-	if (pipe->stream_state == ISS_PIPELINE_STREAM_STOPPED ||
-	    (pipe->stream_state == ISS_PIPELINE_STREAM_SINGLESHOT &&
-	     !iss_pipeline_ready(pipe)))
-		return 0;
-
-	/*
-	 * atomic_set() doesn't include memory barrier on ARM platform for SMP
-	 * scenario. We'll call it here to avoid race conditions.
-	 */
-	atomic_set(stopping, 1);
-	smp_wmb();
-
-	/*
-	 * If module is the last one, it's writing to memory. In this case,
-	 * it's necessary to check if the module is already paused due to
-	 * DMA queue underrun or if it has to wait for next interrupt to be
-	 * idle.
-	 * If it isn't the last one, the function won't sleep but *stopping
-	 * will still be set to warn next submodule caller's interrupt the
-	 * module wants to be idle.
-	 */
-	if (!iss_pipeline_is_last(me))
-		return 0;
-
-	spin_lock_irqsave(&video->qlock, flags);
-	if (video->dmaqueue_flags & ISS_VIDEO_DMAQUEUE_UNDERRUN) {
-		spin_unlock_irqrestore(&video->qlock, flags);
-		atomic_set(stopping, 0);
-		smp_wmb();
-		return 0;
-	}
-	spin_unlock_irqrestore(&video->qlock, flags);
-	if (!wait_event_timeout(*wait, !atomic_read(stopping),
-				msecs_to_jiffies(1000))) {
-		atomic_set(stopping, 0);
-		smp_wmb();
-		return -ETIMEDOUT;
-	}
-
-	return 0;
-}
-
-/*
- * omap4iss_module_sync_is_stopped - Helper to verify if module was stopping
- * @wait: ISS submodule's wait queue for streamoff/interrupt synchronization
- * @stopping: flag which tells module wants to stop
- *
- * This function checks if ISS submodule was stopping. In case of yes, it
- * notices the caller by setting stopping to 0 and waking up the wait queue.
- * Returns 1 if it was stopping or 0 otherwise.
- */
-int omap4iss_module_sync_is_stopping(wait_queue_head_t *wait,
-				     atomic_t *stopping)
-{
-	if (atomic_cmpxchg(stopping, 1, 0)) {
-		wake_up(wait);
-		return 1;
-	}
-
-	return 0;
-}
-
-/* --------------------------------------------------------------------------
- * Clock management
- */
-
-#define ISS_CLKCTRL_MASK	(ISS_CLKCTRL_CSI2_A |\
-				 ISS_CLKCTRL_CSI2_B |\
-				 ISS_CLKCTRL_ISP)
-
-static int __iss_subclk_update(struct iss_device *iss)
-{
-	u32 clk = 0;
-	int ret = 0, timeout = 1000;
-
-	if (iss->subclk_resources & OMAP4_ISS_SUBCLK_CSI2_A)
-		clk |= ISS_CLKCTRL_CSI2_A;
-
-	if (iss->subclk_resources & OMAP4_ISS_SUBCLK_CSI2_B)
-		clk |= ISS_CLKCTRL_CSI2_B;
-
-	if (iss->subclk_resources & OMAP4_ISS_SUBCLK_ISP)
-		clk |= ISS_CLKCTRL_ISP;
-
-	iss_reg_update(iss, OMAP4_ISS_MEM_TOP, ISS_CLKCTRL,
-		       ISS_CLKCTRL_MASK, clk);
-
-	/* Wait for HW assertion */
-	while (--timeout > 0) {
-		udelay(1);
-		if ((iss_reg_read(iss, OMAP4_ISS_MEM_TOP, ISS_CLKSTAT) &
-		    ISS_CLKCTRL_MASK) == clk)
-			break;
-	}
-
-	if (!timeout)
-		ret = -EBUSY;
-
-	return ret;
-}
-
-int omap4iss_subclk_enable(struct iss_device *iss,
-			   enum iss_subclk_resource res)
-{
-	iss->subclk_resources |= res;
-
-	return __iss_subclk_update(iss);
-}
-
-int omap4iss_subclk_disable(struct iss_device *iss,
-			    enum iss_subclk_resource res)
-{
-	iss->subclk_resources &= ~res;
-
-	return __iss_subclk_update(iss);
-}
-
-#define ISS_ISP5_CLKCTRL_MASK	(ISP5_CTRL_BL_CLK_ENABLE |\
-				 ISP5_CTRL_ISIF_CLK_ENABLE |\
-				 ISP5_CTRL_H3A_CLK_ENABLE |\
-				 ISP5_CTRL_RSZ_CLK_ENABLE |\
-				 ISP5_CTRL_IPIPE_CLK_ENABLE |\
-				 ISP5_CTRL_IPIPEIF_CLK_ENABLE)
-
-static void __iss_isp_subclk_update(struct iss_device *iss)
-{
-	u32 clk = 0;
-
-	if (iss->isp_subclk_resources & OMAP4_ISS_ISP_SUBCLK_ISIF)
-		clk |= ISP5_CTRL_ISIF_CLK_ENABLE;
-
-	if (iss->isp_subclk_resources & OMAP4_ISS_ISP_SUBCLK_H3A)
-		clk |= ISP5_CTRL_H3A_CLK_ENABLE;
-
-	if (iss->isp_subclk_resources & OMAP4_ISS_ISP_SUBCLK_RSZ)
-		clk |= ISP5_CTRL_RSZ_CLK_ENABLE;
-
-	if (iss->isp_subclk_resources & OMAP4_ISS_ISP_SUBCLK_IPIPE)
-		clk |= ISP5_CTRL_IPIPE_CLK_ENABLE;
-
-	if (iss->isp_subclk_resources & OMAP4_ISS_ISP_SUBCLK_IPIPEIF)
-		clk |= ISP5_CTRL_IPIPEIF_CLK_ENABLE;
-
-	if (clk)
-		clk |= ISP5_CTRL_BL_CLK_ENABLE;
-
-	iss_reg_update(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_CTRL,
-		       ISS_ISP5_CLKCTRL_MASK, clk);
-}
-
-void omap4iss_isp_subclk_enable(struct iss_device *iss,
-				enum iss_isp_subclk_resource res)
-{
-	iss->isp_subclk_resources |= res;
-
-	__iss_isp_subclk_update(iss);
-}
-
-void omap4iss_isp_subclk_disable(struct iss_device *iss,
-				 enum iss_isp_subclk_resource res)
-{
-	iss->isp_subclk_resources &= ~res;
-
-	__iss_isp_subclk_update(iss);
-}
-
-/*
- * iss_enable_clocks - Enable ISS clocks
- * @iss: OMAP4 ISS device
- *
- * Return 0 if successful, or clk_enable return value if any of tthem fails.
- */
-static int iss_enable_clocks(struct iss_device *iss)
-{
-	int ret;
-
-	ret = clk_enable(iss->iss_fck);
-	if (ret) {
-		dev_err(iss->dev, "clk_enable iss_fck failed\n");
-		return ret;
-	}
-
-	ret = clk_enable(iss->iss_ctrlclk);
-	if (ret) {
-		dev_err(iss->dev, "clk_enable iss_ctrlclk failed\n");
-		clk_disable(iss->iss_fck);
-		return ret;
-	}
-
-	return 0;
-}
-
-/*
- * iss_disable_clocks - Disable ISS clocks
- * @iss: OMAP4 ISS device
- */
-static void iss_disable_clocks(struct iss_device *iss)
-{
-	clk_disable(iss->iss_ctrlclk);
-	clk_disable(iss->iss_fck);
-}
-
-static int iss_get_clocks(struct iss_device *iss)
-{
-	iss->iss_fck = devm_clk_get(iss->dev, "iss_fck");
-	if (IS_ERR(iss->iss_fck)) {
-		dev_err(iss->dev, "Unable to get iss_fck clock info\n");
-		return PTR_ERR(iss->iss_fck);
-	}
-
-	iss->iss_ctrlclk = devm_clk_get(iss->dev, "iss_ctrlclk");
-	if (IS_ERR(iss->iss_ctrlclk)) {
-		dev_err(iss->dev, "Unable to get iss_ctrlclk clock info\n");
-		return PTR_ERR(iss->iss_ctrlclk);
-	}
-
-	return 0;
-}
-
-/*
- * omap4iss_get - Acquire the ISS resource.
- *
- * Initializes the clocks for the first acquire.
- *
- * Increment the reference count on the ISS. If the first reference is taken,
- * enable clocks and power-up all submodules.
- *
- * Return a pointer to the ISS device structure, or NULL if an error occurred.
- */
-struct iss_device *omap4iss_get(struct iss_device *iss)
-{
-	struct iss_device *__iss = iss;
-
-	if (!iss)
-		return NULL;
-
-	mutex_lock(&iss->iss_mutex);
-	if (iss->ref_count > 0)
-		goto out;
-
-	if (iss_enable_clocks(iss) < 0) {
-		__iss = NULL;
-		goto out;
-	}
-
-	iss_enable_interrupts(iss);
-
-out:
-	if (__iss)
-		iss->ref_count++;
-	mutex_unlock(&iss->iss_mutex);
-
-	return __iss;
-}
-
-/*
- * omap4iss_put - Release the ISS
- *
- * Decrement the reference count on the ISS. If the last reference is released,
- * power-down all submodules, disable clocks and free temporary buffers.
- */
-void omap4iss_put(struct iss_device *iss)
-{
-	if (!iss)
-		return;
-
-	mutex_lock(&iss->iss_mutex);
-	WARN_ON(iss->ref_count == 0);
-	if (--iss->ref_count == 0) {
-		iss_disable_interrupts(iss);
-		/* Reset the ISS if an entity has failed to stop. This is the
-		 * only way to recover from such conditions, although it would
-		 * be worth investigating whether resetting the ISP only can't
-		 * fix the problem in some cases.
-		 */
-		if (!media_entity_enum_empty(&iss->crashed))
-			iss_reset(iss);
-		iss_disable_clocks(iss);
-	}
-	mutex_unlock(&iss->iss_mutex);
-}
-
-static int iss_map_mem_resource(struct platform_device *pdev,
-				struct iss_device *iss,
-				enum iss_mem_resources res)
-{
-	iss->regs[res] = devm_platform_ioremap_resource(pdev, res);
-
-	return PTR_ERR_OR_ZERO(iss->regs[res]);
-}
-
-static void iss_unregister_entities(struct iss_device *iss)
-{
-	omap4iss_resizer_unregister_entities(&iss->resizer);
-	omap4iss_ipipe_unregister_entities(&iss->ipipe);
-	omap4iss_ipipeif_unregister_entities(&iss->ipipeif);
-	omap4iss_csi2_unregister_entities(&iss->csi2a);
-	omap4iss_csi2_unregister_entities(&iss->csi2b);
-
-	v4l2_device_unregister(&iss->v4l2_dev);
-	media_device_unregister(&iss->media_dev);
-}
-
-/*
- * iss_register_subdev_group - Register a group of subdevices
- * @iss: OMAP4 ISS device
- * @board_info: I2C subdevs board information array
- *
- * Register all I2C subdevices in the board_info array. The array must be
- * terminated by a NULL entry, and the first entry must be the sensor.
- *
- * Return a pointer to the sensor media entity if it has been successfully
- * registered, or NULL otherwise.
- */
-static struct v4l2_subdev *
-iss_register_subdev_group(struct iss_device *iss,
-			  struct iss_subdev_i2c_board_info *board_info)
-{
-	struct v4l2_subdev *sensor = NULL;
-	unsigned int first;
-
-	if (!board_info->board_info)
-		return NULL;
-
-	for (first = 1; board_info->board_info; ++board_info, first = 0) {
-		struct v4l2_subdev *subdev;
-		struct i2c_adapter *adapter;
-
-		adapter = i2c_get_adapter(board_info->i2c_adapter_id);
-		if (!adapter) {
-			dev_err(iss->dev,
-				"%s: Unable to get I2C adapter %d for device %s\n",
-				__func__, board_info->i2c_adapter_id,
-				board_info->board_info->type);
-			continue;
-		}
-
-		subdev = v4l2_i2c_new_subdev_board(&iss->v4l2_dev, adapter,
-						   board_info->board_info, NULL);
-		if (!subdev) {
-			dev_err(iss->dev, "Unable to register subdev %s\n",
-				board_info->board_info->type);
-			continue;
-		}
-
-		if (first)
-			sensor = subdev;
-	}
-
-	return sensor;
-}
-
-static int iss_register_entities(struct iss_device *iss)
-{
-	struct iss_platform_data *pdata = iss->pdata;
-	struct iss_v4l2_subdevs_group *subdevs;
-	int ret;
-
-	iss->media_dev.dev = iss->dev;
-	strscpy(iss->media_dev.model, "TI OMAP4 ISS",
-		sizeof(iss->media_dev.model));
-	iss->media_dev.hw_revision = iss->revision;
-	iss->media_dev.ops = &iss_media_ops;
-	ret = media_device_register(&iss->media_dev);
-	if (ret < 0) {
-		dev_err(iss->dev, "Media device registration failed (%d)\n",
-			ret);
-		return ret;
-	}
-
-	iss->v4l2_dev.mdev = &iss->media_dev;
-	ret = v4l2_device_register(iss->dev, &iss->v4l2_dev);
-	if (ret < 0) {
-		dev_err(iss->dev, "V4L2 device registration failed (%d)\n",
-			ret);
-		goto done;
-	}
-
-	/* Register internal entities */
-	ret = omap4iss_csi2_register_entities(&iss->csi2a, &iss->v4l2_dev);
-	if (ret < 0)
-		goto done;
-
-	ret = omap4iss_csi2_register_entities(&iss->csi2b, &iss->v4l2_dev);
-	if (ret < 0)
-		goto done;
-
-	ret = omap4iss_ipipeif_register_entities(&iss->ipipeif, &iss->v4l2_dev);
-	if (ret < 0)
-		goto done;
-
-	ret = omap4iss_ipipe_register_entities(&iss->ipipe, &iss->v4l2_dev);
-	if (ret < 0)
-		goto done;
-
-	ret = omap4iss_resizer_register_entities(&iss->resizer, &iss->v4l2_dev);
-	if (ret < 0)
-		goto done;
-
-	/* Register external entities */
-	for (subdevs = pdata->subdevs; subdevs && subdevs->subdevs; ++subdevs) {
-		struct v4l2_subdev *sensor;
-		struct media_entity *input;
-		unsigned int flags;
-		unsigned int pad;
-
-		sensor = iss_register_subdev_group(iss, subdevs->subdevs);
-		if (!sensor)
-			continue;
-
-		sensor->host_priv = subdevs;
-
-		/* Connect the sensor to the correct interface module.
-		 * CSI2a receiver through CSIPHY1, or
-		 * CSI2b receiver through CSIPHY2
-		 */
-		switch (subdevs->interface) {
-		case ISS_INTERFACE_CSI2A_PHY1:
-			input = &iss->csi2a.subdev.entity;
-			pad = CSI2_PAD_SINK;
-			flags = MEDIA_LNK_FL_IMMUTABLE
-			      | MEDIA_LNK_FL_ENABLED;
-			break;
-
-		case ISS_INTERFACE_CSI2B_PHY2:
-			input = &iss->csi2b.subdev.entity;
-			pad = CSI2_PAD_SINK;
-			flags = MEDIA_LNK_FL_IMMUTABLE
-			      | MEDIA_LNK_FL_ENABLED;
-			break;
-
-		default:
-			dev_err(iss->dev, "invalid interface type %u\n",
-				subdevs->interface);
-			ret = -EINVAL;
-			goto done;
-		}
-
-		ret = media_create_pad_link(&sensor->entity, 0, input, pad,
-					    flags);
-		if (ret < 0)
-			goto done;
-	}
-
-	ret = v4l2_device_register_subdev_nodes(&iss->v4l2_dev);
-
-done:
-	if (ret < 0)
-		iss_unregister_entities(iss);
-
-	return ret;
-}
-
-/*
- * iss_create_links() - Pads links creation for the subdevices
- * @iss : Pointer to ISS device
- *
- * return negative error code or zero on success
- */
-static int iss_create_links(struct iss_device *iss)
-{
-	int ret;
-
-	ret = omap4iss_csi2_create_links(iss);
-	if (ret < 0) {
-		dev_err(iss->dev, "CSI2 pads links creation failed\n");
-		return ret;
-	}
-
-	ret = omap4iss_ipipeif_create_links(iss);
-	if (ret < 0) {
-		dev_err(iss->dev, "ISP IPIPEIF pads links creation failed\n");
-		return ret;
-	}
-
-	ret = omap4iss_resizer_create_links(iss);
-	if (ret < 0) {
-		dev_err(iss->dev, "ISP RESIZER pads links creation failed\n");
-		return ret;
-	}
-
-	/* Connect the submodules. */
-	ret = media_create_pad_link(
-			&iss->csi2a.subdev.entity, CSI2_PAD_SOURCE,
-			&iss->ipipeif.subdev.entity, IPIPEIF_PAD_SINK, 0);
-	if (ret < 0)
-		return ret;
-
-	ret = media_create_pad_link(
-			&iss->csi2b.subdev.entity, CSI2_PAD_SOURCE,
-			&iss->ipipeif.subdev.entity, IPIPEIF_PAD_SINK, 0);
-	if (ret < 0)
-		return ret;
-
-	ret = media_create_pad_link(
-			&iss->ipipeif.subdev.entity, IPIPEIF_PAD_SOURCE_VP,
-			&iss->resizer.subdev.entity, RESIZER_PAD_SINK, 0);
-	if (ret < 0)
-		return ret;
-
-	ret = media_create_pad_link(
-			&iss->ipipeif.subdev.entity, IPIPEIF_PAD_SOURCE_VP,
-			&iss->ipipe.subdev.entity, IPIPE_PAD_SINK, 0);
-	if (ret < 0)
-		return ret;
-
-	ret = media_create_pad_link(
-			&iss->ipipe.subdev.entity, IPIPE_PAD_SOURCE_VP,
-			&iss->resizer.subdev.entity, RESIZER_PAD_SINK, 0);
-	if (ret < 0)
-		return ret;
-
-	return 0;
-};
-
-static void iss_cleanup_modules(struct iss_device *iss)
-{
-	omap4iss_csi2_cleanup(iss);
-	omap4iss_ipipeif_cleanup(iss);
-	omap4iss_ipipe_cleanup(iss);
-	omap4iss_resizer_cleanup(iss);
-}
-
-static int iss_initialize_modules(struct iss_device *iss)
-{
-	int ret;
-
-	ret = omap4iss_csiphy_init(iss);
-	if (ret < 0) {
-		dev_err(iss->dev, "CSI PHY initialization failed\n");
-		goto error_csiphy;
-	}
-
-	ret = omap4iss_csi2_init(iss);
-	if (ret < 0) {
-		dev_err(iss->dev, "CSI2 initialization failed\n");
-		goto error_csi2;
-	}
-
-	ret = omap4iss_ipipeif_init(iss);
-	if (ret < 0) {
-		dev_err(iss->dev, "ISP IPIPEIF initialization failed\n");
-		goto error_ipipeif;
-	}
-
-	ret = omap4iss_ipipe_init(iss);
-	if (ret < 0) {
-		dev_err(iss->dev, "ISP IPIPE initialization failed\n");
-		goto error_ipipe;
-	}
-
-	ret = omap4iss_resizer_init(iss);
-	if (ret < 0) {
-		dev_err(iss->dev, "ISP RESIZER initialization failed\n");
-		goto error_resizer;
-	}
-
-	return 0;
-
-error_resizer:
-	omap4iss_ipipe_cleanup(iss);
-error_ipipe:
-	omap4iss_ipipeif_cleanup(iss);
-error_ipipeif:
-	omap4iss_csi2_cleanup(iss);
-error_csi2:
-error_csiphy:
-	return ret;
-}
-
-static int iss_probe(struct platform_device *pdev)
-{
-	struct iss_platform_data *pdata = pdev->dev.platform_data;
-	struct iss_device *iss;
-	unsigned int i;
-	int ret;
-
-	if (!pdata)
-		return -EINVAL;
-
-	iss = devm_kzalloc(&pdev->dev, sizeof(*iss), GFP_KERNEL);
-	if (!iss)
-		return -ENOMEM;
-
-	mutex_init(&iss->iss_mutex);
-
-	iss->dev = &pdev->dev;
-	iss->pdata = pdata;
-
-	iss->raw_dmamask = DMA_BIT_MASK(32);
-	iss->dev->dma_mask = &iss->raw_dmamask;
-	iss->dev->coherent_dma_mask = DMA_BIT_MASK(32);
-
-	platform_set_drvdata(pdev, iss);
-
-	/*
-	 * TODO: When implementing DT support switch to syscon regmap lookup by
-	 * phandle.
-	 */
-	iss->syscon = syscon_regmap_lookup_by_compatible("syscon");
-	if (IS_ERR(iss->syscon)) {
-		ret = PTR_ERR(iss->syscon);
-		goto error;
-	}
-
-	/* Clocks */
-	ret = iss_map_mem_resource(pdev, iss, OMAP4_ISS_MEM_TOP);
-	if (ret < 0)
-		goto error;
-
-	ret = iss_get_clocks(iss);
-	if (ret < 0)
-		goto error;
-
-	if (!omap4iss_get(iss)) {
-		ret = -EINVAL;
-		goto error;
-	}
-
-	ret = iss_reset(iss);
-	if (ret < 0)
-		goto error_iss;
-
-	iss->revision = iss_reg_read(iss, OMAP4_ISS_MEM_TOP, ISS_HL_REVISION);
-	dev_info(iss->dev, "Revision %08x found\n", iss->revision);
-
-	for (i = 1; i < OMAP4_ISS_MEM_LAST; i++) {
-		ret = iss_map_mem_resource(pdev, iss, i);
-		if (ret)
-			goto error_iss;
-	}
-
-	/* Configure BTE BW_LIMITER field to max recommended value (1 GB) */
-	iss_reg_update(iss, OMAP4_ISS_MEM_BTE, BTE_CTRL,
-		       BTE_CTRL_BW_LIMITER_MASK,
-		       18 << BTE_CTRL_BW_LIMITER_SHIFT);
-
-	/* Perform ISP reset */
-	ret = omap4iss_subclk_enable(iss, OMAP4_ISS_SUBCLK_ISP);
-	if (ret < 0)
-		goto error_iss;
-
-	ret = iss_isp_reset(iss);
-	if (ret < 0)
-		goto error_iss;
-
-	dev_info(iss->dev, "ISP Revision %08x found\n",
-		 iss_reg_read(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_REVISION));
-
-	/* Interrupt */
-	ret = platform_get_irq(pdev, 0);
-	if (ret <= 0) {
-		ret = -ENODEV;
-		goto error_iss;
-	}
-	iss->irq_num = ret;
-
-	if (devm_request_irq(iss->dev, iss->irq_num, iss_isr, IRQF_SHARED,
-			     "OMAP4 ISS", iss)) {
-		dev_err(iss->dev, "Unable to request IRQ\n");
-		ret = -EINVAL;
-		goto error_iss;
-	}
-
-	/* Entities */
-	ret = iss_initialize_modules(iss);
-	if (ret < 0)
-		goto error_iss;
-
-	ret = iss_register_entities(iss);
-	if (ret < 0)
-		goto error_modules;
-
-	ret = media_entity_enum_init(&iss->crashed, &iss->media_dev);
-	if (ret)
-		goto error_entities;
-
-	ret = iss_create_links(iss);
-	if (ret < 0)
-		goto error_entities;
-
-	omap4iss_put(iss);
-
-	return 0;
-
-error_entities:
-	iss_unregister_entities(iss);
-	media_entity_enum_cleanup(&iss->crashed);
-error_modules:
-	iss_cleanup_modules(iss);
-error_iss:
-	omap4iss_put(iss);
-error:
-	mutex_destroy(&iss->iss_mutex);
-
-	return ret;
-}
-
-static void iss_remove(struct platform_device *pdev)
-{
-	struct iss_device *iss = platform_get_drvdata(pdev);
-
-	iss_unregister_entities(iss);
-	media_entity_enum_cleanup(&iss->crashed);
-	iss_cleanup_modules(iss);
-}
-
-static const struct platform_device_id omap4iss_id_table[] = {
-	{ "omap4iss", 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(platform, omap4iss_id_table);
-
-static struct platform_driver iss_driver = {
-	.probe		= iss_probe,
-	.remove_new	= iss_remove,
-	.id_table	= omap4iss_id_table,
-	.driver = {
-		.name	= "omap4iss",
-	},
-};
-
-module_platform_driver(iss_driver);
-
-MODULE_DESCRIPTION("TI OMAP4 ISS driver");
-MODULE_AUTHOR("Sergio Aguirre <sergio.a.aguirre@gmail.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/staging/media/omap4iss/iss.h b/drivers/staging/media/omap4iss/iss.h
deleted file mode 100644
index 3f587e000729..000000000000
--- a/drivers/staging/media/omap4iss/iss.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * TI OMAP4 ISS V4L2 Driver
- *
- * Copyright (C) 2012 Texas Instruments.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#ifndef _OMAP4_ISS_H_
-#define _OMAP4_ISS_H_
-
-#include <media/v4l2-device.h>
-#include <media/v4l2-mc.h>
-
-#include <linux/device.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/wait.h>
-
-#include <linux/platform_data/media/omap4iss.h>
-
-#include "iss_regs.h"
-#include "iss_csiphy.h"
-#include "iss_csi2.h"
-#include "iss_ipipeif.h"
-#include "iss_ipipe.h"
-#include "iss_resizer.h"
-
-struct regmap;
-
-#define to_iss_device(ptr_module)				\
-	container_of(ptr_module, struct iss_device, ptr_module)
-#define to_device(ptr_module)						\
-	(to_iss_device(ptr_module)->dev)
-
-enum iss_mem_resources {
-	OMAP4_ISS_MEM_TOP,
-	OMAP4_ISS_MEM_CSI2_A_REGS1,
-	OMAP4_ISS_MEM_CAMERARX_CORE1,
-	OMAP4_ISS_MEM_CSI2_B_REGS1,
-	OMAP4_ISS_MEM_CAMERARX_CORE2,
-	OMAP4_ISS_MEM_BTE,
-	OMAP4_ISS_MEM_ISP_SYS1,
-	OMAP4_ISS_MEM_ISP_RESIZER,
-	OMAP4_ISS_MEM_ISP_IPIPE,
-	OMAP4_ISS_MEM_ISP_ISIF,
-	OMAP4_ISS_MEM_ISP_IPIPEIF,
-	OMAP4_ISS_MEM_LAST,
-};
-
-enum iss_subclk_resource {
-	OMAP4_ISS_SUBCLK_SIMCOP		= (1 << 0),
-	OMAP4_ISS_SUBCLK_ISP		= (1 << 1),
-	OMAP4_ISS_SUBCLK_CSI2_A		= (1 << 2),
-	OMAP4_ISS_SUBCLK_CSI2_B		= (1 << 3),
-	OMAP4_ISS_SUBCLK_CCP2		= (1 << 4),
-};
-
-enum iss_isp_subclk_resource {
-	OMAP4_ISS_ISP_SUBCLK_BL		= (1 << 0),
-	OMAP4_ISS_ISP_SUBCLK_ISIF	= (1 << 1),
-	OMAP4_ISS_ISP_SUBCLK_H3A	= (1 << 2),
-	OMAP4_ISS_ISP_SUBCLK_RSZ	= (1 << 3),
-	OMAP4_ISS_ISP_SUBCLK_IPIPE	= (1 << 4),
-	OMAP4_ISS_ISP_SUBCLK_IPIPEIF	= (1 << 5),
-};
-
-/*
- * struct iss_reg - Structure for ISS register values.
- * @reg: 32-bit Register address.
- * @val: 32-bit Register value.
- */
-struct iss_reg {
-	enum iss_mem_resources mmio_range;
-	u32 reg;
-	u32 val;
-};
-
-/*
- * struct iss_device - ISS device structure.
- * @syscon: Regmap for the syscon register space
- * @crashed: Crashed entities
- */
-struct iss_device {
-	struct v4l2_device v4l2_dev;
-	struct media_device media_dev;
-	struct device *dev;
-	u32 revision;
-
-	/* platform HW resources */
-	struct iss_platform_data *pdata;
-	unsigned int irq_num;
-
-	struct resource *res[OMAP4_ISS_MEM_LAST];
-	void __iomem *regs[OMAP4_ISS_MEM_LAST];
-	struct regmap *syscon;
-
-	u64 raw_dmamask;
-
-	struct mutex iss_mutex;	/* For handling ref_count field */
-	struct media_entity_enum crashed;
-	int has_context;
-	int ref_count;
-
-	struct clk *iss_fck;
-	struct clk *iss_ctrlclk;
-
-	/* ISS modules */
-	struct iss_csi2_device csi2a;
-	struct iss_csi2_device csi2b;
-	struct iss_csiphy csiphy1;
-	struct iss_csiphy csiphy2;
-	struct iss_ipipeif_device ipipeif;
-	struct iss_ipipe_device ipipe;
-	struct iss_resizer_device resizer;
-
-	unsigned int subclk_resources;
-	unsigned int isp_subclk_resources;
-};
-
-int omap4iss_get_external_info(struct iss_pipeline *pipe,
-			       struct media_link *link);
-
-int omap4iss_module_sync_idle(struct media_entity *me, wait_queue_head_t *wait,
-			      atomic_t *stopping);
-
-int omap4iss_module_sync_is_stopping(wait_queue_head_t *wait,
-				     atomic_t *stopping);
-
-int omap4iss_pipeline_set_stream(struct iss_pipeline *pipe,
-				 enum iss_pipeline_stream_state state);
-void omap4iss_pipeline_cancel_stream(struct iss_pipeline *pipe);
-
-void omap4iss_configure_bridge(struct iss_device *iss,
-			       enum ipipeif_input_entity input);
-
-struct iss_device *omap4iss_get(struct iss_device *iss);
-void omap4iss_put(struct iss_device *iss);
-int omap4iss_subclk_enable(struct iss_device *iss,
-			   enum iss_subclk_resource res);
-int omap4iss_subclk_disable(struct iss_device *iss,
-			    enum iss_subclk_resource res);
-void omap4iss_isp_subclk_enable(struct iss_device *iss,
-				enum iss_isp_subclk_resource res);
-void omap4iss_isp_subclk_disable(struct iss_device *iss,
-				 enum iss_isp_subclk_resource res);
-
-int omap4iss_register_entities(struct platform_device *pdev,
-			       struct v4l2_device *v4l2_dev);
-void omap4iss_unregister_entities(struct platform_device *pdev);
-
-/*
- * iss_reg_read - Read the value of an OMAP4 ISS register
- * @iss: the ISS device
- * @res: memory resource in which the register is located
- * @offset: register offset in the memory resource
- *
- * Return the register value.
- */
-static inline
-u32 iss_reg_read(struct iss_device *iss, enum iss_mem_resources res,
-		 u32 offset)
-{
-	return readl(iss->regs[res] + offset);
-}
-
-/*
- * iss_reg_write - Write a value to an OMAP4 ISS register
- * @iss: the ISS device
- * @res: memory resource in which the register is located
- * @offset: register offset in the memory resource
- * @value: value to be written
- */
-static inline
-void iss_reg_write(struct iss_device *iss, enum iss_mem_resources res,
-		   u32 offset, u32 value)
-{
-	writel(value, iss->regs[res] + offset);
-}
-
-/*
- * iss_reg_clr - Clear bits in an OMAP4 ISS register
- * @iss: the ISS device
- * @res: memory resource in which the register is located
- * @offset: register offset in the memory resource
- * @clr: bit mask to be cleared
- */
-static inline
-void iss_reg_clr(struct iss_device *iss, enum iss_mem_resources res,
-		 u32 offset, u32 clr)
-{
-	u32 v = iss_reg_read(iss, res, offset);
-
-	iss_reg_write(iss, res, offset, v & ~clr);
-}
-
-/*
- * iss_reg_set - Set bits in an OMAP4 ISS register
- * @iss: the ISS device
- * @res: memory resource in which the register is located
- * @offset: register offset in the memory resource
- * @set: bit mask to be set
- */
-static inline
-void iss_reg_set(struct iss_device *iss, enum iss_mem_resources res,
-		 u32 offset, u32 set)
-{
-	u32 v = iss_reg_read(iss, res, offset);
-
-	iss_reg_write(iss, res, offset, v | set);
-}
-
-/*
- * iss_reg_update - Clear and set bits in an OMAP4 ISS register
- * @iss: the ISS device
- * @res: memory resource in which the register is located
- * @offset: register offset in the memory resource
- * @clr: bit mask to be cleared
- * @set: bit mask to be set
- *
- * Clear the clr mask first and then set the set mask.
- */
-static inline
-void iss_reg_update(struct iss_device *iss, enum iss_mem_resources res,
-		    u32 offset, u32 clr, u32 set)
-{
-	u32 v = iss_reg_read(iss, res, offset);
-
-	iss_reg_write(iss, res, offset, (v & ~clr) | set);
-}
-
-#define iss_poll_condition_timeout(cond, timeout, min_ival, max_ival)	\
-({									\
-	unsigned long __timeout = jiffies + usecs_to_jiffies(timeout);	\
-	unsigned int __min_ival = (min_ival);				\
-	unsigned int __max_ival = (max_ival);				\
-	bool __cond;							\
-	while (!(__cond = (cond))) {					\
-		if (time_after(jiffies, __timeout))			\
-			break;						\
-		usleep_range(__min_ival, __max_ival);			\
-	}								\
-	!__cond;							\
-})
-
-#endif /* _OMAP4_ISS_H_ */
diff --git a/drivers/staging/media/omap4iss/iss_csi2.c b/drivers/staging/media/omap4iss/iss_csi2.c
deleted file mode 100644
index 0e6c5bd81930..000000000000
--- a/drivers/staging/media/omap4iss/iss_csi2.c
+++ /dev/null
@@ -1,1379 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * TI OMAP4 ISS V4L2 Driver - CSI PHY module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#include <linux/delay.h>
-#include <media/v4l2-common.h>
-#include <linux/v4l2-mediabus.h>
-#include <linux/mm.h>
-
-#include "iss.h"
-#include "iss_regs.h"
-#include "iss_csi2.h"
-
-/*
- * csi2_if_enable - Enable CSI2 Receiver interface.
- * @enable: enable flag
- *
- */
-static void csi2_if_enable(struct iss_csi2_device *csi2, u8 enable)
-{
-	struct iss_csi2_ctrl_cfg *currctrl = &csi2->ctrl;
-
-	iss_reg_update(csi2->iss, csi2->regs1, CSI2_CTRL, CSI2_CTRL_IF_EN,
-		       enable ? CSI2_CTRL_IF_EN : 0);
-
-	currctrl->if_enable = enable;
-}
-
-/*
- * csi2_recv_config - CSI2 receiver module configuration.
- * @currctrl: iss_csi2_ctrl_cfg structure
- *
- */
-static void csi2_recv_config(struct iss_csi2_device *csi2,
-			     struct iss_csi2_ctrl_cfg *currctrl)
-{
-	u32 reg = 0;
-
-	if (currctrl->frame_mode)
-		reg |= CSI2_CTRL_FRAME;
-	else
-		reg &= ~CSI2_CTRL_FRAME;
-
-	if (currctrl->vp_clk_enable)
-		reg |= CSI2_CTRL_VP_CLK_EN;
-	else
-		reg &= ~CSI2_CTRL_VP_CLK_EN;
-
-	if (currctrl->vp_only_enable)
-		reg |= CSI2_CTRL_VP_ONLY_EN;
-	else
-		reg &= ~CSI2_CTRL_VP_ONLY_EN;
-
-	reg &= ~CSI2_CTRL_VP_OUT_CTRL_MASK;
-	reg |= currctrl->vp_out_ctrl << CSI2_CTRL_VP_OUT_CTRL_SHIFT;
-
-	if (currctrl->ecc_enable)
-		reg |= CSI2_CTRL_ECC_EN;
-	else
-		reg &= ~CSI2_CTRL_ECC_EN;
-
-	/*
-	 * Set MFlag assertion boundaries to:
-	 * Low: 4/8 of FIFO size
-	 * High: 6/8 of FIFO size
-	 */
-	reg &= ~(CSI2_CTRL_MFLAG_LEVH_MASK | CSI2_CTRL_MFLAG_LEVL_MASK);
-	reg |= (2 << CSI2_CTRL_MFLAG_LEVH_SHIFT) |
-	       (4 << CSI2_CTRL_MFLAG_LEVL_SHIFT);
-
-	/* Generation of 16x64-bit bursts (Recommended) */
-	reg |= CSI2_CTRL_BURST_SIZE_EXPAND;
-
-	/* Do Non-Posted writes (Recommended) */
-	reg |= CSI2_CTRL_NON_POSTED_WRITE;
-
-	/*
-	 * Enforce Little endian for all formats, including:
-	 * YUV4:2:2 8-bit and YUV4:2:0 Legacy
-	 */
-	reg |= CSI2_CTRL_ENDIANNESS;
-
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTRL, reg);
-}
-
-static const unsigned int csi2_input_fmts[] = {
-	MEDIA_BUS_FMT_SGRBG10_1X10,
-	MEDIA_BUS_FMT_SGRBG10_DPCM8_1X8,
-	MEDIA_BUS_FMT_SRGGB10_1X10,
-	MEDIA_BUS_FMT_SRGGB10_DPCM8_1X8,
-	MEDIA_BUS_FMT_SBGGR10_1X10,
-	MEDIA_BUS_FMT_SBGGR10_DPCM8_1X8,
-	MEDIA_BUS_FMT_SGBRG10_1X10,
-	MEDIA_BUS_FMT_SGBRG10_DPCM8_1X8,
-	MEDIA_BUS_FMT_SBGGR8_1X8,
-	MEDIA_BUS_FMT_SGBRG8_1X8,
-	MEDIA_BUS_FMT_SGRBG8_1X8,
-	MEDIA_BUS_FMT_SRGGB8_1X8,
-	MEDIA_BUS_FMT_UYVY8_1X16,
-	MEDIA_BUS_FMT_YUYV8_1X16,
-};
-
-/* To set the format on the CSI2 requires a mapping function that takes
- * the following inputs:
- * - 3 different formats (at this time)
- * - 2 destinations (mem, vp+mem) (vp only handled separately)
- * - 2 decompression options (on, off)
- * Output should be CSI2 frame format code
- * Array indices as follows: [format][dest][decompr]
- * Not all combinations are valid. 0 means invalid.
- */
-static const u16 __csi2_fmt_map[][2][2] = {
-	/* RAW10 formats */
-	{
-		/* Output to memory */
-		{
-			/* No DPCM decompression */
-			CSI2_PIX_FMT_RAW10_EXP16,
-			/* DPCM decompression */
-			0,
-		},
-		/* Output to both */
-		{
-			/* No DPCM decompression */
-			CSI2_PIX_FMT_RAW10_EXP16_VP,
-			/* DPCM decompression */
-			0,
-		},
-	},
-	/* RAW10 DPCM8 formats */
-	{
-		/* Output to memory */
-		{
-			/* No DPCM decompression */
-			CSI2_USERDEF_8BIT_DATA1,
-			/* DPCM decompression */
-			CSI2_USERDEF_8BIT_DATA1_DPCM10,
-		},
-		/* Output to both */
-		{
-			/* No DPCM decompression */
-			CSI2_PIX_FMT_RAW8_VP,
-			/* DPCM decompression */
-			CSI2_USERDEF_8BIT_DATA1_DPCM10_VP,
-		},
-	},
-	/* RAW8 formats */
-	{
-		/* Output to memory */
-		{
-			/* No DPCM decompression */
-			CSI2_PIX_FMT_RAW8,
-			/* DPCM decompression */
-			0,
-		},
-		/* Output to both */
-		{
-			/* No DPCM decompression */
-			CSI2_PIX_FMT_RAW8_VP,
-			/* DPCM decompression */
-			0,
-		},
-	},
-	/* YUV422 formats */
-	{
-		/* Output to memory */
-		{
-			/* No DPCM decompression */
-			CSI2_PIX_FMT_YUV422_8BIT,
-			/* DPCM decompression */
-			0,
-		},
-		/* Output to both */
-		{
-			/* No DPCM decompression */
-			CSI2_PIX_FMT_YUV422_8BIT_VP16,
-			/* DPCM decompression */
-			0,
-		},
-	},
-};
-
-/*
- * csi2_ctx_map_format - Map CSI2 sink media bus format to CSI2 format ID
- * @csi2: ISS CSI2 device
- *
- * Returns CSI2 physical format id
- */
-static u16 csi2_ctx_map_format(struct iss_csi2_device *csi2)
-{
-	const struct v4l2_mbus_framefmt *fmt = &csi2->formats[CSI2_PAD_SINK];
-	int fmtidx, destidx;
-
-	switch (fmt->code) {
-	case MEDIA_BUS_FMT_SGRBG10_1X10:
-	case MEDIA_BUS_FMT_SRGGB10_1X10:
-	case MEDIA_BUS_FMT_SBGGR10_1X10:
-	case MEDIA_BUS_FMT_SGBRG10_1X10:
-		fmtidx = 0;
-		break;
-	case MEDIA_BUS_FMT_SGRBG10_DPCM8_1X8:
-	case MEDIA_BUS_FMT_SRGGB10_DPCM8_1X8:
-	case MEDIA_BUS_FMT_SBGGR10_DPCM8_1X8:
-	case MEDIA_BUS_FMT_SGBRG10_DPCM8_1X8:
-		fmtidx = 1;
-		break;
-	case MEDIA_BUS_FMT_SBGGR8_1X8:
-	case MEDIA_BUS_FMT_SGBRG8_1X8:
-	case MEDIA_BUS_FMT_SGRBG8_1X8:
-	case MEDIA_BUS_FMT_SRGGB8_1X8:
-		fmtidx = 2;
-		break;
-	case MEDIA_BUS_FMT_UYVY8_1X16:
-	case MEDIA_BUS_FMT_YUYV8_1X16:
-		fmtidx = 3;
-		break;
-	default:
-		WARN(1, "CSI2: pixel format %08x unsupported!\n",
-		     fmt->code);
-		return 0;
-	}
-
-	if (!(csi2->output & CSI2_OUTPUT_IPIPEIF) &&
-	    !(csi2->output & CSI2_OUTPUT_MEMORY)) {
-		/* Neither output enabled is a valid combination */
-		return CSI2_PIX_FMT_OTHERS;
-	}
-
-	/* If we need to skip frames at the beginning of the stream disable the
-	 * video port to avoid sending the skipped frames to the IPIPEIF.
-	 */
-	destidx = csi2->frame_skip ? 0 : !!(csi2->output & CSI2_OUTPUT_IPIPEIF);
-
-	return __csi2_fmt_map[fmtidx][destidx][csi2->dpcm_decompress];
-}
-
-/*
- * csi2_set_outaddr - Set memory address to save output image
- * @csi2: Pointer to ISS CSI2a device.
- * @addr: 32-bit memory address aligned on 32 byte boundary.
- *
- * Sets the memory address where the output will be saved.
- *
- * Returns 0 if successful, or -EINVAL if the address is not in the 32 byte
- * boundary.
- */
-static void csi2_set_outaddr(struct iss_csi2_device *csi2, u32 addr)
-{
-	struct iss_csi2_ctx_cfg *ctx = &csi2->contexts[0];
-
-	ctx->ping_addr = addr;
-	ctx->pong_addr = addr;
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_PING_ADDR(ctx->ctxnum),
-		      ctx->ping_addr);
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_PONG_ADDR(ctx->ctxnum),
-		      ctx->pong_addr);
-}
-
-/*
- * is_usr_def_mapping - Checks whether USER_DEF_MAPPING should
- *			be enabled by CSI2.
- * @format_id: mapped format id
- *
- */
-static inline int is_usr_def_mapping(u32 format_id)
-{
-	return (format_id & 0xf0) == 0x40 ? 1 : 0;
-}
-
-/*
- * csi2_ctx_enable - Enable specified CSI2 context
- * @ctxnum: Context number, valid between 0 and 7 values.
- * @enable: enable
- *
- */
-static void csi2_ctx_enable(struct iss_csi2_device *csi2, u8 ctxnum, u8 enable)
-{
-	struct iss_csi2_ctx_cfg *ctx = &csi2->contexts[ctxnum];
-	u32 reg;
-
-	reg = iss_reg_read(csi2->iss, csi2->regs1, CSI2_CTX_CTRL1(ctxnum));
-
-	if (enable) {
-		unsigned int skip = 0;
-
-		if (csi2->frame_skip)
-			skip = csi2->frame_skip;
-		else if (csi2->output & CSI2_OUTPUT_MEMORY)
-			skip = 1;
-
-		reg &= ~CSI2_CTX_CTRL1_COUNT_MASK;
-		reg |= CSI2_CTX_CTRL1_COUNT_UNLOCK
-		    |  (skip << CSI2_CTX_CTRL1_COUNT_SHIFT)
-		    |  CSI2_CTX_CTRL1_CTX_EN;
-	} else {
-		reg &= ~CSI2_CTX_CTRL1_CTX_EN;
-	}
-
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_CTRL1(ctxnum), reg);
-	ctx->enabled = enable;
-}
-
-/*
- * csi2_ctx_config - CSI2 context configuration.
- * @ctx: context configuration
- *
- */
-static void csi2_ctx_config(struct iss_csi2_device *csi2,
-			    struct iss_csi2_ctx_cfg *ctx)
-{
-	u32 reg = 0;
-
-	ctx->frame = 0;
-
-	/* Set up CSI2_CTx_CTRL1 */
-	if (ctx->eof_enabled)
-		reg = CSI2_CTX_CTRL1_EOF_EN;
-
-	if (ctx->eol_enabled)
-		reg |= CSI2_CTX_CTRL1_EOL_EN;
-
-	if (ctx->checksum_enabled)
-		reg |= CSI2_CTX_CTRL1_CS_EN;
-
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_CTRL1(ctx->ctxnum), reg);
-
-	/* Set up CSI2_CTx_CTRL2 */
-	reg = ctx->virtual_id << CSI2_CTX_CTRL2_VIRTUAL_ID_SHIFT;
-	reg |= ctx->format_id << CSI2_CTX_CTRL2_FORMAT_SHIFT;
-
-	if (ctx->dpcm_decompress && ctx->dpcm_predictor)
-		reg |= CSI2_CTX_CTRL2_DPCM_PRED;
-
-	if (is_usr_def_mapping(ctx->format_id))
-		reg |= 2 << CSI2_CTX_CTRL2_USER_DEF_MAP_SHIFT;
-
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_CTRL2(ctx->ctxnum), reg);
-
-	/* Set up CSI2_CTx_CTRL3 */
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_CTRL3(ctx->ctxnum),
-		      ctx->alpha << CSI2_CTX_CTRL3_ALPHA_SHIFT);
-
-	/* Set up CSI2_CTx_DAT_OFST */
-	iss_reg_update(csi2->iss, csi2->regs1, CSI2_CTX_DAT_OFST(ctx->ctxnum),
-		       CSI2_CTX_DAT_OFST_MASK, ctx->data_offset);
-
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_PING_ADDR(ctx->ctxnum),
-		      ctx->ping_addr);
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_PONG_ADDR(ctx->ctxnum),
-		      ctx->pong_addr);
-}
-
-/*
- * csi2_timing_config - CSI2 timing configuration.
- * @timing: csi2_timing_cfg structure
- */
-static void csi2_timing_config(struct iss_csi2_device *csi2,
-			       struct iss_csi2_timing_cfg *timing)
-{
-	u32 reg;
-
-	reg = iss_reg_read(csi2->iss, csi2->regs1, CSI2_TIMING);
-
-	if (timing->force_rx_mode)
-		reg |= CSI2_TIMING_FORCE_RX_MODE_IO1;
-	else
-		reg &= ~CSI2_TIMING_FORCE_RX_MODE_IO1;
-
-	if (timing->stop_state_16x)
-		reg |= CSI2_TIMING_STOP_STATE_X16_IO1;
-	else
-		reg &= ~CSI2_TIMING_STOP_STATE_X16_IO1;
-
-	if (timing->stop_state_4x)
-		reg |= CSI2_TIMING_STOP_STATE_X4_IO1;
-	else
-		reg &= ~CSI2_TIMING_STOP_STATE_X4_IO1;
-
-	reg &= ~CSI2_TIMING_STOP_STATE_COUNTER_IO1_MASK;
-	reg |= timing->stop_state_counter <<
-	       CSI2_TIMING_STOP_STATE_COUNTER_IO1_SHIFT;
-
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_TIMING, reg);
-}
-
-/*
- * csi2_irq_ctx_set - Enables CSI2 Context IRQs.
- * @enable: Enable/disable CSI2 Context interrupts
- */
-static void csi2_irq_ctx_set(struct iss_csi2_device *csi2, int enable)
-{
-	const u32 mask = CSI2_CTX_IRQ_FE | CSI2_CTX_IRQ_FS;
-	int i;
-
-	for (i = 0; i < 8; i++) {
-		iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_IRQSTATUS(i),
-			      mask);
-		if (enable)
-			iss_reg_set(csi2->iss, csi2->regs1,
-				    CSI2_CTX_IRQENABLE(i), mask);
-		else
-			iss_reg_clr(csi2->iss, csi2->regs1,
-				    CSI2_CTX_IRQENABLE(i), mask);
-	}
-}
-
-/*
- * csi2_irq_complexio1_set - Enables CSI2 ComplexIO IRQs.
- * @enable: Enable/disable CSI2 ComplexIO #1 interrupts
- */
-static void csi2_irq_complexio1_set(struct iss_csi2_device *csi2, int enable)
-{
-	u32 reg;
-
-	reg = CSI2_COMPLEXIO_IRQ_STATEALLULPMEXIT |
-		CSI2_COMPLEXIO_IRQ_STATEALLULPMENTER |
-		CSI2_COMPLEXIO_IRQ_STATEULPM5 |
-		CSI2_COMPLEXIO_IRQ_ERRCONTROL5 |
-		CSI2_COMPLEXIO_IRQ_ERRESC5 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS5 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTHS5 |
-		CSI2_COMPLEXIO_IRQ_STATEULPM4 |
-		CSI2_COMPLEXIO_IRQ_ERRCONTROL4 |
-		CSI2_COMPLEXIO_IRQ_ERRESC4 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS4 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTHS4 |
-		CSI2_COMPLEXIO_IRQ_STATEULPM3 |
-		CSI2_COMPLEXIO_IRQ_ERRCONTROL3 |
-		CSI2_COMPLEXIO_IRQ_ERRESC3 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS3 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTHS3 |
-		CSI2_COMPLEXIO_IRQ_STATEULPM2 |
-		CSI2_COMPLEXIO_IRQ_ERRCONTROL2 |
-		CSI2_COMPLEXIO_IRQ_ERRESC2 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS2 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTHS2 |
-		CSI2_COMPLEXIO_IRQ_STATEULPM1 |
-		CSI2_COMPLEXIO_IRQ_ERRCONTROL1 |
-		CSI2_COMPLEXIO_IRQ_ERRESC1 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS1 |
-		CSI2_COMPLEXIO_IRQ_ERRSOTHS1;
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_COMPLEXIO_IRQSTATUS, reg);
-	if (enable)
-		iss_reg_set(csi2->iss, csi2->regs1, CSI2_COMPLEXIO_IRQENABLE,
-			    reg);
-	else
-		iss_reg_write(csi2->iss, csi2->regs1, CSI2_COMPLEXIO_IRQENABLE,
-			      0);
-}
-
-/*
- * csi2_irq_status_set - Enables CSI2 Status IRQs.
- * @enable: Enable/disable CSI2 Status interrupts
- */
-static void csi2_irq_status_set(struct iss_csi2_device *csi2, int enable)
-{
-	u32 reg;
-
-	reg = CSI2_IRQ_OCP_ERR |
-		CSI2_IRQ_SHORT_PACKET |
-		CSI2_IRQ_ECC_CORRECTION |
-		CSI2_IRQ_ECC_NO_CORRECTION |
-		CSI2_IRQ_COMPLEXIO_ERR |
-		CSI2_IRQ_FIFO_OVF |
-		CSI2_IRQ_CONTEXT0;
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_IRQSTATUS, reg);
-	if (enable)
-		iss_reg_set(csi2->iss, csi2->regs1, CSI2_IRQENABLE, reg);
-	else
-		iss_reg_write(csi2->iss, csi2->regs1, CSI2_IRQENABLE, 0);
-}
-
-/*
- * omap4iss_csi2_reset - Resets the CSI2 module.
- *
- * Must be called with the phy lock held.
- *
- * Returns 0 if successful, or -EBUSY if power command didn't respond.
- */
-int omap4iss_csi2_reset(struct iss_csi2_device *csi2)
-{
-	unsigned int timeout;
-
-	if (!csi2->available)
-		return -ENODEV;
-
-	if (csi2->phy->phy_in_use)
-		return -EBUSY;
-
-	iss_reg_set(csi2->iss, csi2->regs1, CSI2_SYSCONFIG,
-		    CSI2_SYSCONFIG_SOFT_RESET);
-
-	timeout = iss_poll_condition_timeout(
-		iss_reg_read(csi2->iss, csi2->regs1, CSI2_SYSSTATUS) &
-		CSI2_SYSSTATUS_RESET_DONE, 500, 100, 200);
-	if (timeout) {
-		dev_err(csi2->iss->dev, "CSI2: Soft reset timeout!\n");
-		return -EBUSY;
-	}
-
-	iss_reg_set(csi2->iss, csi2->regs1, CSI2_COMPLEXIO_CFG,
-		    CSI2_COMPLEXIO_CFG_RESET_CTRL);
-
-	timeout = iss_poll_condition_timeout(
-		iss_reg_read(csi2->iss, csi2->phy->phy_regs, REGISTER1) &
-		REGISTER1_RESET_DONE_CTRLCLK, 10000, 100, 500);
-	if (timeout) {
-		dev_err(csi2->iss->dev, "CSI2: CSI2_96M_FCLK reset timeout!\n");
-		return -EBUSY;
-	}
-
-	iss_reg_update(csi2->iss, csi2->regs1, CSI2_SYSCONFIG,
-		       CSI2_SYSCONFIG_MSTANDBY_MODE_MASK |
-		       CSI2_SYSCONFIG_AUTO_IDLE,
-		       CSI2_SYSCONFIG_MSTANDBY_MODE_NO);
-
-	return 0;
-}
-
-static int csi2_configure(struct iss_csi2_device *csi2)
-{
-	const struct iss_v4l2_subdevs_group *pdata;
-	struct iss_csi2_timing_cfg *timing = &csi2->timing[0];
-	struct v4l2_subdev *sensor;
-	struct media_pad *pad;
-
-	/*
-	 * CSI2 fields that can be updated while the context has
-	 * been enabled or the interface has been enabled are not
-	 * updated dynamically currently. So we do not allow to
-	 * reconfigure if either has been enabled
-	 */
-	if (csi2->contexts[0].enabled || csi2->ctrl.if_enable)
-		return -EBUSY;
-
-	pad = media_pad_remote_pad_first(&csi2->pads[CSI2_PAD_SINK]);
-	sensor = media_entity_to_v4l2_subdev(pad->entity);
-	pdata = sensor->host_priv;
-
-	csi2->frame_skip = 0;
-	v4l2_subdev_call(sensor, sensor, g_skip_frames, &csi2->frame_skip);
-
-	csi2->ctrl.vp_out_ctrl = pdata->bus.csi2.vpclk_div;
-	csi2->ctrl.frame_mode = ISS_CSI2_FRAME_IMMEDIATE;
-	csi2->ctrl.ecc_enable = pdata->bus.csi2.crc;
-
-	timing->force_rx_mode = 1;
-	timing->stop_state_16x = 1;
-	timing->stop_state_4x = 1;
-	timing->stop_state_counter = 0x1ff;
-
-	/*
-	 * The CSI2 receiver can't do any format conversion except DPCM
-	 * decompression, so every set_format call configures both pads
-	 * and enables DPCM decompression as a special case:
-	 */
-	if (csi2->formats[CSI2_PAD_SINK].code !=
-	    csi2->formats[CSI2_PAD_SOURCE].code)
-		csi2->dpcm_decompress = true;
-	else
-		csi2->dpcm_decompress = false;
-
-	csi2->contexts[0].format_id = csi2_ctx_map_format(csi2);
-
-	if (csi2->video_out.bpl_padding == 0)
-		csi2->contexts[0].data_offset = 0;
-	else
-		csi2->contexts[0].data_offset = csi2->video_out.bpl_value;
-
-	/*
-	 * Enable end of frame and end of line signals generation for
-	 * context 0. These signals are generated from CSI2 receiver to
-	 * qualify the last pixel of a frame and the last pixel of a line.
-	 * Without enabling the signals CSI2 receiver writes data to memory
-	 * beyond buffer size and/or data line offset is not handled correctly.
-	 */
-	csi2->contexts[0].eof_enabled = 1;
-	csi2->contexts[0].eol_enabled = 1;
-
-	csi2_irq_complexio1_set(csi2, 1);
-	csi2_irq_ctx_set(csi2, 1);
-	csi2_irq_status_set(csi2, 1);
-
-	/* Set configuration (timings, format and links) */
-	csi2_timing_config(csi2, timing);
-	csi2_recv_config(csi2, &csi2->ctrl);
-	csi2_ctx_config(csi2, &csi2->contexts[0]);
-
-	return 0;
-}
-
-/*
- * csi2_print_status - Prints CSI2 debug information.
- */
-#define CSI2_PRINT_REGISTER(iss, regs, name)\
-	dev_dbg(iss->dev, "###CSI2 " #name "=0x%08x\n", \
-		iss_reg_read(iss, regs, CSI2_##name))
-
-static void csi2_print_status(struct iss_csi2_device *csi2)
-{
-	struct iss_device *iss = csi2->iss;
-
-	if (!csi2->available)
-		return;
-
-	dev_dbg(iss->dev, "-------------CSI2 Register dump-------------\n");
-
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, SYSCONFIG);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, SYSSTATUS);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, IRQENABLE);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, IRQSTATUS);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, CTRL);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, DBG_H);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, COMPLEXIO_CFG);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, COMPLEXIO_IRQSTATUS);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, SHORT_PACKET);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, COMPLEXIO_IRQENABLE);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, DBG_P);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, TIMING);
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, CTX_CTRL1(0));
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, CTX_CTRL2(0));
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, CTX_DAT_OFST(0));
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, CTX_PING_ADDR(0));
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, CTX_PONG_ADDR(0));
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, CTX_IRQENABLE(0));
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, CTX_IRQSTATUS(0));
-	CSI2_PRINT_REGISTER(iss, csi2->regs1, CTX_CTRL3(0));
-
-	dev_dbg(iss->dev, "--------------------------------------------\n");
-}
-
-/* -----------------------------------------------------------------------------
- * Interrupt handling
- */
-
-/*
- * csi2_isr_buffer - Does buffer handling at end-of-frame
- * when writing to memory.
- */
-static void csi2_isr_buffer(struct iss_csi2_device *csi2)
-{
-	struct iss_buffer *buffer;
-
-	csi2_ctx_enable(csi2, 0, 0);
-
-	buffer = omap4iss_video_buffer_next(&csi2->video_out);
-
-	/*
-	 * Let video queue operation restart engine if there is an underrun
-	 * condition.
-	 */
-	if (!buffer)
-		return;
-
-	csi2_set_outaddr(csi2, buffer->iss_addr);
-	csi2_ctx_enable(csi2, 0, 1);
-}
-
-static void csi2_isr_ctx(struct iss_csi2_device *csi2,
-			 struct iss_csi2_ctx_cfg *ctx)
-{
-	unsigned int n = ctx->ctxnum;
-	u32 status;
-
-	status = iss_reg_read(csi2->iss, csi2->regs1, CSI2_CTX_IRQSTATUS(n));
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_CTX_IRQSTATUS(n), status);
-
-	if (omap4iss_module_sync_is_stopping(&csi2->wait, &csi2->stopping))
-		return;
-
-	/* Propagate frame number */
-	if (status & CSI2_CTX_IRQ_FS) {
-		struct iss_pipeline *pipe =
-				     to_iss_pipeline(&csi2->subdev.entity);
-		u16 frame;
-		u16 delta;
-
-		frame = iss_reg_read(csi2->iss, csi2->regs1,
-				     CSI2_CTX_CTRL2(ctx->ctxnum))
-		      >> CSI2_CTX_CTRL2_FRAME_SHIFT;
-
-		if (frame == 0) {
-			/* A zero value means that the counter isn't implemented
-			 * by the source. Increment the frame number in software
-			 * in that case.
-			 */
-			atomic_inc(&pipe->frame_number);
-		} else {
-			/* Extend the 16 bit frame number to 32 bits by
-			 * computing the delta between two consecutive CSI2
-			 * frame numbers and adding it to the software frame
-			 * number. The hardware counter starts at 1 and wraps
-			 * from 0xffff to 1 without going through 0, so subtract
-			 * 1 when the counter wraps.
-			 */
-			delta = frame - ctx->frame;
-			if (frame < ctx->frame)
-				delta--;
-			ctx->frame = frame;
-
-			atomic_add(delta, &pipe->frame_number);
-		}
-	}
-
-	if (!(status & CSI2_CTX_IRQ_FE))
-		return;
-
-	/* Skip interrupts until we reach the frame skip count. The CSI2 will be
-	 * automatically disabled, as the frame skip count has been programmed
-	 * in the CSI2_CTx_CTRL1::COUNT field, so re-enable it.
-	 *
-	 * It would have been nice to rely on the FRAME_NUMBER interrupt instead
-	 * but it turned out that the interrupt is only generated when the CSI2
-	 * writes to memory (the CSI2_CTx_CTRL1::COUNT field is decreased
-	 * correctly and reaches 0 when data is forwarded to the video port only
-	 * but no interrupt arrives). Maybe a CSI2 hardware bug.
-	 */
-	if (csi2->frame_skip) {
-		csi2->frame_skip--;
-		if (csi2->frame_skip == 0) {
-			ctx->format_id = csi2_ctx_map_format(csi2);
-			csi2_ctx_config(csi2, ctx);
-			csi2_ctx_enable(csi2, n, 1);
-		}
-		return;
-	}
-
-	if (csi2->output & CSI2_OUTPUT_MEMORY)
-		csi2_isr_buffer(csi2);
-}
-
-/*
- * omap4iss_csi2_isr - CSI2 interrupt handling.
- */
-void omap4iss_csi2_isr(struct iss_csi2_device *csi2)
-{
-	struct iss_pipeline *pipe = to_iss_pipeline(&csi2->subdev.entity);
-	u32 csi2_irqstatus, cpxio1_irqstatus;
-	struct iss_device *iss = csi2->iss;
-
-	if (!csi2->available)
-		return;
-
-	csi2_irqstatus = iss_reg_read(csi2->iss, csi2->regs1, CSI2_IRQSTATUS);
-	iss_reg_write(csi2->iss, csi2->regs1, CSI2_IRQSTATUS, csi2_irqstatus);
-
-	/* Failure Cases */
-	if (csi2_irqstatus & CSI2_IRQ_COMPLEXIO_ERR) {
-		cpxio1_irqstatus = iss_reg_read(csi2->iss, csi2->regs1,
-						CSI2_COMPLEXIO_IRQSTATUS);
-		iss_reg_write(csi2->iss, csi2->regs1, CSI2_COMPLEXIO_IRQSTATUS,
-			      cpxio1_irqstatus);
-		dev_dbg(iss->dev, "CSI2: ComplexIO Error IRQ %x\n",
-			cpxio1_irqstatus);
-		pipe->error = true;
-	}
-
-	if (csi2_irqstatus & (CSI2_IRQ_OCP_ERR |
-			      CSI2_IRQ_SHORT_PACKET |
-			      CSI2_IRQ_ECC_NO_CORRECTION |
-			      CSI2_IRQ_COMPLEXIO_ERR |
-			      CSI2_IRQ_FIFO_OVF)) {
-		dev_dbg(iss->dev,
-			"CSI2 Err: OCP:%d SHORT:%d ECC:%d CPXIO:%d OVF:%d\n",
-			csi2_irqstatus & CSI2_IRQ_OCP_ERR ? 1 : 0,
-			csi2_irqstatus & CSI2_IRQ_SHORT_PACKET ? 1 : 0,
-			csi2_irqstatus & CSI2_IRQ_ECC_NO_CORRECTION ? 1 : 0,
-			csi2_irqstatus & CSI2_IRQ_COMPLEXIO_ERR ? 1 : 0,
-			csi2_irqstatus & CSI2_IRQ_FIFO_OVF ? 1 : 0);
-		pipe->error = true;
-	}
-
-	/* Successful cases */
-	if (csi2_irqstatus & CSI2_IRQ_CONTEXT0)
-		csi2_isr_ctx(csi2, &csi2->contexts[0]);
-
-	if (csi2_irqstatus & CSI2_IRQ_ECC_CORRECTION)
-		dev_dbg(iss->dev, "CSI2: ECC correction done\n");
-}
-
-/* -----------------------------------------------------------------------------
- * ISS video operations
- */
-
-/*
- * csi2_queue - Queues the first buffer when using memory output
- * @video: The video node
- * @buffer: buffer to queue
- */
-static int csi2_queue(struct iss_video *video, struct iss_buffer *buffer)
-{
-	struct iss_csi2_device *csi2 = container_of(video,
-				struct iss_csi2_device, video_out);
-
-	csi2_set_outaddr(csi2, buffer->iss_addr);
-
-	/*
-	 * If streaming was enabled before there was a buffer queued
-	 * or underrun happened in the ISR, the hardware was not enabled
-	 * and DMA queue flag ISS_VIDEO_DMAQUEUE_UNDERRUN is still set.
-	 * Enable it now.
-	 */
-	if (csi2->video_out.dmaqueue_flags & ISS_VIDEO_DMAQUEUE_UNDERRUN) {
-		/* Enable / disable context 0 and IRQs */
-		csi2_if_enable(csi2, 1);
-		csi2_ctx_enable(csi2, 0, 1);
-		iss_video_dmaqueue_flags_clr(&csi2->video_out);
-	}
-
-	return 0;
-}
-
-static const struct iss_video_operations csi2_issvideo_ops = {
-	.queue = csi2_queue,
-};
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev operations
- */
-
-static struct v4l2_mbus_framefmt *
-__csi2_get_format(struct iss_csi2_device *csi2,
-		  struct v4l2_subdev_state *sd_state,
-		  unsigned int pad,
-		  enum v4l2_subdev_format_whence which)
-{
-	if (which == V4L2_SUBDEV_FORMAT_TRY)
-		return v4l2_subdev_state_get_format(sd_state, pad);
-
-	return &csi2->formats[pad];
-}
-
-static void
-csi2_try_format(struct iss_csi2_device *csi2,
-		struct v4l2_subdev_state *sd_state,
-		unsigned int pad,
-		struct v4l2_mbus_framefmt *fmt,
-		enum v4l2_subdev_format_whence which)
-{
-	u32 pixelcode;
-	struct v4l2_mbus_framefmt *format;
-	const struct iss_format_info *info;
-	unsigned int i;
-
-	switch (pad) {
-	case CSI2_PAD_SINK:
-		/* Clamp the width and height to valid range (1-8191). */
-		for (i = 0; i < ARRAY_SIZE(csi2_input_fmts); i++) {
-			if (fmt->code == csi2_input_fmts[i])
-				break;
-		}
-
-		/* If not found, use SGRBG10 as default */
-		if (i >= ARRAY_SIZE(csi2_input_fmts))
-			fmt->code = MEDIA_BUS_FMT_SGRBG10_1X10;
-
-		fmt->width = clamp_t(u32, fmt->width, 1, 8191);
-		fmt->height = clamp_t(u32, fmt->height, 1, 8191);
-		break;
-
-	case CSI2_PAD_SOURCE:
-		/* Source format same as sink format, except for DPCM
-		 * compression.
-		 */
-		pixelcode = fmt->code;
-		format = __csi2_get_format(csi2, sd_state, CSI2_PAD_SINK,
-					   which);
-		memcpy(fmt, format, sizeof(*fmt));
-
-		/*
-		 * Only Allow DPCM decompression, and check that the
-		 * pattern is preserved
-		 */
-		info = omap4iss_video_format_info(fmt->code);
-		if (info->uncompressed == pixelcode)
-			fmt->code = pixelcode;
-		break;
-	}
-
-	/* RGB, non-interlaced */
-	fmt->colorspace = V4L2_COLORSPACE_SRGB;
-	fmt->field = V4L2_FIELD_NONE;
-}
-
-/*
- * csi2_enum_mbus_code - Handle pixel format enumeration
- * @sd     : pointer to v4l2 subdev structure
- * @sd_state: V4L2 subdev state
- * @code   : pointer to v4l2_subdev_mbus_code_enum structure
- * return -EINVAL or zero on success
- */
-static int csi2_enum_mbus_code(struct v4l2_subdev *sd,
-			       struct v4l2_subdev_state *sd_state,
-			       struct v4l2_subdev_mbus_code_enum *code)
-{
-	struct iss_csi2_device *csi2 = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-	const struct iss_format_info *info;
-
-	if (code->pad == CSI2_PAD_SINK) {
-		if (code->index >= ARRAY_SIZE(csi2_input_fmts))
-			return -EINVAL;
-
-		code->code = csi2_input_fmts[code->index];
-	} else {
-		format = __csi2_get_format(csi2, sd_state, CSI2_PAD_SINK,
-					   code->which);
-		switch (code->index) {
-		case 0:
-			/* Passthrough sink pad code */
-			code->code = format->code;
-			break;
-		case 1:
-			/* Uncompressed code */
-			info = omap4iss_video_format_info(format->code);
-			if (info->uncompressed == format->code)
-				return -EINVAL;
-
-			code->code = info->uncompressed;
-			break;
-		default:
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-
-static int csi2_enum_frame_size(struct v4l2_subdev *sd,
-				struct v4l2_subdev_state *sd_state,
-				struct v4l2_subdev_frame_size_enum *fse)
-{
-	struct iss_csi2_device *csi2 = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt format;
-
-	if (fse->index != 0)
-		return -EINVAL;
-
-	format.code = fse->code;
-	format.width = 1;
-	format.height = 1;
-	csi2_try_format(csi2, sd_state, fse->pad, &format, fse->which);
-	fse->min_width = format.width;
-	fse->min_height = format.height;
-
-	if (format.code != fse->code)
-		return -EINVAL;
-
-	format.code = fse->code;
-	format.width = -1;
-	format.height = -1;
-	csi2_try_format(csi2, sd_state, fse->pad, &format, fse->which);
-	fse->max_width = format.width;
-	fse->max_height = format.height;
-
-	return 0;
-}
-
-/*
- * csi2_get_format - Handle get format by pads subdev method
- * @sd : pointer to v4l2 subdev structure
- * @sd_state: V4L2 subdev state
- * @fmt: pointer to v4l2 subdev format structure
- * return -EINVAL or zero on success
- */
-static int csi2_get_format(struct v4l2_subdev *sd,
-			   struct v4l2_subdev_state *sd_state,
-			   struct v4l2_subdev_format *fmt)
-{
-	struct iss_csi2_device *csi2 = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	format = __csi2_get_format(csi2, sd_state, fmt->pad, fmt->which);
-	if (!format)
-		return -EINVAL;
-
-	fmt->format = *format;
-	return 0;
-}
-
-/*
- * csi2_set_format - Handle set format by pads subdev method
- * @sd : pointer to v4l2 subdev structure
- * @sd_state: V4L2 subdev state
- * @fmt: pointer to v4l2 subdev format structure
- * return -EINVAL or zero on success
- */
-static int csi2_set_format(struct v4l2_subdev *sd,
-			   struct v4l2_subdev_state *sd_state,
-			   struct v4l2_subdev_format *fmt)
-{
-	struct iss_csi2_device *csi2 = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	format = __csi2_get_format(csi2, sd_state, fmt->pad, fmt->which);
-	if (!format)
-		return -EINVAL;
-
-	csi2_try_format(csi2, sd_state, fmt->pad, &fmt->format, fmt->which);
-	*format = fmt->format;
-
-	/* Propagate the format from sink to source */
-	if (fmt->pad == CSI2_PAD_SINK) {
-		format = __csi2_get_format(csi2, sd_state, CSI2_PAD_SOURCE,
-					   fmt->which);
-		*format = fmt->format;
-		csi2_try_format(csi2, sd_state, CSI2_PAD_SOURCE, format,
-				fmt->which);
-	}
-
-	return 0;
-}
-
-static int csi2_link_validate(struct v4l2_subdev *sd, struct media_link *link,
-			      struct v4l2_subdev_format *source_fmt,
-			      struct v4l2_subdev_format *sink_fmt)
-{
-	struct iss_csi2_device *csi2 = v4l2_get_subdevdata(sd);
-	struct iss_pipeline *pipe = to_iss_pipeline(&csi2->subdev.entity);
-	int rval;
-
-	pipe->external = media_entity_to_v4l2_subdev(link->source->entity);
-	rval = omap4iss_get_external_info(pipe, link);
-	if (rval < 0)
-		return rval;
-
-	return v4l2_subdev_link_validate_default(sd, link, source_fmt,
-						 sink_fmt);
-}
-
-/*
- * csi2_init_formats - Initialize formats on all pads
- * @sd: ISS CSI2 V4L2 subdevice
- * @fh: V4L2 subdev file handle
- *
- * Initialize all pad formats with default values. If fh is not NULL, try
- * formats are initialized on the file handle. Otherwise active formats are
- * initialized on the device.
- */
-static int csi2_init_formats(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
-{
-	struct v4l2_subdev_format format;
-
-	memset(&format, 0, sizeof(format));
-	format.pad = CSI2_PAD_SINK;
-	format.which = fh ? V4L2_SUBDEV_FORMAT_TRY : V4L2_SUBDEV_FORMAT_ACTIVE;
-	format.format.code = MEDIA_BUS_FMT_SGRBG10_1X10;
-	format.format.width = 4096;
-	format.format.height = 4096;
-	csi2_set_format(sd, fh ? fh->state : NULL, &format);
-
-	return 0;
-}
-
-/*
- * csi2_set_stream - Enable/Disable streaming on the CSI2 module
- * @sd: ISS CSI2 V4L2 subdevice
- * @enable: ISS pipeline stream state
- *
- * Return 0 on success or a negative error code otherwise.
- */
-static int csi2_set_stream(struct v4l2_subdev *sd, int enable)
-{
-	struct iss_csi2_device *csi2 = v4l2_get_subdevdata(sd);
-	struct iss_device *iss = csi2->iss;
-	struct iss_video *video_out = &csi2->video_out;
-	int ret = 0;
-
-	if (csi2->state == ISS_PIPELINE_STREAM_STOPPED) {
-		if (enable == ISS_PIPELINE_STREAM_STOPPED)
-			return 0;
-
-		omap4iss_subclk_enable(iss, csi2->subclk);
-	}
-
-	switch (enable) {
-	case ISS_PIPELINE_STREAM_CONTINUOUS: {
-		ret = omap4iss_csiphy_config(iss, sd);
-		if (ret < 0)
-			return ret;
-
-		if (omap4iss_csiphy_acquire(csi2->phy) < 0)
-			return -ENODEV;
-		csi2_configure(csi2);
-		csi2_print_status(csi2);
-
-		/*
-		 * When outputting to memory with no buffer available, let the
-		 * buffer queue handler start the hardware. A DMA queue flag
-		 * ISS_VIDEO_DMAQUEUE_QUEUED will be set as soon as there is
-		 * a buffer available.
-		 */
-		if (csi2->output & CSI2_OUTPUT_MEMORY &&
-		    !(video_out->dmaqueue_flags & ISS_VIDEO_DMAQUEUE_QUEUED))
-			break;
-		/* Enable context 0 and IRQs */
-		atomic_set(&csi2->stopping, 0);
-		csi2_ctx_enable(csi2, 0, 1);
-		csi2_if_enable(csi2, 1);
-		iss_video_dmaqueue_flags_clr(video_out);
-		break;
-	}
-	case ISS_PIPELINE_STREAM_STOPPED:
-		if (csi2->state == ISS_PIPELINE_STREAM_STOPPED)
-			return 0;
-		if (omap4iss_module_sync_idle(&sd->entity, &csi2->wait,
-					      &csi2->stopping))
-			ret = -ETIMEDOUT;
-		csi2_ctx_enable(csi2, 0, 0);
-		csi2_if_enable(csi2, 0);
-		csi2_irq_ctx_set(csi2, 0);
-		omap4iss_csiphy_release(csi2->phy);
-		omap4iss_subclk_disable(iss, csi2->subclk);
-		iss_video_dmaqueue_flags_clr(video_out);
-		break;
-	}
-
-	csi2->state = enable;
-	return ret;
-}
-
-/* subdev video operations */
-static const struct v4l2_subdev_video_ops csi2_video_ops = {
-	.s_stream = csi2_set_stream,
-};
-
-/* subdev pad operations */
-static const struct v4l2_subdev_pad_ops csi2_pad_ops = {
-	.enum_mbus_code = csi2_enum_mbus_code,
-	.enum_frame_size = csi2_enum_frame_size,
-	.get_fmt = csi2_get_format,
-	.set_fmt = csi2_set_format,
-	.link_validate = csi2_link_validate,
-};
-
-/* subdev operations */
-static const struct v4l2_subdev_ops csi2_ops = {
-	.video = &csi2_video_ops,
-	.pad = &csi2_pad_ops,
-};
-
-/* subdev internal operations */
-static const struct v4l2_subdev_internal_ops csi2_internal_ops = {
-	.open = csi2_init_formats,
-};
-
-/* -----------------------------------------------------------------------------
- * Media entity operations
- */
-
-/*
- * csi2_link_setup - Setup CSI2 connections.
- * @entity : Pointer to media entity structure
- * @local  : Pointer to local pad array
- * @remote : Pointer to remote pad array
- * @flags  : Link flags
- * return -EINVAL or zero on success
- */
-static int csi2_link_setup(struct media_entity *entity,
-			   const struct media_pad *local,
-			   const struct media_pad *remote, u32 flags)
-{
-	struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity);
-	struct iss_csi2_device *csi2 = v4l2_get_subdevdata(sd);
-	struct iss_csi2_ctrl_cfg *ctrl = &csi2->ctrl;
-	unsigned int index = local->index;
-
-	/* FIXME: this is actually a hack! */
-	if (is_media_entity_v4l2_subdev(remote->entity))
-		index |= 2 << 16;
-
-	/*
-	 * The ISS core doesn't support pipelines with multiple video outputs.
-	 * Revisit this when it will be implemented, and return -EBUSY for now.
-	 */
-
-	switch (index) {
-	case CSI2_PAD_SOURCE:
-		if (flags & MEDIA_LNK_FL_ENABLED) {
-			if (csi2->output & ~CSI2_OUTPUT_MEMORY)
-				return -EBUSY;
-			csi2->output |= CSI2_OUTPUT_MEMORY;
-		} else {
-			csi2->output &= ~CSI2_OUTPUT_MEMORY;
-		}
-		break;
-
-	case CSI2_PAD_SOURCE | 2 << 16:
-		if (flags & MEDIA_LNK_FL_ENABLED) {
-			if (csi2->output & ~CSI2_OUTPUT_IPIPEIF)
-				return -EBUSY;
-			csi2->output |= CSI2_OUTPUT_IPIPEIF;
-		} else {
-			csi2->output &= ~CSI2_OUTPUT_IPIPEIF;
-		}
-		break;
-
-	default:
-		/* Link from camera to CSI2 is fixed... */
-		return -EINVAL;
-	}
-
-	ctrl->vp_only_enable = csi2->output & CSI2_OUTPUT_MEMORY ? false : true;
-	ctrl->vp_clk_enable = !!(csi2->output & CSI2_OUTPUT_IPIPEIF);
-
-	return 0;
-}
-
-/* media operations */
-static const struct media_entity_operations csi2_media_ops = {
-	.link_setup = csi2_link_setup,
-	.link_validate = v4l2_subdev_link_validate,
-};
-
-void omap4iss_csi2_unregister_entities(struct iss_csi2_device *csi2)
-{
-	v4l2_device_unregister_subdev(&csi2->subdev);
-	omap4iss_video_unregister(&csi2->video_out);
-}
-
-int omap4iss_csi2_register_entities(struct iss_csi2_device *csi2,
-				    struct v4l2_device *vdev)
-{
-	int ret;
-
-	/* Register the subdev and video nodes. */
-	ret = v4l2_device_register_subdev(vdev, &csi2->subdev);
-	if (ret < 0)
-		goto error;
-
-	ret = omap4iss_video_register(&csi2->video_out, vdev);
-	if (ret < 0)
-		goto error;
-
-	return 0;
-
-error:
-	omap4iss_csi2_unregister_entities(csi2);
-	return ret;
-}
-
-/* -----------------------------------------------------------------------------
- * ISS CSI2 initialisation and cleanup
- */
-
-/*
- * csi2_init_entities - Initialize subdev and media entity.
- * @csi2: Pointer to csi2 structure.
- * return -ENOMEM or zero on success
- */
-static int csi2_init_entities(struct iss_csi2_device *csi2, const char *subname)
-{
-	struct v4l2_subdev *sd = &csi2->subdev;
-	struct media_pad *pads = csi2->pads;
-	struct media_entity *me = &sd->entity;
-	int ret;
-	char name[32];
-
-	v4l2_subdev_init(sd, &csi2_ops);
-	sd->internal_ops = &csi2_internal_ops;
-	snprintf(name, sizeof(name), "CSI2%s", subname);
-	snprintf(sd->name, sizeof(sd->name), "OMAP4 ISS %s", name);
-
-	sd->grp_id = BIT(16);	/* group ID for iss subdevs */
-	v4l2_set_subdevdata(sd, csi2);
-	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	pads[CSI2_PAD_SOURCE].flags = MEDIA_PAD_FL_SOURCE;
-	pads[CSI2_PAD_SINK].flags = MEDIA_PAD_FL_SINK;
-
-	me->ops = &csi2_media_ops;
-	ret = media_entity_pads_init(me, CSI2_PADS_NUM, pads);
-	if (ret < 0)
-		return ret;
-
-	csi2_init_formats(sd, NULL);
-
-	/* Video device node */
-	csi2->video_out.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-	csi2->video_out.ops = &csi2_issvideo_ops;
-	csi2->video_out.bpl_alignment = 32;
-	csi2->video_out.bpl_zero_padding = 1;
-	csi2->video_out.bpl_max = 0x1ffe0;
-	csi2->video_out.iss = csi2->iss;
-	csi2->video_out.capture_mem = PAGE_ALIGN(4096 * 4096) * 3;
-
-	ret = omap4iss_video_init(&csi2->video_out, name);
-	if (ret < 0)
-		goto error_video;
-
-	return 0;
-
-error_video:
-	media_entity_cleanup(&csi2->subdev.entity);
-	return ret;
-}
-
-/*
- * omap4iss_csi2_init - Routine for module driver init
- */
-int omap4iss_csi2_init(struct iss_device *iss)
-{
-	struct iss_csi2_device *csi2a = &iss->csi2a;
-	struct iss_csi2_device *csi2b = &iss->csi2b;
-	int ret;
-
-	csi2a->iss = iss;
-	csi2a->available = 1;
-	csi2a->regs1 = OMAP4_ISS_MEM_CSI2_A_REGS1;
-	csi2a->phy = &iss->csiphy1;
-	csi2a->subclk = OMAP4_ISS_SUBCLK_CSI2_A;
-	csi2a->state = ISS_PIPELINE_STREAM_STOPPED;
-	init_waitqueue_head(&csi2a->wait);
-
-	ret = csi2_init_entities(csi2a, "a");
-	if (ret < 0)
-		return ret;
-
-	csi2b->iss = iss;
-	csi2b->available = 1;
-	csi2b->regs1 = OMAP4_ISS_MEM_CSI2_B_REGS1;
-	csi2b->phy = &iss->csiphy2;
-	csi2b->subclk = OMAP4_ISS_SUBCLK_CSI2_B;
-	csi2b->state = ISS_PIPELINE_STREAM_STOPPED;
-	init_waitqueue_head(&csi2b->wait);
-
-	ret = csi2_init_entities(csi2b, "b");
-	if (ret < 0)
-		return ret;
-
-	return 0;
-}
-
-/*
- * omap4iss_csi2_create_links() - CSI2 pads links creation
- * @iss: Pointer to ISS device
- *
- * return negative error code or zero on success
- */
-int omap4iss_csi2_create_links(struct iss_device *iss)
-{
-	struct iss_csi2_device *csi2a = &iss->csi2a;
-	struct iss_csi2_device *csi2b = &iss->csi2b;
-	int ret;
-
-	/* Connect the CSI2a subdev to the video node. */
-	ret = media_create_pad_link(&csi2a->subdev.entity, CSI2_PAD_SOURCE,
-				    &csi2a->video_out.video.entity, 0, 0);
-	if (ret < 0)
-		return ret;
-
-	/* Connect the CSI2b subdev to the video node. */
-	ret = media_create_pad_link(&csi2b->subdev.entity, CSI2_PAD_SOURCE,
-				    &csi2b->video_out.video.entity, 0, 0);
-	if (ret < 0)
-		return ret;
-
-	return 0;
-}
-
-/*
- * omap4iss_csi2_cleanup - Routine for module driver cleanup
- */
-void omap4iss_csi2_cleanup(struct iss_device *iss)
-{
-	struct iss_csi2_device *csi2a = &iss->csi2a;
-	struct iss_csi2_device *csi2b = &iss->csi2b;
-
-	omap4iss_video_cleanup(&csi2a->video_out);
-	media_entity_cleanup(&csi2a->subdev.entity);
-
-	omap4iss_video_cleanup(&csi2b->video_out);
-	media_entity_cleanup(&csi2b->subdev.entity);
-}
diff --git a/drivers/staging/media/omap4iss/iss_csi2.h b/drivers/staging/media/omap4iss/iss_csi2.h
deleted file mode 100644
index 3f7fd9cff41d..000000000000
--- a/drivers/staging/media/omap4iss/iss_csi2.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * TI OMAP4 ISS V4L2 Driver - CSI2 module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#ifndef OMAP4_ISS_CSI2_H
-#define OMAP4_ISS_CSI2_H
-
-#include <linux/types.h>
-#include <linux/videodev2.h>
-
-#include "iss_video.h"
-
-struct iss_csiphy;
-
-/* This is not an exhaustive list */
-enum iss_csi2_pix_formats {
-	CSI2_PIX_FMT_OTHERS = 0,
-	CSI2_PIX_FMT_YUV422_8BIT = 0x1e,
-	CSI2_PIX_FMT_YUV422_8BIT_VP = 0x9e,
-	CSI2_PIX_FMT_YUV422_8BIT_VP16 = 0xde,
-	CSI2_PIX_FMT_RAW10_EXP16 = 0xab,
-	CSI2_PIX_FMT_RAW10_EXP16_VP = 0x12f,
-	CSI2_PIX_FMT_RAW8 = 0x2a,
-	CSI2_PIX_FMT_RAW8_DPCM10_EXP16 = 0x2aa,
-	CSI2_PIX_FMT_RAW8_DPCM10_VP = 0x32a,
-	CSI2_PIX_FMT_RAW8_VP = 0x12a,
-	CSI2_USERDEF_8BIT_DATA1_DPCM10_VP = 0x340,
-	CSI2_USERDEF_8BIT_DATA1_DPCM10 = 0x2c0,
-	CSI2_USERDEF_8BIT_DATA1 = 0x40,
-};
-
-enum iss_csi2_irqevents {
-	OCP_ERR_IRQ = 0x4000,
-	SHORT_PACKET_IRQ = 0x2000,
-	ECC_CORRECTION_IRQ = 0x1000,
-	ECC_NO_CORRECTION_IRQ = 0x800,
-	COMPLEXIO2_ERR_IRQ = 0x400,
-	COMPLEXIO1_ERR_IRQ = 0x200,
-	FIFO_OVF_IRQ = 0x100,
-	CONTEXT7 = 0x80,
-	CONTEXT6 = 0x40,
-	CONTEXT5 = 0x20,
-	CONTEXT4 = 0x10,
-	CONTEXT3 = 0x8,
-	CONTEXT2 = 0x4,
-	CONTEXT1 = 0x2,
-	CONTEXT0 = 0x1,
-};
-
-enum iss_csi2_ctx_irqevents {
-	CTX_ECC_CORRECTION = 0x100,
-	CTX_LINE_NUMBER = 0x80,
-	CTX_FRAME_NUMBER = 0x40,
-	CTX_CS = 0x20,
-	CTX_LE = 0x8,
-	CTX_LS = 0x4,
-	CTX_FE = 0x2,
-	CTX_FS = 0x1,
-};
-
-enum iss_csi2_frame_mode {
-	ISS_CSI2_FRAME_IMMEDIATE,
-	ISS_CSI2_FRAME_AFTERFEC,
-};
-
-#define ISS_CSI2_MAX_CTX_NUM	7
-
-struct iss_csi2_ctx_cfg {
-	u8 ctxnum;		/* context number 0 - 7 */
-	u8 dpcm_decompress;
-
-	/* Fields in CSI2_CTx_CTRL2 - locked by CSI2_CTx_CTRL1.CTX_EN */
-	u8 virtual_id;
-	u16 format_id;		/* as in CSI2_CTx_CTRL2[9:0] */
-	u8 dpcm_predictor;	/* 1: simple, 0: advanced */
-	u16 frame;
-
-	/* Fields in CSI2_CTx_CTRL1/3 - Shadowed */
-	u16 alpha;
-	u16 data_offset;
-	u32 ping_addr;
-	u32 pong_addr;
-	u8 eof_enabled;
-	u8 eol_enabled;
-	u8 checksum_enabled;
-	u8 enabled;
-};
-
-struct iss_csi2_timing_cfg {
-	u8 ionum;			/* IO1 or IO2 as in CSI2_TIMING */
-	unsigned force_rx_mode:1;
-	unsigned stop_state_16x:1;
-	unsigned stop_state_4x:1;
-	u16 stop_state_counter;
-};
-
-struct iss_csi2_ctrl_cfg {
-	bool vp_clk_enable;
-	bool vp_only_enable;
-	u8 vp_out_ctrl;
-	enum iss_csi2_frame_mode frame_mode;
-	bool ecc_enable;
-	bool if_enable;
-};
-
-#define CSI2_PAD_SINK		0
-#define CSI2_PAD_SOURCE		1
-#define CSI2_PADS_NUM		2
-
-#define CSI2_OUTPUT_IPIPEIF	BIT(0)
-#define CSI2_OUTPUT_MEMORY	BIT(1)
-
-struct iss_csi2_device {
-	struct v4l2_subdev subdev;
-	struct media_pad pads[CSI2_PADS_NUM];
-	struct v4l2_mbus_framefmt formats[CSI2_PADS_NUM];
-
-	struct iss_video video_out;
-	struct iss_device *iss;
-
-	u8 available;		/* Is the IP present on the silicon? */
-
-	/* memory resources, as defined in enum iss_mem_resources */
-	unsigned int regs1;
-	unsigned int regs2;
-	/* ISP subclock, as defined in enum iss_isp_subclk_resource */
-	unsigned int subclk;
-
-	u32 output; /* output to IPIPEIF, memory or both? */
-	bool dpcm_decompress;
-	unsigned int frame_skip;
-
-	struct iss_csiphy *phy;
-	struct iss_csi2_ctx_cfg contexts[ISS_CSI2_MAX_CTX_NUM + 1];
-	struct iss_csi2_timing_cfg timing[2];
-	struct iss_csi2_ctrl_cfg ctrl;
-	enum iss_pipeline_stream_state state;
-	wait_queue_head_t wait;
-	atomic_t stopping;
-};
-
-void omap4iss_csi2_isr(struct iss_csi2_device *csi2);
-int omap4iss_csi2_reset(struct iss_csi2_device *csi2);
-int omap4iss_csi2_init(struct iss_device *iss);
-int omap4iss_csi2_create_links(struct iss_device *iss);
-void omap4iss_csi2_cleanup(struct iss_device *iss);
-void omap4iss_csi2_unregister_entities(struct iss_csi2_device *csi2);
-int omap4iss_csi2_register_entities(struct iss_csi2_device *csi2,
-				    struct v4l2_device *vdev);
-#endif	/* OMAP4_ISS_CSI2_H */
diff --git a/drivers/staging/media/omap4iss/iss_csiphy.c b/drivers/staging/media/omap4iss/iss_csiphy.c
deleted file mode 100644
index 96f2ce045138..000000000000
--- a/drivers/staging/media/omap4iss/iss_csiphy.c
+++ /dev/null
@@ -1,277 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * TI OMAP4 ISS V4L2 Driver - CSI PHY module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/regmap.h>
-
-#include "../../../../arch/arm/mach-omap2/control.h"
-
-#include "iss.h"
-#include "iss_regs.h"
-#include "iss_csiphy.h"
-
-/*
- * csiphy_lanes_config - Configuration of CSIPHY lanes.
- *
- * Updates HW configuration.
- * Called with phy->mutex taken.
- */
-static void csiphy_lanes_config(struct iss_csiphy *phy)
-{
-	unsigned int i;
-	u32 reg;
-
-	reg = iss_reg_read(phy->iss, phy->cfg_regs, CSI2_COMPLEXIO_CFG);
-
-	for (i = 0; i < phy->max_data_lanes; i++) {
-		reg &= ~(CSI2_COMPLEXIO_CFG_DATA_POL(i + 1) |
-			 CSI2_COMPLEXIO_CFG_DATA_POSITION_MASK(i + 1));
-		reg |= (phy->lanes.data[i].pol ?
-			CSI2_COMPLEXIO_CFG_DATA_POL(i + 1) : 0);
-		reg |= (phy->lanes.data[i].pos <<
-			CSI2_COMPLEXIO_CFG_DATA_POSITION_SHIFT(i + 1));
-	}
-
-	reg &= ~(CSI2_COMPLEXIO_CFG_CLOCK_POL |
-		 CSI2_COMPLEXIO_CFG_CLOCK_POSITION_MASK);
-	reg |= phy->lanes.clk.pol ? CSI2_COMPLEXIO_CFG_CLOCK_POL : 0;
-	reg |= phy->lanes.clk.pos << CSI2_COMPLEXIO_CFG_CLOCK_POSITION_SHIFT;
-
-	iss_reg_write(phy->iss, phy->cfg_regs, CSI2_COMPLEXIO_CFG, reg);
-}
-
-/*
- * csiphy_set_power
- * @power: Power state to be set.
- *
- * Returns 0 if successful, or -EBUSY if the retry count is exceeded.
- */
-static int csiphy_set_power(struct iss_csiphy *phy, u32 power)
-{
-	u32 reg;
-	u8 retry_count;
-
-	iss_reg_update(phy->iss, phy->cfg_regs, CSI2_COMPLEXIO_CFG,
-		       CSI2_COMPLEXIO_CFG_PWD_CMD_MASK,
-		       power | CSI2_COMPLEXIO_CFG_PWR_AUTO);
-
-	retry_count = 0;
-	do {
-		udelay(1);
-		reg = iss_reg_read(phy->iss, phy->cfg_regs, CSI2_COMPLEXIO_CFG)
-		    & CSI2_COMPLEXIO_CFG_PWD_STATUS_MASK;
-
-		if (reg != power >> 2)
-			retry_count++;
-
-	} while ((reg != power >> 2) && (retry_count < 250));
-
-	if (retry_count == 250) {
-		dev_err(phy->iss->dev, "CSI2 CIO set power failed!\n");
-		return -EBUSY;
-	}
-
-	return 0;
-}
-
-/*
- * csiphy_dphy_config - Configure CSI2 D-PHY parameters.
- *
- * Called with phy->mutex taken.
- */
-static void csiphy_dphy_config(struct iss_csiphy *phy)
-{
-	u32 reg;
-
-	/* Set up REGISTER0 */
-	reg = phy->dphy.ths_term << REGISTER0_THS_TERM_SHIFT;
-	reg |= phy->dphy.ths_settle << REGISTER0_THS_SETTLE_SHIFT;
-
-	iss_reg_write(phy->iss, phy->phy_regs, REGISTER0, reg);
-
-	/* Set up REGISTER1 */
-	reg = phy->dphy.tclk_term << REGISTER1_TCLK_TERM_SHIFT;
-	reg |= phy->dphy.tclk_miss << REGISTER1_CTRLCLK_DIV_FACTOR_SHIFT;
-	reg |= phy->dphy.tclk_settle << REGISTER1_TCLK_SETTLE_SHIFT;
-	reg |= 0xb8 << REGISTER1_DPHY_HS_SYNC_PATTERN_SHIFT;
-
-	iss_reg_write(phy->iss, phy->phy_regs, REGISTER1, reg);
-}
-
-/*
- * TCLK values are OK at their reset values
- */
-#define TCLK_TERM	0
-#define TCLK_MISS	1
-#define TCLK_SETTLE	14
-
-int omap4iss_csiphy_config(struct iss_device *iss,
-			   struct v4l2_subdev *csi2_subdev)
-{
-	struct iss_csi2_device *csi2 = v4l2_get_subdevdata(csi2_subdev);
-	struct iss_pipeline *pipe = to_iss_pipeline(&csi2_subdev->entity);
-	struct iss_v4l2_subdevs_group *subdevs = pipe->external->host_priv;
-	struct iss_csiphy_dphy_cfg csi2phy;
-	int csi2_ddrclk_khz;
-	struct iss_csiphy_lanes_cfg *lanes;
-	unsigned int used_lanes = 0;
-	u32 cam_rx_ctrl;
-	unsigned int i;
-
-	lanes = &subdevs->bus.csi2.lanecfg;
-
-	/*
-	 * SCM.CONTROL_CAMERA_RX
-	 * - bit [31] : CSIPHY2 lane 2 enable (4460+ only)
-	 * - bit [30:29] : CSIPHY2 per-lane enable (1 to 0)
-	 * - bit [28:24] : CSIPHY1 per-lane enable (4 to 0)
-	 * - bit [21] : CSIPHY2 CTRLCLK enable
-	 * - bit [20:19] : CSIPHY2 config: 00 d-phy, 01/10 ccp2
-	 * - bit [18] : CSIPHY1 CTRLCLK enable
-	 * - bit [17:16] : CSIPHY1 config: 00 d-phy, 01/10 ccp2
-	 */
-	/*
-	 * TODO: When implementing DT support specify the CONTROL_CAMERA_RX
-	 * register offset in the syscon property instead of hardcoding it.
-	 */
-	regmap_read(iss->syscon, 0x68, &cam_rx_ctrl);
-
-	if (subdevs->interface == ISS_INTERFACE_CSI2A_PHY1) {
-		cam_rx_ctrl &= ~(OMAP4_CAMERARX_CSI21_LANEENABLE_MASK |
-				OMAP4_CAMERARX_CSI21_CAMMODE_MASK);
-		/* NOTE: Leave CSIPHY1 config to 0x0: D-PHY mode */
-		/* Enable all lanes for now */
-		cam_rx_ctrl |=
-			0x1f << OMAP4_CAMERARX_CSI21_LANEENABLE_SHIFT;
-		/* Enable CTRLCLK */
-		cam_rx_ctrl |= OMAP4_CAMERARX_CSI21_CTRLCLKEN_MASK;
-	}
-
-	if (subdevs->interface == ISS_INTERFACE_CSI2B_PHY2) {
-		cam_rx_ctrl &= ~(OMAP4_CAMERARX_CSI22_LANEENABLE_MASK |
-				OMAP4_CAMERARX_CSI22_CAMMODE_MASK);
-		/* NOTE: Leave CSIPHY2 config to 0x0: D-PHY mode */
-		/* Enable all lanes for now */
-		cam_rx_ctrl |=
-			0x3 << OMAP4_CAMERARX_CSI22_LANEENABLE_SHIFT;
-		/* Enable CTRLCLK */
-		cam_rx_ctrl |= OMAP4_CAMERARX_CSI22_CTRLCLKEN_MASK;
-	}
-
-	regmap_write(iss->syscon, 0x68, cam_rx_ctrl);
-
-	/* Reset used lane count */
-	csi2->phy->used_data_lanes = 0;
-
-	/* Clock and data lanes verification */
-	for (i = 0; i < csi2->phy->max_data_lanes; i++) {
-		if (lanes->data[i].pos == 0)
-			continue;
-
-		if (lanes->data[i].pol > 1 ||
-		    lanes->data[i].pos > (csi2->phy->max_data_lanes + 1))
-			return -EINVAL;
-
-		if (used_lanes & (1 << lanes->data[i].pos))
-			return -EINVAL;
-
-		used_lanes |= 1 << lanes->data[i].pos;
-		csi2->phy->used_data_lanes++;
-	}
-
-	if (lanes->clk.pol > 1 ||
-	    lanes->clk.pos > (csi2->phy->max_data_lanes + 1))
-		return -EINVAL;
-
-	if (lanes->clk.pos == 0 || used_lanes & (1 << lanes->clk.pos))
-		return -EINVAL;
-
-	csi2_ddrclk_khz = pipe->external_rate / 1000
-		/ (2 * csi2->phy->used_data_lanes)
-		* pipe->external_bpp;
-
-	/*
-	 * THS_TERM: Programmed value = ceil(12.5 ns/DDRClk period) - 1.
-	 * THS_SETTLE: Programmed value = ceil(90 ns/DDRClk period) + 3.
-	 */
-	csi2phy.ths_term = DIV_ROUND_UP(25 * csi2_ddrclk_khz, 2000000) - 1;
-	csi2phy.ths_settle = DIV_ROUND_UP(90 * csi2_ddrclk_khz, 1000000) + 3;
-	csi2phy.tclk_term = TCLK_TERM;
-	csi2phy.tclk_miss = TCLK_MISS;
-	csi2phy.tclk_settle = TCLK_SETTLE;
-
-	mutex_lock(&csi2->phy->mutex);
-	csi2->phy->dphy = csi2phy;
-	csi2->phy->lanes = *lanes;
-	mutex_unlock(&csi2->phy->mutex);
-
-	return 0;
-}
-
-int omap4iss_csiphy_acquire(struct iss_csiphy *phy)
-{
-	int rval;
-
-	mutex_lock(&phy->mutex);
-
-	rval = omap4iss_csi2_reset(phy->csi2);
-	if (rval)
-		goto done;
-
-	csiphy_dphy_config(phy);
-	csiphy_lanes_config(phy);
-
-	rval = csiphy_set_power(phy, CSI2_COMPLEXIO_CFG_PWD_CMD_ON);
-	if (rval)
-		goto done;
-
-	phy->phy_in_use = 1;
-
-done:
-	mutex_unlock(&phy->mutex);
-	return rval;
-}
-
-void omap4iss_csiphy_release(struct iss_csiphy *phy)
-{
-	mutex_lock(&phy->mutex);
-	if (phy->phy_in_use) {
-		csiphy_set_power(phy, CSI2_COMPLEXIO_CFG_PWD_CMD_OFF);
-		phy->phy_in_use = 0;
-	}
-	mutex_unlock(&phy->mutex);
-}
-
-/*
- * omap4iss_csiphy_init - Initialize the CSI PHY frontends
- */
-int omap4iss_csiphy_init(struct iss_device *iss)
-{
-	struct iss_csiphy *phy1 = &iss->csiphy1;
-	struct iss_csiphy *phy2 = &iss->csiphy2;
-
-	phy1->iss = iss;
-	phy1->csi2 = &iss->csi2a;
-	phy1->max_data_lanes = ISS_CSIPHY1_NUM_DATA_LANES;
-	phy1->used_data_lanes = 0;
-	phy1->cfg_regs = OMAP4_ISS_MEM_CSI2_A_REGS1;
-	phy1->phy_regs = OMAP4_ISS_MEM_CAMERARX_CORE1;
-	mutex_init(&phy1->mutex);
-
-	phy2->iss = iss;
-	phy2->csi2 = &iss->csi2b;
-	phy2->max_data_lanes = ISS_CSIPHY2_NUM_DATA_LANES;
-	phy2->used_data_lanes = 0;
-	phy2->cfg_regs = OMAP4_ISS_MEM_CSI2_B_REGS1;
-	phy2->phy_regs = OMAP4_ISS_MEM_CAMERARX_CORE2;
-	mutex_init(&phy2->mutex);
-
-	return 0;
-}
diff --git a/drivers/staging/media/omap4iss/iss_csiphy.h b/drivers/staging/media/omap4iss/iss_csiphy.h
deleted file mode 100644
index 44408e4fcf3b..000000000000
--- a/drivers/staging/media/omap4iss/iss_csiphy.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * TI OMAP4 ISS V4L2 Driver - CSI PHY module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#ifndef OMAP4_ISS_CSI_PHY_H
-#define OMAP4_ISS_CSI_PHY_H
-
-#include <linux/platform_data/media/omap4iss.h>
-
-struct iss_csi2_device;
-
-struct iss_csiphy_dphy_cfg {
-	u8 ths_term;
-	u8 ths_settle;
-	u8 tclk_term;
-	unsigned tclk_miss:1;
-	u8 tclk_settle;
-};
-
-struct iss_csiphy {
-	struct iss_device *iss;
-	struct mutex mutex;	/* serialize csiphy configuration */
-	u8 phy_in_use;
-	struct iss_csi2_device *csi2;
-
-	/* memory resources, as defined in enum iss_mem_resources */
-	unsigned int cfg_regs;
-	unsigned int phy_regs;
-
-	u8 max_data_lanes;	/* number of CSI2 Data Lanes supported */
-	u8 used_data_lanes;	/* number of CSI2 Data Lanes used */
-	struct iss_csiphy_lanes_cfg lanes;
-	struct iss_csiphy_dphy_cfg dphy;
-};
-
-int omap4iss_csiphy_config(struct iss_device *iss,
-			   struct v4l2_subdev *csi2_subdev);
-int omap4iss_csiphy_acquire(struct iss_csiphy *phy);
-void omap4iss_csiphy_release(struct iss_csiphy *phy);
-int omap4iss_csiphy_init(struct iss_device *iss);
-
-#endif	/* OMAP4_ISS_CSI_PHY_H */
diff --git a/drivers/staging/media/omap4iss/iss_ipipe.c b/drivers/staging/media/omap4iss/iss_ipipe.c
deleted file mode 100644
index 4a4eae290d65..000000000000
--- a/drivers/staging/media/omap4iss/iss_ipipe.c
+++ /dev/null
@@ -1,579 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * TI OMAP4 ISS V4L2 Driver - ISP IPIPE module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-
-#include "iss.h"
-#include "iss_regs.h"
-#include "iss_ipipe.h"
-
-static struct v4l2_mbus_framefmt *
-__ipipe_get_format(struct iss_ipipe_device *ipipe,
-		   struct v4l2_subdev_state *sd_state,
-		   unsigned int pad,
-		   enum v4l2_subdev_format_whence which);
-
-static const unsigned int ipipe_fmts[] = {
-	MEDIA_BUS_FMT_SGRBG10_1X10,
-	MEDIA_BUS_FMT_SRGGB10_1X10,
-	MEDIA_BUS_FMT_SBGGR10_1X10,
-	MEDIA_BUS_FMT_SGBRG10_1X10,
-};
-
-/*
- * ipipe_print_status - Print current IPIPE Module register values.
- * @ipipe: Pointer to ISS ISP IPIPE device.
- *
- * Also prints other debug information stored in the IPIPE module.
- */
-#define IPIPE_PRINT_REGISTER(iss, name)\
-	dev_dbg(iss->dev, "###IPIPE " #name "=0x%08x\n", \
-		iss_reg_read(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_##name))
-
-static void ipipe_print_status(struct iss_ipipe_device *ipipe)
-{
-	struct iss_device *iss = to_iss_device(ipipe);
-
-	dev_dbg(iss->dev, "-------------IPIPE Register dump-------------\n");
-
-	IPIPE_PRINT_REGISTER(iss, SRC_EN);
-	IPIPE_PRINT_REGISTER(iss, SRC_MODE);
-	IPIPE_PRINT_REGISTER(iss, SRC_FMT);
-	IPIPE_PRINT_REGISTER(iss, SRC_COL);
-	IPIPE_PRINT_REGISTER(iss, SRC_VPS);
-	IPIPE_PRINT_REGISTER(iss, SRC_VSZ);
-	IPIPE_PRINT_REGISTER(iss, SRC_HPS);
-	IPIPE_PRINT_REGISTER(iss, SRC_HSZ);
-	IPIPE_PRINT_REGISTER(iss, GCK_MMR);
-	IPIPE_PRINT_REGISTER(iss, YUV_PHS);
-
-	dev_dbg(iss->dev, "-----------------------------------------------\n");
-}
-
-/*
- * ipipe_enable - Enable/Disable IPIPE.
- * @enable: enable flag
- *
- */
-static void ipipe_enable(struct iss_ipipe_device *ipipe, u8 enable)
-{
-	struct iss_device *iss = to_iss_device(ipipe);
-
-	iss_reg_update(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_SRC_EN,
-		       IPIPE_SRC_EN_EN, enable ? IPIPE_SRC_EN_EN : 0);
-}
-
-/* -----------------------------------------------------------------------------
- * Format- and pipeline-related configuration helpers
- */
-
-static void ipipe_configure(struct iss_ipipe_device *ipipe)
-{
-	struct iss_device *iss = to_iss_device(ipipe);
-	struct v4l2_mbus_framefmt *format;
-
-	/* IPIPE_PAD_SINK */
-	format = &ipipe->formats[IPIPE_PAD_SINK];
-
-	/* NOTE: Currently just supporting pipeline IN: RGB, OUT: YUV422 */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_SRC_FMT,
-		      IPIPE_SRC_FMT_RAW2YUV);
-
-	/* Enable YUV444 -> YUV422 conversion */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_YUV_PHS,
-		      IPIPE_YUV_PHS_LPF);
-
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_SRC_VPS, 0);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_SRC_HPS, 0);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_SRC_VSZ,
-		      (format->height - 2) & IPIPE_SRC_VSZ_MASK);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_SRC_HSZ,
-		      (format->width - 1) & IPIPE_SRC_HSZ_MASK);
-
-	/* Ignore ipipeif_wrt signal, and operate on-the-fly.  */
-	iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_SRC_MODE,
-		    IPIPE_SRC_MODE_WRT | IPIPE_SRC_MODE_OST);
-
-	/* HACK: Values tuned for Ducati SW (OV) */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_SRC_COL,
-		      IPIPE_SRC_COL_EE_B | IPIPE_SRC_COL_EO_GB |
-		      IPIPE_SRC_COL_OE_GR | IPIPE_SRC_COL_OO_R);
-
-	/* IPIPE_PAD_SOURCE_VP */
-	format = &ipipe->formats[IPIPE_PAD_SOURCE_VP];
-	/* Do nothing? */
-}
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev operations
- */
-
-/*
- * ipipe_set_stream - Enable/Disable streaming on the IPIPE module
- * @sd: ISP IPIPE V4L2 subdevice
- * @enable: Enable/disable stream
- */
-static int ipipe_set_stream(struct v4l2_subdev *sd, int enable)
-{
-	struct iss_ipipe_device *ipipe = v4l2_get_subdevdata(sd);
-	struct iss_device *iss = to_iss_device(ipipe);
-	int ret = 0;
-
-	if (ipipe->state == ISS_PIPELINE_STREAM_STOPPED) {
-		if (enable == ISS_PIPELINE_STREAM_STOPPED)
-			return 0;
-
-		omap4iss_isp_subclk_enable(iss, OMAP4_ISS_ISP_SUBCLK_IPIPE);
-
-		/* Enable clk_arm_g0 */
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_GCK_MMR,
-			      IPIPE_GCK_MMR_REG);
-
-		/* Enable clk_pix_g[3:0] */
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_IPIPE, IPIPE_GCK_PIX,
-			      IPIPE_GCK_PIX_G3 | IPIPE_GCK_PIX_G2 |
-			      IPIPE_GCK_PIX_G1 | IPIPE_GCK_PIX_G0);
-	}
-
-	switch (enable) {
-	case ISS_PIPELINE_STREAM_CONTINUOUS:
-
-		ipipe_configure(ipipe);
-		ipipe_print_status(ipipe);
-
-		atomic_set(&ipipe->stopping, 0);
-		ipipe_enable(ipipe, 1);
-		break;
-
-	case ISS_PIPELINE_STREAM_STOPPED:
-		if (ipipe->state == ISS_PIPELINE_STREAM_STOPPED)
-			return 0;
-		if (omap4iss_module_sync_idle(&sd->entity, &ipipe->wait,
-					      &ipipe->stopping))
-			ret = -ETIMEDOUT;
-
-		ipipe_enable(ipipe, 0);
-		omap4iss_isp_subclk_disable(iss, OMAP4_ISS_ISP_SUBCLK_IPIPE);
-		break;
-	}
-
-	ipipe->state = enable;
-	return ret;
-}
-
-static struct v4l2_mbus_framefmt *
-__ipipe_get_format(struct iss_ipipe_device *ipipe,
-		   struct v4l2_subdev_state *sd_state,
-		   unsigned int pad,
-		   enum v4l2_subdev_format_whence which)
-{
-	if (which == V4L2_SUBDEV_FORMAT_TRY)
-		return v4l2_subdev_state_get_format(sd_state, pad);
-
-	return &ipipe->formats[pad];
-}
-
-/*
- * ipipe_try_format - Try video format on a pad
- * @ipipe: ISS IPIPE device
- * @sd_state: V4L2 subdev state
- * @pad: Pad number
- * @fmt: Format
- */
-static void
-ipipe_try_format(struct iss_ipipe_device *ipipe,
-		 struct v4l2_subdev_state *sd_state,
-		 unsigned int pad,
-		 struct v4l2_mbus_framefmt *fmt,
-		 enum v4l2_subdev_format_whence which)
-{
-	struct v4l2_mbus_framefmt *format;
-	unsigned int width = fmt->width;
-	unsigned int height = fmt->height;
-	unsigned int i;
-
-	switch (pad) {
-	case IPIPE_PAD_SINK:
-		for (i = 0; i < ARRAY_SIZE(ipipe_fmts); i++) {
-			if (fmt->code == ipipe_fmts[i])
-				break;
-		}
-
-		/* If not found, use SGRBG10 as default */
-		if (i >= ARRAY_SIZE(ipipe_fmts))
-			fmt->code = MEDIA_BUS_FMT_SGRBG10_1X10;
-
-		/* Clamp the input size. */
-		fmt->width = clamp_t(u32, width, 1, 8192);
-		fmt->height = clamp_t(u32, height, 1, 8192);
-		fmt->colorspace = V4L2_COLORSPACE_SRGB;
-		break;
-
-	case IPIPE_PAD_SOURCE_VP:
-		format = __ipipe_get_format(ipipe, sd_state, IPIPE_PAD_SINK,
-					    which);
-		memcpy(fmt, format, sizeof(*fmt));
-
-		fmt->code = MEDIA_BUS_FMT_UYVY8_1X16;
-		fmt->width = clamp_t(u32, width, 32, fmt->width);
-		fmt->height = clamp_t(u32, height, 32, fmt->height);
-		fmt->colorspace = V4L2_COLORSPACE_JPEG;
-		break;
-	}
-
-	fmt->field = V4L2_FIELD_NONE;
-}
-
-/*
- * ipipe_enum_mbus_code - Handle pixel format enumeration
- * @sd     : pointer to v4l2 subdev structure
- * @sd_state: V4L2 subdev state
- * @code   : pointer to v4l2_subdev_mbus_code_enum structure
- * return -EINVAL or zero on success
- */
-static int ipipe_enum_mbus_code(struct v4l2_subdev *sd,
-				struct v4l2_subdev_state *sd_state,
-				struct v4l2_subdev_mbus_code_enum *code)
-{
-	switch (code->pad) {
-	case IPIPE_PAD_SINK:
-		if (code->index >= ARRAY_SIZE(ipipe_fmts))
-			return -EINVAL;
-
-		code->code = ipipe_fmts[code->index];
-		break;
-
-	case IPIPE_PAD_SOURCE_VP:
-		/* FIXME: Forced format conversion inside IPIPE ? */
-		if (code->index != 0)
-			return -EINVAL;
-
-		code->code = MEDIA_BUS_FMT_UYVY8_1X16;
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int ipipe_enum_frame_size(struct v4l2_subdev *sd,
-				 struct v4l2_subdev_state *sd_state,
-				 struct v4l2_subdev_frame_size_enum *fse)
-{
-	struct iss_ipipe_device *ipipe = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt format;
-
-	if (fse->index != 0)
-		return -EINVAL;
-
-	format.code = fse->code;
-	format.width = 1;
-	format.height = 1;
-	ipipe_try_format(ipipe, sd_state, fse->pad, &format, fse->which);
-	fse->min_width = format.width;
-	fse->min_height = format.height;
-
-	if (format.code != fse->code)
-		return -EINVAL;
-
-	format.code = fse->code;
-	format.width = -1;
-	format.height = -1;
-	ipipe_try_format(ipipe, sd_state, fse->pad, &format, fse->which);
-	fse->max_width = format.width;
-	fse->max_height = format.height;
-
-	return 0;
-}
-
-/*
- * ipipe_get_format - Retrieve the video format on a pad
- * @sd : ISP IPIPE V4L2 subdevice
- * @sd_state: V4L2 subdev state
- * @fmt: Format
- *
- * Return 0 on success or -EINVAL if the pad is invalid or doesn't correspond
- * to the format type.
- */
-static int ipipe_get_format(struct v4l2_subdev *sd,
-			    struct v4l2_subdev_state *sd_state,
-			    struct v4l2_subdev_format *fmt)
-{
-	struct iss_ipipe_device *ipipe = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	format = __ipipe_get_format(ipipe, sd_state, fmt->pad, fmt->which);
-	if (!format)
-		return -EINVAL;
-
-	fmt->format = *format;
-	return 0;
-}
-
-/*
- * ipipe_set_format - Set the video format on a pad
- * @sd : ISP IPIPE V4L2 subdevice
- * @sd_state: V4L2 subdev state
- * @fmt: Format
- *
- * Return 0 on success or -EINVAL if the pad is invalid or doesn't correspond
- * to the format type.
- */
-static int ipipe_set_format(struct v4l2_subdev *sd,
-			    struct v4l2_subdev_state *sd_state,
-			    struct v4l2_subdev_format *fmt)
-{
-	struct iss_ipipe_device *ipipe = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	format = __ipipe_get_format(ipipe, sd_state, fmt->pad, fmt->which);
-	if (!format)
-		return -EINVAL;
-
-	ipipe_try_format(ipipe, sd_state, fmt->pad, &fmt->format, fmt->which);
-	*format = fmt->format;
-
-	/* Propagate the format from sink to source */
-	if (fmt->pad == IPIPE_PAD_SINK) {
-		format = __ipipe_get_format(ipipe, sd_state,
-					    IPIPE_PAD_SOURCE_VP,
-					    fmt->which);
-		*format = fmt->format;
-		ipipe_try_format(ipipe, sd_state, IPIPE_PAD_SOURCE_VP, format,
-				 fmt->which);
-	}
-
-	return 0;
-}
-
-static int ipipe_link_validate(struct v4l2_subdev *sd, struct media_link *link,
-			       struct v4l2_subdev_format *source_fmt,
-			       struct v4l2_subdev_format *sink_fmt)
-{
-	/* Check if the two ends match */
-	if (source_fmt->format.width != sink_fmt->format.width ||
-	    source_fmt->format.height != sink_fmt->format.height)
-		return -EPIPE;
-
-	if (source_fmt->format.code != sink_fmt->format.code)
-		return -EPIPE;
-
-	return 0;
-}
-
-/*
- * ipipe_init_formats - Initialize formats on all pads
- * @sd: ISP IPIPE V4L2 subdevice
- * @fh: V4L2 subdev file handle
- *
- * Initialize all pad formats with default values. If fh is not NULL, try
- * formats are initialized on the file handle. Otherwise active formats are
- * initialized on the device.
- */
-static int ipipe_init_formats(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
-{
-	struct v4l2_subdev_format format;
-
-	memset(&format, 0, sizeof(format));
-	format.pad = IPIPE_PAD_SINK;
-	format.which = fh ? V4L2_SUBDEV_FORMAT_TRY : V4L2_SUBDEV_FORMAT_ACTIVE;
-	format.format.code = MEDIA_BUS_FMT_SGRBG10_1X10;
-	format.format.width = 4096;
-	format.format.height = 4096;
-	ipipe_set_format(sd, fh ? fh->state : NULL, &format);
-
-	return 0;
-}
-
-/* V4L2 subdev video operations */
-static const struct v4l2_subdev_video_ops ipipe_v4l2_video_ops = {
-	.s_stream = ipipe_set_stream,
-};
-
-/* V4L2 subdev pad operations */
-static const struct v4l2_subdev_pad_ops ipipe_v4l2_pad_ops = {
-	.enum_mbus_code = ipipe_enum_mbus_code,
-	.enum_frame_size = ipipe_enum_frame_size,
-	.get_fmt = ipipe_get_format,
-	.set_fmt = ipipe_set_format,
-	.link_validate = ipipe_link_validate,
-};
-
-/* V4L2 subdev operations */
-static const struct v4l2_subdev_ops ipipe_v4l2_ops = {
-	.video = &ipipe_v4l2_video_ops,
-	.pad = &ipipe_v4l2_pad_ops,
-};
-
-/* V4L2 subdev internal operations */
-static const struct v4l2_subdev_internal_ops ipipe_v4l2_internal_ops = {
-	.open = ipipe_init_formats,
-};
-
-/* -----------------------------------------------------------------------------
- * Media entity operations
- */
-
-/*
- * ipipe_link_setup - Setup IPIPE connections
- * @entity: IPIPE media entity
- * @local: Pad at the local end of the link
- * @remote: Pad at the remote end of the link
- * @flags: Link flags
- *
- * return -EINVAL or zero on success
- */
-static int ipipe_link_setup(struct media_entity *entity,
-			    const struct media_pad *local,
-			    const struct media_pad *remote, u32 flags)
-{
-	struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity);
-	struct iss_ipipe_device *ipipe = v4l2_get_subdevdata(sd);
-	struct iss_device *iss = to_iss_device(ipipe);
-
-	if (!is_media_entity_v4l2_subdev(remote->entity))
-		return -EINVAL;
-
-	switch (local->index) {
-	case IPIPE_PAD_SINK:
-		/* Read from IPIPEIF. */
-		if (!(flags & MEDIA_LNK_FL_ENABLED)) {
-			ipipe->input = IPIPE_INPUT_NONE;
-			break;
-		}
-
-		if (ipipe->input != IPIPE_INPUT_NONE)
-			return -EBUSY;
-
-		if (remote->entity == &iss->ipipeif.subdev.entity)
-			ipipe->input = IPIPE_INPUT_IPIPEIF;
-
-		break;
-
-	case IPIPE_PAD_SOURCE_VP:
-		/* Send to RESIZER */
-		if (flags & MEDIA_LNK_FL_ENABLED) {
-			if (ipipe->output & ~IPIPE_OUTPUT_VP)
-				return -EBUSY;
-			ipipe->output |= IPIPE_OUTPUT_VP;
-		} else {
-			ipipe->output &= ~IPIPE_OUTPUT_VP;
-		}
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* media operations */
-static const struct media_entity_operations ipipe_media_ops = {
-	.link_setup = ipipe_link_setup,
-	.link_validate = v4l2_subdev_link_validate,
-};
-
-/*
- * ipipe_init_entities - Initialize V4L2 subdev and media entity
- * @ipipe: ISS ISP IPIPE module
- *
- * Return 0 on success and a negative error code on failure.
- */
-static int ipipe_init_entities(struct iss_ipipe_device *ipipe)
-{
-	struct v4l2_subdev *sd = &ipipe->subdev;
-	struct media_pad *pads = ipipe->pads;
-	struct media_entity *me = &sd->entity;
-	int ret;
-
-	ipipe->input = IPIPE_INPUT_NONE;
-
-	v4l2_subdev_init(sd, &ipipe_v4l2_ops);
-	sd->internal_ops = &ipipe_v4l2_internal_ops;
-	strscpy(sd->name, "OMAP4 ISS ISP IPIPE", sizeof(sd->name));
-	sd->grp_id = BIT(16);	/* group ID for iss subdevs */
-	v4l2_set_subdevdata(sd, ipipe);
-	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	pads[IPIPE_PAD_SINK].flags = MEDIA_PAD_FL_SINK;
-	pads[IPIPE_PAD_SOURCE_VP].flags = MEDIA_PAD_FL_SOURCE;
-
-	me->ops = &ipipe_media_ops;
-	ret = media_entity_pads_init(me, IPIPE_PADS_NUM, pads);
-	if (ret < 0)
-		return ret;
-
-	ipipe_init_formats(sd, NULL);
-
-	return 0;
-}
-
-void omap4iss_ipipe_unregister_entities(struct iss_ipipe_device *ipipe)
-{
-	v4l2_device_unregister_subdev(&ipipe->subdev);
-}
-
-int omap4iss_ipipe_register_entities(struct iss_ipipe_device *ipipe,
-				     struct v4l2_device *vdev)
-{
-	int ret;
-
-	/* Register the subdev and video node. */
-	ret = v4l2_device_register_subdev(vdev, &ipipe->subdev);
-	if (ret < 0)
-		goto error;
-
-	return 0;
-
-error:
-	omap4iss_ipipe_unregister_entities(ipipe);
-	return ret;
-}
-
-/* -----------------------------------------------------------------------------
- * ISP IPIPE initialisation and cleanup
- */
-
-/*
- * omap4iss_ipipe_init - IPIPE module initialization.
- * @iss: Device pointer specific to the OMAP4 ISS.
- *
- * TODO: Get the initialisation values from platform data.
- *
- * Return 0 on success or a negative error code otherwise.
- */
-int omap4iss_ipipe_init(struct iss_device *iss)
-{
-	struct iss_ipipe_device *ipipe = &iss->ipipe;
-
-	ipipe->state = ISS_PIPELINE_STREAM_STOPPED;
-	init_waitqueue_head(&ipipe->wait);
-
-	return ipipe_init_entities(ipipe);
-}
-
-/*
- * omap4iss_ipipe_cleanup - IPIPE module cleanup.
- * @iss: Device pointer specific to the OMAP4 ISS.
- */
-void omap4iss_ipipe_cleanup(struct iss_device *iss)
-{
-	struct iss_ipipe_device *ipipe = &iss->ipipe;
-
-	media_entity_cleanup(&ipipe->subdev.entity);
-}
diff --git a/drivers/staging/media/omap4iss/iss_ipipe.h b/drivers/staging/media/omap4iss/iss_ipipe.h
deleted file mode 100644
index 53b42aac1696..000000000000
--- a/drivers/staging/media/omap4iss/iss_ipipe.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * TI OMAP4 ISS V4L2 Driver - ISP IPIPE module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#ifndef OMAP4_ISS_IPIPE_H
-#define OMAP4_ISS_IPIPE_H
-
-#include "iss_video.h"
-
-enum ipipe_input_entity {
-	IPIPE_INPUT_NONE,
-	IPIPE_INPUT_IPIPEIF,
-};
-
-#define IPIPE_OUTPUT_VP				BIT(0)
-
-/* Sink and source IPIPE pads */
-#define IPIPE_PAD_SINK				0
-#define IPIPE_PAD_SOURCE_VP			1
-#define IPIPE_PADS_NUM				2
-
-/*
- * struct iss_ipipe_device - Structure for the IPIPE module to store its own
- *			    information
- * @subdev: V4L2 subdevice
- * @pads: Sink and source media entity pads
- * @formats: Active video formats
- * @input: Active input
- * @output: Active outputs
- * @error: A hardware error occurred during capture
- * @state: Streaming state
- * @wait: Wait queue used to stop the module
- * @stopping: Stopping state
- */
-struct iss_ipipe_device {
-	struct v4l2_subdev subdev;
-	struct media_pad pads[IPIPE_PADS_NUM];
-	struct v4l2_mbus_framefmt formats[IPIPE_PADS_NUM];
-
-	enum ipipe_input_entity input;
-	unsigned int output;
-	unsigned int error;
-
-	enum iss_pipeline_stream_state state;
-	wait_queue_head_t wait;
-	atomic_t stopping;
-};
-
-struct iss_device;
-
-int omap4iss_ipipe_register_entities(struct iss_ipipe_device *ipipe,
-				     struct v4l2_device *vdev);
-void omap4iss_ipipe_unregister_entities(struct iss_ipipe_device *ipipe);
-
-int omap4iss_ipipe_init(struct iss_device *iss);
-void omap4iss_ipipe_cleanup(struct iss_device *iss);
-
-#endif	/* OMAP4_ISS_IPIPE_H */
diff --git a/drivers/staging/media/omap4iss/iss_ipipeif.c b/drivers/staging/media/omap4iss/iss_ipipeif.c
deleted file mode 100644
index 8fa99532d9d4..000000000000
--- a/drivers/staging/media/omap4iss/iss_ipipeif.c
+++ /dev/null
@@ -1,844 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * TI OMAP4 ISS V4L2 Driver - ISP IPIPEIF module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-
-#include "iss.h"
-#include "iss_regs.h"
-#include "iss_ipipeif.h"
-
-static const unsigned int ipipeif_fmts[] = {
-	MEDIA_BUS_FMT_SGRBG10_1X10,
-	MEDIA_BUS_FMT_SRGGB10_1X10,
-	MEDIA_BUS_FMT_SBGGR10_1X10,
-	MEDIA_BUS_FMT_SGBRG10_1X10,
-	MEDIA_BUS_FMT_UYVY8_1X16,
-	MEDIA_BUS_FMT_YUYV8_1X16,
-};
-
-/*
- * ipipeif_print_status - Print current IPIPEIF Module register values.
- * @ipipeif: Pointer to ISS ISP IPIPEIF device.
- *
- * Also prints other debug information stored in the IPIPEIF module.
- */
-#define IPIPEIF_PRINT_REGISTER(iss, name)\
-	dev_dbg(iss->dev, "###IPIPEIF " #name "=0x%08x\n", \
-		iss_reg_read(iss, OMAP4_ISS_MEM_ISP_IPIPEIF, IPIPEIF_##name))
-
-#define ISIF_PRINT_REGISTER(iss, name)\
-	dev_dbg(iss->dev, "###ISIF " #name "=0x%08x\n", \
-		iss_reg_read(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_##name))
-
-#define ISP5_PRINT_REGISTER(iss, name)\
-	dev_dbg(iss->dev, "###ISP5 " #name "=0x%08x\n", \
-		iss_reg_read(iss, OMAP4_ISS_MEM_ISP_SYS1, ISP5_##name))
-
-static void ipipeif_print_status(struct iss_ipipeif_device *ipipeif)
-{
-	struct iss_device *iss = to_iss_device(ipipeif);
-
-	dev_dbg(iss->dev, "-------------IPIPEIF Register dump-------------\n");
-
-	IPIPEIF_PRINT_REGISTER(iss, CFG1);
-	IPIPEIF_PRINT_REGISTER(iss, CFG2);
-
-	ISIF_PRINT_REGISTER(iss, SYNCEN);
-	ISIF_PRINT_REGISTER(iss, CADU);
-	ISIF_PRINT_REGISTER(iss, CADL);
-	ISIF_PRINT_REGISTER(iss, MODESET);
-	ISIF_PRINT_REGISTER(iss, CCOLP);
-	ISIF_PRINT_REGISTER(iss, SPH);
-	ISIF_PRINT_REGISTER(iss, LNH);
-	ISIF_PRINT_REGISTER(iss, LNV);
-	ISIF_PRINT_REGISTER(iss, VDINT(0));
-	ISIF_PRINT_REGISTER(iss, HSIZE);
-
-	ISP5_PRINT_REGISTER(iss, SYSCONFIG);
-	ISP5_PRINT_REGISTER(iss, CTRL);
-	ISP5_PRINT_REGISTER(iss, IRQSTATUS(0));
-	ISP5_PRINT_REGISTER(iss, IRQENABLE_SET(0));
-	ISP5_PRINT_REGISTER(iss, IRQENABLE_CLR(0));
-
-	dev_dbg(iss->dev, "-----------------------------------------------\n");
-}
-
-static void ipipeif_write_enable(struct iss_ipipeif_device *ipipeif, u8 enable)
-{
-	struct iss_device *iss = to_iss_device(ipipeif);
-
-	iss_reg_update(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_SYNCEN,
-		       ISIF_SYNCEN_DWEN, enable ? ISIF_SYNCEN_DWEN : 0);
-}
-
-/*
- * ipipeif_enable - Enable/Disable IPIPEIF.
- * @enable: enable flag
- *
- */
-static void ipipeif_enable(struct iss_ipipeif_device *ipipeif, u8 enable)
-{
-	struct iss_device *iss = to_iss_device(ipipeif);
-
-	iss_reg_update(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_SYNCEN,
-		       ISIF_SYNCEN_SYEN, enable ? ISIF_SYNCEN_SYEN : 0);
-}
-
-/* -----------------------------------------------------------------------------
- * Format- and pipeline-related configuration helpers
- */
-
-/*
- * ipipeif_set_outaddr - Set memory address to save output image
- * @ipipeif: Pointer to ISP IPIPEIF device.
- * @addr: 32-bit memory address aligned on 32 byte boundary.
- *
- * Sets the memory address where the output will be saved.
- */
-static void ipipeif_set_outaddr(struct iss_ipipeif_device *ipipeif, u32 addr)
-{
-	struct iss_device *iss = to_iss_device(ipipeif);
-
-	/* Save address split in Base Address H & L */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_CADU,
-		      (addr >> (16 + 5)) & ISIF_CADU_MASK);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_CADL,
-		      (addr >> 5) & ISIF_CADL_MASK);
-}
-
-static void ipipeif_configure(struct iss_ipipeif_device *ipipeif)
-{
-	struct iss_device *iss = to_iss_device(ipipeif);
-	const struct iss_format_info *info;
-	struct v4l2_mbus_framefmt *format;
-	u32 isif_ccolp = 0;
-
-	omap4iss_configure_bridge(iss, ipipeif->input);
-
-	/* IPIPEIF_PAD_SINK */
-	format = &ipipeif->formats[IPIPEIF_PAD_SINK];
-
-	/* IPIPEIF with YUV422 input from ISIF */
-	iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_IPIPEIF, IPIPEIF_CFG1,
-		    IPIPEIF_CFG1_INPSRC1_MASK | IPIPEIF_CFG1_INPSRC2_MASK);
-
-	/* Select ISIF/IPIPEIF input format */
-	switch (format->code) {
-	case MEDIA_BUS_FMT_UYVY8_1X16:
-	case MEDIA_BUS_FMT_YUYV8_1X16:
-		iss_reg_update(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_MODESET,
-			       ISIF_MODESET_CCDMD | ISIF_MODESET_INPMOD_MASK |
-			       ISIF_MODESET_CCDW_MASK,
-			       ISIF_MODESET_INPMOD_YCBCR16);
-
-		iss_reg_update(iss, OMAP4_ISS_MEM_ISP_IPIPEIF, IPIPEIF_CFG2,
-			       IPIPEIF_CFG2_YUV8, IPIPEIF_CFG2_YUV16);
-
-		break;
-	case MEDIA_BUS_FMT_SGRBG10_1X10:
-		isif_ccolp = ISIF_CCOLP_CP0_F0_GR |
-			ISIF_CCOLP_CP1_F0_R |
-			ISIF_CCOLP_CP2_F0_B |
-			ISIF_CCOLP_CP3_F0_GB;
-		goto cont_raw;
-	case MEDIA_BUS_FMT_SRGGB10_1X10:
-		isif_ccolp = ISIF_CCOLP_CP0_F0_R |
-			ISIF_CCOLP_CP1_F0_GR |
-			ISIF_CCOLP_CP2_F0_GB |
-			ISIF_CCOLP_CP3_F0_B;
-		goto cont_raw;
-	case MEDIA_BUS_FMT_SBGGR10_1X10:
-		isif_ccolp = ISIF_CCOLP_CP0_F0_B |
-			ISIF_CCOLP_CP1_F0_GB |
-			ISIF_CCOLP_CP2_F0_GR |
-			ISIF_CCOLP_CP3_F0_R;
-		goto cont_raw;
-	case MEDIA_BUS_FMT_SGBRG10_1X10:
-		isif_ccolp = ISIF_CCOLP_CP0_F0_GB |
-			ISIF_CCOLP_CP1_F0_B |
-			ISIF_CCOLP_CP2_F0_R |
-			ISIF_CCOLP_CP3_F0_GR;
-cont_raw:
-		iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_IPIPEIF, IPIPEIF_CFG2,
-			    IPIPEIF_CFG2_YUV16);
-
-		iss_reg_update(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_MODESET,
-			       ISIF_MODESET_CCDMD | ISIF_MODESET_INPMOD_MASK |
-			       ISIF_MODESET_CCDW_MASK, ISIF_MODESET_INPMOD_RAW |
-			       ISIF_MODESET_CCDW_2BIT);
-
-		info = omap4iss_video_format_info(format->code);
-		iss_reg_update(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_CGAMMAWD,
-			       ISIF_CGAMMAWD_GWDI_MASK,
-			       ISIF_CGAMMAWD_GWDI(info->bpp));
-
-		/* Set RAW Bayer pattern */
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_CCOLP,
-			      isif_ccolp);
-		break;
-	}
-
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_SPH, 0 & ISIF_SPH_MASK);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_LNH,
-		      (format->width - 1) & ISIF_LNH_MASK);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_LNV,
-		      (format->height - 1) & ISIF_LNV_MASK);
-
-	/* Generate ISIF0 on the last line of the image */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_VDINT(0),
-		      format->height - 1);
-
-	/* IPIPEIF_PAD_SOURCE_ISIF_SF */
-	format = &ipipeif->formats[IPIPEIF_PAD_SOURCE_ISIF_SF];
-
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_ISIF, ISIF_HSIZE,
-		      (ipipeif->video_out.bpl_value >> 5) &
-		      ISIF_HSIZE_HSIZE_MASK);
-
-	/* IPIPEIF_PAD_SOURCE_VP */
-	/* Do nothing? */
-}
-
-/* -----------------------------------------------------------------------------
- * Interrupt handling
- */
-
-static void ipipeif_isr_buffer(struct iss_ipipeif_device *ipipeif)
-{
-	struct iss_buffer *buffer;
-
-	/* The ISIF generates VD0 interrupts even when writes are disabled.
-	 * deal with it anyway). Disabling the ISIF when no buffer is available
-	 * is thus not be enough, we need to handle the situation explicitly.
-	 */
-	if (list_empty(&ipipeif->video_out.dmaqueue))
-		return;
-
-	ipipeif_write_enable(ipipeif, 0);
-
-	buffer = omap4iss_video_buffer_next(&ipipeif->video_out);
-	if (!buffer)
-		return;
-
-	ipipeif_set_outaddr(ipipeif, buffer->iss_addr);
-
-	ipipeif_write_enable(ipipeif, 1);
-}
-
-/*
- * omap4iss_ipipeif_isr - Configure ipipeif during interframe time.
- * @ipipeif: Pointer to ISP IPIPEIF device.
- * @events: IPIPEIF events
- */
-void omap4iss_ipipeif_isr(struct iss_ipipeif_device *ipipeif, u32 events)
-{
-	if (omap4iss_module_sync_is_stopping(&ipipeif->wait,
-					     &ipipeif->stopping))
-		return;
-
-	if ((events & ISP5_IRQ_ISIF_INT(0)) &&
-	    (ipipeif->output & IPIPEIF_OUTPUT_MEMORY))
-		ipipeif_isr_buffer(ipipeif);
-}
-
-/* -----------------------------------------------------------------------------
- * ISP video operations
- */
-
-static int ipipeif_video_queue(struct iss_video *video,
-			       struct iss_buffer *buffer)
-{
-	struct iss_ipipeif_device *ipipeif = container_of(video,
-				struct iss_ipipeif_device, video_out);
-
-	if (!(ipipeif->output & IPIPEIF_OUTPUT_MEMORY))
-		return -ENODEV;
-
-	ipipeif_set_outaddr(ipipeif, buffer->iss_addr);
-
-	/*
-	 * If streaming was enabled before there was a buffer queued
-	 * or underrun happened in the ISR, the hardware was not enabled
-	 * and DMA queue flag ISS_VIDEO_DMAQUEUE_UNDERRUN is still set.
-	 * Enable it now.
-	 */
-	if (video->dmaqueue_flags & ISS_VIDEO_DMAQUEUE_UNDERRUN) {
-		if (ipipeif->output & IPIPEIF_OUTPUT_MEMORY)
-			ipipeif_write_enable(ipipeif, 1);
-		ipipeif_enable(ipipeif, 1);
-		iss_video_dmaqueue_flags_clr(video);
-	}
-
-	return 0;
-}
-
-static const struct iss_video_operations ipipeif_video_ops = {
-	.queue = ipipeif_video_queue,
-};
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev operations
- */
-
-#define IPIPEIF_DRV_SUBCLK_MASK	(OMAP4_ISS_ISP_SUBCLK_IPIPEIF |\
-				 OMAP4_ISS_ISP_SUBCLK_ISIF)
-/*
- * ipipeif_set_stream - Enable/Disable streaming on the IPIPEIF module
- * @sd: ISP IPIPEIF V4L2 subdevice
- * @enable: Enable/disable stream
- */
-static int ipipeif_set_stream(struct v4l2_subdev *sd, int enable)
-{
-	struct iss_ipipeif_device *ipipeif = v4l2_get_subdevdata(sd);
-	struct iss_device *iss = to_iss_device(ipipeif);
-	struct iss_video *video_out = &ipipeif->video_out;
-	int ret = 0;
-
-	if (ipipeif->state == ISS_PIPELINE_STREAM_STOPPED) {
-		if (enable == ISS_PIPELINE_STREAM_STOPPED)
-			return 0;
-
-		omap4iss_isp_subclk_enable(iss, IPIPEIF_DRV_SUBCLK_MASK);
-	}
-
-	switch (enable) {
-	case ISS_PIPELINE_STREAM_CONTINUOUS:
-
-		ipipeif_configure(ipipeif);
-		ipipeif_print_status(ipipeif);
-
-		/*
-		 * When outputting to memory with no buffer available, let the
-		 * buffer queue handler start the hardware. A DMA queue flag
-		 * ISS_VIDEO_DMAQUEUE_QUEUED will be set as soon as there is
-		 * a buffer available.
-		 */
-		if (ipipeif->output & IPIPEIF_OUTPUT_MEMORY &&
-		    !(video_out->dmaqueue_flags & ISS_VIDEO_DMAQUEUE_QUEUED))
-			break;
-
-		atomic_set(&ipipeif->stopping, 0);
-		if (ipipeif->output & IPIPEIF_OUTPUT_MEMORY)
-			ipipeif_write_enable(ipipeif, 1);
-		ipipeif_enable(ipipeif, 1);
-		iss_video_dmaqueue_flags_clr(video_out);
-		break;
-
-	case ISS_PIPELINE_STREAM_STOPPED:
-		if (ipipeif->state == ISS_PIPELINE_STREAM_STOPPED)
-			return 0;
-		if (omap4iss_module_sync_idle(&sd->entity, &ipipeif->wait,
-					      &ipipeif->stopping))
-			ret = -ETIMEDOUT;
-
-		if (ipipeif->output & IPIPEIF_OUTPUT_MEMORY)
-			ipipeif_write_enable(ipipeif, 0);
-		ipipeif_enable(ipipeif, 0);
-		omap4iss_isp_subclk_disable(iss, IPIPEIF_DRV_SUBCLK_MASK);
-		iss_video_dmaqueue_flags_clr(video_out);
-		break;
-	}
-
-	ipipeif->state = enable;
-	return ret;
-}
-
-static struct v4l2_mbus_framefmt *
-__ipipeif_get_format(struct iss_ipipeif_device *ipipeif,
-		     struct v4l2_subdev_state *sd_state, unsigned int pad,
-		     enum v4l2_subdev_format_whence which)
-{
-	if (which == V4L2_SUBDEV_FORMAT_TRY)
-		return v4l2_subdev_state_get_format(sd_state, pad);
-	return &ipipeif->formats[pad];
-}
-
-/*
- * ipipeif_try_format - Try video format on a pad
- * @ipipeif: ISS IPIPEIF device
- * @sd_state: V4L2 subdev state
- * @pad: Pad number
- * @fmt: Format
- */
-static void
-ipipeif_try_format(struct iss_ipipeif_device *ipipeif,
-		   struct v4l2_subdev_state *sd_state, unsigned int pad,
-		   struct v4l2_mbus_framefmt *fmt,
-		   enum v4l2_subdev_format_whence which)
-{
-	struct v4l2_mbus_framefmt *format;
-	unsigned int width = fmt->width;
-	unsigned int height = fmt->height;
-	unsigned int i;
-
-	switch (pad) {
-	case IPIPEIF_PAD_SINK:
-		/* TODO: If the IPIPEIF output formatter pad is connected
-		 * directly to the resizer, only YUV formats can be used.
-		 */
-		for (i = 0; i < ARRAY_SIZE(ipipeif_fmts); i++) {
-			if (fmt->code == ipipeif_fmts[i])
-				break;
-		}
-
-		/* If not found, use SGRBG10 as default */
-		if (i >= ARRAY_SIZE(ipipeif_fmts))
-			fmt->code = MEDIA_BUS_FMT_SGRBG10_1X10;
-
-		/* Clamp the input size. */
-		fmt->width = clamp_t(u32, width, 1, 8192);
-		fmt->height = clamp_t(u32, height, 1, 8192);
-		break;
-
-	case IPIPEIF_PAD_SOURCE_ISIF_SF:
-		format = __ipipeif_get_format(ipipeif, sd_state,
-					      IPIPEIF_PAD_SINK,
-					      which);
-		memcpy(fmt, format, sizeof(*fmt));
-
-		/* The data formatter truncates the number of horizontal output
-		 * pixels to a multiple of 16. To avoid clipping data, allow
-		 * callers to request an output size bigger than the input size
-		 * up to the nearest multiple of 16.
-		 */
-		fmt->width = clamp_t(u32, width, 32, (fmt->width + 15) & ~15);
-		fmt->width &= ~15;
-		fmt->height = clamp_t(u32, height, 32, fmt->height);
-		break;
-
-	case IPIPEIF_PAD_SOURCE_VP:
-		format = __ipipeif_get_format(ipipeif, sd_state,
-					      IPIPEIF_PAD_SINK,
-					      which);
-		memcpy(fmt, format, sizeof(*fmt));
-
-		fmt->width = clamp_t(u32, width, 32, fmt->width);
-		fmt->height = clamp_t(u32, height, 32, fmt->height);
-		break;
-	}
-
-	/* Data is written to memory unpacked, each 10-bit or 12-bit pixel is
-	 * stored on 2 bytes.
-	 */
-	fmt->colorspace = V4L2_COLORSPACE_SRGB;
-	fmt->field = V4L2_FIELD_NONE;
-}
-
-/*
- * ipipeif_enum_mbus_code - Handle pixel format enumeration
- * @sd     : pointer to v4l2 subdev structure
- * @sd_state: V4L2 subdev state
- * @code   : pointer to v4l2_subdev_mbus_code_enum structure
- * return -EINVAL or zero on success
- */
-static int ipipeif_enum_mbus_code(struct v4l2_subdev *sd,
-				  struct v4l2_subdev_state *sd_state,
-				  struct v4l2_subdev_mbus_code_enum *code)
-{
-	struct iss_ipipeif_device *ipipeif = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	switch (code->pad) {
-	case IPIPEIF_PAD_SINK:
-		if (code->index >= ARRAY_SIZE(ipipeif_fmts))
-			return -EINVAL;
-
-		code->code = ipipeif_fmts[code->index];
-		break;
-
-	case IPIPEIF_PAD_SOURCE_ISIF_SF:
-	case IPIPEIF_PAD_SOURCE_VP:
-		/* No format conversion inside IPIPEIF */
-		if (code->index != 0)
-			return -EINVAL;
-
-		format = __ipipeif_get_format(ipipeif, sd_state,
-					      IPIPEIF_PAD_SINK,
-					      code->which);
-
-		code->code = format->code;
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int ipipeif_enum_frame_size(struct v4l2_subdev *sd,
-				   struct v4l2_subdev_state *sd_state,
-				   struct v4l2_subdev_frame_size_enum *fse)
-{
-	struct iss_ipipeif_device *ipipeif = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt format;
-
-	if (fse->index != 0)
-		return -EINVAL;
-
-	format.code = fse->code;
-	format.width = 1;
-	format.height = 1;
-	ipipeif_try_format(ipipeif, sd_state, fse->pad, &format, fse->which);
-	fse->min_width = format.width;
-	fse->min_height = format.height;
-
-	if (format.code != fse->code)
-		return -EINVAL;
-
-	format.code = fse->code;
-	format.width = -1;
-	format.height = -1;
-	ipipeif_try_format(ipipeif, sd_state, fse->pad, &format, fse->which);
-	fse->max_width = format.width;
-	fse->max_height = format.height;
-
-	return 0;
-}
-
-/*
- * ipipeif_get_format - Retrieve the video format on a pad
- * @sd : ISP IPIPEIF V4L2 subdevice
- * @sd_state: V4L2 subdev state
- * @fmt: Format
- *
- * Return 0 on success or -EINVAL if the pad is invalid or doesn't correspond
- * to the format type.
- */
-static int ipipeif_get_format(struct v4l2_subdev *sd,
-			      struct v4l2_subdev_state *sd_state,
-			      struct v4l2_subdev_format *fmt)
-{
-	struct iss_ipipeif_device *ipipeif = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	format = __ipipeif_get_format(ipipeif, sd_state, fmt->pad, fmt->which);
-	if (!format)
-		return -EINVAL;
-
-	fmt->format = *format;
-	return 0;
-}
-
-/*
- * ipipeif_set_format - Set the video format on a pad
- * @sd : ISP IPIPEIF V4L2 subdevice
- * @sd_state: V4L2 subdev state
- * @fmt: Format
- *
- * Return 0 on success or -EINVAL if the pad is invalid or doesn't correspond
- * to the format type.
- */
-static int ipipeif_set_format(struct v4l2_subdev *sd,
-			      struct v4l2_subdev_state *sd_state,
-			      struct v4l2_subdev_format *fmt)
-{
-	struct iss_ipipeif_device *ipipeif = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	format = __ipipeif_get_format(ipipeif, sd_state, fmt->pad, fmt->which);
-	if (!format)
-		return -EINVAL;
-
-	ipipeif_try_format(ipipeif, sd_state, fmt->pad, &fmt->format,
-			   fmt->which);
-	*format = fmt->format;
-
-	/* Propagate the format from sink to source */
-	if (fmt->pad == IPIPEIF_PAD_SINK) {
-		format = __ipipeif_get_format(ipipeif, sd_state,
-					      IPIPEIF_PAD_SOURCE_ISIF_SF,
-					      fmt->which);
-		*format = fmt->format;
-		ipipeif_try_format(ipipeif, sd_state,
-				   IPIPEIF_PAD_SOURCE_ISIF_SF,
-				   format, fmt->which);
-
-		format = __ipipeif_get_format(ipipeif, sd_state,
-					      IPIPEIF_PAD_SOURCE_VP,
-					      fmt->which);
-		*format = fmt->format;
-		ipipeif_try_format(ipipeif, sd_state, IPIPEIF_PAD_SOURCE_VP,
-				   format,
-				   fmt->which);
-	}
-
-	return 0;
-}
-
-static int ipipeif_link_validate(struct v4l2_subdev *sd,
-				 struct media_link *link,
-				 struct v4l2_subdev_format *source_fmt,
-				 struct v4l2_subdev_format *sink_fmt)
-{
-	/* Check if the two ends match */
-	if (source_fmt->format.width != sink_fmt->format.width ||
-	    source_fmt->format.height != sink_fmt->format.height)
-		return -EPIPE;
-
-	if (source_fmt->format.code != sink_fmt->format.code)
-		return -EPIPE;
-
-	return 0;
-}
-
-/*
- * ipipeif_init_formats - Initialize formats on all pads
- * @sd: ISP IPIPEIF V4L2 subdevice
- * @fh: V4L2 subdev file handle
- *
- * Initialize all pad formats with default values. If fh is not NULL, try
- * formats are initialized on the file handle. Otherwise active formats are
- * initialized on the device.
- */
-static int ipipeif_init_formats(struct v4l2_subdev *sd,
-				struct v4l2_subdev_fh *fh)
-{
-	struct v4l2_subdev_format format;
-
-	memset(&format, 0, sizeof(format));
-	format.pad = IPIPEIF_PAD_SINK;
-	format.which = fh ? V4L2_SUBDEV_FORMAT_TRY : V4L2_SUBDEV_FORMAT_ACTIVE;
-	format.format.code = MEDIA_BUS_FMT_SGRBG10_1X10;
-	format.format.width = 4096;
-	format.format.height = 4096;
-	ipipeif_set_format(sd, fh ? fh->state : NULL, &format);
-
-	return 0;
-}
-
-/* V4L2 subdev video operations */
-static const struct v4l2_subdev_video_ops ipipeif_v4l2_video_ops = {
-	.s_stream = ipipeif_set_stream,
-};
-
-/* V4L2 subdev pad operations */
-static const struct v4l2_subdev_pad_ops ipipeif_v4l2_pad_ops = {
-	.enum_mbus_code = ipipeif_enum_mbus_code,
-	.enum_frame_size = ipipeif_enum_frame_size,
-	.get_fmt = ipipeif_get_format,
-	.set_fmt = ipipeif_set_format,
-	.link_validate = ipipeif_link_validate,
-};
-
-/* V4L2 subdev operations */
-static const struct v4l2_subdev_ops ipipeif_v4l2_ops = {
-	.video = &ipipeif_v4l2_video_ops,
-	.pad = &ipipeif_v4l2_pad_ops,
-};
-
-/* V4L2 subdev internal operations */
-static const struct v4l2_subdev_internal_ops ipipeif_v4l2_internal_ops = {
-	.open = ipipeif_init_formats,
-};
-
-/* -----------------------------------------------------------------------------
- * Media entity operations
- */
-
-/*
- * ipipeif_link_setup - Setup IPIPEIF connections
- * @entity: IPIPEIF media entity
- * @local: Pad at the local end of the link
- * @remote: Pad at the remote end of the link
- * @flags: Link flags
- *
- * return -EINVAL or zero on success
- */
-static int ipipeif_link_setup(struct media_entity *entity,
-			      const struct media_pad *local,
-			      const struct media_pad *remote, u32 flags)
-{
-	struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity);
-	struct iss_ipipeif_device *ipipeif = v4l2_get_subdevdata(sd);
-	struct iss_device *iss = to_iss_device(ipipeif);
-	unsigned int index = local->index;
-
-	/* FIXME: this is actually a hack! */
-	if (is_media_entity_v4l2_subdev(remote->entity))
-		index |= 2 << 16;
-
-	switch (index) {
-	case IPIPEIF_PAD_SINK | 2 << 16:
-		/* Read from the sensor CSI2a or CSI2b. */
-		if (!(flags & MEDIA_LNK_FL_ENABLED)) {
-			ipipeif->input = IPIPEIF_INPUT_NONE;
-			break;
-		}
-
-		if (ipipeif->input != IPIPEIF_INPUT_NONE)
-			return -EBUSY;
-
-		if (remote->entity == &iss->csi2a.subdev.entity)
-			ipipeif->input = IPIPEIF_INPUT_CSI2A;
-		else if (remote->entity == &iss->csi2b.subdev.entity)
-			ipipeif->input = IPIPEIF_INPUT_CSI2B;
-
-		break;
-
-	case IPIPEIF_PAD_SOURCE_ISIF_SF:
-		/* Write to memory */
-		if (flags & MEDIA_LNK_FL_ENABLED) {
-			if (ipipeif->output & ~IPIPEIF_OUTPUT_MEMORY)
-				return -EBUSY;
-			ipipeif->output |= IPIPEIF_OUTPUT_MEMORY;
-		} else {
-			ipipeif->output &= ~IPIPEIF_OUTPUT_MEMORY;
-		}
-		break;
-
-	case IPIPEIF_PAD_SOURCE_VP | 2 << 16:
-		/* Send to IPIPE/RESIZER */
-		if (flags & MEDIA_LNK_FL_ENABLED) {
-			if (ipipeif->output & ~IPIPEIF_OUTPUT_VP)
-				return -EBUSY;
-			ipipeif->output |= IPIPEIF_OUTPUT_VP;
-		} else {
-			ipipeif->output &= ~IPIPEIF_OUTPUT_VP;
-		}
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* media operations */
-static const struct media_entity_operations ipipeif_media_ops = {
-	.link_setup = ipipeif_link_setup,
-	.link_validate = v4l2_subdev_link_validate,
-};
-
-/*
- * ipipeif_init_entities - Initialize V4L2 subdev and media entity
- * @ipipeif: ISS ISP IPIPEIF module
- *
- * Return 0 on success and a negative error code on failure.
- */
-static int ipipeif_init_entities(struct iss_ipipeif_device *ipipeif)
-{
-	struct v4l2_subdev *sd = &ipipeif->subdev;
-	struct media_pad *pads = ipipeif->pads;
-	struct media_entity *me = &sd->entity;
-	int ret;
-
-	ipipeif->input = IPIPEIF_INPUT_NONE;
-
-	v4l2_subdev_init(sd, &ipipeif_v4l2_ops);
-	sd->internal_ops = &ipipeif_v4l2_internal_ops;
-	strscpy(sd->name, "OMAP4 ISS ISP IPIPEIF", sizeof(sd->name));
-	sd->grp_id = BIT(16);	/* group ID for iss subdevs */
-	v4l2_set_subdevdata(sd, ipipeif);
-	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	pads[IPIPEIF_PAD_SINK].flags = MEDIA_PAD_FL_SINK;
-	pads[IPIPEIF_PAD_SOURCE_ISIF_SF].flags = MEDIA_PAD_FL_SOURCE;
-	pads[IPIPEIF_PAD_SOURCE_VP].flags = MEDIA_PAD_FL_SOURCE;
-
-	me->ops = &ipipeif_media_ops;
-	ret = media_entity_pads_init(me, IPIPEIF_PADS_NUM, pads);
-	if (ret < 0)
-		return ret;
-
-	ipipeif_init_formats(sd, NULL);
-
-	ipipeif->video_out.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-	ipipeif->video_out.ops = &ipipeif_video_ops;
-	ipipeif->video_out.iss = to_iss_device(ipipeif);
-	ipipeif->video_out.capture_mem = PAGE_ALIGN(4096 * 4096) * 3;
-	ipipeif->video_out.bpl_alignment = 32;
-	ipipeif->video_out.bpl_zero_padding = 1;
-	ipipeif->video_out.bpl_max = 0x1ffe0;
-
-	return omap4iss_video_init(&ipipeif->video_out, "ISP IPIPEIF");
-}
-
-void omap4iss_ipipeif_unregister_entities(struct iss_ipipeif_device *ipipeif)
-{
-	v4l2_device_unregister_subdev(&ipipeif->subdev);
-	omap4iss_video_unregister(&ipipeif->video_out);
-}
-
-int omap4iss_ipipeif_register_entities(struct iss_ipipeif_device *ipipeif,
-				       struct v4l2_device *vdev)
-{
-	int ret;
-
-	/* Register the subdev and video node. */
-	ret = v4l2_device_register_subdev(vdev, &ipipeif->subdev);
-	if (ret < 0)
-		goto error;
-
-	ret = omap4iss_video_register(&ipipeif->video_out, vdev);
-	if (ret < 0)
-		goto error;
-
-	return 0;
-
-error:
-	omap4iss_ipipeif_unregister_entities(ipipeif);
-	return ret;
-}
-
-/* -----------------------------------------------------------------------------
- * ISP IPIPEIF initialisation and cleanup
- */
-
-/*
- * omap4iss_ipipeif_init - IPIPEIF module initialization.
- * @iss: Device pointer specific to the OMAP4 ISS.
- *
- * TODO: Get the initialisation values from platform data.
- *
- * Return 0 on success or a negative error code otherwise.
- */
-int omap4iss_ipipeif_init(struct iss_device *iss)
-{
-	struct iss_ipipeif_device *ipipeif = &iss->ipipeif;
-
-	ipipeif->state = ISS_PIPELINE_STREAM_STOPPED;
-	init_waitqueue_head(&ipipeif->wait);
-
-	return ipipeif_init_entities(ipipeif);
-}
-
-/*
- * omap4iss_ipipeif_create_links() - IPIPEIF pads links creation
- * @iss: Pointer to ISS device
- *
- * return negative error code or zero on success
- */
-int omap4iss_ipipeif_create_links(struct iss_device *iss)
-{
-	struct iss_ipipeif_device *ipipeif = &iss->ipipeif;
-
-	/* Connect the IPIPEIF subdev to the video node. */
-	return media_create_pad_link(&ipipeif->subdev.entity,
-				     IPIPEIF_PAD_SOURCE_ISIF_SF,
-				     &ipipeif->video_out.video.entity, 0, 0);
-}
-
-/*
- * omap4iss_ipipeif_cleanup - IPIPEIF module cleanup.
- * @iss: Device pointer specific to the OMAP4 ISS.
- */
-void omap4iss_ipipeif_cleanup(struct iss_device *iss)
-{
-	struct iss_ipipeif_device *ipipeif = &iss->ipipeif;
-
-	media_entity_cleanup(&ipipeif->subdev.entity);
-}
diff --git a/drivers/staging/media/omap4iss/iss_ipipeif.h b/drivers/staging/media/omap4iss/iss_ipipeif.h
deleted file mode 100644
index 69792333a62e..000000000000
--- a/drivers/staging/media/omap4iss/iss_ipipeif.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * TI OMAP4 ISS V4L2 Driver - ISP IPIPEIF module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#ifndef OMAP4_ISS_IPIPEIF_H
-#define OMAP4_ISS_IPIPEIF_H
-
-#include "iss_video.h"
-
-enum ipipeif_input_entity {
-	IPIPEIF_INPUT_NONE,
-	IPIPEIF_INPUT_CSI2A,
-	IPIPEIF_INPUT_CSI2B
-};
-
-#define IPIPEIF_OUTPUT_MEMORY			BIT(0)
-#define IPIPEIF_OUTPUT_VP			BIT(1)
-
-/* Sink and source IPIPEIF pads */
-#define IPIPEIF_PAD_SINK			0
-#define IPIPEIF_PAD_SOURCE_ISIF_SF		1
-#define IPIPEIF_PAD_SOURCE_VP			2
-#define IPIPEIF_PADS_NUM			3
-
-/*
- * struct iss_ipipeif_device - Structure for the IPIPEIF module to store its own
- *			    information
- * @subdev: V4L2 subdevice
- * @pads: Sink and source media entity pads
- * @formats: Active video formats
- * @input: Active input
- * @output: Active outputs
- * @video_out: Output video node
- * @error: A hardware error occurred during capture
- * @alaw: A-law compression enabled (1) or disabled (0)
- * @lpf: Low pass filter enabled (1) or disabled (0)
- * @obclamp: Optical-black clamp enabled (1) or disabled (0)
- * @fpc_en: Faulty pixels correction enabled (1) or disabled (0)
- * @blcomp: Black level compensation configuration
- * @clamp: Optical-black or digital clamp configuration
- * @fpc: Faulty pixels correction configuration
- * @lsc: Lens shading compensation configuration
- * @update: Bitmask of controls to update during the next interrupt
- * @shadow_update: Controls update in progress by userspace
- * @syncif: Interface synchronization configuration
- * @vpcfg: Video port configuration
- * @underrun: A buffer underrun occurred and a new buffer has been queued
- * @state: Streaming state
- * @lock: Serializes shadow_update with interrupt handler
- * @wait: Wait queue used to stop the module
- * @stopping: Stopping state
- * @ioctl_lock: Serializes ioctl calls and LSC requests freeing
- */
-struct iss_ipipeif_device {
-	struct v4l2_subdev subdev;
-	struct media_pad pads[IPIPEIF_PADS_NUM];
-	struct v4l2_mbus_framefmt formats[IPIPEIF_PADS_NUM];
-
-	enum ipipeif_input_entity input;
-	unsigned int output;
-	struct iss_video video_out;
-	unsigned int error;
-
-	enum iss_pipeline_stream_state state;
-	wait_queue_head_t wait;
-	atomic_t stopping;
-};
-
-struct iss_device;
-
-int omap4iss_ipipeif_init(struct iss_device *iss);
-int omap4iss_ipipeif_create_links(struct iss_device *iss);
-void omap4iss_ipipeif_cleanup(struct iss_device *iss);
-int omap4iss_ipipeif_register_entities(struct iss_ipipeif_device *ipipeif,
-				       struct v4l2_device *vdev);
-void omap4iss_ipipeif_unregister_entities(struct iss_ipipeif_device *ipipeif);
-
-int omap4iss_ipipeif_busy(struct iss_ipipeif_device *ipipeif);
-void omap4iss_ipipeif_isr(struct iss_ipipeif_device *ipipeif, u32 events);
-void omap4iss_ipipeif_restore_context(struct iss_device *iss);
-void omap4iss_ipipeif_max_rate(struct iss_ipipeif_device *ipipeif,
-			       unsigned int *max_rate);
-
-#endif	/* OMAP4_ISS_IPIPEIF_H */
diff --git a/drivers/staging/media/omap4iss/iss_regs.h b/drivers/staging/media/omap4iss/iss_regs.h
deleted file mode 100644
index cfe0bb075072..000000000000
--- a/drivers/staging/media/omap4iss/iss_regs.h
+++ /dev/null
@@ -1,899 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * TI OMAP4 ISS V4L2 Driver - Register defines
- *
- * Copyright (C) 2012 Texas Instruments.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#ifndef _OMAP4_ISS_REGS_H_
-#define _OMAP4_ISS_REGS_H_
-
-/* ISS */
-#define ISS_HL_REVISION					0x0
-
-#define ISS_HL_SYSCONFIG				0x10
-#define ISS_HL_SYSCONFIG_IDLEMODE_SHIFT			2
-#define ISS_HL_SYSCONFIG_IDLEMODE_FORCEIDLE		0x0
-#define ISS_HL_SYSCONFIG_IDLEMODE_NOIDLE		0x1
-#define ISS_HL_SYSCONFIG_IDLEMODE_SMARTIDLE		0x2
-#define ISS_HL_SYSCONFIG_SOFTRESET			BIT(0)
-
-#define ISS_HL_IRQSTATUS_RAW(i)				(0x20 + (0x10 * (i)))
-#define ISS_HL_IRQSTATUS(i)				(0x24 + (0x10 * (i)))
-#define ISS_HL_IRQENABLE_SET(i)				(0x28 + (0x10 * (i)))
-#define ISS_HL_IRQENABLE_CLR(i)				(0x2c + (0x10 * (i)))
-
-#define ISS_HL_IRQ_HS_VS			BIT(17)
-#define ISS_HL_IRQ_SIMCOP(i)			BIT(12 + (i))
-#define ISS_HL_IRQ_BTE				BIT(11)
-#define ISS_HL_IRQ_CBUFF			BIT(10)
-#define ISS_HL_IRQ_CCP2(i)			BIT((i) > 3 ? 16 : 14 + (i))
-#define ISS_HL_IRQ_CSIB				BIT(5)
-#define ISS_HL_IRQ_CSIA				BIT(4)
-#define ISS_HL_IRQ_ISP(i)			BIT(i)
-
-#define ISS_CTRL					0x80
-#define ISS_CTRL_CLK_DIV_MASK				(3 << 4)
-#define ISS_CTRL_INPUT_SEL_MASK				(3 << 2)
-#define ISS_CTRL_INPUT_SEL_CSI2A			(0 << 2)
-#define ISS_CTRL_INPUT_SEL_CSI2B			(1 << 2)
-#define ISS_CTRL_SYNC_DETECT_VS_RAISING			(3 << 0)
-
-#define ISS_CLKCTRL					0x84
-#define ISS_CLKCTRL_VPORT2_CLK				BIT(30)
-#define ISS_CLKCTRL_VPORT1_CLK				BIT(29)
-#define ISS_CLKCTRL_VPORT0_CLK				BIT(28)
-#define ISS_CLKCTRL_CCP2				BIT(4)
-#define ISS_CLKCTRL_CSI2_B				BIT(3)
-#define ISS_CLKCTRL_CSI2_A				BIT(2)
-#define ISS_CLKCTRL_ISP					BIT(1)
-#define ISS_CLKCTRL_SIMCOP				BIT(0)
-
-#define ISS_CLKSTAT					0x88
-#define ISS_CLKSTAT_VPORT2_CLK				BIT(30)
-#define ISS_CLKSTAT_VPORT1_CLK				BIT(29)
-#define ISS_CLKSTAT_VPORT0_CLK				BIT(28)
-#define ISS_CLKSTAT_CCP2				BIT(4)
-#define ISS_CLKSTAT_CSI2_B				BIT(3)
-#define ISS_CLKSTAT_CSI2_A				BIT(2)
-#define ISS_CLKSTAT_ISP					BIT(1)
-#define ISS_CLKSTAT_SIMCOP				BIT(0)
-
-#define ISS_PM_STATUS					0x8c
-#define ISS_PM_STATUS_CBUFF_PM_MASK			(3 << 12)
-#define ISS_PM_STATUS_BTE_PM_MASK			(3 << 10)
-#define ISS_PM_STATUS_SIMCOP_PM_MASK			(3 << 8)
-#define ISS_PM_STATUS_ISP_PM_MASK			(3 << 6)
-#define ISS_PM_STATUS_CCP2_PM_MASK			(3 << 4)
-#define ISS_PM_STATUS_CSI2_B_PM_MASK			(3 << 2)
-#define ISS_PM_STATUS_CSI2_A_PM_MASK			(3 << 0)
-
-#define REGISTER0					0x0
-#define REGISTER0_HSCLOCKCONFIG				BIT(24)
-#define REGISTER0_THS_TERM_MASK				(0xff << 8)
-#define REGISTER0_THS_TERM_SHIFT			8
-#define REGISTER0_THS_SETTLE_MASK			(0xff << 0)
-#define REGISTER0_THS_SETTLE_SHIFT			0
-
-#define REGISTER1					0x4
-#define REGISTER1_RESET_DONE_CTRLCLK			BIT(29)
-#define REGISTER1_CLOCK_MISS_DETECTOR_STATUS		BIT(25)
-#define REGISTER1_TCLK_TERM_MASK			(0x3f << 18)
-#define REGISTER1_TCLK_TERM_SHIFT			18
-#define REGISTER1_DPHY_HS_SYNC_PATTERN_SHIFT		10
-#define REGISTER1_CTRLCLK_DIV_FACTOR_MASK		(0x3 << 8)
-#define REGISTER1_CTRLCLK_DIV_FACTOR_SHIFT		8
-#define REGISTER1_TCLK_SETTLE_MASK			(0xff << 0)
-#define REGISTER1_TCLK_SETTLE_SHIFT			0
-
-#define REGISTER2					0x8
-
-#define CSI2_SYSCONFIG					0x10
-#define CSI2_SYSCONFIG_MSTANDBY_MODE_MASK		(3 << 12)
-#define CSI2_SYSCONFIG_MSTANDBY_MODE_FORCE		(0 << 12)
-#define CSI2_SYSCONFIG_MSTANDBY_MODE_NO			(1 << 12)
-#define CSI2_SYSCONFIG_MSTANDBY_MODE_SMART		(2 << 12)
-#define CSI2_SYSCONFIG_SOFT_RESET			(1 << 1)
-#define CSI2_SYSCONFIG_AUTO_IDLE			(1 << 0)
-
-#define CSI2_SYSSTATUS					0x14
-#define CSI2_SYSSTATUS_RESET_DONE			BIT(0)
-
-#define CSI2_IRQSTATUS					0x18
-#define CSI2_IRQENABLE					0x1c
-
-/* Shared bits across CSI2_IRQENABLE and IRQSTATUS */
-
-#define CSI2_IRQ_OCP_ERR				BIT(14)
-#define CSI2_IRQ_SHORT_PACKET				BIT(13)
-#define CSI2_IRQ_ECC_CORRECTION				BIT(12)
-#define CSI2_IRQ_ECC_NO_CORRECTION			BIT(11)
-#define CSI2_IRQ_COMPLEXIO_ERR				BIT(9)
-#define CSI2_IRQ_FIFO_OVF				BIT(8)
-#define CSI2_IRQ_CONTEXT0				BIT(0)
-
-#define CSI2_CTRL					0x40
-#define CSI2_CTRL_MFLAG_LEVH_MASK			(7 << 20)
-#define CSI2_CTRL_MFLAG_LEVH_SHIFT			20
-#define CSI2_CTRL_MFLAG_LEVL_MASK			(7 << 17)
-#define CSI2_CTRL_MFLAG_LEVL_SHIFT			17
-#define CSI2_CTRL_BURST_SIZE_EXPAND			(1 << 16)
-#define CSI2_CTRL_VP_CLK_EN				(1 << 15)
-#define CSI2_CTRL_NON_POSTED_WRITE			(1 << 13)
-#define CSI2_CTRL_VP_ONLY_EN				(1 << 11)
-#define CSI2_CTRL_VP_OUT_CTRL_MASK			(3 << 8)
-#define CSI2_CTRL_VP_OUT_CTRL_SHIFT			8
-#define CSI2_CTRL_DBG_EN				(1 << 7)
-#define CSI2_CTRL_BURST_SIZE_MASK			(3 << 5)
-#define CSI2_CTRL_ENDIANNESS				(1 << 4)
-#define CSI2_CTRL_FRAME					(1 << 3)
-#define CSI2_CTRL_ECC_EN				(1 << 2)
-#define CSI2_CTRL_IF_EN					(1 << 0)
-
-#define CSI2_DBG_H					0x44
-
-#define CSI2_COMPLEXIO_CFG				0x50
-#define CSI2_COMPLEXIO_CFG_RESET_CTRL			(1 << 30)
-#define CSI2_COMPLEXIO_CFG_RESET_DONE			(1 << 29)
-#define CSI2_COMPLEXIO_CFG_PWD_CMD_MASK			(3 << 27)
-#define CSI2_COMPLEXIO_CFG_PWD_CMD_OFF			(0 << 27)
-#define CSI2_COMPLEXIO_CFG_PWD_CMD_ON			(1 << 27)
-#define CSI2_COMPLEXIO_CFG_PWD_CMD_ULP			(2 << 27)
-#define CSI2_COMPLEXIO_CFG_PWD_STATUS_MASK		(3 << 25)
-#define CSI2_COMPLEXIO_CFG_PWD_STATUS_OFF		(0 << 25)
-#define CSI2_COMPLEXIO_CFG_PWD_STATUS_ON		(1 << 25)
-#define CSI2_COMPLEXIO_CFG_PWD_STATUS_ULP		(2 << 25)
-#define CSI2_COMPLEXIO_CFG_PWR_AUTO			(1 << 24)
-#define CSI2_COMPLEXIO_CFG_DATA_POL(i)			(1 << (((i) * 4) + 3))
-#define CSI2_COMPLEXIO_CFG_DATA_POSITION_MASK(i)	(7 << ((i) * 4))
-#define CSI2_COMPLEXIO_CFG_DATA_POSITION_SHIFT(i)	((i) * 4)
-#define CSI2_COMPLEXIO_CFG_CLOCK_POL			(1 << 3)
-#define CSI2_COMPLEXIO_CFG_CLOCK_POSITION_MASK		(7 << 0)
-#define CSI2_COMPLEXIO_CFG_CLOCK_POSITION_SHIFT		0
-
-#define CSI2_COMPLEXIO_IRQSTATUS			0x54
-
-#define CSI2_SHORT_PACKET				0x5c
-
-#define CSI2_COMPLEXIO_IRQENABLE			0x60
-
-/* Shared bits across CSI2_COMPLEXIO_IRQENABLE and IRQSTATUS */
-#define CSI2_COMPLEXIO_IRQ_STATEALLULPMEXIT		BIT(26)
-#define CSI2_COMPLEXIO_IRQ_STATEALLULPMENTER		BIT(25)
-#define CSI2_COMPLEXIO_IRQ_STATEULPM5			BIT(24)
-#define CSI2_COMPLEXIO_IRQ_STATEULPM4			BIT(23)
-#define CSI2_COMPLEXIO_IRQ_STATEULPM3			BIT(22)
-#define CSI2_COMPLEXIO_IRQ_STATEULPM2			BIT(21)
-#define CSI2_COMPLEXIO_IRQ_STATEULPM1			BIT(20)
-#define CSI2_COMPLEXIO_IRQ_ERRCONTROL5			BIT(19)
-#define CSI2_COMPLEXIO_IRQ_ERRCONTROL4			BIT(18)
-#define CSI2_COMPLEXIO_IRQ_ERRCONTROL3			BIT(17)
-#define CSI2_COMPLEXIO_IRQ_ERRCONTROL2			BIT(16)
-#define CSI2_COMPLEXIO_IRQ_ERRCONTROL1			BIT(15)
-#define CSI2_COMPLEXIO_IRQ_ERRESC5			BIT(14)
-#define CSI2_COMPLEXIO_IRQ_ERRESC4			BIT(13)
-#define CSI2_COMPLEXIO_IRQ_ERRESC3			BIT(12)
-#define CSI2_COMPLEXIO_IRQ_ERRESC2			BIT(11)
-#define CSI2_COMPLEXIO_IRQ_ERRESC1			BIT(10)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS5		BIT(9)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS4		BIT(8)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS3		BIT(7)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS2		BIT(6)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTSYNCHS1		BIT(5)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTHS5			BIT(4)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTHS4			BIT(3)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTHS3			BIT(2)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTHS2			BIT(1)
-#define CSI2_COMPLEXIO_IRQ_ERRSOTHS1			BIT(0)
-
-#define CSI2_DBG_P					0x68
-
-#define CSI2_TIMING					0x6c
-#define CSI2_TIMING_FORCE_RX_MODE_IO1			BIT(15)
-#define CSI2_TIMING_STOP_STATE_X16_IO1			BIT(14)
-#define CSI2_TIMING_STOP_STATE_X4_IO1			BIT(13)
-#define CSI2_TIMING_STOP_STATE_COUNTER_IO1_MASK		(0x1fff << 0)
-#define CSI2_TIMING_STOP_STATE_COUNTER_IO1_SHIFT	0
-
-#define CSI2_CTX_CTRL1(i)				(0x70 + (0x20 * (i)))
-#define CSI2_CTX_CTRL1_GENERIC				BIT(30)
-#define CSI2_CTX_CTRL1_TRANSCODE			(0xf << 24)
-#define CSI2_CTX_CTRL1_FEC_NUMBER_MASK			(0xff << 16)
-#define CSI2_CTX_CTRL1_COUNT_MASK			(0xff << 8)
-#define CSI2_CTX_CTRL1_COUNT_SHIFT			8
-#define CSI2_CTX_CTRL1_EOF_EN				BIT(7)
-#define CSI2_CTX_CTRL1_EOL_EN				BIT(6)
-#define CSI2_CTX_CTRL1_CS_EN				BIT(5)
-#define CSI2_CTX_CTRL1_COUNT_UNLOCK			BIT(4)
-#define CSI2_CTX_CTRL1_PING_PONG			BIT(3)
-#define CSI2_CTX_CTRL1_CTX_EN				BIT(0)
-
-#define CSI2_CTX_CTRL2(i)				(0x74 + (0x20 * (i)))
-#define CSI2_CTX_CTRL2_FRAME_MASK			(0xffff << 16)
-#define CSI2_CTX_CTRL2_FRAME_SHIFT			16
-#define CSI2_CTX_CTRL2_USER_DEF_MAP_SHIFT		13
-#define CSI2_CTX_CTRL2_USER_DEF_MAP_MASK		\
-		(0x3 << CSI2_CTX_CTRL2_USER_DEF_MAP_SHIFT)
-#define CSI2_CTX_CTRL2_VIRTUAL_ID_MASK			(3 << 11)
-#define CSI2_CTX_CTRL2_VIRTUAL_ID_SHIFT			11
-#define CSI2_CTX_CTRL2_DPCM_PRED			(1 << 10)
-#define CSI2_CTX_CTRL2_FORMAT_MASK			(0x3ff << 0)
-#define CSI2_CTX_CTRL2_FORMAT_SHIFT			0
-
-#define CSI2_CTX_DAT_OFST(i)				(0x78 + (0x20 * (i)))
-#define CSI2_CTX_DAT_OFST_MASK				(0xfff << 5)
-
-#define CSI2_CTX_PING_ADDR(i)				(0x7c + (0x20 * (i)))
-#define CSI2_CTX_PING_ADDR_MASK				0xffffffe0
-
-#define CSI2_CTX_PONG_ADDR(i)				(0x80 + (0x20 * (i)))
-#define CSI2_CTX_PONG_ADDR_MASK				CSI2_CTX_PING_ADDR_MASK
-
-#define CSI2_CTX_IRQENABLE(i)				(0x84 + (0x20 * (i)))
-#define CSI2_CTX_IRQSTATUS(i)				(0x88 + (0x20 * (i)))
-
-#define CSI2_CTX_CTRL3(i)				(0x8c + (0x20 * (i)))
-#define CSI2_CTX_CTRL3_ALPHA_SHIFT			5
-#define CSI2_CTX_CTRL3_ALPHA_MASK			\
-		(0x3fff << CSI2_CTX_CTRL3_ALPHA_SHIFT)
-
-/* Shared bits across CSI2_CTX_IRQENABLE and IRQSTATUS */
-#define CSI2_CTX_IRQ_ECC_CORRECTION			BIT(8)
-#define CSI2_CTX_IRQ_LINE_NUMBER			BIT(7)
-#define CSI2_CTX_IRQ_FRAME_NUMBER			BIT(6)
-#define CSI2_CTX_IRQ_CS					BIT(5)
-#define CSI2_CTX_IRQ_LE					BIT(3)
-#define CSI2_CTX_IRQ_LS					BIT(2)
-#define CSI2_CTX_IRQ_FE					BIT(1)
-#define CSI2_CTX_IRQ_FS					BIT(0)
-
-/* ISS BTE */
-#define BTE_CTRL					(0x0030)
-#define BTE_CTRL_BW_LIMITER_MASK			(0x3ff << 22)
-#define BTE_CTRL_BW_LIMITER_SHIFT			22
-
-/* ISS ISP_SYS1 */
-#define ISP5_REVISION					(0x0000)
-#define ISP5_SYSCONFIG					(0x0010)
-#define ISP5_SYSCONFIG_STANDBYMODE_MASK			(3 << 4)
-#define ISP5_SYSCONFIG_STANDBYMODE_FORCE		(0 << 4)
-#define ISP5_SYSCONFIG_STANDBYMODE_NO			(1 << 4)
-#define ISP5_SYSCONFIG_STANDBYMODE_SMART		(2 << 4)
-#define ISP5_SYSCONFIG_SOFTRESET			(1 << 1)
-
-#define ISP5_IRQSTATUS(i)				(0x0028 + (0x10 * (i)))
-#define ISP5_IRQENABLE_SET(i)				(0x002c + (0x10 * (i)))
-#define ISP5_IRQENABLE_CLR(i)				(0x0030 + (0x10 * (i)))
-
-/* Bits shared for ISP5_IRQ* registers */
-#define ISP5_IRQ_OCP_ERR				BIT(31)
-#define ISP5_IRQ_IPIPE_INT_DPC_RNEW1			BIT(29)
-#define ISP5_IRQ_IPIPE_INT_DPC_RNEW0			BIT(28)
-#define ISP5_IRQ_IPIPE_INT_DPC_INIT			BIT(27)
-#define ISP5_IRQ_IPIPE_INT_EOF				BIT(25)
-#define ISP5_IRQ_H3A_INT_EOF				BIT(24)
-#define ISP5_IRQ_RSZ_INT_EOF1				BIT(23)
-#define ISP5_IRQ_RSZ_INT_EOF0				BIT(22)
-#define ISP5_IRQ_RSZ_FIFO_IN_BLK_ERR			BIT(19)
-#define ISP5_IRQ_RSZ_FIFO_OVF				BIT(18)
-#define ISP5_IRQ_RSZ_INT_CYC_RSZB			BIT(17)
-#define ISP5_IRQ_RSZ_INT_CYC_RSZA			BIT(16)
-#define ISP5_IRQ_RSZ_INT_DMA				BIT(15)
-#define ISP5_IRQ_RSZ_INT_LAST_PIX			BIT(14)
-#define ISP5_IRQ_RSZ_INT_REG				BIT(13)
-#define ISP5_IRQ_H3A_INT				BIT(12)
-#define ISP5_IRQ_AF_INT					BIT(11)
-#define ISP5_IRQ_AEW_INT				BIT(10)
-#define ISP5_IRQ_IPIPEIF_IRQ				BIT(9)
-#define ISP5_IRQ_IPIPE_INT_HST				BIT(8)
-#define ISP5_IRQ_IPIPE_INT_BSC				BIT(7)
-#define ISP5_IRQ_IPIPE_INT_DMA				BIT(6)
-#define ISP5_IRQ_IPIPE_INT_LAST_PIX			BIT(5)
-#define ISP5_IRQ_IPIPE_INT_REG				BIT(4)
-#define ISP5_IRQ_ISIF_INT(i)				BIT(i)
-
-#define ISP5_CTRL					(0x006c)
-#define ISP5_CTRL_MSTANDBY				BIT(24)
-#define ISP5_CTRL_VD_PULSE_EXT				BIT(23)
-#define ISP5_CTRL_MSTANDBY_WAIT				BIT(20)
-#define ISP5_CTRL_BL_CLK_ENABLE				BIT(15)
-#define ISP5_CTRL_ISIF_CLK_ENABLE			BIT(14)
-#define ISP5_CTRL_H3A_CLK_ENABLE			BIT(13)
-#define ISP5_CTRL_RSZ_CLK_ENABLE			BIT(12)
-#define ISP5_CTRL_IPIPE_CLK_ENABLE			BIT(11)
-#define ISP5_CTRL_IPIPEIF_CLK_ENABLE			BIT(10)
-#define ISP5_CTRL_SYNC_ENABLE				BIT(9)
-#define ISP5_CTRL_PSYNC_CLK_SEL				BIT(8)
-
-/* ISS ISP ISIF register offsets */
-#define ISIF_SYNCEN					(0x0000)
-#define ISIF_SYNCEN_DWEN				BIT(1)
-#define ISIF_SYNCEN_SYEN				BIT(0)
-
-#define ISIF_MODESET					(0x0004)
-#define ISIF_MODESET_INPMOD_MASK			(3 << 12)
-#define ISIF_MODESET_INPMOD_RAW				(0 << 12)
-#define ISIF_MODESET_INPMOD_YCBCR16			(1 << 12)
-#define ISIF_MODESET_INPMOD_YCBCR8			(2 << 12)
-#define ISIF_MODESET_CCDW_MASK				(7 << 8)
-#define ISIF_MODESET_CCDW_2BIT				(2 << 8)
-#define ISIF_MODESET_CCDMD				(1 << 7)
-#define ISIF_MODESET_SWEN				(1 << 5)
-#define ISIF_MODESET_HDPOL				(1 << 3)
-#define ISIF_MODESET_VDPOL				(1 << 2)
-
-#define ISIF_SPH					(0x0018)
-#define ISIF_SPH_MASK					(0x7fff)
-
-#define ISIF_LNH					(0x001c)
-#define ISIF_LNH_MASK					(0x7fff)
-
-#define ISIF_LNV					(0x0028)
-#define ISIF_LNV_MASK					(0x7fff)
-
-#define ISIF_HSIZE					(0x0034)
-#define ISIF_HSIZE_ADCR					BIT(12)
-#define ISIF_HSIZE_HSIZE_MASK				(0xfff)
-
-#define ISIF_CADU					(0x003c)
-#define ISIF_CADU_MASK					(0x7ff)
-
-#define ISIF_CADL					(0x0040)
-#define ISIF_CADL_MASK					(0xffff)
-
-#define ISIF_CCOLP					(0x004c)
-#define ISIF_CCOLP_CP0_F0_R				(0 << 6)
-#define ISIF_CCOLP_CP0_F0_GR				(1 << 6)
-#define ISIF_CCOLP_CP0_F0_B				(3 << 6)
-#define ISIF_CCOLP_CP0_F0_GB				(2 << 6)
-#define ISIF_CCOLP_CP1_F0_R				(0 << 4)
-#define ISIF_CCOLP_CP1_F0_GR				(1 << 4)
-#define ISIF_CCOLP_CP1_F0_B				(3 << 4)
-#define ISIF_CCOLP_CP1_F0_GB				(2 << 4)
-#define ISIF_CCOLP_CP2_F0_R				(0 << 2)
-#define ISIF_CCOLP_CP2_F0_GR				(1 << 2)
-#define ISIF_CCOLP_CP2_F0_B				(3 << 2)
-#define ISIF_CCOLP_CP2_F0_GB				(2 << 2)
-#define ISIF_CCOLP_CP3_F0_R				(0 << 0)
-#define ISIF_CCOLP_CP3_F0_GR				(1 << 0)
-#define ISIF_CCOLP_CP3_F0_B				(3 << 0)
-#define ISIF_CCOLP_CP3_F0_GB				(2 << 0)
-
-#define ISIF_VDINT(i)					(0x0070 + (i) * 4)
-#define ISIF_VDINT_MASK					(0x7fff)
-
-#define ISIF_CGAMMAWD					(0x0080)
-#define ISIF_CGAMMAWD_GWDI_MASK				(0xf << 1)
-#define ISIF_CGAMMAWD_GWDI(bpp)				((16 - (bpp)) << 1)
-
-#define ISIF_CCDCFG					(0x0088)
-#define ISIF_CCDCFG_Y8POS				BIT(11)
-
-/* ISS ISP IPIPEIF register offsets */
-#define IPIPEIF_ENABLE					(0x0000)
-
-#define IPIPEIF_CFG1					(0x0004)
-#define IPIPEIF_CFG1_INPSRC1_MASK			(3 << 14)
-#define IPIPEIF_CFG1_INPSRC1_VPORT_RAW			(0 << 14)
-#define IPIPEIF_CFG1_INPSRC1_SDRAM_RAW			(1 << 14)
-#define IPIPEIF_CFG1_INPSRC1_ISIF_DARKFM		(2 << 14)
-#define IPIPEIF_CFG1_INPSRC1_SDRAM_YUV			(3 << 14)
-#define IPIPEIF_CFG1_INPSRC2_MASK			(3 << 2)
-#define IPIPEIF_CFG1_INPSRC2_ISIF			(0 << 2)
-#define IPIPEIF_CFG1_INPSRC2_SDRAM_RAW			(1 << 2)
-#define IPIPEIF_CFG1_INPSRC2_ISIF_DARKFM		(2 << 2)
-#define IPIPEIF_CFG1_INPSRC2_SDRAM_YUV			(3 << 2)
-
-#define IPIPEIF_CFG2					(0x0030)
-#define IPIPEIF_CFG2_YUV8P				BIT(7)
-#define IPIPEIF_CFG2_YUV8				BIT(6)
-#define IPIPEIF_CFG2_YUV16				BIT(3)
-#define IPIPEIF_CFG2_VDPOL				BIT(2)
-#define IPIPEIF_CFG2_HDPOL				BIT(1)
-#define IPIPEIF_CFG2_INTSW				BIT(0)
-
-#define IPIPEIF_CLKDIV					(0x0040)
-
-/* ISS ISP IPIPE register offsets */
-#define IPIPE_SRC_EN					(0x0000)
-#define IPIPE_SRC_EN_EN					BIT(0)
-
-#define IPIPE_SRC_MODE					(0x0004)
-#define IPIPE_SRC_MODE_WRT				BIT(1)
-#define IPIPE_SRC_MODE_OST				BIT(0)
-
-#define IPIPE_SRC_FMT					(0x0008)
-#define IPIPE_SRC_FMT_RAW2YUV				(0 << 0)
-#define IPIPE_SRC_FMT_RAW2RAW				(1 << 0)
-#define IPIPE_SRC_FMT_RAW2STATS				(2 << 0)
-#define IPIPE_SRC_FMT_YUV2YUV				(3 << 0)
-
-#define IPIPE_SRC_COL					(0x000c)
-#define IPIPE_SRC_COL_OO_R				(0 << 6)
-#define IPIPE_SRC_COL_OO_GR				(1 << 6)
-#define IPIPE_SRC_COL_OO_B				(3 << 6)
-#define IPIPE_SRC_COL_OO_GB				(2 << 6)
-#define IPIPE_SRC_COL_OE_R				(0 << 4)
-#define IPIPE_SRC_COL_OE_GR				(1 << 4)
-#define IPIPE_SRC_COL_OE_B				(3 << 4)
-#define IPIPE_SRC_COL_OE_GB				(2 << 4)
-#define IPIPE_SRC_COL_EO_R				(0 << 2)
-#define IPIPE_SRC_COL_EO_GR				(1 << 2)
-#define IPIPE_SRC_COL_EO_B				(3 << 2)
-#define IPIPE_SRC_COL_EO_GB				(2 << 2)
-#define IPIPE_SRC_COL_EE_R				(0 << 0)
-#define IPIPE_SRC_COL_EE_GR				(1 << 0)
-#define IPIPE_SRC_COL_EE_B				(3 << 0)
-#define IPIPE_SRC_COL_EE_GB				(2 << 0)
-
-#define IPIPE_SRC_VPS					(0x0010)
-#define IPIPE_SRC_VPS_MASK				(0xffff)
-
-#define IPIPE_SRC_VSZ					(0x0014)
-#define IPIPE_SRC_VSZ_MASK				(0x1fff)
-
-#define IPIPE_SRC_HPS					(0x0018)
-#define IPIPE_SRC_HPS_MASK				(0xffff)
-
-#define IPIPE_SRC_HSZ					(0x001c)
-#define IPIPE_SRC_HSZ_MASK				(0x1ffe)
-
-#define IPIPE_SEL_SBU					(0x0020)
-
-#define IPIPE_SRC_STA					(0x0024)
-
-#define IPIPE_GCK_MMR					(0x0028)
-#define IPIPE_GCK_MMR_REG				BIT(0)
-
-#define IPIPE_GCK_PIX					(0x002c)
-#define IPIPE_GCK_PIX_G3				BIT(3)
-#define IPIPE_GCK_PIX_G2				BIT(2)
-#define IPIPE_GCK_PIX_G1				BIT(1)
-#define IPIPE_GCK_PIX_G0				BIT(0)
-
-#define IPIPE_DPC_LUT_EN				(0x0034)
-#define IPIPE_DPC_LUT_SEL				(0x0038)
-#define IPIPE_DPC_LUT_ADR				(0x003c)
-#define IPIPE_DPC_LUT_SIZ				(0x0040)
-
-#define IPIPE_DPC_OTF_EN				(0x0044)
-#define IPIPE_DPC_OTF_TYP				(0x0048)
-#define IPIPE_DPC_OTF_2_D_THR_R				(0x004c)
-#define IPIPE_DPC_OTF_2_D_THR_GR			(0x0050)
-#define IPIPE_DPC_OTF_2_D_THR_GB			(0x0054)
-#define IPIPE_DPC_OTF_2_D_THR_B				(0x0058)
-#define IPIPE_DPC_OTF_2_C_THR_R				(0x005c)
-#define IPIPE_DPC_OTF_2_C_THR_GR			(0x0060)
-#define IPIPE_DPC_OTF_2_C_THR_GB			(0x0064)
-#define IPIPE_DPC_OTF_2_C_THR_B				(0x0068)
-#define IPIPE_DPC_OTF_3_SHF				(0x006c)
-#define IPIPE_DPC_OTF_3_D_THR				(0x0070)
-#define IPIPE_DPC_OTF_3_D_SPL				(0x0074)
-#define IPIPE_DPC_OTF_3_D_MIN				(0x0078)
-#define IPIPE_DPC_OTF_3_D_MAX				(0x007c)
-#define IPIPE_DPC_OTF_3_C_THR				(0x0080)
-#define IPIPE_DPC_OTF_3_C_SLP				(0x0084)
-#define IPIPE_DPC_OTF_3_C_MIN				(0x0088)
-#define IPIPE_DPC_OTF_3_C_MAX				(0x008c)
-
-#define IPIPE_LSC_VOFT					(0x0090)
-#define IPIPE_LSC_VA2					(0x0094)
-#define IPIPE_LSC_VA1					(0x0098)
-#define IPIPE_LSC_VS					(0x009c)
-#define IPIPE_LSC_HOFT					(0x00a0)
-#define IPIPE_LSC_HA2					(0x00a4)
-#define IPIPE_LSC_HA1					(0x00a8)
-#define IPIPE_LSC_HS					(0x00ac)
-#define IPIPE_LSC_GAN_R					(0x00b0)
-#define IPIPE_LSC_GAN_GR				(0x00b4)
-#define IPIPE_LSC_GAN_GB				(0x00b8)
-#define IPIPE_LSC_GAN_B					(0x00bc)
-#define IPIPE_LSC_OFT_R					(0x00c0)
-#define IPIPE_LSC_OFT_GR				(0x00c4)
-#define IPIPE_LSC_OFT_GB				(0x00c8)
-#define IPIPE_LSC_OFT_B					(0x00cc)
-#define IPIPE_LSC_SHF					(0x00d0)
-#define IPIPE_LSC_MAX					(0x00d4)
-
-#define IPIPE_D2F_1ST_EN				(0x00d8)
-#define IPIPE_D2F_1ST_TYP				(0x00dc)
-#define IPIPE_D2F_1ST_THR_00				(0x00e0)
-#define IPIPE_D2F_1ST_THR_01				(0x00e4)
-#define IPIPE_D2F_1ST_THR_02				(0x00e8)
-#define IPIPE_D2F_1ST_THR_03				(0x00ec)
-#define IPIPE_D2F_1ST_THR_04				(0x00f0)
-#define IPIPE_D2F_1ST_THR_05				(0x00f4)
-#define IPIPE_D2F_1ST_THR_06				(0x00f8)
-#define IPIPE_D2F_1ST_THR_07				(0x00fc)
-#define IPIPE_D2F_1ST_STR_00				(0x0100)
-#define IPIPE_D2F_1ST_STR_01				(0x0104)
-#define IPIPE_D2F_1ST_STR_02				(0x0108)
-#define IPIPE_D2F_1ST_STR_03				(0x010c)
-#define IPIPE_D2F_1ST_STR_04				(0x0110)
-#define IPIPE_D2F_1ST_STR_05				(0x0114)
-#define IPIPE_D2F_1ST_STR_06				(0x0118)
-#define IPIPE_D2F_1ST_STR_07				(0x011c)
-#define IPIPE_D2F_1ST_SPR_00				(0x0120)
-#define IPIPE_D2F_1ST_SPR_01				(0x0124)
-#define IPIPE_D2F_1ST_SPR_02				(0x0128)
-#define IPIPE_D2F_1ST_SPR_03				(0x012c)
-#define IPIPE_D2F_1ST_SPR_04				(0x0130)
-#define IPIPE_D2F_1ST_SPR_05				(0x0134)
-#define IPIPE_D2F_1ST_SPR_06				(0x0138)
-#define IPIPE_D2F_1ST_SPR_07				(0x013c)
-#define IPIPE_D2F_1ST_EDG_MIN				(0x0140)
-#define IPIPE_D2F_1ST_EDG_MAX				(0x0144)
-#define IPIPE_D2F_2ND_EN				(0x0148)
-#define IPIPE_D2F_2ND_TYP				(0x014c)
-#define IPIPE_D2F_2ND_THR00				(0x0150)
-#define IPIPE_D2F_2ND_THR01				(0x0154)
-#define IPIPE_D2F_2ND_THR02				(0x0158)
-#define IPIPE_D2F_2ND_THR03				(0x015c)
-#define IPIPE_D2F_2ND_THR04				(0x0160)
-#define IPIPE_D2F_2ND_THR05				(0x0164)
-#define IPIPE_D2F_2ND_THR06				(0x0168)
-#define IPIPE_D2F_2ND_THR07				(0x016c)
-#define IPIPE_D2F_2ND_STR_00				(0x0170)
-#define IPIPE_D2F_2ND_STR_01				(0x0174)
-#define IPIPE_D2F_2ND_STR_02				(0x0178)
-#define IPIPE_D2F_2ND_STR_03				(0x017c)
-#define IPIPE_D2F_2ND_STR_04				(0x0180)
-#define IPIPE_D2F_2ND_STR_05				(0x0184)
-#define IPIPE_D2F_2ND_STR_06				(0x0188)
-#define IPIPE_D2F_2ND_STR_07				(0x018c)
-#define IPIPE_D2F_2ND_SPR_00				(0x0190)
-#define IPIPE_D2F_2ND_SPR_01				(0x0194)
-#define IPIPE_D2F_2ND_SPR_02				(0x0198)
-#define IPIPE_D2F_2ND_SPR_03				(0x019c)
-#define IPIPE_D2F_2ND_SPR_04				(0x01a0)
-#define IPIPE_D2F_2ND_SPR_05				(0x01a4)
-#define IPIPE_D2F_2ND_SPR_06				(0x01a8)
-#define IPIPE_D2F_2ND_SPR_07				(0x01ac)
-#define IPIPE_D2F_2ND_EDG_MIN				(0x01b0)
-#define IPIPE_D2F_2ND_EDG_MAX				(0x01b4)
-
-#define IPIPE_GIC_EN					(0x01b8)
-#define IPIPE_GIC_TYP					(0x01bc)
-#define IPIPE_GIC_GAN					(0x01c0)
-#define IPIPE_GIC_NFGAIN				(0x01c4)
-#define IPIPE_GIC_THR					(0x01c8)
-#define IPIPE_GIC_SLP					(0x01cc)
-
-#define IPIPE_WB2_OFT_R					(0x01d0)
-#define IPIPE_WB2_OFT_GR				(0x01d4)
-#define IPIPE_WB2_OFT_GB				(0x01d8)
-#define IPIPE_WB2_OFT_B					(0x01dc)
-
-#define IPIPE_WB2_WGN_R					(0x01e0)
-#define IPIPE_WB2_WGN_GR				(0x01e4)
-#define IPIPE_WB2_WGN_GB				(0x01e8)
-#define IPIPE_WB2_WGN_B					(0x01ec)
-
-#define IPIPE_CFA_MODE					(0x01f0)
-#define IPIPE_CFA_2DIR_HPF_THR				(0x01f4)
-#define IPIPE_CFA_2DIR_HPF_SLP				(0x01f8)
-#define IPIPE_CFA_2DIR_MIX_THR				(0x01fc)
-#define IPIPE_CFA_2DIR_MIX_SLP				(0x0200)
-#define IPIPE_CFA_2DIR_DIR_TRH				(0x0204)
-#define IPIPE_CFA_2DIR_DIR_SLP				(0x0208)
-#define IPIPE_CFA_2DIR_NDWT				(0x020c)
-#define IPIPE_CFA_MONO_HUE_FRA				(0x0210)
-#define IPIPE_CFA_MONO_EDG_THR				(0x0214)
-#define IPIPE_CFA_MONO_THR_MIN				(0x0218)
-
-#define IPIPE_CFA_MONO_THR_SLP				(0x021c)
-#define IPIPE_CFA_MONO_SLP_MIN				(0x0220)
-#define IPIPE_CFA_MONO_SLP_SLP				(0x0224)
-#define IPIPE_CFA_MONO_LPWT				(0x0228)
-
-#define IPIPE_RGB1_MUL_RR				(0x022c)
-#define IPIPE_RGB1_MUL_GR				(0x0230)
-#define IPIPE_RGB1_MUL_BR				(0x0234)
-#define IPIPE_RGB1_MUL_RG				(0x0238)
-#define IPIPE_RGB1_MUL_GG				(0x023c)
-#define IPIPE_RGB1_MUL_BG				(0x0240)
-#define IPIPE_RGB1_MUL_RB				(0x0244)
-#define IPIPE_RGB1_MUL_GB				(0x0248)
-#define IPIPE_RGB1_MUL_BB				(0x024c)
-#define IPIPE_RGB1_OFT_OR				(0x0250)
-#define IPIPE_RGB1_OFT_OG				(0x0254)
-#define IPIPE_RGB1_OFT_OB				(0x0258)
-#define IPIPE_GMM_CFG					(0x025c)
-#define IPIPE_RGB2_MUL_RR				(0x0260)
-#define IPIPE_RGB2_MUL_GR				(0x0264)
-#define IPIPE_RGB2_MUL_BR				(0x0268)
-#define IPIPE_RGB2_MUL_RG				(0x026c)
-#define IPIPE_RGB2_MUL_GG				(0x0270)
-#define IPIPE_RGB2_MUL_BG				(0x0274)
-#define IPIPE_RGB2_MUL_RB				(0x0278)
-#define IPIPE_RGB2_MUL_GB				(0x027c)
-#define IPIPE_RGB2_MUL_BB				(0x0280)
-#define IPIPE_RGB2_OFT_OR				(0x0284)
-#define IPIPE_RGB2_OFT_OG				(0x0288)
-#define IPIPE_RGB2_OFT_OB				(0x028c)
-
-#define IPIPE_YUV_ADJ					(0x0294)
-#define IPIPE_YUV_MUL_RY				(0x0298)
-#define IPIPE_YUV_MUL_GY				(0x029c)
-#define IPIPE_YUV_MUL_BY				(0x02a0)
-#define IPIPE_YUV_MUL_RCB				(0x02a4)
-#define IPIPE_YUV_MUL_GCB				(0x02a8)
-#define IPIPE_YUV_MUL_BCB				(0x02ac)
-#define IPIPE_YUV_MUL_RCR				(0x02b0)
-#define IPIPE_YUV_MUL_GCR				(0x02b4)
-#define IPIPE_YUV_MUL_BCR				(0x02b8)
-#define IPIPE_YUV_OFT_Y					(0x02bc)
-#define IPIPE_YUV_OFT_CB				(0x02c0)
-#define IPIPE_YUV_OFT_CR				(0x02c4)
-
-#define IPIPE_YUV_PHS					(0x02c8)
-#define IPIPE_YUV_PHS_LPF				BIT(1)
-#define IPIPE_YUV_PHS_POS				BIT(0)
-
-#define IPIPE_YEE_EN					(0x02d4)
-#define IPIPE_YEE_TYP					(0x02d8)
-#define IPIPE_YEE_SHF					(0x02dc)
-#define IPIPE_YEE_MUL_00				(0x02e0)
-#define IPIPE_YEE_MUL_01				(0x02e4)
-#define IPIPE_YEE_MUL_02				(0x02e8)
-#define IPIPE_YEE_MUL_10				(0x02ec)
-#define IPIPE_YEE_MUL_11				(0x02f0)
-#define IPIPE_YEE_MUL_12				(0x02f4)
-#define IPIPE_YEE_MUL_20				(0x02f8)
-#define IPIPE_YEE_MUL_21				(0x02fc)
-#define IPIPE_YEE_MUL_22				(0x0300)
-#define IPIPE_YEE_THR					(0x0304)
-#define IPIPE_YEE_E_GAN					(0x0308)
-#define IPIPE_YEE_E_THR_1				(0x030c)
-#define IPIPE_YEE_E_THR_2				(0x0310)
-#define IPIPE_YEE_G_GAN					(0x0314)
-#define IPIPE_YEE_G_OFT					(0x0318)
-
-#define IPIPE_CAR_EN					(0x031c)
-#define IPIPE_CAR_TYP					(0x0320)
-#define IPIPE_CAR_SW					(0x0324)
-#define IPIPE_CAR_HPF_TYP				(0x0328)
-#define IPIPE_CAR_HPF_SHF				(0x032c)
-#define IPIPE_CAR_HPF_THR				(0x0330)
-#define IPIPE_CAR_GN1_GAN				(0x0334)
-#define IPIPE_CAR_GN1_SHF				(0x0338)
-#define IPIPE_CAR_GN1_MIN				(0x033c)
-#define IPIPE_CAR_GN2_GAN				(0x0340)
-#define IPIPE_CAR_GN2_SHF				(0x0344)
-#define IPIPE_CAR_GN2_MIN				(0x0348)
-#define IPIPE_CGS_EN					(0x034c)
-#define IPIPE_CGS_GN1_L_THR				(0x0350)
-#define IPIPE_CGS_GN1_L_GAIN				(0x0354)
-#define IPIPE_CGS_GN1_L_SHF				(0x0358)
-#define IPIPE_CGS_GN1_L_MIN				(0x035c)
-#define IPIPE_CGS_GN1_H_THR				(0x0360)
-#define IPIPE_CGS_GN1_H_GAIN				(0x0364)
-#define IPIPE_CGS_GN1_H_SHF				(0x0368)
-#define IPIPE_CGS_GN1_H_MIN				(0x036c)
-#define IPIPE_CGS_GN2_L_THR				(0x0370)
-#define IPIPE_CGS_GN2_L_GAIN				(0x0374)
-#define IPIPE_CGS_GN2_L_SHF				(0x0378)
-#define IPIPE_CGS_GN2_L_MIN				(0x037c)
-
-#define IPIPE_BOX_EN					(0x0380)
-#define IPIPE_BOX_MODE					(0x0384)
-#define IPIPE_BOX_TYP					(0x0388)
-#define IPIPE_BOX_SHF					(0x038c)
-#define IPIPE_BOX_SDR_SAD_H				(0x0390)
-#define IPIPE_BOX_SDR_SAD_L				(0x0394)
-
-#define IPIPE_HST_EN					(0x039c)
-#define IPIPE_HST_MODE					(0x03a0)
-#define IPIPE_HST_SEL					(0x03a4)
-#define IPIPE_HST_PARA					(0x03a8)
-#define IPIPE_HST_0_VPS					(0x03ac)
-#define IPIPE_HST_0_VSZ					(0x03b0)
-#define IPIPE_HST_0_HPS					(0x03b4)
-#define IPIPE_HST_0_HSZ					(0x03b8)
-#define IPIPE_HST_1_VPS					(0x03bc)
-#define IPIPE_HST_1_VSZ					(0x03c0)
-#define IPIPE_HST_1_HPS					(0x03c4)
-#define IPIPE_HST_1_HSZ					(0x03c8)
-#define IPIPE_HST_2_VPS					(0x03cc)
-#define IPIPE_HST_2_VSZ					(0x03d0)
-#define IPIPE_HST_2_HPS					(0x03d4)
-#define IPIPE_HST_2_HSZ					(0x03d8)
-#define IPIPE_HST_3_VPS					(0x03dc)
-#define IPIPE_HST_3_VSZ					(0x03e0)
-#define IPIPE_HST_3_HPS					(0x03e4)
-#define IPIPE_HST_3_HSZ					(0x03e8)
-#define IPIPE_HST_TBL					(0x03ec)
-#define IPIPE_HST_MUL_R					(0x03f0)
-#define IPIPE_HST_MUL_GR				(0x03f4)
-#define IPIPE_HST_MUL_GB				(0x03f8)
-#define IPIPE_HST_MUL_B					(0x03fc)
-
-#define IPIPE_BSC_EN					(0x0400)
-#define IPIPE_BSC_MODE					(0x0404)
-#define IPIPE_BSC_TYP					(0x0408)
-#define IPIPE_BSC_ROW_VCT				(0x040c)
-#define IPIPE_BSC_ROW_SHF				(0x0410)
-#define IPIPE_BSC_ROW_VPO				(0x0414)
-#define IPIPE_BSC_ROW_VNU				(0x0418)
-#define IPIPE_BSC_ROW_VSKIP				(0x041c)
-#define IPIPE_BSC_ROW_HPO				(0x0420)
-#define IPIPE_BSC_ROW_HNU				(0x0424)
-#define IPIPE_BSC_ROW_HSKIP				(0x0428)
-#define IPIPE_BSC_COL_VCT				(0x042c)
-#define IPIPE_BSC_COL_SHF				(0x0430)
-#define IPIPE_BSC_COL_VPO				(0x0434)
-#define IPIPE_BSC_COL_VNU				(0x0438)
-#define IPIPE_BSC_COL_VSKIP				(0x043c)
-#define IPIPE_BSC_COL_HPO				(0x0440)
-#define IPIPE_BSC_COL_HNU				(0x0444)
-#define IPIPE_BSC_COL_HSKIP				(0x0448)
-
-#define IPIPE_BSC_EN					(0x0400)
-
-/* ISS ISP Resizer register offsets */
-#define RSZ_REVISION					(0x0000)
-#define RSZ_SYSCONFIG					(0x0004)
-#define RSZ_SYSCONFIG_RSZB_CLK_EN			BIT(9)
-#define RSZ_SYSCONFIG_RSZA_CLK_EN			BIT(8)
-
-#define RSZ_IN_FIFO_CTRL				(0x000c)
-#define RSZ_IN_FIFO_CTRL_THRLD_LOW_MASK			(0x1ff << 16)
-#define RSZ_IN_FIFO_CTRL_THRLD_LOW_SHIFT		16
-#define RSZ_IN_FIFO_CTRL_THRLD_HIGH_MASK		(0x1ff << 0)
-#define RSZ_IN_FIFO_CTRL_THRLD_HIGH_SHIFT		0
-
-#define RSZ_FRACDIV					(0x0008)
-#define RSZ_FRACDIV_MASK				(0xffff)
-
-#define RSZ_SRC_EN					(0x0020)
-#define RSZ_SRC_EN_SRC_EN				BIT(0)
-
-#define RSZ_SRC_MODE					(0x0024)
-#define RSZ_SRC_MODE_OST				BIT(0)
-#define RSZ_SRC_MODE_WRT				BIT(1)
-
-#define RSZ_SRC_FMT0					(0x0028)
-#define RSZ_SRC_FMT0_BYPASS				BIT(1)
-#define RSZ_SRC_FMT0_SEL				BIT(0)
-
-#define RSZ_SRC_FMT1					(0x002c)
-#define RSZ_SRC_FMT1_IN420				BIT(1)
-
-#define RSZ_SRC_VPS					(0x0030)
-#define RSZ_SRC_VSZ					(0x0034)
-#define RSZ_SRC_HPS					(0x0038)
-#define RSZ_SRC_HSZ					(0x003c)
-#define RSZ_DMA_RZA					(0x0040)
-#define RSZ_DMA_RZB					(0x0044)
-#define RSZ_DMA_STA					(0x0048)
-#define RSZ_GCK_MMR					(0x004c)
-#define RSZ_GCK_MMR_MMR					BIT(0)
-
-#define RSZ_GCK_SDR					(0x0054)
-#define RSZ_GCK_SDR_CORE				BIT(0)
-
-#define RSZ_IRQ_RZA					(0x0058)
-#define RSZ_IRQ_RZA_MASK				(0x1fff)
-
-#define RSZ_IRQ_RZB					(0x005c)
-#define RSZ_IRQ_RZB_MASK				(0x1fff)
-
-#define RSZ_YUV_Y_MIN					(0x0060)
-#define RSZ_YUV_Y_MAX					(0x0064)
-#define RSZ_YUV_C_MIN					(0x0068)
-#define RSZ_YUV_C_MAX					(0x006c)
-
-#define RSZ_SEQ						(0x0074)
-#define RSZ_SEQ_HRVB					BIT(2)
-#define RSZ_SEQ_HRVA					BIT(0)
-
-#define RZA_EN						(0x0078)
-#define RZA_MODE					(0x007c)
-#define RZA_MODE_ONE_SHOT				BIT(0)
-
-#define RZA_420						(0x0080)
-#define RZA_I_VPS					(0x0084)
-#define RZA_I_HPS					(0x0088)
-#define RZA_O_VSZ					(0x008c)
-#define RZA_O_HSZ					(0x0090)
-#define RZA_V_PHS_Y					(0x0094)
-#define RZA_V_PHS_C					(0x0098)
-#define RZA_V_DIF					(0x009c)
-#define RZA_V_TYP					(0x00a0)
-#define RZA_V_LPF					(0x00a4)
-#define RZA_H_PHS					(0x00a8)
-#define RZA_H_DIF					(0x00b0)
-#define RZA_H_TYP					(0x00b4)
-#define RZA_H_LPF					(0x00b8)
-#define RZA_DWN_EN					(0x00bc)
-#define RZA_SDR_Y_BAD_H					(0x00d0)
-#define RZA_SDR_Y_BAD_L					(0x00d4)
-#define RZA_SDR_Y_SAD_H					(0x00d8)
-#define RZA_SDR_Y_SAD_L					(0x00dc)
-#define RZA_SDR_Y_OFT					(0x00e0)
-#define RZA_SDR_Y_PTR_S					(0x00e4)
-#define RZA_SDR_Y_PTR_E					(0x00e8)
-#define RZA_SDR_C_BAD_H					(0x00ec)
-#define RZA_SDR_C_BAD_L					(0x00f0)
-#define RZA_SDR_C_SAD_H					(0x00f4)
-#define RZA_SDR_C_SAD_L					(0x00f8)
-#define RZA_SDR_C_OFT					(0x00fc)
-#define RZA_SDR_C_PTR_S					(0x0100)
-#define RZA_SDR_C_PTR_E					(0x0104)
-
-#define RZB_EN						(0x0108)
-#define RZB_MODE					(0x010c)
-#define RZB_420						(0x0110)
-#define RZB_I_VPS					(0x0114)
-#define RZB_I_HPS					(0x0118)
-#define RZB_O_VSZ					(0x011c)
-#define RZB_O_HSZ					(0x0120)
-
-#define RZB_V_DIF					(0x012c)
-#define RZB_V_TYP					(0x0130)
-#define RZB_V_LPF					(0x0134)
-
-#define RZB_H_DIF					(0x0140)
-#define RZB_H_TYP					(0x0144)
-#define RZB_H_LPF					(0x0148)
-
-#define RZB_SDR_Y_BAD_H					(0x0160)
-#define RZB_SDR_Y_BAD_L					(0x0164)
-#define RZB_SDR_Y_SAD_H					(0x0168)
-#define RZB_SDR_Y_SAD_L					(0x016c)
-#define RZB_SDR_Y_OFT					(0x0170)
-#define RZB_SDR_Y_PTR_S					(0x0174)
-#define RZB_SDR_Y_PTR_E					(0x0178)
-#define RZB_SDR_C_BAD_H					(0x017c)
-#define RZB_SDR_C_BAD_L					(0x0180)
-#define RZB_SDR_C_SAD_H					(0x0184)
-#define RZB_SDR_C_SAD_L					(0x0188)
-
-#define RZB_SDR_C_PTR_S					(0x0190)
-#define RZB_SDR_C_PTR_E					(0x0194)
-
-/* Shared Bitmasks between RZA & RZB */
-#define RSZ_EN_EN					BIT(0)
-
-#define RSZ_420_CEN					BIT(1)
-#define RSZ_420_YEN					BIT(0)
-
-#define RSZ_I_VPS_MASK					(0x1fff)
-
-#define RSZ_I_HPS_MASK					(0x1fff)
-
-#define RSZ_O_VSZ_MASK					(0x1fff)
-
-#define RSZ_O_HSZ_MASK					(0x1ffe)
-
-#define RSZ_V_PHS_Y_MASK				(0x3fff)
-
-#define RSZ_V_PHS_C_MASK				(0x3fff)
-
-#define RSZ_V_DIF_MASK					(0x3fff)
-
-#define RSZ_V_TYP_C					BIT(1)
-#define RSZ_V_TYP_Y					BIT(0)
-
-#define RSZ_V_LPF_C_MASK				(0x3f << 6)
-#define RSZ_V_LPF_C_SHIFT				6
-#define RSZ_V_LPF_Y_MASK				(0x3f << 0)
-#define RSZ_V_LPF_Y_SHIFT				0
-
-#define RSZ_H_PHS_MASK					(0x3fff)
-
-#define RSZ_H_DIF_MASK					(0x3fff)
-
-#define RSZ_H_TYP_C					BIT(1)
-#define RSZ_H_TYP_Y					BIT(0)
-
-#define RSZ_H_LPF_C_MASK				(0x3f << 6)
-#define RSZ_H_LPF_C_SHIFT				6
-#define RSZ_H_LPF_Y_MASK				(0x3f << 0)
-#define RSZ_H_LPF_Y_SHIFT				0
-
-#define RSZ_DWN_EN_DWN_EN				BIT(0)
-
-#endif /* _OMAP4_ISS_REGS_H_ */
diff --git a/drivers/staging/media/omap4iss/iss_resizer.c b/drivers/staging/media/omap4iss/iss_resizer.c
deleted file mode 100644
index 58e698ef9108..000000000000
--- a/drivers/staging/media/omap4iss/iss_resizer.c
+++ /dev/null
@@ -1,884 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * TI OMAP4 ISS V4L2 Driver - ISP RESIZER module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-
-#include "iss.h"
-#include "iss_regs.h"
-#include "iss_resizer.h"
-
-static const unsigned int resizer_fmts[] = {
-	MEDIA_BUS_FMT_UYVY8_1X16,
-	MEDIA_BUS_FMT_YUYV8_1X16,
-};
-
-/*
- * resizer_print_status - Print current RESIZER Module register values.
- * @resizer: Pointer to ISS ISP RESIZER device.
- *
- * Also prints other debug information stored in the RESIZER module.
- */
-#define RSZ_PRINT_REGISTER(iss, name)\
-	dev_dbg(iss->dev, "###RSZ " #name "=0x%08x\n", \
-		iss_reg_read(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_##name))
-
-#define RZA_PRINT_REGISTER(iss, name)\
-	dev_dbg(iss->dev, "###RZA " #name "=0x%08x\n", \
-		iss_reg_read(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_##name))
-
-static void resizer_print_status(struct iss_resizer_device *resizer)
-{
-	struct iss_device *iss = to_iss_device(resizer);
-
-	dev_dbg(iss->dev, "-------------RESIZER Register dump-------------\n");
-
-	RSZ_PRINT_REGISTER(iss, SYSCONFIG);
-	RSZ_PRINT_REGISTER(iss, IN_FIFO_CTRL);
-	RSZ_PRINT_REGISTER(iss, FRACDIV);
-	RSZ_PRINT_REGISTER(iss, SRC_EN);
-	RSZ_PRINT_REGISTER(iss, SRC_MODE);
-	RSZ_PRINT_REGISTER(iss, SRC_FMT0);
-	RSZ_PRINT_REGISTER(iss, SRC_FMT1);
-	RSZ_PRINT_REGISTER(iss, SRC_VPS);
-	RSZ_PRINT_REGISTER(iss, SRC_VSZ);
-	RSZ_PRINT_REGISTER(iss, SRC_HPS);
-	RSZ_PRINT_REGISTER(iss, SRC_HSZ);
-	RSZ_PRINT_REGISTER(iss, DMA_RZA);
-	RSZ_PRINT_REGISTER(iss, DMA_RZB);
-	RSZ_PRINT_REGISTER(iss, DMA_STA);
-	RSZ_PRINT_REGISTER(iss, GCK_MMR);
-	RSZ_PRINT_REGISTER(iss, GCK_SDR);
-	RSZ_PRINT_REGISTER(iss, IRQ_RZA);
-	RSZ_PRINT_REGISTER(iss, IRQ_RZB);
-	RSZ_PRINT_REGISTER(iss, YUV_Y_MIN);
-	RSZ_PRINT_REGISTER(iss, YUV_Y_MAX);
-	RSZ_PRINT_REGISTER(iss, YUV_C_MIN);
-	RSZ_PRINT_REGISTER(iss, YUV_C_MAX);
-	RSZ_PRINT_REGISTER(iss, SEQ);
-
-	RZA_PRINT_REGISTER(iss, EN);
-	RZA_PRINT_REGISTER(iss, MODE);
-	RZA_PRINT_REGISTER(iss, 420);
-	RZA_PRINT_REGISTER(iss, I_VPS);
-	RZA_PRINT_REGISTER(iss, I_HPS);
-	RZA_PRINT_REGISTER(iss, O_VSZ);
-	RZA_PRINT_REGISTER(iss, O_HSZ);
-	RZA_PRINT_REGISTER(iss, V_PHS_Y);
-	RZA_PRINT_REGISTER(iss, V_PHS_C);
-	RZA_PRINT_REGISTER(iss, V_DIF);
-	RZA_PRINT_REGISTER(iss, V_TYP);
-	RZA_PRINT_REGISTER(iss, V_LPF);
-	RZA_PRINT_REGISTER(iss, H_PHS);
-	RZA_PRINT_REGISTER(iss, H_DIF);
-	RZA_PRINT_REGISTER(iss, H_TYP);
-	RZA_PRINT_REGISTER(iss, H_LPF);
-	RZA_PRINT_REGISTER(iss, DWN_EN);
-	RZA_PRINT_REGISTER(iss, SDR_Y_BAD_H);
-	RZA_PRINT_REGISTER(iss, SDR_Y_BAD_L);
-	RZA_PRINT_REGISTER(iss, SDR_Y_SAD_H);
-	RZA_PRINT_REGISTER(iss, SDR_Y_SAD_L);
-	RZA_PRINT_REGISTER(iss, SDR_Y_OFT);
-	RZA_PRINT_REGISTER(iss, SDR_Y_PTR_S);
-	RZA_PRINT_REGISTER(iss, SDR_Y_PTR_E);
-	RZA_PRINT_REGISTER(iss, SDR_C_BAD_H);
-	RZA_PRINT_REGISTER(iss, SDR_C_BAD_L);
-	RZA_PRINT_REGISTER(iss, SDR_C_SAD_H);
-	RZA_PRINT_REGISTER(iss, SDR_C_SAD_L);
-	RZA_PRINT_REGISTER(iss, SDR_C_OFT);
-	RZA_PRINT_REGISTER(iss, SDR_C_PTR_S);
-	RZA_PRINT_REGISTER(iss, SDR_C_PTR_E);
-
-	dev_dbg(iss->dev, "-----------------------------------------------\n");
-}
-
-/*
- * resizer_enable - Enable/Disable RESIZER.
- * @enable: enable flag
- *
- */
-static void resizer_enable(struct iss_resizer_device *resizer, u8 enable)
-{
-	struct iss_device *iss = to_iss_device(resizer);
-
-	iss_reg_update(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SRC_EN,
-		       RSZ_SRC_EN_SRC_EN, enable ? RSZ_SRC_EN_SRC_EN : 0);
-
-	/* TODO: Enable RSZB */
-	iss_reg_update(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_EN, RSZ_EN_EN,
-		       enable ? RSZ_EN_EN : 0);
-}
-
-/* -----------------------------------------------------------------------------
- * Format- and pipeline-related configuration helpers
- */
-
-/*
- * resizer_set_outaddr - Set memory address to save output image
- * @resizer: Pointer to ISP RESIZER device.
- * @addr: 32-bit memory address aligned on 32 byte boundary.
- *
- * Sets the memory address where the output will be saved.
- */
-static void resizer_set_outaddr(struct iss_resizer_device *resizer, u32 addr)
-{
-	struct iss_device *iss = to_iss_device(resizer);
-	struct v4l2_mbus_framefmt *informat, *outformat;
-
-	informat = &resizer->formats[RESIZER_PAD_SINK];
-	outformat = &resizer->formats[RESIZER_PAD_SOURCE_MEM];
-
-	/* Save address split in Base Address H & L */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_Y_BAD_H,
-		      (addr >> 16) & 0xffff);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_Y_BAD_L,
-		      addr & 0xffff);
-
-	/* SAD = BAD */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_Y_SAD_H,
-		      (addr >> 16) & 0xffff);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_Y_SAD_L,
-		      addr & 0xffff);
-
-	/* Program UV buffer address... Hardcoded to be contiguous! */
-	if ((informat->code == MEDIA_BUS_FMT_UYVY8_1X16) &&
-	    (outformat->code == MEDIA_BUS_FMT_YUYV8_1_5X8)) {
-		u32 c_addr = addr + resizer->video_out.bpl_value
-			   * outformat->height;
-
-		/* Ensure Y_BAD_L[6:0] = C_BAD_L[6:0]*/
-		if ((c_addr ^ addr) & 0x7f) {
-			c_addr &= ~0x7f;
-			c_addr += 0x80;
-			c_addr |= addr & 0x7f;
-		}
-
-		/* Save address split in Base Address H & L */
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_C_BAD_H,
-			      (c_addr >> 16) & 0xffff);
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_C_BAD_L,
-			      c_addr & 0xffff);
-
-		/* SAD = BAD */
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_C_SAD_H,
-			      (c_addr >> 16) & 0xffff);
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_C_SAD_L,
-			      c_addr & 0xffff);
-	}
-}
-
-static void resizer_configure(struct iss_resizer_device *resizer)
-{
-	struct iss_device *iss = to_iss_device(resizer);
-	struct v4l2_mbus_framefmt *informat, *outformat;
-
-	informat = &resizer->formats[RESIZER_PAD_SINK];
-	outformat = &resizer->formats[RESIZER_PAD_SOURCE_MEM];
-
-	/* Disable pass-through more. Despite its name, the BYPASS bit controls
-	 * pass-through mode, not bypass mode.
-	 */
-	iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SRC_FMT0,
-		    RSZ_SRC_FMT0_BYPASS);
-
-	/* Select RSZ input */
-	iss_reg_update(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SRC_FMT0,
-		       RSZ_SRC_FMT0_SEL,
-		       resizer->input == RESIZER_INPUT_IPIPEIF ?
-		       RSZ_SRC_FMT0_SEL : 0);
-
-	/* RSZ ignores WEN signal from IPIPE/IPIPEIF */
-	iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SRC_MODE,
-		    RSZ_SRC_MODE_WRT);
-
-	/* Set Resizer in free-running mode */
-	iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SRC_MODE,
-		    RSZ_SRC_MODE_OST);
-
-	/* Init Resizer A */
-	iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_MODE,
-		    RZA_MODE_ONE_SHOT);
-
-	/* Set size related things now */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SRC_VPS, 0);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SRC_HPS, 0);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SRC_VSZ,
-		      informat->height - 2);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SRC_HSZ,
-		      informat->width - 1);
-
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_I_VPS, 0);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_I_HPS, 0);
-
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_O_VSZ,
-		      outformat->height - 2);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_O_HSZ,
-		      outformat->width - 1);
-
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_V_DIF, 0x100);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_H_DIF, 0x100);
-
-	/* Buffer output settings */
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_Y_PTR_S, 0);
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_Y_PTR_E,
-		      outformat->height - 1);
-
-	iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_Y_OFT,
-		      resizer->video_out.bpl_value);
-
-	/* UYVY -> NV12 conversion */
-	if ((informat->code == MEDIA_BUS_FMT_UYVY8_1X16) &&
-	    (outformat->code == MEDIA_BUS_FMT_YUYV8_1_5X8)) {
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_420,
-			      RSZ_420_CEN | RSZ_420_YEN);
-
-		/* UV Buffer output settings */
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_C_PTR_S,
-			      0);
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_C_PTR_E,
-			      outformat->height - 1);
-
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_SDR_C_OFT,
-			      resizer->video_out.bpl_value);
-	} else {
-		iss_reg_write(iss, OMAP4_ISS_MEM_ISP_RESIZER, RZA_420, 0);
-	}
-}
-
-/* -----------------------------------------------------------------------------
- * Interrupt handling
- */
-
-static void resizer_isr_buffer(struct iss_resizer_device *resizer)
-{
-	struct iss_buffer *buffer;
-
-	/* The whole resizer needs to be stopped. Disabling RZA only produces
-	 * input FIFO overflows, most probably when the next frame is received.
-	 */
-	resizer_enable(resizer, 0);
-
-	buffer = omap4iss_video_buffer_next(&resizer->video_out);
-	if (!buffer)
-		return;
-
-	resizer_set_outaddr(resizer, buffer->iss_addr);
-
-	resizer_enable(resizer, 1);
-}
-
-/*
- * omap4iss_resizer_isr - Configure resizer during interframe time.
- * @resizer: Pointer to ISP RESIZER device.
- * @events: RESIZER events
- */
-void omap4iss_resizer_isr(struct iss_resizer_device *resizer, u32 events)
-{
-	struct iss_device *iss = to_iss_device(resizer);
-	struct iss_pipeline *pipe =
-			     to_iss_pipeline(&resizer->subdev.entity);
-
-	if (events & (ISP5_IRQ_RSZ_FIFO_IN_BLK_ERR |
-		      ISP5_IRQ_RSZ_FIFO_OVF)) {
-		dev_dbg(iss->dev, "RSZ Err: FIFO_IN_BLK:%d, FIFO_OVF:%d\n",
-			events & ISP5_IRQ_RSZ_FIFO_IN_BLK_ERR ? 1 : 0,
-			events & ISP5_IRQ_RSZ_FIFO_OVF ? 1 : 0);
-		omap4iss_pipeline_cancel_stream(pipe);
-	}
-
-	if (omap4iss_module_sync_is_stopping(&resizer->wait,
-					     &resizer->stopping))
-		return;
-
-	if (events & ISP5_IRQ_RSZ_INT_DMA)
-		resizer_isr_buffer(resizer);
-}
-
-/* -----------------------------------------------------------------------------
- * ISS video operations
- */
-
-static int resizer_video_queue(struct iss_video *video,
-			       struct iss_buffer *buffer)
-{
-	struct iss_resizer_device *resizer = container_of(video,
-				struct iss_resizer_device, video_out);
-
-	if (!(resizer->output & RESIZER_OUTPUT_MEMORY))
-		return -ENODEV;
-
-	resizer_set_outaddr(resizer, buffer->iss_addr);
-
-	/*
-	 * If streaming was enabled before there was a buffer queued
-	 * or underrun happened in the ISR, the hardware was not enabled
-	 * and DMA queue flag ISS_VIDEO_DMAQUEUE_UNDERRUN is still set.
-	 * Enable it now.
-	 */
-	if (video->dmaqueue_flags & ISS_VIDEO_DMAQUEUE_UNDERRUN) {
-		resizer_enable(resizer, 1);
-		iss_video_dmaqueue_flags_clr(video);
-	}
-
-	return 0;
-}
-
-static const struct iss_video_operations resizer_video_ops = {
-	.queue = resizer_video_queue,
-};
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev operations
- */
-
-/*
- * resizer_set_stream - Enable/Disable streaming on the RESIZER module
- * @sd: ISP RESIZER V4L2 subdevice
- * @enable: Enable/disable stream
- */
-static int resizer_set_stream(struct v4l2_subdev *sd, int enable)
-{
-	struct iss_resizer_device *resizer = v4l2_get_subdevdata(sd);
-	struct iss_device *iss = to_iss_device(resizer);
-	struct iss_video *video_out = &resizer->video_out;
-	int ret = 0;
-
-	if (resizer->state == ISS_PIPELINE_STREAM_STOPPED) {
-		if (enable == ISS_PIPELINE_STREAM_STOPPED)
-			return 0;
-
-		omap4iss_isp_subclk_enable(iss, OMAP4_ISS_ISP_SUBCLK_RSZ);
-
-		iss_reg_set(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_GCK_MMR,
-			    RSZ_GCK_MMR_MMR);
-		iss_reg_set(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_GCK_SDR,
-			    RSZ_GCK_SDR_CORE);
-
-		/* FIXME: Enable RSZB also */
-		iss_reg_set(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SYSCONFIG,
-			    RSZ_SYSCONFIG_RSZA_CLK_EN);
-	}
-
-	switch (enable) {
-	case ISS_PIPELINE_STREAM_CONTINUOUS:
-
-		resizer_configure(resizer);
-		resizer_print_status(resizer);
-
-		/*
-		 * When outputting to memory with no buffer available, let the
-		 * buffer queue handler start the hardware. A DMA queue flag
-		 * ISS_VIDEO_DMAQUEUE_QUEUED will be set as soon as there is
-		 * a buffer available.
-		 */
-		if (resizer->output & RESIZER_OUTPUT_MEMORY &&
-		    !(video_out->dmaqueue_flags & ISS_VIDEO_DMAQUEUE_QUEUED))
-			break;
-
-		atomic_set(&resizer->stopping, 0);
-		resizer_enable(resizer, 1);
-		iss_video_dmaqueue_flags_clr(video_out);
-		break;
-
-	case ISS_PIPELINE_STREAM_STOPPED:
-		if (resizer->state == ISS_PIPELINE_STREAM_STOPPED)
-			return 0;
-		if (omap4iss_module_sync_idle(&sd->entity, &resizer->wait,
-					      &resizer->stopping))
-			ret = -ETIMEDOUT;
-
-		resizer_enable(resizer, 0);
-		iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_SYSCONFIG,
-			    RSZ_SYSCONFIG_RSZA_CLK_EN);
-		iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_GCK_SDR,
-			    RSZ_GCK_SDR_CORE);
-		iss_reg_clr(iss, OMAP4_ISS_MEM_ISP_RESIZER, RSZ_GCK_MMR,
-			    RSZ_GCK_MMR_MMR);
-		omap4iss_isp_subclk_disable(iss, OMAP4_ISS_ISP_SUBCLK_RSZ);
-		iss_video_dmaqueue_flags_clr(video_out);
-		break;
-	}
-
-	resizer->state = enable;
-	return ret;
-}
-
-static struct v4l2_mbus_framefmt *
-__resizer_get_format(struct iss_resizer_device *resizer,
-		     struct v4l2_subdev_state *sd_state, unsigned int pad,
-		     enum v4l2_subdev_format_whence which)
-{
-	if (which == V4L2_SUBDEV_FORMAT_TRY)
-		return v4l2_subdev_state_get_format(sd_state, pad);
-	return &resizer->formats[pad];
-}
-
-/*
- * resizer_try_format - Try video format on a pad
- * @resizer: ISS RESIZER device
- * @sd_state: V4L2 subdev state
- * @pad: Pad number
- * @fmt: Format
- */
-static void
-resizer_try_format(struct iss_resizer_device *resizer,
-		   struct v4l2_subdev_state *sd_state, unsigned int pad,
-		   struct v4l2_mbus_framefmt *fmt,
-		   enum v4l2_subdev_format_whence which)
-{
-	u32 pixelcode;
-	struct v4l2_mbus_framefmt *format;
-	unsigned int width = fmt->width;
-	unsigned int height = fmt->height;
-	unsigned int i;
-
-	switch (pad) {
-	case RESIZER_PAD_SINK:
-		for (i = 0; i < ARRAY_SIZE(resizer_fmts); i++) {
-			if (fmt->code == resizer_fmts[i])
-				break;
-		}
-
-		/* If not found, use UYVY as default */
-		if (i >= ARRAY_SIZE(resizer_fmts))
-			fmt->code = MEDIA_BUS_FMT_UYVY8_1X16;
-
-		/* Clamp the input size. */
-		fmt->width = clamp_t(u32, width, 1, 8192);
-		fmt->height = clamp_t(u32, height, 1, 8192);
-		break;
-
-	case RESIZER_PAD_SOURCE_MEM:
-		pixelcode = fmt->code;
-		format = __resizer_get_format(resizer, sd_state,
-					      RESIZER_PAD_SINK,
-					      which);
-		memcpy(fmt, format, sizeof(*fmt));
-
-		if ((pixelcode == MEDIA_BUS_FMT_YUYV8_1_5X8) &&
-		    (fmt->code == MEDIA_BUS_FMT_UYVY8_1X16))
-			fmt->code = pixelcode;
-
-		/* The data formatter truncates the number of horizontal output
-		 * pixels to a multiple of 16. To avoid clipping data, allow
-		 * callers to request an output size bigger than the input size
-		 * up to the nearest multiple of 16.
-		 */
-		fmt->width = clamp_t(u32, width, 32, (fmt->width + 15) & ~15);
-		fmt->width &= ~15;
-		fmt->height = clamp_t(u32, height, 32, fmt->height);
-		break;
-	}
-
-	fmt->colorspace = V4L2_COLORSPACE_JPEG;
-	fmt->field = V4L2_FIELD_NONE;
-}
-
-/*
- * resizer_enum_mbus_code - Handle pixel format enumeration
- * @sd     : pointer to v4l2 subdev structure
- * @sd_state: V4L2 subdev state
- * @code   : pointer to v4l2_subdev_mbus_code_enum structure
- * return -EINVAL or zero on success
- */
-static int resizer_enum_mbus_code(struct v4l2_subdev *sd,
-				  struct v4l2_subdev_state *sd_state,
-				  struct v4l2_subdev_mbus_code_enum *code)
-{
-	struct iss_resizer_device *resizer = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	switch (code->pad) {
-	case RESIZER_PAD_SINK:
-		if (code->index >= ARRAY_SIZE(resizer_fmts))
-			return -EINVAL;
-
-		code->code = resizer_fmts[code->index];
-		break;
-
-	case RESIZER_PAD_SOURCE_MEM:
-		format = __resizer_get_format(resizer, sd_state,
-					      RESIZER_PAD_SINK,
-					      code->which);
-
-		if (code->index == 0) {
-			code->code = format->code;
-			break;
-		}
-
-		switch (format->code) {
-		case MEDIA_BUS_FMT_UYVY8_1X16:
-			if (code->index == 1)
-				code->code = MEDIA_BUS_FMT_YUYV8_1_5X8;
-			else
-				return -EINVAL;
-			break;
-		default:
-			if (code->index != 0)
-				return -EINVAL;
-		}
-
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int resizer_enum_frame_size(struct v4l2_subdev *sd,
-				   struct v4l2_subdev_state *sd_state,
-				   struct v4l2_subdev_frame_size_enum *fse)
-{
-	struct iss_resizer_device *resizer = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt format;
-
-	if (fse->index != 0)
-		return -EINVAL;
-
-	format.code = fse->code;
-	format.width = 1;
-	format.height = 1;
-	resizer_try_format(resizer, sd_state, fse->pad, &format, fse->which);
-	fse->min_width = format.width;
-	fse->min_height = format.height;
-
-	if (format.code != fse->code)
-		return -EINVAL;
-
-	format.code = fse->code;
-	format.width = -1;
-	format.height = -1;
-	resizer_try_format(resizer, sd_state, fse->pad, &format, fse->which);
-	fse->max_width = format.width;
-	fse->max_height = format.height;
-
-	return 0;
-}
-
-/*
- * resizer_get_format - Retrieve the video format on a pad
- * @sd : ISP RESIZER V4L2 subdevice
- * @sd_state: V4L2 subdev state
- * @fmt: Format
- *
- * Return 0 on success or -EINVAL if the pad is invalid or doesn't correspond
- * to the format type.
- */
-static int resizer_get_format(struct v4l2_subdev *sd,
-			      struct v4l2_subdev_state *sd_state,
-			      struct v4l2_subdev_format *fmt)
-{
-	struct iss_resizer_device *resizer = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	format = __resizer_get_format(resizer, sd_state, fmt->pad, fmt->which);
-	if (!format)
-		return -EINVAL;
-
-	fmt->format = *format;
-	return 0;
-}
-
-/*
- * resizer_set_format - Set the video format on a pad
- * @sd : ISP RESIZER V4L2 subdevice
- * @sd_state: V4L2 subdev state
- * @fmt: Format
- *
- * Return 0 on success or -EINVAL if the pad is invalid or doesn't correspond
- * to the format type.
- */
-static int resizer_set_format(struct v4l2_subdev *sd,
-			      struct v4l2_subdev_state *sd_state,
-			      struct v4l2_subdev_format *fmt)
-{
-	struct iss_resizer_device *resizer = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *format;
-
-	format = __resizer_get_format(resizer, sd_state, fmt->pad, fmt->which);
-	if (!format)
-		return -EINVAL;
-
-	resizer_try_format(resizer, sd_state, fmt->pad, &fmt->format,
-			   fmt->which);
-	*format = fmt->format;
-
-	/* Propagate the format from sink to source */
-	if (fmt->pad == RESIZER_PAD_SINK) {
-		format = __resizer_get_format(resizer, sd_state,
-					      RESIZER_PAD_SOURCE_MEM,
-					      fmt->which);
-		*format = fmt->format;
-		resizer_try_format(resizer, sd_state, RESIZER_PAD_SOURCE_MEM,
-				   format,
-				   fmt->which);
-	}
-
-	return 0;
-}
-
-static int resizer_link_validate(struct v4l2_subdev *sd,
-				 struct media_link *link,
-				 struct v4l2_subdev_format *source_fmt,
-				 struct v4l2_subdev_format *sink_fmt)
-{
-	/* Check if the two ends match */
-	if (source_fmt->format.width != sink_fmt->format.width ||
-	    source_fmt->format.height != sink_fmt->format.height)
-		return -EPIPE;
-
-	if (source_fmt->format.code != sink_fmt->format.code)
-		return -EPIPE;
-
-	return 0;
-}
-
-/*
- * resizer_init_formats - Initialize formats on all pads
- * @sd: ISP RESIZER V4L2 subdevice
- * @fh: V4L2 subdev file handle
- *
- * Initialize all pad formats with default values. If fh is not NULL, try
- * formats are initialized on the file handle. Otherwise active formats are
- * initialized on the device.
- */
-static int resizer_init_formats(struct v4l2_subdev *sd,
-				struct v4l2_subdev_fh *fh)
-{
-	struct v4l2_subdev_format format;
-
-	memset(&format, 0, sizeof(format));
-	format.pad = RESIZER_PAD_SINK;
-	format.which = fh ? V4L2_SUBDEV_FORMAT_TRY : V4L2_SUBDEV_FORMAT_ACTIVE;
-	format.format.code = MEDIA_BUS_FMT_UYVY8_1X16;
-	format.format.width = 4096;
-	format.format.height = 4096;
-	resizer_set_format(sd, fh ? fh->state : NULL, &format);
-
-	return 0;
-}
-
-/* V4L2 subdev video operations */
-static const struct v4l2_subdev_video_ops resizer_v4l2_video_ops = {
-	.s_stream = resizer_set_stream,
-};
-
-/* V4L2 subdev pad operations */
-static const struct v4l2_subdev_pad_ops resizer_v4l2_pad_ops = {
-	.enum_mbus_code = resizer_enum_mbus_code,
-	.enum_frame_size = resizer_enum_frame_size,
-	.get_fmt = resizer_get_format,
-	.set_fmt = resizer_set_format,
-	.link_validate = resizer_link_validate,
-};
-
-/* V4L2 subdev operations */
-static const struct v4l2_subdev_ops resizer_v4l2_ops = {
-	.video = &resizer_v4l2_video_ops,
-	.pad = &resizer_v4l2_pad_ops,
-};
-
-/* V4L2 subdev internal operations */
-static const struct v4l2_subdev_internal_ops resizer_v4l2_internal_ops = {
-	.open = resizer_init_formats,
-};
-
-/* -----------------------------------------------------------------------------
- * Media entity operations
- */
-
-/*
- * resizer_link_setup - Setup RESIZER connections
- * @entity: RESIZER media entity
- * @local: Pad at the local end of the link
- * @remote: Pad at the remote end of the link
- * @flags: Link flags
- *
- * return -EINVAL or zero on success
- */
-static int resizer_link_setup(struct media_entity *entity,
-			      const struct media_pad *local,
-			      const struct media_pad *remote, u32 flags)
-{
-	struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity);
-	struct iss_resizer_device *resizer = v4l2_get_subdevdata(sd);
-	struct iss_device *iss = to_iss_device(resizer);
-	unsigned int index = local->index;
-
-	/* FIXME: this is actually a hack! */
-	if (is_media_entity_v4l2_subdev(remote->entity))
-		index |= 2 << 16;
-
-	switch (index) {
-	case RESIZER_PAD_SINK | 2 << 16:
-		/* Read from IPIPE or IPIPEIF. */
-		if (!(flags & MEDIA_LNK_FL_ENABLED)) {
-			resizer->input = RESIZER_INPUT_NONE;
-			break;
-		}
-
-		if (resizer->input != RESIZER_INPUT_NONE)
-			return -EBUSY;
-
-		if (remote->entity == &iss->ipipeif.subdev.entity)
-			resizer->input = RESIZER_INPUT_IPIPEIF;
-		else if (remote->entity == &iss->ipipe.subdev.entity)
-			resizer->input = RESIZER_INPUT_IPIPE;
-
-		break;
-
-	case RESIZER_PAD_SOURCE_MEM:
-		/* Write to memory */
-		if (flags & MEDIA_LNK_FL_ENABLED) {
-			if (resizer->output & ~RESIZER_OUTPUT_MEMORY)
-				return -EBUSY;
-			resizer->output |= RESIZER_OUTPUT_MEMORY;
-		} else {
-			resizer->output &= ~RESIZER_OUTPUT_MEMORY;
-		}
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* media operations */
-static const struct media_entity_operations resizer_media_ops = {
-	.link_setup = resizer_link_setup,
-	.link_validate = v4l2_subdev_link_validate,
-};
-
-/*
- * resizer_init_entities - Initialize V4L2 subdev and media entity
- * @resizer: ISS ISP RESIZER module
- *
- * Return 0 on success and a negative error code on failure.
- */
-static int resizer_init_entities(struct iss_resizer_device *resizer)
-{
-	struct v4l2_subdev *sd = &resizer->subdev;
-	struct media_pad *pads = resizer->pads;
-	struct media_entity *me = &sd->entity;
-	int ret;
-
-	resizer->input = RESIZER_INPUT_NONE;
-
-	v4l2_subdev_init(sd, &resizer_v4l2_ops);
-	sd->internal_ops = &resizer_v4l2_internal_ops;
-	strscpy(sd->name, "OMAP4 ISS ISP resizer", sizeof(sd->name));
-	sd->grp_id = BIT(16);	/* group ID for iss subdevs */
-	v4l2_set_subdevdata(sd, resizer);
-	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	pads[RESIZER_PAD_SINK].flags = MEDIA_PAD_FL_SINK;
-	pads[RESIZER_PAD_SOURCE_MEM].flags = MEDIA_PAD_FL_SOURCE;
-
-	me->ops = &resizer_media_ops;
-	ret = media_entity_pads_init(me, RESIZER_PADS_NUM, pads);
-	if (ret < 0)
-		return ret;
-
-	resizer_init_formats(sd, NULL);
-
-	resizer->video_out.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-	resizer->video_out.ops = &resizer_video_ops;
-	resizer->video_out.iss = to_iss_device(resizer);
-	resizer->video_out.capture_mem = PAGE_ALIGN(4096 * 4096) * 3;
-	resizer->video_out.bpl_alignment = 32;
-	resizer->video_out.bpl_zero_padding = 1;
-	resizer->video_out.bpl_max = 0x1ffe0;
-
-	return omap4iss_video_init(&resizer->video_out, "ISP resizer a");
-}
-
-void omap4iss_resizer_unregister_entities(struct iss_resizer_device *resizer)
-{
-	v4l2_device_unregister_subdev(&resizer->subdev);
-	omap4iss_video_unregister(&resizer->video_out);
-}
-
-int omap4iss_resizer_register_entities(struct iss_resizer_device *resizer,
-				       struct v4l2_device *vdev)
-{
-	int ret;
-
-	/* Register the subdev and video node. */
-	ret = v4l2_device_register_subdev(vdev, &resizer->subdev);
-	if (ret < 0)
-		goto error;
-
-	ret = omap4iss_video_register(&resizer->video_out, vdev);
-	if (ret < 0)
-		goto error;
-
-	return 0;
-
-error:
-	omap4iss_resizer_unregister_entities(resizer);
-	return ret;
-}
-
-/* -----------------------------------------------------------------------------
- * ISP RESIZER initialisation and cleanup
- */
-
-/*
- * omap4iss_resizer_init - RESIZER module initialization.
- * @iss: Device pointer specific to the OMAP4 ISS.
- *
- * TODO: Get the initialisation values from platform data.
- *
- * Return 0 on success or a negative error code otherwise.
- */
-int omap4iss_resizer_init(struct iss_device *iss)
-{
-	struct iss_resizer_device *resizer = &iss->resizer;
-
-	resizer->state = ISS_PIPELINE_STREAM_STOPPED;
-	init_waitqueue_head(&resizer->wait);
-
-	return resizer_init_entities(resizer);
-}
-
-/*
- * omap4iss_resizer_create_links() - RESIZER pads links creation
- * @iss: Pointer to ISS device
- *
- * return negative error code or zero on success
- */
-int omap4iss_resizer_create_links(struct iss_device *iss)
-{
-	struct iss_resizer_device *resizer = &iss->resizer;
-
-	/* Connect the RESIZER subdev to the video node. */
-	return media_create_pad_link(&resizer->subdev.entity,
-				     RESIZER_PAD_SOURCE_MEM,
-				     &resizer->video_out.video.entity, 0, 0);
-}
-
-/*
- * omap4iss_resizer_cleanup - RESIZER module cleanup.
- * @iss: Device pointer specific to the OMAP4 ISS.
- */
-void omap4iss_resizer_cleanup(struct iss_device *iss)
-{
-	struct iss_resizer_device *resizer = &iss->resizer;
-
-	media_entity_cleanup(&resizer->subdev.entity);
-}
diff --git a/drivers/staging/media/omap4iss/iss_resizer.h b/drivers/staging/media/omap4iss/iss_resizer.h
deleted file mode 100644
index cb937fccc21f..000000000000
--- a/drivers/staging/media/omap4iss/iss_resizer.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * TI OMAP4 ISS V4L2 Driver - ISP RESIZER module
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#ifndef OMAP4_ISS_RESIZER_H
-#define OMAP4_ISS_RESIZER_H
-
-#include "iss_video.h"
-
-enum resizer_input_entity {
-	RESIZER_INPUT_NONE,
-	RESIZER_INPUT_IPIPE,
-	RESIZER_INPUT_IPIPEIF
-};
-
-#define RESIZER_OUTPUT_MEMORY			BIT(0)
-
-/* Sink and source RESIZER pads */
-#define RESIZER_PAD_SINK			0
-#define RESIZER_PAD_SOURCE_MEM			1
-#define RESIZER_PADS_NUM			2
-
-/*
- * struct iss_resizer_device - Structure for the RESIZER module to store its own
- *			    information
- * @subdev: V4L2 subdevice
- * @pads: Sink and source media entity pads
- * @formats: Active video formats
- * @input: Active input
- * @output: Active outputs
- * @video_out: Output video node
- * @error: A hardware error occurred during capture
- * @state: Streaming state
- * @wait: Wait queue used to stop the module
- * @stopping: Stopping state
- */
-struct iss_resizer_device {
-	struct v4l2_subdev subdev;
-	struct media_pad pads[RESIZER_PADS_NUM];
-	struct v4l2_mbus_framefmt formats[RESIZER_PADS_NUM];
-
-	enum resizer_input_entity input;
-	unsigned int output;
-	struct iss_video video_out;
-	unsigned int error;
-
-	enum iss_pipeline_stream_state state;
-	wait_queue_head_t wait;
-	atomic_t stopping;
-};
-
-struct iss_device;
-
-int omap4iss_resizer_init(struct iss_device *iss);
-int omap4iss_resizer_create_links(struct iss_device *iss);
-void omap4iss_resizer_cleanup(struct iss_device *iss);
-int omap4iss_resizer_register_entities(struct iss_resizer_device *resizer,
-				       struct v4l2_device *vdev);
-void omap4iss_resizer_unregister_entities(struct iss_resizer_device *resizer);
-
-int omap4iss_resizer_busy(struct iss_resizer_device *resizer);
-void omap4iss_resizer_isr(struct iss_resizer_device *resizer, u32 events);
-void omap4iss_resizer_restore_context(struct iss_device *iss);
-void omap4iss_resizer_max_rate(struct iss_resizer_device *resizer,
-			       unsigned int *max_rate);
-
-#endif	/* OMAP4_ISS_RESIZER_H */
diff --git a/drivers/staging/media/omap4iss/iss_video.c b/drivers/staging/media/omap4iss/iss_video.c
deleted file mode 100644
index 22fa4d6cae10..000000000000
--- a/drivers/staging/media/omap4iss/iss_video.c
+++ /dev/null
@@ -1,1274 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * TI OMAP4 ISS V4L2 Driver - Generic video node
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#include <linux/clk.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-
-#include <media/v4l2-dev.h>
-#include <media/v4l2-ioctl.h>
-#include <media/v4l2-mc.h>
-
-#include "iss_video.h"
-#include "iss.h"
-
-/* -----------------------------------------------------------------------------
- * Helper functions
- */
-
-static struct iss_format_info formats[] = {
-	{ MEDIA_BUS_FMT_Y8_1X8, MEDIA_BUS_FMT_Y8_1X8,
-	  MEDIA_BUS_FMT_Y8_1X8, MEDIA_BUS_FMT_Y8_1X8,
-	  V4L2_PIX_FMT_GREY, 8, },
-	{ MEDIA_BUS_FMT_Y10_1X10, MEDIA_BUS_FMT_Y10_1X10,
-	  MEDIA_BUS_FMT_Y10_1X10, MEDIA_BUS_FMT_Y8_1X8,
-	  V4L2_PIX_FMT_Y10, 10, },
-	{ MEDIA_BUS_FMT_Y12_1X12, MEDIA_BUS_FMT_Y10_1X10,
-	  MEDIA_BUS_FMT_Y12_1X12, MEDIA_BUS_FMT_Y8_1X8,
-	  V4L2_PIX_FMT_Y12, 12, },
-	{ MEDIA_BUS_FMT_SBGGR8_1X8, MEDIA_BUS_FMT_SBGGR8_1X8,
-	  MEDIA_BUS_FMT_SBGGR8_1X8, MEDIA_BUS_FMT_SBGGR8_1X8,
-	  V4L2_PIX_FMT_SBGGR8, 8, },
-	{ MEDIA_BUS_FMT_SGBRG8_1X8, MEDIA_BUS_FMT_SGBRG8_1X8,
-	  MEDIA_BUS_FMT_SGBRG8_1X8, MEDIA_BUS_FMT_SGBRG8_1X8,
-	  V4L2_PIX_FMT_SGBRG8, 8, },
-	{ MEDIA_BUS_FMT_SGRBG8_1X8, MEDIA_BUS_FMT_SGRBG8_1X8,
-	  MEDIA_BUS_FMT_SGRBG8_1X8, MEDIA_BUS_FMT_SGRBG8_1X8,
-	  V4L2_PIX_FMT_SGRBG8, 8, },
-	{ MEDIA_BUS_FMT_SRGGB8_1X8, MEDIA_BUS_FMT_SRGGB8_1X8,
-	  MEDIA_BUS_FMT_SRGGB8_1X8, MEDIA_BUS_FMT_SRGGB8_1X8,
-	  V4L2_PIX_FMT_SRGGB8, 8, },
-	{ MEDIA_BUS_FMT_SGRBG10_DPCM8_1X8, MEDIA_BUS_FMT_SGRBG10_DPCM8_1X8,
-	  MEDIA_BUS_FMT_SGRBG10_1X10, 0,
-	  V4L2_PIX_FMT_SGRBG10DPCM8, 8, },
-	{ MEDIA_BUS_FMT_SBGGR10_1X10, MEDIA_BUS_FMT_SBGGR10_1X10,
-	  MEDIA_BUS_FMT_SBGGR10_1X10, MEDIA_BUS_FMT_SBGGR8_1X8,
-	  V4L2_PIX_FMT_SBGGR10, 10, },
-	{ MEDIA_BUS_FMT_SGBRG10_1X10, MEDIA_BUS_FMT_SGBRG10_1X10,
-	  MEDIA_BUS_FMT_SGBRG10_1X10, MEDIA_BUS_FMT_SGBRG8_1X8,
-	  V4L2_PIX_FMT_SGBRG10, 10, },
-	{ MEDIA_BUS_FMT_SGRBG10_1X10, MEDIA_BUS_FMT_SGRBG10_1X10,
-	  MEDIA_BUS_FMT_SGRBG10_1X10, MEDIA_BUS_FMT_SGRBG8_1X8,
-	  V4L2_PIX_FMT_SGRBG10, 10, },
-	{ MEDIA_BUS_FMT_SRGGB10_1X10, MEDIA_BUS_FMT_SRGGB10_1X10,
-	  MEDIA_BUS_FMT_SRGGB10_1X10, MEDIA_BUS_FMT_SRGGB8_1X8,
-	  V4L2_PIX_FMT_SRGGB10, 10, },
-	{ MEDIA_BUS_FMT_SBGGR12_1X12, MEDIA_BUS_FMT_SBGGR10_1X10,
-	  MEDIA_BUS_FMT_SBGGR12_1X12, MEDIA_BUS_FMT_SBGGR8_1X8,
-	  V4L2_PIX_FMT_SBGGR12, 12, },
-	{ MEDIA_BUS_FMT_SGBRG12_1X12, MEDIA_BUS_FMT_SGBRG10_1X10,
-	  MEDIA_BUS_FMT_SGBRG12_1X12, MEDIA_BUS_FMT_SGBRG8_1X8,
-	  V4L2_PIX_FMT_SGBRG12, 12, },
-	{ MEDIA_BUS_FMT_SGRBG12_1X12, MEDIA_BUS_FMT_SGRBG10_1X10,
-	  MEDIA_BUS_FMT_SGRBG12_1X12, MEDIA_BUS_FMT_SGRBG8_1X8,
-	  V4L2_PIX_FMT_SGRBG12, 12, },
-	{ MEDIA_BUS_FMT_SRGGB12_1X12, MEDIA_BUS_FMT_SRGGB10_1X10,
-	  MEDIA_BUS_FMT_SRGGB12_1X12, MEDIA_BUS_FMT_SRGGB8_1X8,
-	  V4L2_PIX_FMT_SRGGB12, 12, },
-	{ MEDIA_BUS_FMT_UYVY8_1X16, MEDIA_BUS_FMT_UYVY8_1X16,
-	  MEDIA_BUS_FMT_UYVY8_1X16, 0,
-	  V4L2_PIX_FMT_UYVY, 16, },
-	{ MEDIA_BUS_FMT_YUYV8_1X16, MEDIA_BUS_FMT_YUYV8_1X16,
-	  MEDIA_BUS_FMT_YUYV8_1X16, 0,
-	  V4L2_PIX_FMT_YUYV, 16, },
-	{ MEDIA_BUS_FMT_YUYV8_1_5X8, MEDIA_BUS_FMT_YUYV8_1_5X8,
-	  MEDIA_BUS_FMT_YUYV8_1_5X8, 0,
-	  V4L2_PIX_FMT_NV12, 8, },
-};
-
-const struct iss_format_info *
-omap4iss_video_format_info(u32 code)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(formats); ++i) {
-		if (formats[i].code == code)
-			return &formats[i];
-	}
-
-	return NULL;
-}
-
-/*
- * iss_video_mbus_to_pix - Convert v4l2_mbus_framefmt to v4l2_pix_format
- * @video: ISS video instance
- * @mbus: v4l2_mbus_framefmt format (input)
- * @pix: v4l2_pix_format format (output)
- *
- * Fill the output pix structure with information from the input mbus format.
- * The bytesperline and sizeimage fields are computed from the requested bytes
- * per line value in the pix format and information from the video instance.
- *
- * Return the number of padding bytes at end of line.
- */
-static unsigned int iss_video_mbus_to_pix(const struct iss_video *video,
-					  const struct v4l2_mbus_framefmt *mbus,
-					  struct v4l2_pix_format *pix)
-{
-	unsigned int bpl = pix->bytesperline;
-	unsigned int min_bpl;
-	unsigned int i;
-
-	memset(pix, 0, sizeof(*pix));
-	pix->width = mbus->width;
-	pix->height = mbus->height;
-
-	/*
-	 * Skip the last format in the loop so that it will be selected if no
-	 * match is found.
-	 */
-	for (i = 0; i < ARRAY_SIZE(formats) - 1; ++i) {
-		if (formats[i].code == mbus->code)
-			break;
-	}
-
-	min_bpl = pix->width * ALIGN(formats[i].bpp, 8) / 8;
-
-	/*
-	 * Clamp the requested bytes per line value. If the maximum bytes per
-	 * line value is zero, the module doesn't support user configurable line
-	 * sizes. Override the requested value with the minimum in that case.
-	 */
-	if (video->bpl_max)
-		bpl = clamp(bpl, min_bpl, video->bpl_max);
-	else
-		bpl = min_bpl;
-
-	if (!video->bpl_zero_padding || bpl != min_bpl)
-		bpl = ALIGN(bpl, video->bpl_alignment);
-
-	pix->pixelformat = formats[i].pixelformat;
-	pix->bytesperline = bpl;
-	pix->sizeimage = pix->bytesperline * pix->height;
-	pix->colorspace = mbus->colorspace;
-	pix->field = mbus->field;
-
-	/* FIXME: Special case for NV12! We should make this nicer... */
-	if (pix->pixelformat == V4L2_PIX_FMT_NV12)
-		pix->sizeimage += (pix->bytesperline * pix->height) / 2;
-
-	return bpl - min_bpl;
-}
-
-static void iss_video_pix_to_mbus(const struct v4l2_pix_format *pix,
-				  struct v4l2_mbus_framefmt *mbus)
-{
-	unsigned int i;
-
-	memset(mbus, 0, sizeof(*mbus));
-	mbus->width = pix->width;
-	mbus->height = pix->height;
-
-	/*
-	 * Skip the last format in the loop so that it will be selected if no
-	 * match is found.
-	 */
-	for (i = 0; i < ARRAY_SIZE(formats) - 1; ++i) {
-		if (formats[i].pixelformat == pix->pixelformat)
-			break;
-	}
-
-	mbus->code = formats[i].code;
-	mbus->colorspace = pix->colorspace;
-	mbus->field = pix->field;
-}
-
-static struct v4l2_subdev *
-iss_video_remote_subdev(struct iss_video *video, u32 *pad)
-{
-	struct media_pad *remote;
-
-	remote = media_pad_remote_pad_first(&video->pad);
-
-	if (!remote || !is_media_entity_v4l2_subdev(remote->entity))
-		return NULL;
-
-	if (pad)
-		*pad = remote->index;
-
-	return media_entity_to_v4l2_subdev(remote->entity);
-}
-
-/* Return a pointer to the ISS video instance at the far end of the pipeline. */
-static struct iss_video *
-iss_video_far_end(struct iss_video *video, struct iss_pipeline *pipe)
-{
-	struct media_pipeline_entity_iter iter;
-	struct media_entity *entity;
-	struct iss_video *far_end = NULL;
-	int ret;
-
-	ret = media_pipeline_entity_iter_init(&pipe->pipe, &iter);
-	if (ret)
-		return ERR_PTR(-ENOMEM);
-
-	media_pipeline_for_each_entity(&pipe->pipe, &iter, entity) {
-		struct iss_video *other;
-
-		if (entity == &video->video.entity)
-			continue;
-
-		if (!is_media_entity_v4l2_video_device(entity))
-			continue;
-
-		other = to_iss_video(media_entity_to_video_device(entity));
-		if (other->type != video->type) {
-			far_end = other;
-			break;
-		}
-	}
-
-	media_pipeline_entity_iter_cleanup(&iter);
-
-	return far_end;
-}
-
-static int
-__iss_video_get_format(struct iss_video *video,
-		       struct v4l2_mbus_framefmt *format)
-{
-	struct v4l2_subdev_format fmt = {
-		.which = V4L2_SUBDEV_FORMAT_ACTIVE,
-	};
-	struct v4l2_subdev *subdev;
-	u32 pad;
-	int ret;
-
-	subdev = iss_video_remote_subdev(video, &pad);
-	if (!subdev)
-		return -EINVAL;
-
-	fmt.pad = pad;
-
-	mutex_lock(&video->mutex);
-	ret = v4l2_subdev_call(subdev, pad, get_fmt, NULL, &fmt);
-	mutex_unlock(&video->mutex);
-
-	if (ret)
-		return ret;
-
-	*format = fmt.format;
-	return 0;
-}
-
-static int
-iss_video_check_format(struct iss_video *video, struct iss_video_fh *vfh)
-{
-	struct v4l2_mbus_framefmt format;
-	struct v4l2_pix_format pixfmt;
-	int ret;
-
-	ret = __iss_video_get_format(video, &format);
-	if (ret < 0)
-		return ret;
-
-	pixfmt.bytesperline = 0;
-	ret = iss_video_mbus_to_pix(video, &format, &pixfmt);
-
-	if (vfh->format.fmt.pix.pixelformat != pixfmt.pixelformat ||
-	    vfh->format.fmt.pix.height != pixfmt.height ||
-	    vfh->format.fmt.pix.width != pixfmt.width ||
-	    vfh->format.fmt.pix.bytesperline != pixfmt.bytesperline ||
-	    vfh->format.fmt.pix.sizeimage != pixfmt.sizeimage)
-		return -EINVAL;
-
-	return ret;
-}
-
-/* -----------------------------------------------------------------------------
- * Video queue operations
- */
-
-static int iss_video_queue_setup(struct vb2_queue *vq,
-				 unsigned int *count, unsigned int *num_planes,
-				 unsigned int sizes[],
-				 struct device *alloc_devs[])
-{
-	struct iss_video_fh *vfh = vb2_get_drv_priv(vq);
-	struct iss_video *video = vfh->video;
-
-	/* Revisit multi-planar support for NV12 */
-	*num_planes = 1;
-
-	sizes[0] = vfh->format.fmt.pix.sizeimage;
-	if (sizes[0] == 0)
-		return -EINVAL;
-
-	*count = min(*count, video->capture_mem / PAGE_ALIGN(sizes[0]));
-
-	return 0;
-}
-
-static void iss_video_buf_cleanup(struct vb2_buffer *vb)
-{
-	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
-	struct iss_buffer *buffer = container_of(vbuf, struct iss_buffer, vb);
-
-	if (buffer->iss_addr)
-		buffer->iss_addr = 0;
-}
-
-static int iss_video_buf_prepare(struct vb2_buffer *vb)
-{
-	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
-	struct iss_video_fh *vfh = vb2_get_drv_priv(vb->vb2_queue);
-	struct iss_buffer *buffer = container_of(vbuf, struct iss_buffer, vb);
-	struct iss_video *video = vfh->video;
-	unsigned long size = vfh->format.fmt.pix.sizeimage;
-	dma_addr_t addr;
-
-	if (vb2_plane_size(vb, 0) < size)
-		return -ENOBUFS;
-
-	addr = vb2_dma_contig_plane_dma_addr(vb, 0);
-	if (!IS_ALIGNED(addr, 32)) {
-		dev_dbg(video->iss->dev,
-			"Buffer address must be aligned to 32 bytes boundary.\n");
-		return -EINVAL;
-	}
-
-	vb2_set_plane_payload(vb, 0, size);
-	buffer->iss_addr = addr;
-	return 0;
-}
-
-static void iss_video_buf_queue(struct vb2_buffer *vb)
-{
-	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
-	struct iss_video_fh *vfh = vb2_get_drv_priv(vb->vb2_queue);
-	struct iss_video *video = vfh->video;
-	struct iss_buffer *buffer = container_of(vbuf, struct iss_buffer, vb);
-	struct iss_pipeline *pipe = to_iss_pipeline(&video->video.entity);
-	unsigned long flags;
-	bool empty;
-
-	spin_lock_irqsave(&video->qlock, flags);
-
-	/*
-	 * Mark the buffer is faulty and give it back to the queue immediately
-	 * if the video node has registered an error. vb2 will perform the same
-	 * check when preparing the buffer, but that is inherently racy, so we
-	 * need to handle the race condition with an authoritative check here.
-	 */
-	if (unlikely(video->error)) {
-		vb2_buffer_done(vb, VB2_BUF_STATE_ERROR);
-		spin_unlock_irqrestore(&video->qlock, flags);
-		return;
-	}
-
-	empty = list_empty(&video->dmaqueue);
-	list_add_tail(&buffer->list, &video->dmaqueue);
-
-	spin_unlock_irqrestore(&video->qlock, flags);
-
-	if (empty) {
-		enum iss_pipeline_state state;
-		unsigned int start;
-
-		if (video->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-			state = ISS_PIPELINE_QUEUE_OUTPUT;
-		else
-			state = ISS_PIPELINE_QUEUE_INPUT;
-
-		spin_lock_irqsave(&pipe->lock, flags);
-		pipe->state |= state;
-		video->ops->queue(video, buffer);
-		video->dmaqueue_flags |= ISS_VIDEO_DMAQUEUE_QUEUED;
-
-		start = iss_pipeline_ready(pipe);
-		if (start)
-			pipe->state |= ISS_PIPELINE_STREAM;
-		spin_unlock_irqrestore(&pipe->lock, flags);
-
-		if (start)
-			omap4iss_pipeline_set_stream(pipe,
-						     ISS_PIPELINE_STREAM_SINGLESHOT);
-	}
-}
-
-static const struct vb2_ops iss_video_vb2ops = {
-	.queue_setup	= iss_video_queue_setup,
-	.buf_prepare	= iss_video_buf_prepare,
-	.buf_queue	= iss_video_buf_queue,
-	.buf_cleanup	= iss_video_buf_cleanup,
-};
-
-/*
- * omap4iss_video_buffer_next - Complete the current buffer and return the next
- * @video: ISS video object
- *
- * Remove the current video buffer from the DMA queue and fill its timestamp,
- * field count and state fields before waking up its completion handler.
- *
- * For capture video nodes, the buffer state is set to VB2_BUF_STATE_DONE if no
- * error has been flagged in the pipeline, or to VB2_BUF_STATE_ERROR otherwise.
- *
- * The DMA queue is expected to contain at least one buffer.
- *
- * Return a pointer to the next buffer in the DMA queue, or NULL if the queue is
- * empty.
- */
-struct iss_buffer *omap4iss_video_buffer_next(struct iss_video *video)
-{
-	struct iss_pipeline *pipe = to_iss_pipeline(&video->video.entity);
-	enum iss_pipeline_state state;
-	struct iss_buffer *buf;
-	unsigned long flags;
-
-	spin_lock_irqsave(&video->qlock, flags);
-	if (WARN_ON(list_empty(&video->dmaqueue))) {
-		spin_unlock_irqrestore(&video->qlock, flags);
-		return NULL;
-	}
-
-	buf = list_first_entry(&video->dmaqueue, struct iss_buffer,
-			       list);
-	list_del(&buf->list);
-	spin_unlock_irqrestore(&video->qlock, flags);
-
-	buf->vb.vb2_buf.timestamp = ktime_get_ns();
-
-	/*
-	 * Do frame number propagation only if this is the output video node.
-	 * Frame number either comes from the CSI receivers or it gets
-	 * incremented here if H3A is not active.
-	 * Note: There is no guarantee that the output buffer will finish
-	 * first, so the input number might lag behind by 1 in some cases.
-	 */
-	if (video == pipe->output && !pipe->do_propagation)
-		buf->vb.sequence =
-			atomic_inc_return(&pipe->frame_number);
-	else
-		buf->vb.sequence = atomic_read(&pipe->frame_number);
-
-	vb2_buffer_done(&buf->vb.vb2_buf, pipe->error ?
-			VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE);
-	pipe->error = false;
-
-	spin_lock_irqsave(&video->qlock, flags);
-	if (list_empty(&video->dmaqueue)) {
-		spin_unlock_irqrestore(&video->qlock, flags);
-		if (video->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-			state = ISS_PIPELINE_QUEUE_OUTPUT
-			      | ISS_PIPELINE_STREAM;
-		else
-			state = ISS_PIPELINE_QUEUE_INPUT
-			      | ISS_PIPELINE_STREAM;
-
-		spin_lock_irqsave(&pipe->lock, flags);
-		pipe->state &= ~state;
-		if (video->pipe.stream_state == ISS_PIPELINE_STREAM_CONTINUOUS)
-			video->dmaqueue_flags |= ISS_VIDEO_DMAQUEUE_UNDERRUN;
-		spin_unlock_irqrestore(&pipe->lock, flags);
-		return NULL;
-	}
-
-	if (video->type == V4L2_BUF_TYPE_VIDEO_CAPTURE && pipe->input) {
-		spin_lock(&pipe->lock);
-		pipe->state &= ~ISS_PIPELINE_STREAM;
-		spin_unlock(&pipe->lock);
-	}
-
-	buf = list_first_entry(&video->dmaqueue, struct iss_buffer,
-			       list);
-	spin_unlock_irqrestore(&video->qlock, flags);
-	buf->vb.vb2_buf.state = VB2_BUF_STATE_ACTIVE;
-	return buf;
-}
-
-/*
- * omap4iss_video_cancel_stream - Cancel stream on a video node
- * @video: ISS video object
- *
- * Cancelling a stream mark all buffers on the video node as erroneous and makes
- * sure no new buffer can be queued.
- */
-void omap4iss_video_cancel_stream(struct iss_video *video)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&video->qlock, flags);
-
-	while (!list_empty(&video->dmaqueue)) {
-		struct iss_buffer *buf;
-
-		buf = list_first_entry(&video->dmaqueue, struct iss_buffer,
-				       list);
-		list_del(&buf->list);
-		vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR);
-	}
-
-	vb2_queue_error(video->queue);
-	video->error = true;
-
-	spin_unlock_irqrestore(&video->qlock, flags);
-}
-
-/* -----------------------------------------------------------------------------
- * V4L2 ioctls
- */
-
-static int
-iss_video_querycap(struct file *file, void *fh, struct v4l2_capability *cap)
-{
-	struct iss_video *video = video_drvdata(file);
-
-	strscpy(cap->driver, ISS_VIDEO_DRIVER_NAME, sizeof(cap->driver));
-	strscpy(cap->card, video->video.name, sizeof(cap->card));
-	strscpy(cap->bus_info, "media", sizeof(cap->bus_info));
-	cap->capabilities = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_STREAMING
-			  | V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT;
-
-	return 0;
-}
-
-static int
-iss_video_enum_format(struct file *file, void *fh, struct v4l2_fmtdesc *f)
-{
-	struct iss_video *video = video_drvdata(file);
-	struct v4l2_mbus_framefmt format;
-	unsigned int index = f->index;
-	unsigned int i;
-	int ret;
-
-	if (f->type != video->type)
-		return -EINVAL;
-
-	ret = __iss_video_get_format(video, &format);
-	if (ret < 0)
-		return ret;
-
-	for (i = 0; i < ARRAY_SIZE(formats); ++i) {
-		const struct iss_format_info *info = &formats[i];
-
-		if (format.code != info->code)
-			continue;
-
-		if (index == 0) {
-			f->pixelformat = info->pixelformat;
-			return 0;
-		}
-
-		index--;
-	}
-
-	return -EINVAL;
-}
-
-static int
-iss_video_get_format(struct file *file, void *fh, struct v4l2_format *format)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-	struct iss_video *video = video_drvdata(file);
-
-	if (format->type != video->type)
-		return -EINVAL;
-
-	mutex_lock(&video->mutex);
-	*format = vfh->format;
-	mutex_unlock(&video->mutex);
-
-	return 0;
-}
-
-static int
-iss_video_set_format(struct file *file, void *fh, struct v4l2_format *format)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-	struct iss_video *video = video_drvdata(file);
-	struct v4l2_mbus_framefmt fmt;
-
-	if (format->type != video->type)
-		return -EINVAL;
-
-	mutex_lock(&video->mutex);
-
-	/*
-	 * Fill the bytesperline and sizeimage fields by converting to media bus
-	 * format and back to pixel format.
-	 */
-	iss_video_pix_to_mbus(&format->fmt.pix, &fmt);
-	iss_video_mbus_to_pix(video, &fmt, &format->fmt.pix);
-
-	vfh->format = *format;
-
-	mutex_unlock(&video->mutex);
-	return 0;
-}
-
-static int
-iss_video_try_format(struct file *file, void *fh, struct v4l2_format *format)
-{
-	struct iss_video *video = video_drvdata(file);
-	struct v4l2_subdev_format fmt = {
-		.which = V4L2_SUBDEV_FORMAT_ACTIVE,
-	};
-	struct v4l2_subdev *subdev;
-	u32 pad;
-	int ret;
-
-	if (format->type != video->type)
-		return -EINVAL;
-
-	subdev = iss_video_remote_subdev(video, &pad);
-	if (!subdev)
-		return -EINVAL;
-
-	iss_video_pix_to_mbus(&format->fmt.pix, &fmt.format);
-
-	fmt.pad = pad;
-	ret = v4l2_subdev_call(subdev, pad, get_fmt, NULL, &fmt);
-	if (ret)
-		return ret;
-
-	iss_video_mbus_to_pix(video, &fmt.format, &format->fmt.pix);
-	return 0;
-}
-
-static int
-iss_video_get_selection(struct file *file, void *fh, struct v4l2_selection *sel)
-{
-	struct iss_video *video = video_drvdata(file);
-	struct v4l2_subdev_format format = {
-		.which = V4L2_SUBDEV_FORMAT_ACTIVE,
-	};
-	struct v4l2_subdev *subdev;
-	struct v4l2_subdev_selection sdsel = {
-		.which = V4L2_SUBDEV_FORMAT_ACTIVE,
-		.target = sel->target,
-	};
-	u32 pad;
-	int ret;
-
-	switch (sel->target) {
-	case V4L2_SEL_TGT_CROP:
-	case V4L2_SEL_TGT_CROP_BOUNDS:
-	case V4L2_SEL_TGT_CROP_DEFAULT:
-		if (video->type == V4L2_BUF_TYPE_VIDEO_OUTPUT)
-			return -EINVAL;
-		break;
-	case V4L2_SEL_TGT_COMPOSE:
-	case V4L2_SEL_TGT_COMPOSE_BOUNDS:
-	case V4L2_SEL_TGT_COMPOSE_DEFAULT:
-		if (video->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-			return -EINVAL;
-		break;
-	default:
-		return -EINVAL;
-	}
-	subdev = iss_video_remote_subdev(video, &pad);
-	if (!subdev)
-		return -EINVAL;
-
-	/*
-	 * Try the get selection operation first and fallback to get format if
-	 * not implemented.
-	 */
-	sdsel.pad = pad;
-	ret = v4l2_subdev_call(subdev, pad, get_selection, NULL, &sdsel);
-	if (!ret)
-		sel->r = sdsel.r;
-	if (ret != -ENOIOCTLCMD)
-		return ret;
-
-	format.pad = pad;
-	ret = v4l2_subdev_call(subdev, pad, get_fmt, NULL, &format);
-	if (ret < 0)
-		return ret == -ENOIOCTLCMD ? -ENOTTY : ret;
-
-	sel->r.left = 0;
-	sel->r.top = 0;
-	sel->r.width = format.format.width;
-	sel->r.height = format.format.height;
-
-	return 0;
-}
-
-static int
-iss_video_set_selection(struct file *file, void *fh, struct v4l2_selection *sel)
-{
-	struct iss_video *video = video_drvdata(file);
-	struct v4l2_subdev *subdev;
-	struct v4l2_subdev_selection sdsel = {
-		.which = V4L2_SUBDEV_FORMAT_ACTIVE,
-		.target = sel->target,
-		.flags = sel->flags,
-		.r = sel->r,
-	};
-	u32 pad;
-	int ret;
-
-	switch (sel->target) {
-	case V4L2_SEL_TGT_CROP:
-		if (video->type == V4L2_BUF_TYPE_VIDEO_OUTPUT)
-			return -EINVAL;
-		break;
-	case V4L2_SEL_TGT_COMPOSE:
-		if (video->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-			return -EINVAL;
-		break;
-	default:
-		return -EINVAL;
-	}
-	subdev = iss_video_remote_subdev(video, &pad);
-	if (!subdev)
-		return -EINVAL;
-
-	sdsel.pad = pad;
-	mutex_lock(&video->mutex);
-	ret = v4l2_subdev_call(subdev, pad, set_selection, NULL, &sdsel);
-	mutex_unlock(&video->mutex);
-	if (!ret)
-		sel->r = sdsel.r;
-
-	return ret == -ENOIOCTLCMD ? -ENOTTY : ret;
-}
-
-static int
-iss_video_get_param(struct file *file, void *fh, struct v4l2_streamparm *a)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-	struct iss_video *video = video_drvdata(file);
-
-	if (video->type != V4L2_BUF_TYPE_VIDEO_OUTPUT ||
-	    video->type != a->type)
-		return -EINVAL;
-
-	memset(a, 0, sizeof(*a));
-	a->type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-	a->parm.output.capability = V4L2_CAP_TIMEPERFRAME;
-	a->parm.output.timeperframe = vfh->timeperframe;
-
-	return 0;
-}
-
-static int
-iss_video_set_param(struct file *file, void *fh, struct v4l2_streamparm *a)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-	struct iss_video *video = video_drvdata(file);
-
-	if (video->type != V4L2_BUF_TYPE_VIDEO_OUTPUT ||
-	    video->type != a->type)
-		return -EINVAL;
-
-	if (a->parm.output.timeperframe.denominator == 0)
-		a->parm.output.timeperframe.denominator = 1;
-
-	vfh->timeperframe = a->parm.output.timeperframe;
-
-	return 0;
-}
-
-static int
-iss_video_reqbufs(struct file *file, void *fh, struct v4l2_requestbuffers *rb)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-
-	return vb2_reqbufs(&vfh->queue, rb);
-}
-
-static int
-iss_video_querybuf(struct file *file, void *fh, struct v4l2_buffer *b)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-
-	return vb2_querybuf(&vfh->queue, b);
-}
-
-static int
-iss_video_qbuf(struct file *file, void *fh, struct v4l2_buffer *b)
-{
-	struct iss_video *video = video_drvdata(file);
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-
-	return vb2_qbuf(&vfh->queue, video->video.v4l2_dev->mdev, b);
-}
-
-static int
-iss_video_expbuf(struct file *file, void *fh, struct v4l2_exportbuffer *e)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-
-	return vb2_expbuf(&vfh->queue, e);
-}
-
-static int
-iss_video_dqbuf(struct file *file, void *fh, struct v4l2_buffer *b)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-
-	return vb2_dqbuf(&vfh->queue, b, file->f_flags & O_NONBLOCK);
-}
-
-/*
- * Stream management
- *
- * Every ISS pipeline has a single input and a single output. The input can be
- * either a sensor or a video node. The output is always a video node.
- *
- * As every pipeline has an output video node, the ISS video objects at the
- * pipeline output stores the pipeline state. It tracks the streaming state of
- * both the input and output, as well as the availability of buffers.
- *
- * In sensor-to-memory mode, frames are always available at the pipeline input.
- * Starting the sensor usually requires I2C transfers and must be done in
- * interruptible context. The pipeline is started and stopped synchronously
- * to the stream on/off commands. All modules in the pipeline will get their
- * subdev set stream handler called. The module at the end of the pipeline must
- * delay starting the hardware until buffers are available at its output.
- *
- * In memory-to-memory mode, starting/stopping the stream requires
- * synchronization between the input and output. ISS modules can't be stopped
- * in the middle of a frame, and at least some of the modules seem to become
- * busy as soon as they're started, even if they don't receive a frame start
- * event. For that reason frames need to be processed in single-shot mode. The
- * driver needs to wait until a frame is completely processed and written to
- * memory before restarting the pipeline for the next frame. Pipelined
- * processing might be possible but requires more testing.
- *
- * Stream start must be delayed until buffers are available at both the input
- * and output. The pipeline must be started in the vb2 queue callback with
- * the buffers queue spinlock held. The modules subdev set stream operation must
- * not sleep.
- */
-static int
-iss_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-	struct iss_video *video = video_drvdata(file);
-	struct media_device *mdev = video->video.entity.graph_obj.mdev;
-	struct media_pipeline_pad_iter iter;
-	enum iss_pipeline_state state;
-	struct iss_pipeline *pipe;
-	struct iss_video *far_end;
-	struct media_pad *pad;
-	unsigned long flags;
-	int ret;
-
-	if (type != video->type)
-		return -EINVAL;
-
-	mutex_lock(&video->stream_lock);
-
-	/*
-	 * Start streaming on the pipeline. No link touching an entity in the
-	 * pipeline can be activated or deactivated once streaming is started.
-	 */
-	pipe = to_iss_pipeline(&video->video.entity) ? : &video->pipe;
-	pipe->external = NULL;
-	pipe->external_rate = 0;
-	pipe->external_bpp = 0;
-
-	ret = media_entity_enum_init(&pipe->ent_enum, mdev);
-	if (ret)
-		goto err_entity_enum_init;
-
-	if (video->iss->pdata->set_constraints)
-		video->iss->pdata->set_constraints(video->iss, true);
-
-	ret = video_device_pipeline_start(&video->video, &pipe->pipe);
-	if (ret < 0)
-		goto err_media_pipeline_start;
-
-	media_pipeline_for_each_pad(&pipe->pipe, &iter, pad)
-		media_entity_enum_set(&pipe->ent_enum, pad->entity);
-
-	/*
-	 * Verify that the currently configured format matches the output of
-	 * the connected subdev.
-	 */
-	ret = iss_video_check_format(video, vfh);
-	if (ret < 0)
-		goto err_iss_video_check_format;
-
-	video->bpl_padding = ret;
-	video->bpl_value = vfh->format.fmt.pix.bytesperline;
-
-	/*
-	 * Find the ISS video node connected at the far end of the pipeline and
-	 * update the pipeline.
-	 */
-	far_end = iss_video_far_end(video, pipe);
-	if (IS_ERR(far_end)) {
-		ret = PTR_ERR(far_end);
-		goto err_iss_video_check_format;
-	}
-
-	if (video->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) {
-		state = ISS_PIPELINE_STREAM_OUTPUT | ISS_PIPELINE_IDLE_OUTPUT;
-		pipe->input = far_end;
-		pipe->output = video;
-	} else {
-		if (!far_end) {
-			ret = -EPIPE;
-			goto err_iss_video_check_format;
-		}
-
-		state = ISS_PIPELINE_STREAM_INPUT | ISS_PIPELINE_IDLE_INPUT;
-		pipe->input = video;
-		pipe->output = far_end;
-	}
-
-	spin_lock_irqsave(&pipe->lock, flags);
-	pipe->state &= ~ISS_PIPELINE_STREAM;
-	pipe->state |= state;
-	spin_unlock_irqrestore(&pipe->lock, flags);
-
-	/*
-	 * Set the maximum time per frame as the value requested by userspace.
-	 * This is a soft limit that can be overridden if the hardware doesn't
-	 * support the request limit.
-	 */
-	if (video->type == V4L2_BUF_TYPE_VIDEO_OUTPUT)
-		pipe->max_timeperframe = vfh->timeperframe;
-
-	video->queue = &vfh->queue;
-	INIT_LIST_HEAD(&video->dmaqueue);
-	video->error = false;
-	atomic_set(&pipe->frame_number, -1);
-
-	ret = vb2_streamon(&vfh->queue, type);
-	if (ret < 0)
-		goto err_iss_video_check_format;
-
-	/*
-	 * In sensor-to-memory mode, the stream can be started synchronously
-	 * to the stream on command. In memory-to-memory mode, it will be
-	 * started when buffers are queued on both the input and output.
-	 */
-	if (!pipe->input) {
-		unsigned long flags;
-
-		ret = omap4iss_pipeline_set_stream(pipe,
-						   ISS_PIPELINE_STREAM_CONTINUOUS);
-		if (ret < 0)
-			goto err_omap4iss_set_stream;
-		spin_lock_irqsave(&video->qlock, flags);
-		if (list_empty(&video->dmaqueue))
-			video->dmaqueue_flags |= ISS_VIDEO_DMAQUEUE_UNDERRUN;
-		spin_unlock_irqrestore(&video->qlock, flags);
-	}
-
-	mutex_unlock(&video->stream_lock);
-
-	return 0;
-
-err_omap4iss_set_stream:
-	vb2_streamoff(&vfh->queue, type);
-err_iss_video_check_format:
-	video_device_pipeline_stop(&video->video);
-err_media_pipeline_start:
-	if (video->iss->pdata->set_constraints)
-		video->iss->pdata->set_constraints(video->iss, false);
-	video->queue = NULL;
-
-err_entity_enum_init:
-	media_entity_enum_cleanup(&pipe->ent_enum);
-
-	mutex_unlock(&video->stream_lock);
-
-	return ret;
-}
-
-static int
-iss_video_streamoff(struct file *file, void *fh, enum v4l2_buf_type type)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(fh);
-	struct iss_video *video = video_drvdata(file);
-	struct iss_pipeline *pipe = to_iss_pipeline(&video->video.entity);
-	enum iss_pipeline_state state;
-	unsigned long flags;
-
-	if (type != video->type)
-		return -EINVAL;
-
-	mutex_lock(&video->stream_lock);
-
-	if (!vb2_is_streaming(&vfh->queue))
-		goto done;
-
-	/* Update the pipeline state. */
-	if (video->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-		state = ISS_PIPELINE_STREAM_OUTPUT
-		      | ISS_PIPELINE_QUEUE_OUTPUT;
-	else
-		state = ISS_PIPELINE_STREAM_INPUT
-		      | ISS_PIPELINE_QUEUE_INPUT;
-
-	spin_lock_irqsave(&pipe->lock, flags);
-	pipe->state &= ~state;
-	spin_unlock_irqrestore(&pipe->lock, flags);
-
-	/* Stop the stream. */
-	omap4iss_pipeline_set_stream(pipe, ISS_PIPELINE_STREAM_STOPPED);
-	vb2_streamoff(&vfh->queue, type);
-	video->queue = NULL;
-
-	media_entity_enum_cleanup(&pipe->ent_enum);
-
-	if (video->iss->pdata->set_constraints)
-		video->iss->pdata->set_constraints(video->iss, false);
-	video_device_pipeline_stop(&video->video);
-
-done:
-	mutex_unlock(&video->stream_lock);
-	return 0;
-}
-
-static int
-iss_video_enum_input(struct file *file, void *fh, struct v4l2_input *input)
-{
-	if (input->index > 0)
-		return -EINVAL;
-
-	strscpy(input->name, "camera", sizeof(input->name));
-	input->type = V4L2_INPUT_TYPE_CAMERA;
-
-	return 0;
-}
-
-static int
-iss_video_g_input(struct file *file, void *fh, unsigned int *input)
-{
-	*input = 0;
-
-	return 0;
-}
-
-static int
-iss_video_s_input(struct file *file, void *fh, unsigned int input)
-{
-	return input == 0 ? 0 : -EINVAL;
-}
-
-static const struct v4l2_ioctl_ops iss_video_ioctl_ops = {
-	.vidioc_querycap		= iss_video_querycap,
-	.vidioc_enum_fmt_vid_cap        = iss_video_enum_format,
-	.vidioc_g_fmt_vid_cap		= iss_video_get_format,
-	.vidioc_s_fmt_vid_cap		= iss_video_set_format,
-	.vidioc_try_fmt_vid_cap		= iss_video_try_format,
-	.vidioc_g_fmt_vid_out		= iss_video_get_format,
-	.vidioc_s_fmt_vid_out		= iss_video_set_format,
-	.vidioc_try_fmt_vid_out		= iss_video_try_format,
-	.vidioc_g_selection		= iss_video_get_selection,
-	.vidioc_s_selection		= iss_video_set_selection,
-	.vidioc_g_parm			= iss_video_get_param,
-	.vidioc_s_parm			= iss_video_set_param,
-	.vidioc_reqbufs			= iss_video_reqbufs,
-	.vidioc_querybuf		= iss_video_querybuf,
-	.vidioc_qbuf			= iss_video_qbuf,
-	.vidioc_expbuf			= iss_video_expbuf,
-	.vidioc_dqbuf			= iss_video_dqbuf,
-	.vidioc_streamon		= iss_video_streamon,
-	.vidioc_streamoff		= iss_video_streamoff,
-	.vidioc_enum_input		= iss_video_enum_input,
-	.vidioc_g_input			= iss_video_g_input,
-	.vidioc_s_input			= iss_video_s_input,
-};
-
-/* -----------------------------------------------------------------------------
- * V4L2 file operations
- */
-
-static int iss_video_open(struct file *file)
-{
-	struct iss_video *video = video_drvdata(file);
-	struct iss_video_fh *handle;
-	struct vb2_queue *q;
-	int ret = 0;
-
-	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
-	if (!handle)
-		return -ENOMEM;
-
-	v4l2_fh_init(&handle->vfh, &video->video);
-	v4l2_fh_add(&handle->vfh);
-
-	/* If this is the first user, initialise the pipeline. */
-	if (!omap4iss_get(video->iss)) {
-		ret = -EBUSY;
-		goto done;
-	}
-
-	ret = v4l2_pipeline_pm_get(&video->video.entity);
-	if (ret < 0) {
-		omap4iss_put(video->iss);
-		goto done;
-	}
-
-	q = &handle->queue;
-
-	q->type = video->type;
-	q->io_modes = VB2_MMAP | VB2_DMABUF;
-	q->drv_priv = handle;
-	q->ops = &iss_video_vb2ops;
-	q->mem_ops = &vb2_dma_contig_memops;
-	q->buf_struct_size = sizeof(struct iss_buffer);
-	q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
-	q->dev = video->iss->dev;
-
-	ret = vb2_queue_init(q);
-	if (ret) {
-		omap4iss_put(video->iss);
-		goto done;
-	}
-
-	memset(&handle->format, 0, sizeof(handle->format));
-	handle->format.type = video->type;
-	handle->timeperframe.denominator = 1;
-
-	handle->video = video;
-	file->private_data = &handle->vfh;
-
-done:
-	if (ret < 0) {
-		v4l2_fh_del(&handle->vfh);
-		v4l2_fh_exit(&handle->vfh);
-		kfree(handle);
-	}
-
-	return ret;
-}
-
-static int iss_video_release(struct file *file)
-{
-	struct iss_video *video = video_drvdata(file);
-	struct v4l2_fh *vfh = file->private_data;
-	struct iss_video_fh *handle = to_iss_video_fh(vfh);
-
-	/* Disable streaming and free the buffers queue resources. */
-	iss_video_streamoff(file, vfh, video->type);
-
-	v4l2_pipeline_pm_put(&video->video.entity);
-
-	/* Release the videobuf2 queue */
-	vb2_queue_release(&handle->queue);
-
-	v4l2_fh_del(vfh);
-	v4l2_fh_exit(vfh);
-	kfree(handle);
-	file->private_data = NULL;
-
-	omap4iss_put(video->iss);
-
-	return 0;
-}
-
-static __poll_t iss_video_poll(struct file *file, poll_table *wait)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(file->private_data);
-
-	return vb2_poll(&vfh->queue, file, wait);
-}
-
-static int iss_video_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct iss_video_fh *vfh = to_iss_video_fh(file->private_data);
-
-	return vb2_mmap(&vfh->queue, vma);
-}
-
-static const struct v4l2_file_operations iss_video_fops = {
-	.owner = THIS_MODULE,
-	.unlocked_ioctl = video_ioctl2,
-	.open = iss_video_open,
-	.release = iss_video_release,
-	.poll = iss_video_poll,
-	.mmap = iss_video_mmap,
-};
-
-/* -----------------------------------------------------------------------------
- * ISS video core
- */
-
-static const struct iss_video_operations iss_video_dummy_ops = {
-};
-
-int omap4iss_video_init(struct iss_video *video, const char *name)
-{
-	const char *direction;
-	int ret;
-
-	switch (video->type) {
-	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
-		direction = "output";
-		video->pad.flags = MEDIA_PAD_FL_SINK;
-		break;
-	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
-		direction = "input";
-		video->pad.flags = MEDIA_PAD_FL_SOURCE;
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	ret = media_entity_pads_init(&video->video.entity, 1, &video->pad);
-	if (ret < 0)
-		return ret;
-
-	spin_lock_init(&video->qlock);
-	mutex_init(&video->mutex);
-	atomic_set(&video->active, 0);
-
-	spin_lock_init(&video->pipe.lock);
-	mutex_init(&video->stream_lock);
-
-	/* Initialize the video device. */
-	if (!video->ops)
-		video->ops = &iss_video_dummy_ops;
-
-	video->video.fops = &iss_video_fops;
-	snprintf(video->video.name, sizeof(video->video.name),
-		 "OMAP4 ISS %s %s", name, direction);
-	video->video.vfl_type = VFL_TYPE_VIDEO;
-	video->video.release = video_device_release_empty;
-	video->video.ioctl_ops = &iss_video_ioctl_ops;
-	video->pipe.stream_state = ISS_PIPELINE_STREAM_STOPPED;
-
-	video_set_drvdata(&video->video, video);
-
-	return 0;
-}
-
-void omap4iss_video_cleanup(struct iss_video *video)
-{
-	media_entity_cleanup(&video->video.entity);
-	mutex_destroy(&video->stream_lock);
-	mutex_destroy(&video->mutex);
-}
-
-int omap4iss_video_register(struct iss_video *video, struct v4l2_device *vdev)
-{
-	int ret;
-
-	video->video.v4l2_dev = vdev;
-	if (video->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-		video->video.device_caps = V4L2_CAP_VIDEO_CAPTURE;
-	else
-		video->video.device_caps = V4L2_CAP_VIDEO_OUTPUT;
-	video->video.device_caps |= V4L2_CAP_STREAMING;
-
-	ret = video_register_device(&video->video, VFL_TYPE_VIDEO, -1);
-	if (ret < 0)
-		dev_err(video->iss->dev,
-			"could not register video device (%d)\n", ret);
-
-	return ret;
-}
-
-void omap4iss_video_unregister(struct iss_video *video)
-{
-	video_unregister_device(&video->video);
-}
diff --git a/drivers/staging/media/omap4iss/iss_video.h b/drivers/staging/media/omap4iss/iss_video.h
deleted file mode 100644
index 19668d28b682..000000000000
--- a/drivers/staging/media/omap4iss/iss_video.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * TI OMAP4 ISS V4L2 Driver - Generic video node
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Author: Sergio Aguirre <sergio.a.aguirre@gmail.com>
- */
-
-#ifndef OMAP4_ISS_VIDEO_H
-#define OMAP4_ISS_VIDEO_H
-
-#include <linux/v4l2-mediabus.h>
-#include <media/media-entity.h>
-#include <media/v4l2-dev.h>
-#include <media/v4l2-fh.h>
-#include <media/videobuf2-v4l2.h>
-#include <media/videobuf2-dma-contig.h>
-
-#define ISS_VIDEO_DRIVER_NAME		"issvideo"
-
-struct iss_device;
-struct iss_video;
-struct v4l2_mbus_framefmt;
-struct v4l2_pix_format;
-
-/*
- * struct iss_format_info - ISS media bus format information
- * @code: V4L2 media bus format code
- * @truncated: V4L2 media bus format code for the same format truncated to 10
- *	bits. Identical to @code if the format is 10 bits wide or less.
- * @uncompressed: V4L2 media bus format code for the corresponding uncompressed
- *	format. Identical to @code if the format is not DPCM compressed.
- * @flavor: V4L2 media bus format code for the same pixel layout but
- *	shifted to be 8 bits per pixel. =0 if format is not shiftable.
- * @pixelformat: V4L2 pixel format FCC identifier
- * @bpp: Bits per pixel
- */
-struct iss_format_info {
-	u32 code;
-	u32 truncated;
-	u32 uncompressed;
-	u32 flavor;
-	u32 pixelformat;
-	unsigned int bpp;
-};
-
-enum iss_pipeline_stream_state {
-	ISS_PIPELINE_STREAM_STOPPED = 0,
-	ISS_PIPELINE_STREAM_CONTINUOUS = 1,
-	ISS_PIPELINE_STREAM_SINGLESHOT = 2,
-};
-
-enum iss_pipeline_state {
-	/* The stream has been started on the input video node. */
-	ISS_PIPELINE_STREAM_INPUT = BIT(0),
-	/* The stream has been started on the output video node. */
-	ISS_PIPELINE_STREAM_OUTPUT = BIT(1),
-	/* At least one buffer is queued on the input video node. */
-	ISS_PIPELINE_QUEUE_INPUT = BIT(2),
-	/* At least one buffer is queued on the output video node. */
-	ISS_PIPELINE_QUEUE_OUTPUT = BIT(3),
-	/* The input entity is idle, ready to be started. */
-	ISS_PIPELINE_IDLE_INPUT = BIT(4),
-	/* The output entity is idle, ready to be started. */
-	ISS_PIPELINE_IDLE_OUTPUT = BIT(5),
-	/* The pipeline is currently streaming. */
-	ISS_PIPELINE_STREAM = BIT(6),
-};
-
-/*
- * struct iss_pipeline - An OMAP4 ISS hardware pipeline
- * @ent_enum: Entities in the pipeline
- * @error: A hardware error occurred during capture
- */
-struct iss_pipeline {
-	struct media_pipeline pipe;
-	spinlock_t lock;		/* Pipeline state and queue flags */
-	unsigned int state;
-	enum iss_pipeline_stream_state stream_state;
-	struct iss_video *input;
-	struct iss_video *output;
-	struct media_entity_enum ent_enum;
-	atomic_t frame_number;
-	bool do_propagation; /* of frame number */
-	bool error;
-	struct v4l2_fract max_timeperframe;
-	struct v4l2_subdev *external;
-	unsigned int external_rate;
-	int external_bpp;
-};
-
-static inline struct iss_pipeline *to_iss_pipeline(struct media_entity *entity)
-{
-	struct media_pipeline *pipe = media_entity_pipeline(entity);
-
-	if (!pipe)
-		return NULL;
-
-	return container_of(pipe, struct iss_pipeline, pipe);
-}
-
-static inline int iss_pipeline_ready(struct iss_pipeline *pipe)
-{
-	return pipe->state == (ISS_PIPELINE_STREAM_INPUT |
-			       ISS_PIPELINE_STREAM_OUTPUT |
-			       ISS_PIPELINE_QUEUE_INPUT |
-			       ISS_PIPELINE_QUEUE_OUTPUT |
-			       ISS_PIPELINE_IDLE_INPUT |
-			       ISS_PIPELINE_IDLE_OUTPUT);
-}
-
-/*
- * struct iss_buffer - ISS buffer
- * @buffer: ISS video buffer
- * @iss_addr: Physical address of the buffer.
- */
-struct iss_buffer {
-	/* common v4l buffer stuff -- must be first */
-	struct vb2_v4l2_buffer	vb;
-	struct list_head	list;
-	dma_addr_t iss_addr;
-};
-
-#define to_iss_buffer(buf)	container_of(buf, struct iss_buffer, vb)
-
-enum iss_video_dmaqueue_flags {
-	/* Set if DMA queue becomes empty when ISS_PIPELINE_STREAM_CONTINUOUS */
-	ISS_VIDEO_DMAQUEUE_UNDERRUN = BIT(0),
-	/* Set when queuing buffer to an empty DMA queue */
-	ISS_VIDEO_DMAQUEUE_QUEUED = BIT(1),
-};
-
-#define iss_video_dmaqueue_flags_clr(video)	\
-			({ (video)->dmaqueue_flags = 0; })
-
-/*
- * struct iss_video_operations - ISS video operations
- * @queue:	Resume streaming when a buffer is queued. Called on VIDIOC_QBUF
- *		if there was no buffer previously queued.
- */
-struct iss_video_operations {
-	int (*queue)(struct iss_video *video, struct iss_buffer *buffer);
-};
-
-struct iss_video {
-	struct video_device video;
-	enum v4l2_buf_type type;
-	struct media_pad pad;
-
-	struct mutex mutex;		/* format and crop settings */
-	atomic_t active;
-
-	struct iss_device *iss;
-
-	unsigned int capture_mem;
-	unsigned int bpl_alignment;	/* alignment value */
-	unsigned int bpl_zero_padding;	/* whether the alignment is optional */
-	unsigned int bpl_max;		/* maximum bytes per line value */
-	unsigned int bpl_value;		/* bytes per line value */
-	unsigned int bpl_padding;	/* padding at end of line */
-
-	/* Pipeline state */
-	struct iss_pipeline pipe;
-	struct mutex stream_lock;	/* pipeline and stream states */
-	bool error;
-
-	/* Video buffers queue */
-	struct vb2_queue *queue;
-	spinlock_t qlock;		/* protects dmaqueue and error */
-	struct list_head dmaqueue;
-	enum iss_video_dmaqueue_flags dmaqueue_flags;
-
-	const struct iss_video_operations *ops;
-};
-
-#define to_iss_video(vdev)	container_of(vdev, struct iss_video, video)
-
-struct iss_video_fh {
-	struct v4l2_fh vfh;
-	struct iss_video *video;
-	struct vb2_queue queue;
-	struct v4l2_format format;
-	struct v4l2_fract timeperframe;
-};
-
-#define to_iss_video_fh(fh)	container_of(fh, struct iss_video_fh, vfh)
-#define iss_video_queue_to_iss_video_fh(q) \
-				container_of(q, struct iss_video_fh, queue)
-
-int omap4iss_video_init(struct iss_video *video, const char *name);
-void omap4iss_video_cleanup(struct iss_video *video);
-int omap4iss_video_register(struct iss_video *video,
-			    struct v4l2_device *vdev);
-void omap4iss_video_unregister(struct iss_video *video);
-struct iss_buffer *omap4iss_video_buffer_next(struct iss_video *video);
-void omap4iss_video_cancel_stream(struct iss_video *video);
-struct media_pad *omap4iss_video_remote_pad(struct iss_video *video);
-
-const struct iss_format_info *
-omap4iss_video_format_info(u32 code);
-
-#endif /* OMAP4_ISS_VIDEO_H */
diff --git a/include/linux/platform_data/media/omap4iss.h b/include/linux/platform_data/media/omap4iss.h
deleted file mode 100644
index 2a511a8fcda7..000000000000
--- a/include/linux/platform_data/media/omap4iss.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH_ARM_PLAT_OMAP4_ISS_H
-#define ARCH_ARM_PLAT_OMAP4_ISS_H
-
-#include <linux/i2c.h>
-
-struct iss_device;
-
-enum iss_interface_type {
-	ISS_INTERFACE_CSI2A_PHY1,
-	ISS_INTERFACE_CSI2B_PHY2,
-};
-
-/**
- * struct iss_csiphy_lane: CSI2 lane position and polarity
- * @pos: position of the lane
- * @pol: polarity of the lane
- */
-struct iss_csiphy_lane {
-	u8 pos;
-	u8 pol;
-};
-
-#define ISS_CSIPHY1_NUM_DATA_LANES	4
-#define ISS_CSIPHY2_NUM_DATA_LANES	1
-
-/**
- * struct iss_csiphy_lanes_cfg - CSI2 lane configuration
- * @data: Configuration of one or two data lanes
- * @clk: Clock lane configuration
- */
-struct iss_csiphy_lanes_cfg {
-	struct iss_csiphy_lane data[ISS_CSIPHY1_NUM_DATA_LANES];
-	struct iss_csiphy_lane clk;
-};
-
-/**
- * struct iss_csi2_platform_data - CSI2 interface platform data
- * @crc: Enable the cyclic redundancy check
- * @vpclk_div: Video port output clock control
- */
-struct iss_csi2_platform_data {
-	unsigned crc:1;
-	unsigned vpclk_div:2;
-	struct iss_csiphy_lanes_cfg lanecfg;
-};
-
-struct iss_subdev_i2c_board_info {
-	struct i2c_board_info *board_info;
-	int i2c_adapter_id;
-};
-
-struct iss_v4l2_subdevs_group {
-	struct iss_subdev_i2c_board_info *subdevs;
-	enum iss_interface_type interface;
-	union {
-		struct iss_csi2_platform_data csi2;
-	} bus; /* gcc < 4.6.0 chokes on anonymous union initializers */
-};
-
-struct iss_platform_data {
-	struct iss_v4l2_subdevs_group *subdevs;
-	void (*set_constraints)(struct iss_device *iss, bool enable);
-};
-
-#endif
-- 
cgit v1.2.3


From ec763c234d7f60c5bce0fa2611ba79f5be1af76b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 4 Oct 2024 15:10:28 -0700
Subject: Revert "rtnetlink: add guard for RTNL"

This reverts commit 464eb03c4a7cfb32cb3324249193cf6bb5b35152.

Once we have a per-netns RTNL, we won't use guard(rtnl).

Also, there's no users for now.

  $ grep -rnI "guard(rtnl" || true
  $

Suggested-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/netdev/CANn89i+KoYzUH+VPLdGmLABYf5y4TW0hrM4UAeQQJ9AREty0iw@mail.gmail.com/
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/rtnetlink.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index a7da7dfc06a2..cdfc897f1e3c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -7,7 +7,6 @@
 #include <linux/netdevice.h>
 #include <linux/wait.h>
 #include <linux/refcount.h>
-#include <linux/cleanup.h>
 #include <uapi/linux/rtnetlink.h>
 
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
@@ -47,8 +46,6 @@ extern int rtnl_is_locked(void);
 extern int rtnl_lock_killable(void);
 extern bool refcount_dec_and_rtnl_lock(refcount_t *r);
 
-DEFINE_LOCK_GUARD_0(rtnl, rtnl_lock(), rtnl_unlock())
-
 extern wait_queue_head_t netdev_unregistering_wq;
 extern atomic_t dev_unreg_count;
 extern struct rw_semaphore pernet_ops_rwsem;
-- 
cgit v1.2.3


From 76aed95319da25d6884dff01d5f0149e4b542f96 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 4 Oct 2024 15:10:29 -0700
Subject: rtnetlink: Add per-netns RTNL.

The goal is to break RTNL down into per-netns mutex.

This patch adds per-netns mutex and its helper functions, rtnl_net_lock()
and rtnl_net_unlock().

rtnl_net_lock() acquires the global RTNL and per-netns RTNL mutex, and
rtnl_net_unlock() releases them.

We will replace 800+ rtnl_lock() with rtnl_net_lock() and finally removes
rtnl_lock() in rtnl_net_lock().

When we need to nest per-netns RTNL mutex, we will use __rtnl_net_lock(),
and its locking order is defined by rtnl_net_lock_cmp_fn() as follows:

  1. init_net is first
  2. netns address ascending order

Note that the conversion will be done under CONFIG_DEBUG_NET_SMALL_RTNL
with LOCKDEP so that we can carefully add the extra mutex without slowing
down RTNL operations during conversion.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/rtnetlink.h   | 21 ++++++++++++++++
 include/net/net_namespace.h |  4 ++++
 net/Kconfig.debug           | 15 ++++++++++++
 net/core/net_namespace.c    |  6 +++++
 net/core/rtnetlink.c        | 58 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 104 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index cdfc897f1e3c..edd840a49989 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -92,6 +92,27 @@ static inline bool lockdep_rtnl_is_held(void)
 #define rcu_replace_pointer_rtnl(rp, p)			\
 	rcu_replace_pointer(rp, p, lockdep_rtnl_is_held())
 
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+void __rtnl_net_lock(struct net *net);
+void __rtnl_net_unlock(struct net *net);
+void rtnl_net_lock(struct net *net);
+void rtnl_net_unlock(struct net *net);
+int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b);
+#else
+static inline void __rtnl_net_lock(struct net *net) {}
+static inline void __rtnl_net_unlock(struct net *net) {}
+
+static inline void rtnl_net_lock(struct net *net)
+{
+	rtnl_lock();
+}
+
+static inline void rtnl_net_unlock(struct net *net)
+{
+	rtnl_unlock();
+}
+#endif
+
 static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
 {
 	return rtnl_dereference(dev->ingress_queue);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index e67b483cc8bb..873c0f9fdac6 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -188,6 +188,10 @@ struct net {
 #if IS_ENABLED(CONFIG_SMC)
 	struct netns_smc	smc;
 #endif
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+	/* Move to a better place when the config guard is removed. */
+	struct mutex		rtnl_mutex;
+#endif
 } __randomize_layout;
 
 #include <linux/seq_file_net.h>
diff --git a/net/Kconfig.debug b/net/Kconfig.debug
index 5e3fffe707dd..277fab8c4d77 100644
--- a/net/Kconfig.debug
+++ b/net/Kconfig.debug
@@ -24,3 +24,18 @@ config DEBUG_NET
 	help
 	  Enable extra sanity checks in networking.
 	  This is mostly used by fuzzers, but is safe to select.
+
+config DEBUG_NET_SMALL_RTNL
+	bool "Add extra per-netns mutex inside RTNL"
+	depends on DEBUG_KERNEL && NET && LOCK_DEBUGGING_SUPPORT
+	select PROVE_LOCKING
+	default n
+	help
+	  rtnl_lock() is being replaced with rtnl_net_lock() that
+	  acquires the global RTNL and a small per-netns RTNL mutex.
+
+	  During the conversion, rtnl_net_lock() just adds an extra
+	  mutex in every RTNL scope and slows down the operations.
+
+	  Once the conversion completes, rtnl_lock() will be removed
+	  and rtnetlink will gain per-netns scalability.
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index e39479f1c9a4..105e3cd26763 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -334,6 +334,12 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
 	idr_init(&net->netns_ids);
 	spin_lock_init(&net->nsid_lock);
 	mutex_init(&net->ipv4.ra_mutex);
+
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+	mutex_init(&net->rtnl_mutex);
+	lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
+#endif
+
 	preinit_net_sysctl(net);
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 682d8d3127db..46567fa54e42 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -179,6 +179,64 @@ bool lockdep_rtnl_is_held(void)
 EXPORT_SYMBOL(lockdep_rtnl_is_held);
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+void __rtnl_net_lock(struct net *net)
+{
+	ASSERT_RTNL();
+
+	mutex_lock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_lock);
+
+void __rtnl_net_unlock(struct net *net)
+{
+	ASSERT_RTNL();
+
+	mutex_unlock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_unlock);
+
+void rtnl_net_lock(struct net *net)
+{
+	rtnl_lock();
+	__rtnl_net_lock(net);
+}
+EXPORT_SYMBOL(rtnl_net_lock);
+
+void rtnl_net_unlock(struct net *net)
+{
+	__rtnl_net_unlock(net);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL(rtnl_net_unlock);
+
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+	if (net_eq(net_a, net_b))
+		return 0;
+
+	/* always init_net first */
+	if (net_eq(net_a, &init_net))
+		return -1;
+
+	if (net_eq(net_b, &init_net))
+		return 1;
+
+	/* otherwise lock in ascending order */
+	return net_a < net_b ? -1 : 1;
+}
+
+int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b)
+{
+	const struct net *net_a, *net_b;
+
+	net_a = container_of(a, struct net, rtnl_mutex.dep_map);
+	net_b = container_of(b, struct net, rtnl_mutex.dep_map);
+
+	return rtnl_net_cmp_locks(net_a, net_b);
+}
+#endif
+
 static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
 
 static inline int rtm_msgindex(int msgtype)
-- 
cgit v1.2.3


From 844e5e7e656d3a7a904fd5607f8491d6fd01db8e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 4 Oct 2024 15:10:30 -0700
Subject: rtnetlink: Add assertion helpers for per-netns RTNL.

Once an RTNL scope is converted with rtnl_net_lock(), we will replace
RTNL helper functions inside the scope with the following per-netns
alternatives:

  ASSERT_RTNL()           -> ASSERT_RTNL_NET(net)
  rcu_dereference_rtnl(p) -> rcu_dereference_rtnl_net(net, p)

Note that the per-netns helpers are equivalent to the conventional
helpers unless CONFIG_DEBUG_NET_SMALL_RTNL is enabled.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/rtnetlink.h | 45 +++++++++++++++++++++++++++++++++++++++++----
 net/core/rtnetlink.c      | 12 ++++++++++++
 2 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index edd840a49989..8468a4ce8510 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -51,6 +51,10 @@ extern atomic_t dev_unreg_count;
 extern struct rw_semaphore pernet_ops_rwsem;
 extern struct rw_semaphore net_rwsem;
 
+#define ASSERT_RTNL() \
+	WARN_ONCE(!rtnl_is_locked(), \
+		  "RTNL: assertion failed at %s (%d)\n", __FILE__,  __LINE__)
+
 #ifdef CONFIG_PROVE_LOCKING
 extern bool lockdep_rtnl_is_held(void);
 #else
@@ -98,6 +102,22 @@ void __rtnl_net_unlock(struct net *net);
 void rtnl_net_lock(struct net *net);
 void rtnl_net_unlock(struct net *net);
 int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b);
+
+bool rtnl_net_is_locked(struct net *net);
+
+#define ASSERT_RTNL_NET(net)						\
+	WARN_ONCE(!rtnl_net_is_locked(net),				\
+		  "RTNL_NET: assertion failed at %s (%d)\n",		\
+		  __FILE__,  __LINE__)
+
+bool lockdep_rtnl_net_is_held(struct net *net);
+
+#define rcu_dereference_rtnl_net(net, p)				\
+	rcu_dereference_check(p, lockdep_rtnl_net_is_held(net))
+#define rtnl_net_dereference(net, p)					\
+	rcu_dereference_protected(p, lockdep_rtnl_net_is_held(net))
+#define rcu_replace_pointer_rtnl_net(net, rp, p)			\
+	rcu_replace_pointer(rp, p, lockdep_rtnl_net_is_held(net))
 #else
 static inline void __rtnl_net_lock(struct net *net) {}
 static inline void __rtnl_net_unlock(struct net *net) {}
@@ -111,6 +131,27 @@ static inline void rtnl_net_unlock(struct net *net)
 {
 	rtnl_unlock();
 }
+
+static inline void ASSERT_RTNL_NET(struct net *net)
+{
+	ASSERT_RTNL();
+}
+
+static inline void *rcu_dereference_rtnl_net(struct net *net, void *p)
+{
+	return rcu_dereference_rtnl(p);
+}
+
+static inline void *rtnl_net_dereference(struct net *net, void *p)
+{
+	return rtnl_dereference(p);
+}
+
+static inline void *rcu_replace_pointer_rtnl_net(struct net *net,
+						 void *rp, void *p)
+{
+	return rcu_replace_pointer_rtnl(rp, p);
+}
 #endif
 
 static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
@@ -140,10 +181,6 @@ void rtnetlink_init(void);
 void __rtnl_unlock(void);
 void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail);
 
-#define ASSERT_RTNL() \
-	WARN_ONCE(!rtnl_is_locked(), \
-		  "RTNL: assertion failed at %s (%d)\n", __FILE__,  __LINE__)
-
 extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
 			     struct netlink_callback *cb,
 			     struct net_device *dev,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 46567fa54e42..6d68247aea70 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -235,6 +235,18 @@ int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *
 
 	return rtnl_net_cmp_locks(net_a, net_b);
 }
+
+bool rtnl_net_is_locked(struct net *net)
+{
+	return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_net_is_locked);
+
+bool lockdep_rtnl_net_is_held(struct net *net)
+{
+	return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_net_is_held);
 #endif
 
 static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
-- 
cgit v1.2.3


From 55b834873e800a5809787ce6aedcb70ee49a596f Mon Sep 17 00:00:00 2001
From: Dmitry Perchanov <dmitry.perchanov@intel.com>
Date: Mon, 26 Aug 2024 16:05:04 +0300
Subject: media: uvcvideo: Add luma 16-bit interlaced pixel format

The formats added by this patch are:

        UVC_GUID_FORMAT_Y16I

Interlaced lumina format primary use in RealSense Depth cameras with
stereo stream for left and right image sensors.

Signed-off-by: Dmitry Perchanov <dmitry.perchanov@intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://lore.kernel.org/r/a717a912035b0a0f82b2f35719cca0c5269e995f.camel@intel.com
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/common/uvc.c | 4 ++++
 include/linux/usb/uvc.h    | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/media/common/uvc.c b/drivers/media/common/uvc.c
index c54c2268fee6..027498d37464 100644
--- a/drivers/media/common/uvc.c
+++ b/drivers/media/common/uvc.c
@@ -120,6 +120,10 @@ static const struct uvc_format_desc uvc_fmts[] = {
 		.guid		= UVC_GUID_FORMAT_Y12I,
 		.fcc		= V4L2_PIX_FMT_Y12I,
 	},
+	{
+		.guid		= UVC_GUID_FORMAT_Y16I,
+		.fcc		= V4L2_PIX_FMT_Y16I,
+	},
 	{
 		.guid		= UVC_GUID_FORMAT_Z16,
 		.fcc		= V4L2_PIX_FMT_Z16,
diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h
index 88d96095bcb1..1c16be20c966 100644
--- a/include/linux/usb/uvc.h
+++ b/include/linux/usb/uvc.h
@@ -118,6 +118,9 @@
 #define UVC_GUID_FORMAT_Y12I \
 	{ 'Y',  '1',  '2',  'I', 0x00, 0x00, 0x10, 0x00, \
 	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
+#define UVC_GUID_FORMAT_Y16I \
+	{ 'Y',  '1',  '6',  'I', 0x00, 0x00, 0x10, 0x00, \
+	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
 #define UVC_GUID_FORMAT_Z16 \
 	{ 'Z',  '1',  '6',  ' ', 0x00, 0x00, 0x10, 0x00, \
 	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
-- 
cgit v1.2.3


From a7e742e416bc331c0f409d6a3630471896e696d3 Mon Sep 17 00:00:00 2001
From: David Given <dg@cowlark.com>
Date: Wed, 18 Sep 2024 20:05:39 +0200
Subject: media: uvcvideo: Add support for the D3DFMT_R5G6B5 pixmap type

This media format is used by the NXP Semiconductors 1fc9:009b chipset,
used by the Kaiweets KTI-W02 infrared camera.

Signed-off-by: David Given <dg@cowlark.com>
Reviewed-by: Ricardo Ribalda <ribalda@chromium.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://lore.kernel.org/r/20240918180540.10830-1-dg@cowlark.com
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/common/uvc.c | 4 ++++
 include/linux/usb/uvc.h    | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/media/common/uvc.c b/drivers/media/common/uvc.c
index 027498d37464..1ad4604474ac 100644
--- a/drivers/media/common/uvc.c
+++ b/drivers/media/common/uvc.c
@@ -96,6 +96,10 @@ static const struct uvc_format_desc uvc_fmts[] = {
 		.guid		= UVC_GUID_FORMAT_RGBP,
 		.fcc		= V4L2_PIX_FMT_RGB565,
 	},
+	{
+		.guid		= UVC_GUID_FORMAT_D3DFMT_R5G6B5,
+		.fcc		= V4L2_PIX_FMT_RGB565,
+	},
 	{
 		.guid		= UVC_GUID_FORMAT_BGR3,
 		.fcc		= V4L2_PIX_FMT_BGR24,
diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h
index 1c16be20c966..bce95153e5a6 100644
--- a/include/linux/usb/uvc.h
+++ b/include/linux/usb/uvc.h
@@ -143,6 +143,9 @@
 #define UVC_GUID_FORMAT_D3DFMT_L8 \
 	{0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, \
 	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
+#define UVC_GUID_FORMAT_D3DFMT_R5G6B5 \
+	{0x7b, 0xeb, 0x36, 0xe4, 0x4f, 0x52, 0xce, 0x11, \
+	 0x9f, 0x53, 0x00, 0x20, 0xaf, 0x0b, 0xa7, 0x70}
 #define UVC_GUID_FORMAT_KSMEDIA_L8_IR \
 	{0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x10, 0x00, \
 	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
-- 
cgit v1.2.3


From 36e69b160705b65bf136c2fb6a1194447eeb8478 Mon Sep 17 00:00:00 2001
From: Dragan Simic <dsimic@manjaro.org>
Date: Sun, 29 Sep 2024 11:21:16 +0200
Subject: driver core: Add device probe log helper dev_warn_probe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some drivers can still provide their functionality to a certain extent
even when some of their resource acquisitions eventually fail.  In such
cases, emitting errors isn't the desired action, but warnings should be
emitted instead.

To solve this, introduce dev_warn_probe() as a new device probe log helper,
which behaves identically as the already existing dev_err_probe(), while it
produces warnings instead of errors.  The intended use is with the resources
that are actually optional for a particular driver.

While there, copyedit the kerneldoc for dev_err_probe() a bit, to simplify
its wording a bit, and reuse it as the kerneldoc for dev_warn_probe(), with
the necessary wording adjustments, of course.

Signed-off-by: Dragan Simic <dsimic@manjaro.org>
Tested-by: Hélène Vulquin <oss@helene.moe>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/2be0a28538bb2a3d1bcc91e2ca1f2d0dc09146d9.1727601608.git.dsimic@manjaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/core.c        | 129 +++++++++++++++++++++++++++++++++++----------
 include/linux/dev_printk.h |   1 +
 2 files changed, 102 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index a4c853411a6b..a84a7b952cfd 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4980,6 +4980,49 @@ define_dev_printk_level(_dev_info, KERN_INFO);
 
 #endif
 
+static void __dev_probe_failed(const struct device *dev, int err, bool fatal,
+			       const char *fmt, va_list vargsp)
+{
+	struct va_format vaf;
+	va_list vargs;
+
+	/*
+	 * On x86_64 and possibly on other architectures, va_list is actually a
+	 * size-1 array containing a structure.  As a result, function parameter
+	 * vargsp decays from T[1] to T*, and &vargsp has type T** rather than
+	 * T(*)[1], which is expected by its assignment to vaf.va below.
+	 *
+	 * One standard way to solve this mess is by creating a copy in a local
+	 * variable of type va_list and then using a pointer to that local copy
+	 * instead, which is the approach employed here.
+	 */
+	va_copy(vargs, vargsp);
+
+	vaf.fmt = fmt;
+	vaf.va = &vargs;
+
+	switch (err) {
+	case -EPROBE_DEFER:
+		device_set_deferred_probe_reason(dev, &vaf);
+		dev_dbg(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
+		break;
+
+	case -ENOMEM:
+		/* Don't print anything on -ENOMEM, there's already enough output */
+		break;
+
+	default:
+		/* Log fatal final failures as errors, otherwise produce warnings */
+		if (fatal)
+			dev_err(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
+		else
+			dev_warn(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
+		break;
+	}
+
+	va_end(vargs);
+}
+
 /**
  * dev_err_probe - probe error check and log helper
  * @dev: the pointer to the struct device
@@ -4992,7 +5035,7 @@ define_dev_printk_level(_dev_info, KERN_INFO);
  * -EPROBE_DEFER and propagate error upwards.
  * In case of -EPROBE_DEFER it sets also defer probe reason, which can be
  * checked later by reading devices_deferred debugfs attribute.
- * It replaces code sequence::
+ * It replaces the following code sequence::
  *
  * 	if (err != -EPROBE_DEFER)
  * 		dev_err(dev, ...);
@@ -5004,47 +5047,77 @@ define_dev_printk_level(_dev_info, KERN_INFO);
  *
  * 	return dev_err_probe(dev, err, ...);
  *
- * Using this helper in your probe function is totally fine even if @err is
- * known to never be -EPROBE_DEFER.
+ * Using this helper in your probe function is totally fine even if @err
+ * is known to never be -EPROBE_DEFER.
  * The benefit compared to a normal dev_err() is the standardized format
- * of the error code, it being emitted symbolically (i.e. you get "EAGAIN"
- * instead of "-35") and the fact that the error code is returned which allows
- * more compact error paths.
+ * of the error code, which is emitted symbolically (i.e. you get "EAGAIN"
+ * instead of "-35"), and having the error code returned allows more
+ * compact error paths.
  *
  * Returns @err.
  */
 int dev_err_probe(const struct device *dev, int err, const char *fmt, ...)
 {
-	struct va_format vaf;
-	va_list args;
+	va_list vargs;
 
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
+	va_start(vargs, fmt);
 
-	switch (err) {
-	case -EPROBE_DEFER:
-		device_set_deferred_probe_reason(dev, &vaf);
-		dev_dbg(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
-		break;
+	/* Use dev_err() for logging when err doesn't equal -EPROBE_DEFER */
+	__dev_probe_failed(dev, err, true, fmt, vargs);
 
-	case -ENOMEM:
-		/*
-		 * We don't print anything on -ENOMEM, there is already enough
-		 * output.
-		 */
-		break;
+	va_end(vargs);
 
-	default:
-		dev_err(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
-		break;
-	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(dev_err_probe);
 
-	va_end(args);
+/**
+ * dev_warn_probe - probe error check and log helper
+ * @dev: the pointer to the struct device
+ * @err: error value to test
+ * @fmt: printf-style format string
+ * @...: arguments as specified in the format string
+ *
+ * This helper implements common pattern present in probe functions for error
+ * checking: print debug or warning message depending if the error value is
+ * -EPROBE_DEFER and propagate error upwards.
+ * In case of -EPROBE_DEFER it sets also defer probe reason, which can be
+ * checked later by reading devices_deferred debugfs attribute.
+ * It replaces the following code sequence::
+ *
+ * 	if (err != -EPROBE_DEFER)
+ * 		dev_warn(dev, ...);
+ * 	else
+ * 		dev_dbg(dev, ...);
+ * 	return err;
+ *
+ * with::
+ *
+ * 	return dev_warn_probe(dev, err, ...);
+ *
+ * Using this helper in your probe function is totally fine even if @err
+ * is known to never be -EPROBE_DEFER.
+ * The benefit compared to a normal dev_warn() is the standardized format
+ * of the error code, which is emitted symbolically (i.e. you get "EAGAIN"
+ * instead of "-35"), and having the error code returned allows more
+ * compact error paths.
+ *
+ * Returns @err.
+ */
+int dev_warn_probe(const struct device *dev, int err, const char *fmt, ...)
+{
+	va_list vargs;
+
+	va_start(vargs, fmt);
+
+	/* Use dev_warn() for logging when err doesn't equal -EPROBE_DEFER */
+	__dev_probe_failed(dev, err, false, fmt, vargs);
+
+	va_end(vargs);
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(dev_err_probe);
+EXPORT_SYMBOL_GPL(dev_warn_probe);
 
 static inline bool fwnode_is_primary(struct fwnode_handle *fwnode)
 {
diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h
index ca32b5bb28eb..eb2094e43050 100644
--- a/include/linux/dev_printk.h
+++ b/include/linux/dev_printk.h
@@ -276,6 +276,7 @@ do {									\
 			dev_driver_string(dev), dev_name(dev), ## arg)
 
 __printf(3, 4) int dev_err_probe(const struct device *dev, int err, const char *fmt, ...);
+__printf(3, 4) int dev_warn_probe(const struct device *dev, int err, const char *fmt, ...);
 
 /* Simple helper for dev_err_probe() when ERR_PTR() is to be returned. */
 #define dev_err_ptr_probe(dev, ___err, fmt, ...) \
-- 
cgit v1.2.3


From 0b028ff7e70ecbe5240ad92e36a664af5cf7f382 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 6 Oct 2024 23:55:11 +0100
Subject: auxdisplay: Remove unused functions

cfag12864b_getrate() and cfag12864b_isenabled() were both added
in commit 70e840499aae ("[PATCH] drivers: add LCD support")
but never used.

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/auxdisplay/cfag12864b.c | 12 ------------
 include/linux/cfag12864b.h      | 17 -----------------
 2 files changed, 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/auxdisplay/cfag12864b.c b/drivers/auxdisplay/cfag12864b.c
index 6526aa51fb1d..e1a94ae3eb0c 100644
--- a/drivers/auxdisplay/cfag12864b.c
+++ b/drivers/auxdisplay/cfag12864b.c
@@ -37,11 +37,6 @@ module_param(cfag12864b_rate, uint, 0444);
 MODULE_PARM_DESC(cfag12864b_rate,
 	"Refresh rate (hertz)");
 
-unsigned int cfag12864b_getrate(void)
-{
-	return cfag12864b_rate;
-}
-
 /*
  * cfag12864b Commands
  *
@@ -249,11 +244,6 @@ void cfag12864b_disable(void)
 	mutex_unlock(&cfag12864b_mutex);
 }
 
-unsigned char cfag12864b_isenabled(void)
-{
-	return cfag12864b_updating;
-}
-
 static void cfag12864b_update(struct work_struct *work)
 {
 	unsigned char c;
@@ -293,10 +283,8 @@ static void cfag12864b_update(struct work_struct *work)
  */
 
 EXPORT_SYMBOL_GPL(cfag12864b_buffer);
-EXPORT_SYMBOL_GPL(cfag12864b_getrate);
 EXPORT_SYMBOL_GPL(cfag12864b_enable);
 EXPORT_SYMBOL_GPL(cfag12864b_disable);
-EXPORT_SYMBOL_GPL(cfag12864b_isenabled);
 
 /*
  * Is the module inited?
diff --git a/include/linux/cfag12864b.h b/include/linux/cfag12864b.h
index 6617d9c68d86..83e6613d12ae 100644
--- a/include/linux/cfag12864b.h
+++ b/include/linux/cfag12864b.h
@@ -27,13 +27,6 @@
  */
 extern unsigned char * cfag12864b_buffer;
 
-/*
- * Get the refresh rate of the LCD
- *
- * Returns the refresh rate (hertz).
- */
-extern unsigned int cfag12864b_getrate(void);
-
 /*
  * Enable refreshing
  *
@@ -49,16 +42,6 @@ extern unsigned char cfag12864b_enable(void);
  */
 extern void cfag12864b_disable(void);
 
-/*
- * Is enabled refreshing? (is anyone using the module?)
- *
- * Returns 0 if refreshing is not enabled (anyone is using it),
- * or != 0 if refreshing is enabled (someone is using it).
- *
- * Useful for buffer read-only modules.
- */
-extern unsigned char cfag12864b_isenabled(void);
-
 /*
  * Is the module inited?
  */
-- 
cgit v1.2.3


From 581434654e01ec79dd02c21448ac84e2ce2d1a64 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong@huawei.com>
Date: Tue, 8 Oct 2024 11:24:58 +0000
Subject: workqueue: Adjust WQ_MAX_ACTIVE from 512 to 2048

WQ_MAX_ACTIVE is currently set to 512, which was established approximately
15 yeas ago. However, with the significant increase in machine sizes and
capabilities, the previous limit of 256 concurrent tasks is no longer
sufficient. Therefore, we propose to increase WQ_MAX_ACTIVE to 2048.
and WQ_DFL_ACTIVE is 1024 now.

Signed-off-by: Chen Ridong <chenridong@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/core-api/workqueue.rst | 4 ++--
 include/linux/workqueue.h            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst
index 2b813f80ce39..e295835fc116 100644
--- a/Documentation/core-api/workqueue.rst
+++ b/Documentation/core-api/workqueue.rst
@@ -245,8 +245,8 @@ CPU which can be assigned to the work items of a wq. For example, with
 at the same time per CPU. This is always a per-CPU attribute, even for
 unbound workqueues.
 
-The maximum limit for ``@max_active`` is 512 and the default value used
-when 0 is specified is 256. These values are chosen sufficiently high
+The maximum limit for ``@max_active`` is 2048 and the default value used
+when 0 is specified is 1024. These values are chosen sufficiently high
 such that they are not the limiting factor while providing protection in
 runaway cases.
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 59c2695e12e7..b0dc957c3e56 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -412,7 +412,7 @@ enum wq_flags {
 };
 
 enum wq_consts {
-	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
+	WQ_MAX_ACTIVE		= 2048,	  /* I like 2048, better ideas? */
 	WQ_UNBOUND_MAX_ACTIVE	= WQ_MAX_ACTIVE,
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
 
-- 
cgit v1.2.3


From aefa398d93d5db7c555be78a605ff015357f127d Mon Sep 17 00:00:00 2001
From: Joshua Hahn <joshua.hahn6@gmail.com>
Date: Wed, 2 Oct 2024 11:47:16 -0700
Subject: cgroup/rstat: Tracking cgroup-level niced CPU time

Cgroup-level CPU statistics currently include time spent on
user/system processes, but do not include niced CPU time (despite
already being tracked). This patch exposes niced CPU time to the
userspace, allowing users to get a better understanding of their
hardware limits and can facilitate more informed workload distribution.

A new field 'ntime' is added to struct cgroup_base_stat as opposed to
struct task_cputime to minimize footprint.

Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  1 +
 kernel/cgroup/rstat.c       | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 47ae4c4d924c..0a80ef9191a6 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -327,6 +327,7 @@ struct cgroup_base_stat {
 #ifdef CONFIG_SCHED_CORE
 	u64 forceidle_sum;
 #endif
+	u64 ntime;
 };
 
 /*
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index a06b45272411..5877974ece92 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -444,6 +444,7 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
 #ifdef CONFIG_SCHED_CORE
 	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
 #endif
+	dst_bstat->ntime += src_bstat->ntime;
 }
 
 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -455,6 +456,7 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 #ifdef CONFIG_SCHED_CORE
 	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
 #endif
+	dst_bstat->ntime -= src_bstat->ntime;
 }
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -534,8 +536,10 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
 	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 
 	switch (index) {
-	case CPUTIME_USER:
 	case CPUTIME_NICE:
+		rstatc->bstat.ntime += delta_exec;
+		fallthrough;
+	case CPUTIME_USER:
 		rstatc->bstat.cputime.utime += delta_exec;
 		break;
 	case CPUTIME_SYSTEM:
@@ -591,6 +595,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
 #ifdef CONFIG_SCHED_CORE
 		bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
 #endif
+		bstat->ntime += cpustat[CPUTIME_NICE];
 	}
 }
 
@@ -608,13 +613,14 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
 void cgroup_base_stat_cputime_show(struct seq_file *seq)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	u64 usage, utime, stime;
+	u64 usage, utime, stime, ntime;
 
 	if (cgroup_parent(cgrp)) {
 		cgroup_rstat_flush_hold(cgrp);
 		usage = cgrp->bstat.cputime.sum_exec_runtime;
 		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 			       &utime, &stime);
+		ntime = cgrp->bstat.ntime;
 		cgroup_rstat_flush_release(cgrp);
 	} else {
 		/* cgrp->bstat of root is not actually used, reuse it */
@@ -622,16 +628,19 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 		usage = cgrp->bstat.cputime.sum_exec_runtime;
 		utime = cgrp->bstat.cputime.utime;
 		stime = cgrp->bstat.cputime.stime;
+		ntime = cgrp->bstat.ntime;
 	}
 
 	do_div(usage, NSEC_PER_USEC);
 	do_div(utime, NSEC_PER_USEC);
 	do_div(stime, NSEC_PER_USEC);
+	do_div(ntime, NSEC_PER_USEC);
 
 	seq_printf(seq, "usage_usec %llu\n"
-		   "user_usec %llu\n"
-		   "system_usec %llu\n",
-		   usage, utime, stime);
+			"user_usec %llu\n"
+			"system_usec %llu\n"
+			"nice_usec %llu\n",
+			usage, utime, stime, ntime);
 
 	cgroup_force_idle_show(seq, &cgrp->bstat);
 }
-- 
cgit v1.2.3


From 49e4154f4b16345da5e219b23ed9737a6e735bc1 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian@huaweicloud.com>
Date: Wed, 11 Sep 2024 09:00:26 +0800
Subject: tracing: Remove TRACE_EVENT_FL_FILTERED logic

After commit dcb0b5575d24 ("tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER
 logic"), no one's going to set the TRACE_EVENT_FL_FILTERED or change the
call->filter, so remove related logic.

Link: https://lore.kernel.org/20240911010026.2302849-1-zhengyejian@huaweicloud.com
Signed-off-by: Zheng Yejian <zhengyejian@huaweicloud.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h         |  4 ----
 kernel/trace/trace.c                 | 44 ++++++++----------------------------
 kernel/trace/trace.h                 |  4 ----
 kernel/trace/trace_branch.c          |  4 +---
 kernel/trace/trace_events.c          |  2 --
 kernel/trace/trace_functions_graph.c |  8 ++-----
 kernel/trace/trace_hwlat.c           |  4 +---
 kernel/trace/trace_mmiotrace.c       |  8 ++-----
 kernel/trace/trace_osnoise.c         | 12 +++-------
 kernel/trace/trace_sched_wakeup.c    |  8 ++-----
 10 files changed, 20 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 42bedcddd511..f8f2e52653df 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -326,7 +326,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
 void trace_event_buffer_commit(struct trace_event_buffer *fbuffer);
 
 enum {
-	TRACE_EVENT_FL_FILTERED_BIT,
 	TRACE_EVENT_FL_CAP_ANY_BIT,
 	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
 	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
@@ -341,7 +340,6 @@ enum {
 
 /*
  * Event flags:
- *  FILTERED	  - The event has a filter attached
  *  CAP_ANY	  - Any user can enable for perf
  *  NO_SET_FILTER - Set when filter has error and is to be ignored
  *  IGNORE_ENABLE - For trace internal events, do not enable with debugfs file
@@ -356,7 +354,6 @@ enum {
  *                   to a tracepoint yet, then it is cleared when it is.
  */
 enum {
-	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
 	TRACE_EVENT_FL_CAP_ANY		= (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
 	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
 	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
@@ -381,7 +378,6 @@ struct trace_event_call {
 	};
 	struct trace_event	event;
 	char			*print_fmt;
-	struct event_filter	*filter;
 	/*
 	 * Static events can disappear with modules,
 	 * where as dynamic ones need their own ref count.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1c69ca1f1088..bdb776e6ceb9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -593,19 +593,6 @@ int tracing_check_open_get_tr(struct trace_array *tr)
 	return 0;
 }
 
-int call_filter_check_discard(struct trace_event_call *call, void *rec,
-			      struct trace_buffer *buffer,
-			      struct ring_buffer_event *event)
-{
-	if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
-	    !filter_match_preds(call->filter, rec)) {
-		__trace_event_discard_commit(buffer, event);
-		return 1;
-	}
-
-	return 0;
-}
-
 /**
  * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
  * @filtered_pids: The list of pids to check
@@ -2889,7 +2876,6 @@ void
 trace_function(struct trace_array *tr, unsigned long ip, unsigned long
 	       parent_ip, unsigned int trace_ctx)
 {
-	struct trace_event_call *call = &event_function;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
@@ -2902,11 +2888,9 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	if (!call_filter_check_discard(call, entry, buffer, event)) {
-		if (static_branch_unlikely(&trace_function_exports_enabled))
-			ftrace_exports(event, TRACE_EXPORT_FUNCTION);
-		__buffer_unlock_commit(buffer, event);
-	}
+	if (static_branch_unlikely(&trace_function_exports_enabled))
+		ftrace_exports(event, TRACE_EXPORT_FUNCTION);
+	__buffer_unlock_commit(buffer, event);
 }
 
 #ifdef CONFIG_STACKTRACE
@@ -2932,7 +2916,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 				 unsigned int trace_ctx,
 				 int skip, struct pt_regs *regs)
 {
-	struct trace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
 	unsigned int size, nr_entries;
 	struct ftrace_stack *fstack;
@@ -2986,8 +2969,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 	memcpy(&entry->caller, fstack->calls,
 	       flex_array_size(entry, caller, nr_entries));
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		__buffer_unlock_commit(buffer, event);
+	__buffer_unlock_commit(buffer, event);
 
  out:
 	/* Again, don't let gcc optimize things here */
@@ -3060,7 +3042,6 @@ static void
 ftrace_trace_userstack(struct trace_array *tr,
 		       struct trace_buffer *buffer, unsigned int trace_ctx)
 {
-	struct trace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 
@@ -3094,8 +3075,7 @@ ftrace_trace_userstack(struct trace_array *tr,
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
 	stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		__buffer_unlock_commit(buffer, event);
+	__buffer_unlock_commit(buffer, event);
 
  out_drop_count:
 	__this_cpu_dec(user_stack_count);
@@ -3264,7 +3244,6 @@ static void trace_printk_start_stop_comm(int enabled)
  */
 int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
-	struct trace_event_call *call = &event_bprint;
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
 	struct trace_array *tr = READ_ONCE(printk_trace);
@@ -3308,10 +3287,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, tbuffer, sizeof(u32) * len);
-	if (!call_filter_check_discard(call, entry, buffer, event)) {
-		__buffer_unlock_commit(buffer, event);
-		ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
-	}
+	__buffer_unlock_commit(buffer, event);
+	ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
 
 out:
 	ring_buffer_nest_end(buffer);
@@ -3331,7 +3308,6 @@ static int
 __trace_array_vprintk(struct trace_buffer *buffer,
 		      unsigned long ip, const char *fmt, va_list args)
 {
-	struct trace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
 	int len = 0, size;
 	struct print_entry *entry;
@@ -3366,10 +3342,8 @@ __trace_array_vprintk(struct trace_buffer *buffer,
 	entry->ip = ip;
 
 	memcpy(&entry->buf, tbuffer, len + 1);
-	if (!call_filter_check_discard(call, entry, buffer, event)) {
-		__buffer_unlock_commit(buffer, event);
-		ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
-	}
+	__buffer_unlock_commit(buffer, event);
+	ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
 
 out:
 	ring_buffer_nest_end(buffer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c866991b9c78..638f452eec10 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1429,10 +1429,6 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
-extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
-				     struct trace_buffer *buffer,
-				     struct ring_buffer_event *event);
-
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct trace_buffer *buffer,
 				     struct ring_buffer_event *event,
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index aa63548873c3..6d08a5523ce0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,7 +30,6 @@ static struct trace_array *branch_tracer;
 static void
 probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
-	struct trace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
 	struct trace_buffer *buffer;
 	struct trace_array_cpu *data;
@@ -80,8 +79,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 	entry->line = f->data.line;
 	entry->correct = val == expect;
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit_nostack(buffer, event);
+	trace_buffer_unlock_commit_nostack(buffer, event);
 
  out:
 	current->trace_recursion &= ~TRACE_BRANCH_BIT;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7266ec2a4eea..77e68efbd43e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3149,8 +3149,6 @@ static void __trace_remove_event_call(struct trace_event_call *call)
 {
 	event_remove(call);
 	trace_destroy_fields(call);
-	free_event_filter(call->filter);
-	call->filter = NULL;
 }
 
 static int probe_remove_event_call(struct trace_event_call *call)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a569daaac4c4..ab57ec78ca04 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -102,7 +102,6 @@ int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned int trace_ctx)
 {
-	struct trace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ftrace_graph_ent_entry *entry;
@@ -113,8 +112,7 @@ int __trace_graph_entry(struct trace_array *tr,
 		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit_nostack(buffer, event);
+	trace_buffer_unlock_commit_nostack(buffer, event);
 
 	return 1;
 }
@@ -223,7 +221,6 @@ void __trace_graph_return(struct trace_array *tr,
 				struct ftrace_graph_ret *trace,
 				unsigned int trace_ctx)
 {
-	struct trace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ftrace_graph_ret_entry *entry;
@@ -234,8 +231,7 @@ void __trace_graph_return(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	entry->ret				= *trace;
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit_nostack(buffer, event);
+	trace_buffer_unlock_commit_nostack(buffer, event);
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace,
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 3bd6071441ad..b65353ec2837 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -130,7 +130,6 @@ static bool hwlat_busy;
 static void trace_hwlat_sample(struct hwlat_sample *sample)
 {
 	struct trace_array *tr = hwlat_trace;
-	struct trace_event_call *call = &event_hwlat;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct hwlat_entry *entry;
@@ -148,8 +147,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
 	entry->nmi_count		= sample->nmi_count;
 	entry->count			= sample->count;
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit_nostack(buffer, event);
+	trace_buffer_unlock_commit_nostack(buffer, event);
 }
 
 /* Macros to encapsulate the time capturing infrastructure */
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 64e77b513697..ba5858866b2f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -294,7 +294,6 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct mmiotrace_rw *rw)
 {
-	struct trace_event_call *call = &event_mmiotrace_rw;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
@@ -310,8 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->rw			= *rw;
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+	trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -325,7 +323,6 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct mmiotrace_map *map)
 {
-	struct trace_event_call *call = &event_mmiotrace_map;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
@@ -341,8 +338,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->map			= *map;
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+	trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a50ed23bee77..b9f96c77527d 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -499,7 +499,6 @@ static void print_osnoise_headers(struct seq_file *s)
 static void
 __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
 {
-	struct trace_event_call *call = &event_osnoise;
 	struct ring_buffer_event *event;
 	struct osnoise_entry *entry;
 
@@ -517,8 +516,7 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe
 	entry->softirq_count	= sample->softirq_count;
 	entry->thread_count	= sample->thread_count;
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit_nostack(buffer, event);
+	trace_buffer_unlock_commit_nostack(buffer, event);
 }
 
 /*
@@ -578,7 +576,6 @@ static void print_timerlat_headers(struct seq_file *s)
 static void
 __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
 {
-	struct trace_event_call *call = &event_osnoise;
 	struct ring_buffer_event *event;
 	struct timerlat_entry *entry;
 
@@ -591,8 +588,7 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf
 	entry->context			= sample->context;
 	entry->timer_latency		= sample->timer_latency;
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit_nostack(buffer, event);
+	trace_buffer_unlock_commit_nostack(buffer, event);
 }
 
 /*
@@ -654,7 +650,6 @@ static void timerlat_save_stack(int skip)
 static void
 __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size)
 {
-	struct trace_event_call *call = &event_osnoise;
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 
@@ -668,8 +663,7 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u
 	memcpy(&entry->caller, fstack->calls, size);
 	entry->size = fstack->nr_entries;
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit_nostack(buffer, event);
+	trace_buffer_unlock_commit_nostack(buffer, event);
 }
 
 /*
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ae2ace5e515a..d6c7f18daa15 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -378,7 +378,6 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   struct task_struct *next,
 			   unsigned int trace_ctx)
 {
-	struct trace_event_call *call = &event_context_switch;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
@@ -396,8 +395,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_state		= task_state_index(next);
 	entry->next_cpu	= task_cpu(next);
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+	trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 static void
@@ -406,7 +404,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct task_struct *curr,
 			   unsigned int trace_ctx)
 {
-	struct trace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -424,8 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= task_state_index(wakee);
 	entry->next_cpu			= task_cpu(wakee);
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+	trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 static void notrace
-- 
cgit v1.2.3


From 836265d31631e28000fc8917ce697fc687a58724 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 7 Oct 2024 21:35:25 +0200
Subject: wifi: remove iw_public_data from struct net_device

Given the previous patches, we no longer need the
struct iw_public_data etc., it's only used by the
old Intel drivers (and ps3_gelic creates it but
then doesn't use it). Remove all of that, including
the pointer in struct net_device.

Link: https://patch.msgid.link/20241007213525.8b2d52b60531.I6a27aaf30bded9a0977f07f47fba2bd31a3b3330@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 Documentation/networking/net_cachelines/net_device.rst |  1 -
 drivers/net/ethernet/toshiba/ps3_gelic_wireless.c      |  1 -
 drivers/net/ethernet/toshiba/ps3_gelic_wireless.h      |  1 -
 drivers/net/wireless/intel/ipw2x00/ipw2100.c           |  2 --
 drivers/net/wireless/intel/ipw2x00/ipw2100.h           |  2 --
 drivers/net/wireless/intel/ipw2x00/ipw2200.c           |  1 -
 drivers/net/wireless/intel/ipw2x00/ipw2200.h           |  2 --
 drivers/net/wireless/intel/ipw2x00/libipw_spy.c        |  7 ++++---
 include/linux/netdevice.h                              |  2 --
 include/net/iw_handler.h                               | 18 ------------------
 10 files changed, 4 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 22b07c814f4a..b7d1d4c7a1ab 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -51,7 +51,6 @@ struct_net_device_core_stats*       core_stats
 atomic_t                            carrier_up_count                                                
 atomic_t                            carrier_down_count                                              
 struct_iw_handler_def*              wireless_handlers                                               
-struct_iw_public_data*              wireless_data                                                   
 struct_ethtool_ops*                 ethtool_ops                                                     
 struct_l3mdev_ops*                  l3mdev_ops                                                      
 struct_ndisc_ops*                   ndisc_ops                                                       
diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c
index 44488c153ea2..4fbe4b7cd12a 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c
@@ -2566,7 +2566,6 @@ static void gelic_wl_setup_netdev_ops(struct net_device *netdev)
 
 	netdev->ethtool_ops = &gelic_wl_ethtool_ops;
 	netdev->netdev_ops = &gelic_wl_netdevice_ops;
-	netdev->wireless_data = &wl->wireless_data;
 	netdev->wireless_handlers = &gelic_wl_wext_handler_def;
 }
 
diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h
index 1f203d1ae8db..dbabf538e10a 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h
@@ -276,7 +276,6 @@ struct gelic_wl_info {
 	u8 active_bssid[ETH_ALEN]; /* associated bssid */
 	unsigned int essid_len;
 
-	struct iw_public_data wireless_data;
 	struct iw_statistics iwstat;
 };
 
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
index a89e06c1b8ee..15bf35f2de2a 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
@@ -6022,8 +6022,6 @@ static struct net_device *ipw2100_alloc_device(struct pci_dev *pci_dev,
 	dev->netdev_ops = &ipw2100_netdev_ops;
 	dev->ethtool_ops = &ipw2100_ethtool_ops;
 	dev->wireless_handlers = &ipw2100_wx_handler_def;
-	priv->wireless_data.libipw = priv->ieee;
-	dev->wireless_data = &priv->wireless_data;
 	dev->watchdog_timeo = 3 * HZ;
 	dev->irq = 0;
 	dev->min_mtu = 68;
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.h b/drivers/net/wireless/intel/ipw2x00/ipw2100.h
index b34085ade3aa..6f81f509b9cb 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.h
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.h
@@ -554,8 +554,6 @@ struct ipw2100_priv {
 	struct net_device *net_dev;
 	struct iw_statistics wstats;
 
-	struct iw_public_data wireless_data;
-
 	struct tasklet_struct irq_tasklet;
 
 	struct delayed_work reset_work;
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
index c0e9d2109e34..be1d971b3d32 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
@@ -11645,7 +11645,6 @@ static int ipw_pci_probe(struct pci_dev *pdev,
 
 	net_dev->netdev_ops = &ipw_netdev_ops;
 	priv->ieee->spy_enabled = true;
-	net_dev->wireless_data = &priv->wireless_data;
 	net_dev->wireless_handlers = &ipw_wx_handler_def;
 	net_dev->ethtool_ops = &ipw_ethtool_ops;
 
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.h b/drivers/net/wireless/intel/ipw2x00/ipw2200.h
index 46f119123b49..d3db54e6d37c 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2200.h
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.h
@@ -1274,8 +1274,6 @@ struct ipw_priv {
 
 	struct iw_statistics wstats;
 
-	struct iw_public_data wireless_data;
-
 	int user_requested_scan;
 	u8 direct_scan_ssid[IW_ESSID_MAX_SIZE];
 	u8 direct_scan_ssid_len;
diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_spy.c b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c
index 979aeb10aeeb..ba876e92f7f6 100644
--- a/drivers/net/wireless/intel/ipw2x00/libipw_spy.c
+++ b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c
@@ -18,9 +18,10 @@
 
 static struct iw_spy_data *get_spydata(struct net_device *dev)
 {
-	if (dev->wireless_data && dev->wireless_data->libipw &&
-	    dev->wireless_data->libipw->spy_enabled)
-		return &dev->wireless_data->libipw->spy_data;
+	struct libipw_device *ieee = netdev_priv(dev);
+
+	if (ieee->spy_enabled)
+		return &ieee->spy_data;
 	return NULL;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e87b5e488325..f0abed1e6e45 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1773,7 +1773,6 @@ enum netdev_reg_state {
  *	@wireless_handlers:	List of functions to handle Wireless Extensions,
  *				instead of ioctl,
  *				see <net/iw_handler.h> for details.
- *	@wireless_data:	Instance data managed by the core of wireless extensions
  *
  *	@netdev_ops:	Includes several pointers to callbacks,
  *			if one wants to override the ndo_*() functions
@@ -2150,7 +2149,6 @@ struct net_device {
 
 #ifdef CONFIG_WIRELESS_EXT
 	const struct iw_handler_def *wireless_handlers;
-	struct iw_public_data	*wireless_data;
 #endif
 	const struct ethtool_ops *ethtool_ops;
 #ifdef CONFIG_NET_L3_MASTER_DEV
diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h
index a7b502958d27..fc44fcca1d5c 100644
--- a/include/net/iw_handler.h
+++ b/include/net/iw_handler.h
@@ -404,24 +404,6 @@ struct iw_spy_data {
 	u_char			spy_thr_under[IW_MAX_SPY];
 };
 
-/* --------------------- DEVICE WIRELESS DATA --------------------- */
-/*
- * This is all the wireless data specific to a device instance that
- * is managed by the core of Wireless Extensions or the 802.11 layer.
- * We only keep pointer to those structures, so that a driver is free
- * to share them between instances.
- * This structure should be initialised before registering the device.
- * Access to this data follow the same rules as any other struct net_device
- * data (i.e. valid as long as struct net_device exist, same locking rules).
- */
-/* Forward declaration */
-struct libipw_device;
-/* The struct */
-struct iw_public_data {
-	/* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */
-	struct libipw_device *		libipw;
-};
-
 /**************************** PROTOTYPES ****************************/
 /*
  * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c).
-- 
cgit v1.2.3


From 9e1a98aac11b757e2634304bbe6bed5f5a58e0ee Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 7 Oct 2024 21:50:25 +0200
Subject: wifi: wext: merge adjacent CONFIG_COMPAT ifdef blocks

Simplify this, and also add a comment at the #endif.

Link: https://patch.msgid.link/20241007215025.5ecdad1e02ed.I54efa895efc496e06ba41e1c39c9df9e23b0171f@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/wireless.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index e6e34d74dda0..03e5d3fe226d 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -21,8 +21,7 @@ struct compat_iw_point {
 	__u16 length;
 	__u16 flags;
 };
-#endif
-#ifdef CONFIG_COMPAT
+
 struct __compat_iw_event {
 	__u16		len;			/* Real length of this stuff */
 	__u16		cmd;			/* Wireless IOCTL */
@@ -49,5 +48,5 @@ struct __compat_iw_event {
 #define IW_EV_COMPAT_POINT_LEN	\
 	(IW_EV_COMPAT_LCP_LEN + sizeof(struct compat_iw_point) - \
 	 IW_EV_COMPAT_POINT_OFF)
-#endif
+#endif /* CONFIG_COMPAT */
 #endif	/* _LINUX_WIRELESS_H */
-- 
cgit v1.2.3


From 4a8840af5f53f2902eba91130fae650879f18e7a Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Tue, 8 Oct 2024 12:17:19 -0700
Subject: tracepoints: Use new static branch API

The old static key API is deprecated.  Switch to the new one.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Link: https://lore.kernel.org/7a08dae3c5eddb14b13864923c1b58ac1f4af83c.1728414936.git.jpoimboe@kernel.org
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint-defs.h  | 4 ++--
 include/linux/tracepoint.h       | 8 ++++----
 kernel/trace/trace_events_hist.c | 2 +-
 kernel/trace/trace_events_user.c | 4 ++--
 kernel/tracepoint.c              | 4 ++--
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index 4dc4955f0fbf..60a6e8314d4c 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -31,7 +31,7 @@ struct tracepoint_func {
 
 struct tracepoint {
 	const char *name;		/* Tracepoint name */
-	struct static_key key;
+	struct static_key_false key;
 	struct static_call_key *static_call_key;
 	void *static_call_tramp;
 	void *iterator;
@@ -83,7 +83,7 @@ struct bpf_raw_event_map {
 
 #ifdef CONFIG_TRACEPOINTS
 # define tracepoint_enabled(tp) \
-	static_key_false(&(__tracepoint_##tp).key)
+	static_branch_unlikely(&(__tracepoint_##tp).key)
 #else
 # define tracepoint_enabled(tracepoint) false
 #endif
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 93a9f3070b48..2a29334bbc02 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -248,7 +248,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #define __DECLARE_TRACE_RCU(name, proto, args, cond)			\
 	static inline void trace_##name##_rcuidle(proto)		\
 	{								\
-		if (static_key_false(&__tracepoint_##name.key))		\
+		if (static_branch_unlikely(&__tracepoint_##name.key))	\
 			__DO_TRACE(name,				\
 				TP_ARGS(args),				\
 				TP_CONDITION(cond), 1);			\
@@ -274,7 +274,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	extern struct tracepoint __tracepoint_##name;			\
 	static inline void trace_##name(proto)				\
 	{								\
-		if (static_key_false(&__tracepoint_##name.key))		\
+		if (static_branch_unlikely(&__tracepoint_##name.key))	\
 			__DO_TRACE(name,				\
 				TP_ARGS(args),				\
 				TP_CONDITION(cond), 0);			\
@@ -311,7 +311,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	static inline bool						\
 	trace_##name##_enabled(void)					\
 	{								\
-		return static_key_false(&__tracepoint_##name.key);	\
+		return static_branch_unlikely(&__tracepoint_##name.key);\
 	}
 
 /*
@@ -328,7 +328,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	struct tracepoint __tracepoint_##_name	__used			\
 	__section("__tracepoints") = {					\
 		.name = __tpstrtab_##_name,				\
-		.key = STATIC_KEY_INIT_FALSE,				\
+		.key = STATIC_KEY_FALSE_INIT,				\
 		.static_call_key = &STATIC_CALL_KEY(tp_func_##_name),	\
 		.static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \
 		.iterator = &__traceiter_##_name,			\
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5f9119eb7c67..cc2924ad32a3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -822,7 +822,7 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
 {
 	struct tracepoint *tp = event->tp;
 
-	if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
+	if (unlikely(static_key_enabled(&tp->key))) {
 		struct tracepoint_func *probe_func_ptr;
 		synth_probe_func_t probe_func;
 		void *__data;
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 42b0d998d103..17bcad8f79de 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1676,7 +1676,7 @@ static void update_enable_bit_for(struct user_event *user)
 	struct tracepoint *tp = &user->tracepoint;
 	char status = 0;
 
-	if (atomic_read(&tp->key.enabled) > 0) {
+	if (static_key_enabled(&tp->key)) {
 		struct tracepoint_func *probe_func_ptr;
 		user_event_func_t probe_func;
 
@@ -2280,7 +2280,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 	 * It's possible key.enabled disables after this check, however
 	 * we don't mind if a few events are included in this condition.
 	 */
-	if (likely(atomic_read(&tp->key.enabled) > 0)) {
+	if (likely(static_key_enabled(&tp->key))) {
 		struct tracepoint_func *probe_func_ptr;
 		user_event_func_t probe_func;
 		struct iov_iter copy;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 8879da16ef4d..1e3de77ea6b3 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -358,7 +358,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
 		tracepoint_update_call(tp, tp_funcs);
 		/* Both iterator and static call handle NULL tp->funcs */
 		rcu_assign_pointer(tp->funcs, tp_funcs);
-		static_key_enable(&tp->key);
+		static_branch_enable(&tp->key);
 		break;
 	case TP_FUNC_2:		/* 1->2 */
 		/* Set iterator static call */
@@ -414,7 +414,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 		if (tp->unregfunc && static_key_enabled(&tp->key))
 			tp->unregfunc();
 
-		static_key_disable(&tp->key);
+		static_branch_disable(&tp->key);
 		/* Set iterator static call */
 		tracepoint_update_call(tp, tp_funcs);
 		/* Both iterator and static call handle NULL tp->funcs */
-- 
cgit v1.2.3


From 48bcda6848232667f13b4e97588de488c83c37d4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 3 Oct 2024 18:16:29 -0400
Subject: tracing: Remove definition of trace_*_rcuidle()

The trace_*_rcuidle() variant of a tracepoint was to handle places where a
tracepoint was located but RCU was not "watching". All those locations
have been removed, and RCU should be watching where all tracepoints are
located. We can now remove the trace_*_rcuidle() variant.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241003181629.36209057@gandalf.local.home
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h        | 50 ++-------------------------------------
 include/trace/events/preemptirq.h |  8 -------
 kernel/trace/trace_preemptirq.c   | 26 +++++---------------
 scripts/tags.sh                   |  2 --
 4 files changed, 8 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 2a29334bbc02..7e4af7b3633c 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -196,67 +196,25 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #define __DO_TRACE_CALL(name, args)	__traceiter_##name(NULL, args)
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
-/*
- * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle
- * code that disallow any/all tracing/instrumentation when RCU isn't watching.
- */
-#ifdef CONFIG_ARCH_WANTS_NO_INSTR
-#define RCUIDLE_COND(rcuidle)	(rcuidle)
-#else
-/* srcu can't be used from NMI */
-#define RCUIDLE_COND(rcuidle)	(rcuidle && in_nmi())
-#endif
-
 /*
  * it_func[0] is never NULL because there is at least one element in the array
  * when the array itself is non NULL.
  */
-#define __DO_TRACE(name, args, cond, rcuidle)				\
+#define __DO_TRACE(name, args, cond)					\
 	do {								\
 		int __maybe_unused __idx = 0;				\
 									\
 		if (!(cond))						\
 			return;						\
 									\
-		if (WARN_ONCE(RCUIDLE_COND(rcuidle),			\
-			      "Bad RCU usage for tracepoint"))		\
-			return;						\
-									\
 		/* keep srcu and sched-rcu usage consistent */		\
 		preempt_disable_notrace();				\
 									\
-		/*							\
-		 * For rcuidle callers, use srcu since sched-rcu	\
-		 * doesn't work from the idle path.			\
-		 */							\
-		if (rcuidle) {						\
-			__idx = srcu_read_lock_notrace(&tracepoint_srcu);\
-			ct_irq_enter_irqson();				\
-		}							\
-									\
 		__DO_TRACE_CALL(name, TP_ARGS(args));			\
 									\
-		if (rcuidle) {						\
-			ct_irq_exit_irqson();				\
-			srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\
-		}							\
-									\
 		preempt_enable_notrace();				\
 	} while (0)
 
-#ifndef MODULE
-#define __DECLARE_TRACE_RCU(name, proto, args, cond)			\
-	static inline void trace_##name##_rcuidle(proto)		\
-	{								\
-		if (static_branch_unlikely(&__tracepoint_##name.key))	\
-			__DO_TRACE(name,				\
-				TP_ARGS(args),				\
-				TP_CONDITION(cond), 1);			\
-	}
-#else
-#define __DECLARE_TRACE_RCU(name, proto, args, cond)
-#endif
-
 /*
  * Make sure the alignment of the structure in the __tracepoints section will
  * not add unwanted padding between the beginning of the section and the
@@ -277,14 +235,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		if (static_branch_unlikely(&__tracepoint_##name.key))	\
 			__DO_TRACE(name,				\
 				TP_ARGS(args),				\
-				TP_CONDITION(cond), 0);			\
+				TP_CONDITION(cond));			\
 		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
 			WARN_ONCE(!rcu_is_watching(),			\
 				  "RCU not watching for tracepoint");	\
 		}							\
 	}								\
-	__DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args),		\
-			    PARAMS(cond))				\
 	static inline int						\
 	register_trace_##name(void (*probe)(data_proto), void *data)	\
 	{								\
@@ -375,8 +331,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
 	static inline void trace_##name(proto)				\
 	{ }								\
-	static inline void trace_##name##_rcuidle(proto)		\
-	{ }								\
 	static inline int						\
 	register_trace_##name(void (*probe)(data_proto),		\
 			      void *data)				\
diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
index 3f249e150c0c..f99562d2b496 100644
--- a/include/trace/events/preemptirq.h
+++ b/include/trace/events/preemptirq.h
@@ -43,8 +43,6 @@ DEFINE_EVENT(preemptirq_template, irq_enable,
 #else
 #define trace_irq_enable(...)
 #define trace_irq_disable(...)
-#define trace_irq_enable_rcuidle(...)
-#define trace_irq_disable_rcuidle(...)
 #endif
 
 #ifdef CONFIG_TRACE_PREEMPT_TOGGLE
@@ -58,8 +56,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable,
 #else
 #define trace_preempt_enable(...)
 #define trace_preempt_disable(...)
-#define trace_preempt_enable_rcuidle(...)
-#define trace_preempt_disable_rcuidle(...)
 #endif
 
 #endif /* _TRACE_PREEMPTIRQ_H */
@@ -69,10 +65,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable,
 #else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */
 #define trace_irq_enable(...)
 #define trace_irq_disable(...)
-#define trace_irq_enable_rcuidle(...)
-#define trace_irq_disable_rcuidle(...)
 #define trace_preempt_enable(...)
 #define trace_preempt_disable(...)
-#define trace_preempt_enable_rcuidle(...)
-#define trace_preempt_disable_rcuidle(...)
 #endif
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index e37446f7916e..5c03633316a6 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -15,20 +15,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/preemptirq.h>
 
-/*
- * Use regular trace points on architectures that implement noinstr
- * tooling: these calls will only happen with RCU enabled, which can
- * use a regular tracepoint.
- *
- * On older architectures, use the rcuidle tracing methods (which
- * aren't NMI-safe - so exclude NMI contexts):
- */
-#ifdef CONFIG_ARCH_WANTS_NO_INSTR
-#define trace(point)	trace_##point
-#else
-#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
-#endif
-
 #ifdef CONFIG_TRACE_IRQFLAGS
 /* Per-cpu variable to prevent redundant calls when IRQs already off */
 static DEFINE_PER_CPU(int, tracing_irq_cpu);
@@ -42,7 +28,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
 void trace_hardirqs_on_prepare(void)
 {
 	if (this_cpu_read(tracing_irq_cpu)) {
-		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
+		trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
 		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
@@ -53,7 +39,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
 void trace_hardirqs_on(void)
 {
 	if (this_cpu_read(tracing_irq_cpu)) {
-		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
+		trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
 		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
@@ -75,7 +61,7 @@ void trace_hardirqs_off_finish(void)
 	if (!this_cpu_read(tracing_irq_cpu)) {
 		this_cpu_write(tracing_irq_cpu, 1);
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
-		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
+		trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
 	}
 
 }
@@ -89,7 +75,7 @@ void trace_hardirqs_off(void)
 	if (!this_cpu_read(tracing_irq_cpu)) {
 		this_cpu_write(tracing_irq_cpu, 1);
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
-		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
+		trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
 	}
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
@@ -100,13 +86,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
 
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-	trace(preempt_enable)(a0, a1);
+	trace_preempt_enable(a0, a1);
 	tracer_preempt_on(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-	trace(preempt_disable)(a0, a1);
+	trace_preempt_disable(a0, a1);
 	tracer_preempt_off(a0, a1);
 }
 #endif
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 191e0461d6d5..0d01c1cafb70 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -152,9 +152,7 @@ regex_c=(
 	'/^BPF_CALL_[0-9]([[:space:]]*\([[:alnum:]_]*\).*/\1/'
 	'/^COMPAT_SYSCALL_DEFINE[0-9]([[:space:]]*\([[:alnum:]_]*\).*/compat_sys_\1/'
 	'/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1/'
-	'/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/'
 	'/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1/'
-	'/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/'
 	'/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/get_\1_slot/'
 	'/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/free_\1_slot/'
 	'/^PAGEFLAG([[:space:]]*\([[:alnum:]_]*\).*/Page\1/'
-- 
cgit v1.2.3


From e53244e2c8931f9e80c1841293aea86ef8ad32a3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 3 Oct 2024 18:42:20 -0400
Subject: tracepoint: Remove SRCU protection

With the removal of the trace_*_rcuidle() tracepoints, there is no reason
to protect tracepoints with SRCU. The reason the SRCU protection was
added, was because it can protect tracepoints when RCU is not "watching".
Now that tracepoints are only used when RCU is watching, remove the SRCU
protection. It just made things more complex and confusing anyway.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241003184220.0dc21d35@gandalf.local.home
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h |  4 ----
 kernel/tracepoint.c        | 51 +---------------------------------------------
 2 files changed, 1 insertion(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 7e4af7b3633c..3d33b9872cec 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -32,8 +32,6 @@ struct trace_eval_map {
 
 #define TRACEPOINT_DEFAULT_PRIO	10
 
-extern struct srcu_struct tracepoint_srcu;
-
 extern int
 tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data);
 extern int
@@ -109,7 +107,6 @@ void for_each_tracepoint_in_module(struct module *mod,
 #ifdef CONFIG_TRACEPOINTS
 static inline void tracepoint_synchronize_unregister(void)
 {
-	synchronize_srcu(&tracepoint_srcu);
 	synchronize_rcu();
 }
 #else
@@ -207,7 +204,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		if (!(cond))						\
 			return;						\
 									\
-		/* keep srcu and sched-rcu usage consistent */		\
 		preempt_disable_notrace();				\
 									\
 		__DO_TRACE_CALL(name, TP_ARGS(args));			\
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1e3de77ea6b3..6474e2cf22c9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,9 +25,6 @@ enum tp_func_state {
 extern tracepoint_ptr_t __start___tracepoints_ptrs[];
 extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
 
-DEFINE_SRCU(tracepoint_srcu);
-EXPORT_SYMBOL_GPL(tracepoint_srcu);
-
 enum tp_transition_sync {
 	TP_TRANSITION_SYNC_1_0_1,
 	TP_TRANSITION_SYNC_N_2_1,
@@ -37,7 +34,6 @@ enum tp_transition_sync {
 
 struct tp_transition_snapshot {
 	unsigned long rcu;
-	unsigned long srcu;
 	bool ongoing;
 };
 
@@ -50,7 +46,6 @@ static void tp_rcu_get_state(enum tp_transition_sync sync)
 
 	/* Keep the latest get_state snapshot. */
 	snapshot->rcu = get_state_synchronize_rcu();
-	snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu);
 	snapshot->ongoing = true;
 }
 
@@ -61,8 +56,6 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync)
 	if (!snapshot->ongoing)
 		return;
 	cond_synchronize_rcu(snapshot->rcu);
-	if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu))
-		synchronize_srcu(&tracepoint_srcu);
 	snapshot->ongoing = false;
 }
 
@@ -85,9 +78,6 @@ static LIST_HEAD(tracepoint_module_list);
  */
 static DEFINE_MUTEX(tracepoints_mutex);
 
-static struct rcu_head *early_probes;
-static bool ok_to_free_tracepoints;
-
 /*
  * Note about RCU :
  * It is used to delay the free of multiple probes array until a quiescent
@@ -111,56 +101,17 @@ static inline void *allocate_probes(int count)
 	return p == NULL ? NULL : p->probes;
 }
 
-static void srcu_free_old_probes(struct rcu_head *head)
-{
-	kfree(container_of(head, struct tp_probes, rcu));
-}
-
 static void rcu_free_old_probes(struct rcu_head *head)
 {
-	call_srcu(&tracepoint_srcu, head, srcu_free_old_probes);
-}
-
-static __init int release_early_probes(void)
-{
-	struct rcu_head *tmp;
-
-	ok_to_free_tracepoints = true;
-
-	while (early_probes) {
-		tmp = early_probes;
-		early_probes = tmp->next;
-		call_rcu(tmp, rcu_free_old_probes);
-	}
-
-	return 0;
+	kfree(container_of(head, struct tp_probes, rcu));
 }
 
-/* SRCU is initialized at core_initcall */
-postcore_initcall(release_early_probes);
-
 static inline void release_probes(struct tracepoint_func *old)
 {
 	if (old) {
 		struct tp_probes *tp_probes = container_of(old,
 			struct tp_probes, probes[0]);
 
-		/*
-		 * We can't free probes if SRCU is not initialized yet.
-		 * Postpone the freeing till after SRCU is initialized.
-		 */
-		if (unlikely(!ok_to_free_tracepoints)) {
-			tp_probes->rcu.next = early_probes;
-			early_probes = &tp_probes->rcu;
-			return;
-		}
-
-		/*
-		 * Tracepoint probes are protected by both sched RCU and SRCU,
-		 * by calling the SRCU callback in the sched RCU callback we
-		 * cover both cases. So let us chain the SRCU and sched RCU
-		 * callbacks to wait for both grace periods.
-		 */
 		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
 	}
 }
-- 
cgit v1.2.3


From f7a870d0be12e3ae38cbe899d858994c5b51f22b Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 9 Oct 2024 17:15:35 +0900
Subject: ata: libata: Remove unused macro definitions

ATA_TMOUT_BOOT and ATA_TMOUT_BOOT_QUICK are not used anywhere. Delete
these definitions.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20241009081535.376994-1-dlemoal@kernel.org
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 include/linux/libata.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9b4a6ff03235..c1a85d46eba6 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -270,9 +270,7 @@ enum {
 
 	/* bits 24:31 of host->flags are reserved for LLD specific flags */
 
-	/* various lengths of time */
-	ATA_TMOUT_BOOT		= 30000,	/* heuristic */
-	ATA_TMOUT_BOOT_QUICK	=  7000,	/* heuristic */
+	/* Various lengths of time */
 	ATA_TMOUT_INTERNAL_QUICK = 5000,
 	ATA_TMOUT_MAX_PARK	= 30000,
 
-- 
cgit v1.2.3


From d12586e1072d92070783c854819a0ca8c82c5439 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sat, 5 Oct 2024 23:38:25 +0200
Subject: platform/x86: wmi: Implement proper shutdown handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When performing a system shutdown under Windows, all WMI clients are
terminated. This means that the ACPI BIOS might expect all WMI devices
to be disabled when shutting down.

Emulate this behaviour by disabling all active WMI devices during
shutdown. Also introduce a new WMI driver callback to allow WMI drivers
to perform any device-specific actions before disabling the WMI device.

Tested on a Dell Inspiron 3505.

Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20241005213825.701887-2-W_Armin@gmx.de
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 Documentation/wmi/driver-development-guide.rst |  7 ++++++-
 drivers/platform/x86/wmi.c                     | 27 ++++++++++++++++++++++++++
 include/linux/wmi.h                            |  2 ++
 3 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/wmi/driver-development-guide.rst b/Documentation/wmi/driver-development-guide.rst
index 429137b2f632..676873c98680 100644
--- a/Documentation/wmi/driver-development-guide.rst
+++ b/Documentation/wmi/driver-development-guide.rst
@@ -64,6 +64,7 @@ to matching WMI devices using a struct wmi_device_id table:
         .id_table = foo_id_table,
         .probe = foo_probe,
         .remove = foo_remove,         /* optional, devres is preferred */
+        .shutdown = foo_shutdown,     /* optional, called during shutdown */
         .notify = foo_notify,         /* optional, for event handling */
         .no_notify_data = true,       /* optional, enables events containing no additional data */
         .no_singleton = true,         /* required for new WMI drivers */
@@ -79,6 +80,10 @@ to unregister interfaces to other kernel subsystems and release resources, devre
 This simplifies error handling during probe and often allows to omit this callback entirely, see
 Documentation/driver-api/driver-model/devres.rst for details.
 
+The shutdown() callback is called during shutdown, reboot or kexec. Its sole purpose is to disable
+the WMI device and put it in a well-known state for the WMI driver to pick up later after reboot
+or kexec. Most WMI drivers need no special shutdown handling and can thus omit this callback.
+
 Please note that new WMI drivers are required to be able to be instantiated multiple times,
 and are forbidden from using any deprecated GUID-based WMI functions. This means that the
 WMI driver should be prepared for the scenario that multiple matching WMI devices are present
@@ -123,7 +128,7 @@ ACPI object is being done by the WMI subsystem, not the driver.
 
 The WMI driver core will take care that the notify() callback will only be called after
 the probe() callback has been called, and that no events are being received by the driver
-right before and after calling its remove() callback.
+right before and after calling its remove() or shutdown() callback.
 
 However WMI driver developers should be aware that multiple WMI events can be received concurrently,
 so any locking (if necessary) needs to be provided by the WMI driver itself.
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 3cbe180c3fc0..a01223c23d10 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -884,6 +884,32 @@ static void wmi_dev_remove(struct device *dev)
 		dev_warn(dev, "failed to disable device\n");
 }
 
+static void wmi_dev_shutdown(struct device *dev)
+{
+	struct wmi_driver *wdriver;
+	struct wmi_block *wblock;
+
+	if (dev->driver) {
+		wdriver = drv_to_wdrv(dev->driver);
+		wblock = dev_to_wblock(dev);
+
+		/*
+		 * Some machines return bogus WMI event data when disabling
+		 * the WMI event. Because of this we must prevent the associated
+		 * WMI driver from receiving new WMI events before disabling it.
+		 */
+		down_write(&wblock->notify_lock);
+		wblock->driver_ready = false;
+		up_write(&wblock->notify_lock);
+
+		if (wdriver->shutdown)
+			wdriver->shutdown(to_wmi_device(dev));
+
+		if (ACPI_FAILURE(wmi_method_enable(wblock, false)))
+			dev_warn(dev, "Failed to disable device\n");
+	}
+}
+
 static struct class wmi_bus_class = {
 	.name = "wmi_bus",
 };
@@ -895,6 +921,7 @@ static const struct bus_type wmi_bus_type = {
 	.uevent = wmi_dev_uevent,
 	.probe = wmi_dev_probe,
 	.remove = wmi_dev_remove,
+	.shutdown = wmi_dev_shutdown,
 };
 
 static const struct device_type wmi_type_event = {
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 3275470b5531..120019677fc6 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -56,6 +56,7 @@ u8 wmidev_instance_count(struct wmi_device *wdev);
  * @no_singleton: Driver can be instantiated multiple times
  * @probe: Callback for device binding
  * @remove: Callback for device unbinding
+ * @shutdown: Callback for device shutdown
  * @notify: Callback for receiving WMI events
  *
  * This represents WMI drivers which handle WMI devices.
@@ -68,6 +69,7 @@ struct wmi_driver {
 
 	int (*probe)(struct wmi_device *wdev, const void *context);
 	void (*remove)(struct wmi_device *wdev);
+	void (*shutdown)(struct wmi_device *wdev);
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
 };
 
-- 
cgit v1.2.3


From f042365a26b07006064c2cfa2293e16cad243b0d Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 4 Oct 2024 11:20:56 +0100
Subject: net: pcs: xpcs: provide a helper to get the phylink pcs given xpcs

Provide a helper to provide the pointer to the phylink_pcs struct
given a valid xpcs pointer. This will be necessary when we make
struct dw_xpcs private to pcs-xpcs.c

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 2 +-
 drivers/net/pcs/pcs-xpcs.c                        | 6 ++++++
 include/linux/pcs/pcs-xpcs.h                      | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 83ad7c7935e3..48acba5eb178 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -451,7 +451,7 @@ static struct phylink_pcs *intel_mgbe_select_pcs(struct stmmac_priv *priv,
 	 * should always be an XPCS. The original code would always
 	 * return this if present.
 	 */
-	return &priv->hw->xpcs->pcs;
+	return xpcs_to_phylink_pcs(priv->hw->xpcs);
 }
 
 static int intel_mgbe_common_data(struct pci_dev *pdev,
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 8bde87ab971f..a7f6d56183a7 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -132,6 +132,12 @@ xpcs_find_compat(struct dw_xpcs *xpcs, phy_interface_t interface)
 	return NULL;
 }
 
+struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs)
+{
+	return &xpcs->pcs;
+}
+EXPORT_SYMBOL_GPL(xpcs_to_phylink_pcs);
+
 int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface)
 {
 	const struct dw_xpcs_compat *compat;
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index abda475111d1..868515f3cc88 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -64,6 +64,7 @@ struct dw_xpcs {
 	bool need_reset;
 };
 
+struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs);
 int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface);
 void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces);
 int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns,
-- 
cgit v1.2.3


From accd5f5cd2e1b0ed6805a7c24765648d213561e5 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 4 Oct 2024 11:21:01 +0100
Subject: net: pcs: xpcs: move definition of struct dw_xpcs to private header

There should be no reason for anything outside the XPCS code to know
the contents of struct dw_xpcs - this is a private structure to XPCS.
Move the definition to the private pcs-xpcs.h header, leaving a
declaration in the global pcs/pcs-xpcs.h

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pcs/pcs-xpcs.h   | 18 ++++++++++++++++++
 include/linux/pcs/pcs-xpcs.h | 18 +-----------------
 2 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs.h b/drivers/net/pcs/pcs-xpcs.h
index fa05adfae220..1b546eae8280 100644
--- a/drivers/net/pcs/pcs-xpcs.h
+++ b/drivers/net/pcs/pcs-xpcs.h
@@ -123,6 +123,24 @@
 #define DW_XPCS_INFO_DECLARE(_name, _pcs, _pma)				\
 	static const struct dw_xpcs_info _name = { .pcs = _pcs, .pma = _pma }
 
+struct dw_xpcs_desc;
+
+enum dw_xpcs_clock {
+	DW_XPCS_CORE_CLK,
+	DW_XPCS_PAD_CLK,
+	DW_XPCS_NUM_CLKS,
+};
+
+struct dw_xpcs {
+	struct dw_xpcs_info info;
+	const struct dw_xpcs_desc *desc;
+	struct mdio_device *mdiodev;
+	struct clk_bulk_data clks[DW_XPCS_NUM_CLKS];
+	struct phylink_pcs pcs;
+	phy_interface_t interface;
+	bool need_reset;
+};
+
 int xpcs_read(struct dw_xpcs *xpcs, int dev, u32 reg);
 int xpcs_write(struct dw_xpcs *xpcs, int dev, u32 reg, u16 val);
 int xpcs_read_vpcs(struct dw_xpcs *xpcs, int reg);
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 868515f3cc88..b5b5d17998b8 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -21,8 +21,6 @@
 #define DW_AN_C37_1000BASEX		4
 #define DW_10GBASER			5
 
-struct dw_xpcs_desc;
-
 enum dw_xpcs_pcs_id {
 	DW_XPCS_ID_NATIVE = 0,
 	NXP_SJA1105_XPCS_ID = 0x00000010,
@@ -48,21 +46,7 @@ struct dw_xpcs_info {
 	u32 pma;
 };
 
-enum dw_xpcs_clock {
-	DW_XPCS_CORE_CLK,
-	DW_XPCS_PAD_CLK,
-	DW_XPCS_NUM_CLKS,
-};
-
-struct dw_xpcs {
-	struct dw_xpcs_info info;
-	const struct dw_xpcs_desc *desc;
-	struct mdio_device *mdiodev;
-	struct clk_bulk_data clks[DW_XPCS_NUM_CLKS];
-	struct phylink_pcs pcs;
-	phy_interface_t interface;
-	bool need_reset;
-};
+struct dw_xpcs;
 
 struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs);
 int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface);
-- 
cgit v1.2.3


From 57e3707eb5e3d9a45eef9151f0378313b1d39a17 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Mon, 5 Aug 2024 11:39:36 +0200
Subject: bpf: Constify ctl_table argument of filter function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sysctl core is moving to allow "struct ctl_table" in read-only memory.
As a preparation for that all functions handling "struct ctl_table" need
to be able to work with "const struct ctl_table".
As __cgroup_bpf_run_filter_sysctl() does not modify its table, it can be
adapted trivially.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/bpf-cgroup.h | 2 +-
 kernel/bpf/cgroup.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ce91d9b2acb9..4dd17128b204 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -138,7 +138,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
-				   struct ctl_table *table, int write,
+				   const struct ctl_table *table, int write,
 				   char **buf, size_t *pcount, loff_t *ppos,
 				   enum cgroup_bpf_attach_type atype);
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index e7113d700b87..9c951b4212a2 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1691,7 +1691,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
  * returned value != 1 during execution. In all other cases 0 is returned.
  */
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
-				   struct ctl_table *table, int write,
+				   const struct ctl_table *table, int write,
 				   char **buf, size_t *pcount, loff_t *ppos,
 				   enum cgroup_bpf_attach_type atype)
 {
-- 
cgit v1.2.3


From 29e1095bb1ad149b5c417719338d9c81d58bf12b Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Mon, 5 Aug 2024 11:39:37 +0200
Subject: sysctl: move internal interfaces to const struct ctl_table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As a preparation to make all the core sysctl code work with const struct
ctl_table switch over the internal function to use the const variant.

Some pointers to "struct ctl_table" need to stay non-const as they are
newly allocated and modified before registration.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 fs/proc/internal.h     |  2 +-
 fs/proc/proc_sysctl.c  | 81 +++++++++++++++++++++++++-------------------------
 include/linux/sysctl.h |  2 +-
 3 files changed, 43 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 87e4d6282025..1695509370b8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -102,7 +102,7 @@ struct proc_inode {
 	union proc_op op;
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;
-	struct ctl_table *sysctl_entry;
+	const struct ctl_table *sysctl_entry;
 	struct hlist_node sibling_inodes;
 	const struct proc_ns_operations *ns_ops;
 	struct inode vfs_inode;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d11ebc055ce0..713abccbfcf9 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -88,7 +88,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
 
 static void drop_sysctl_table(struct ctl_table_header *header);
 static int sysctl_follow_link(struct ctl_table_header **phead,
-	struct ctl_table **pentry);
+	const struct ctl_table **pentry);
 static int insert_links(struct ctl_table_header *head);
 static void put_links(struct ctl_table_header *header);
 
@@ -110,11 +110,11 @@ static int namecmp(const char *name1, int len1, const char *name2, int len2)
 }
 
 /* Called under sysctl_lock */
-static struct ctl_table *find_entry(struct ctl_table_header **phead,
+static const struct ctl_table *find_entry(struct ctl_table_header **phead,
 	struct ctl_dir *dir, const char *name, int namelen)
 {
 	struct ctl_table_header *head;
-	struct ctl_table *entry;
+	const struct ctl_table *entry;
 	struct rb_node *node = dir->root.rb_node;
 
 	while (node)
@@ -141,7 +141,7 @@ static struct ctl_table *find_entry(struct ctl_table_header **phead,
 	return NULL;
 }
 
-static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry)
 {
 	struct rb_node *node = &head->node[entry - head->ctl_table].node;
 	struct rb_node **p = &head->parent->root.rb_node;
@@ -151,7 +151,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
 
 	while (*p) {
 		struct ctl_table_header *parent_head;
-		struct ctl_table *parent_entry;
+		const struct ctl_table *parent_entry;
 		struct ctl_node *parent_node;
 		const char *parent_name;
 		int cmp;
@@ -180,7 +180,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
 	return 0;
 }
 
-static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry)
 {
 	struct rb_node *node = &head->node[entry - head->ctl_table].node;
 
@@ -189,7 +189,7 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
 
 static void init_header(struct ctl_table_header *head,
 	struct ctl_table_root *root, struct ctl_table_set *set,
-	struct ctl_node *node, struct ctl_table *table, size_t table_size)
+	struct ctl_node *node, const struct ctl_table *table, size_t table_size)
 {
 	head->ctl_table = table;
 	head->ctl_table_size = table_size;
@@ -204,7 +204,7 @@ static void init_header(struct ctl_table_header *head,
 	head->node = node;
 	INIT_HLIST_HEAD(&head->inodes);
 	if (node) {
-		struct ctl_table *entry;
+		const struct ctl_table *entry;
 
 		list_for_each_table_entry(entry, head) {
 			node->header = head;
@@ -217,7 +217,7 @@ static void init_header(struct ctl_table_header *head,
 
 static void erase_header(struct ctl_table_header *head)
 {
-	struct ctl_table *entry;
+	const struct ctl_table *entry;
 
 	list_for_each_table_entry(entry, head)
 		erase_entry(head, entry);
@@ -225,7 +225,7 @@ static void erase_header(struct ctl_table_header *head)
 
 static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
 {
-	struct ctl_table *entry;
+	const struct ctl_table *entry;
 	struct ctl_table_header *dir_h = &dir->header;
 	int err;
 
@@ -344,12 +344,12 @@ lookup_header_set(struct ctl_table_root *root)
 	return set;
 }
 
-static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
-				      struct ctl_dir *dir,
-				      const char *name, int namelen)
+static const struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+					    struct ctl_dir *dir,
+					    const char *name, int namelen)
 {
 	struct ctl_table_header *head;
-	struct ctl_table *entry;
+	const struct ctl_table *entry;
 
 	spin_lock(&sysctl_lock);
 	entry = find_entry(&head, dir, name, namelen);
@@ -374,10 +374,10 @@ static struct ctl_node *first_usable_entry(struct rb_node *node)
 }
 
 static void first_entry(struct ctl_dir *dir,
-	struct ctl_table_header **phead, struct ctl_table **pentry)
+	struct ctl_table_header **phead, const struct ctl_table **pentry)
 {
 	struct ctl_table_header *head = NULL;
-	struct ctl_table *entry = NULL;
+	const struct ctl_table *entry = NULL;
 	struct ctl_node *ctl_node;
 
 	spin_lock(&sysctl_lock);
@@ -391,10 +391,10 @@ static void first_entry(struct ctl_dir *dir,
 	*pentry = entry;
 }
 
-static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry)
 {
 	struct ctl_table_header *head = *phead;
-	struct ctl_table *entry = *pentry;
+	const struct ctl_table *entry = *pentry;
 	struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
 
 	spin_lock(&sysctl_lock);
@@ -427,7 +427,7 @@ static int test_perm(int mode, int op)
 	return -EACCES;
 }
 
-static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
+static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op)
 {
 	struct ctl_table_root *root = head->root;
 	int mode;
@@ -441,7 +441,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i
 }
 
 static struct inode *proc_sys_make_inode(struct super_block *sb,
-		struct ctl_table_header *head, struct ctl_table *table)
+		struct ctl_table_header *head, const struct ctl_table *table)
 {
 	struct ctl_table_root *root = head->root;
 	struct inode *inode;
@@ -512,7 +512,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 	struct ctl_table_header *head = grab_header(dir);
 	struct ctl_table_header *h = NULL;
 	const struct qstr *name = &dentry->d_name;
-	struct ctl_table *p;
+	const struct ctl_table *p;
 	struct inode *inode;
 	struct dentry *err = ERR_PTR(-ENOENT);
 	struct ctl_dir *ctl_dir;
@@ -550,7 +550,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct ctl_table_header *head = grab_header(inode);
-	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 	size_t count = iov_iter_count(iter);
 	char *kbuf;
 	ssize_t error;
@@ -624,7 +624,7 @@ static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
 static int proc_sys_open(struct inode *inode, struct file *filp)
 {
 	struct ctl_table_header *head = grab_header(inode);
-	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 
 	/* sysctl was unregistered */
 	if (IS_ERR(head))
@@ -642,7 +642,7 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
 {
 	struct inode *inode = file_inode(filp);
 	struct ctl_table_header *head = grab_header(inode);
-	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 	__poll_t ret = DEFAULT_POLLMASK;
 	unsigned long event;
 
@@ -673,7 +673,7 @@ out:
 static bool proc_sys_fill_cache(struct file *file,
 				struct dir_context *ctx,
 				struct ctl_table_header *head,
-				struct ctl_table *table)
+				const struct ctl_table *table)
 {
 	struct dentry *child, *dir = file->f_path.dentry;
 	struct inode *inode;
@@ -717,7 +717,7 @@ static bool proc_sys_fill_cache(struct file *file,
 static bool proc_sys_link_fill_cache(struct file *file,
 				    struct dir_context *ctx,
 				    struct ctl_table_header *head,
-				    struct ctl_table *table)
+				    const struct ctl_table *table)
 {
 	bool ret = true;
 
@@ -735,7 +735,7 @@ out:
 	return ret;
 }
 
-static int scan(struct ctl_table_header *head, struct ctl_table *table,
+static int scan(struct ctl_table_header *head, const struct ctl_table *table,
 		unsigned long *pos, struct file *file,
 		struct dir_context *ctx)
 {
@@ -759,7 +759,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct ctl_table_header *head = grab_header(file_inode(file));
 	struct ctl_table_header *h = NULL;
-	struct ctl_table *entry;
+	const struct ctl_table *entry;
 	struct ctl_dir *ctl_dir;
 	unsigned long pos;
 
@@ -792,7 +792,7 @@ static int proc_sys_permission(struct mnt_idmap *idmap,
 	 * are _NOT_ writeable, capabilities or not.
 	 */
 	struct ctl_table_header *head;
-	struct ctl_table *table;
+	const struct ctl_table *table;
 	int error;
 
 	/* Executable files are not allowed under /proc/sys/ */
@@ -836,7 +836,7 @@ static int proc_sys_getattr(struct mnt_idmap *idmap,
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct ctl_table_header *head = grab_header(inode);
-	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
@@ -935,7 +935,7 @@ static struct ctl_dir *find_subdir(struct ctl_dir *dir,
 				   const char *name, int namelen)
 {
 	struct ctl_table_header *head;
-	struct ctl_table *entry;
+	const struct ctl_table *entry;
 
 	entry = find_entry(&head, dir, name, namelen);
 	if (!entry)
@@ -1046,12 +1046,12 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
 }
 
 static int sysctl_follow_link(struct ctl_table_header **phead,
-	struct ctl_table **pentry)
+	const struct ctl_table **pentry)
 {
 	struct ctl_table_header *head;
+	const struct ctl_table *entry;
 	struct ctl_table_root *root;
 	struct ctl_table_set *set;
-	struct ctl_table *entry;
 	struct ctl_dir *dir;
 	int ret;
 
@@ -1078,7 +1078,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
 	return ret;
 }
 
-static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
@@ -1094,7 +1094,7 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
 	return -EINVAL;
 }
 
-static int sysctl_check_table_array(const char *path, struct ctl_table *table)
+static int sysctl_check_table_array(const char *path, const struct ctl_table *table)
 {
 	unsigned int extra;
 	int err = 0;
@@ -1133,7 +1133,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
 
 static int sysctl_check_table(const char *path, struct ctl_table_header *header)
 {
-	struct ctl_table *entry;
+	const struct ctl_table *entry;
 	int err = 0;
 	list_for_each_table_entry(entry, header) {
 		if (!entry->procname)
@@ -1169,8 +1169,9 @@ static int sysctl_check_table(const char *path, struct ctl_table_header *header)
 
 static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head)
 {
-	struct ctl_table *link_table, *entry, *link;
+	struct ctl_table *link_table, *link;
 	struct ctl_table_header *links;
+	const struct ctl_table *entry;
 	struct ctl_node *node;
 	char *link_name;
 	int name_bytes;
@@ -1215,7 +1216,7 @@ static bool get_links(struct ctl_dir *dir,
 		      struct ctl_table_root *link_root)
 {
 	struct ctl_table_header *tmp_head;
-	struct ctl_table *entry, *link;
+	const struct ctl_table *entry, *link;
 
 	if (header->ctl_table_size == 0 ||
 	    sysctl_is_perm_empty_ctl_header(header))
@@ -1466,7 +1467,7 @@ static void put_links(struct ctl_table_header *header)
 	struct ctl_table_root *root = header->root;
 	struct ctl_dir *parent = header->parent;
 	struct ctl_dir *core_parent;
-	struct ctl_table *entry;
+	const struct ctl_table *entry;
 
 	if (header->set == root_set)
 		return;
@@ -1477,7 +1478,7 @@ static void put_links(struct ctl_table_header *header)
 
 	list_for_each_table_entry(entry, header) {
 		struct ctl_table_header *link_head;
-		struct ctl_table *link;
+		const struct ctl_table *link;
 		const char *name = entry->procname;
 
 		link = find_entry(&link_head, core_parent, name, strlen(name));
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index aa4c6d44aaa0..a473deaf5a91 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -162,7 +162,7 @@ struct ctl_node {
 struct ctl_table_header {
 	union {
 		struct {
-			struct ctl_table *ctl_table;
+			const struct ctl_table *ctl_table;
 			int ctl_table_size;
 			int used;
 			int count;
-- 
cgit v1.2.3


From 7abc9b53bd515d7953d1f4e069b062ec4b5ba9e7 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Mon, 5 Aug 2024 11:39:38 +0200
Subject: sysctl: allow registration of const struct ctl_table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Putting structure, especially those containing function pointers,
into read-only memory makes the safer and easier to reason about.
Change the sysctl registration APIs to allow registration of
"const struct ctl_table".

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org> # security/*
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 fs/proc/proc_sysctl.c  |  6 +++---
 include/linux/sysctl.h | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 713abccbfcf9..968f8dcffd8f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1359,7 +1359,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
  */
 struct ctl_table_header *__register_sysctl_table(
 	struct ctl_table_set *set,
-	const char *path, struct ctl_table *table, size_t table_size)
+	const char *path, const struct ctl_table *table, size_t table_size)
 {
 	struct ctl_table_root *root = set->dir.header.root;
 	struct ctl_table_header *header;
@@ -1420,7 +1420,7 @@ fail:
  *
  * See __register_sysctl_table for more details.
  */
-struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table,
+struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table,
 					    size_t table_size)
 {
 	return __register_sysctl_table(&sysctl_table_root.default_set,
@@ -1449,7 +1449,7 @@ EXPORT_SYMBOL(register_sysctl_sz);
  *
  * Context: if your base directory does not exist it will be created for you.
  */
-void __init __register_sysctl_init(const char *path, struct ctl_table *table,
+void __init __register_sysctl_init(const char *path, const struct ctl_table *table,
 				 const char *table_name, size_t table_size)
 {
 	struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a473deaf5a91..202855befa8b 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -223,13 +223,13 @@ extern void retire_sysctl_set(struct ctl_table_set *set);
 
 struct ctl_table_header *__register_sysctl_table(
 	struct ctl_table_set *set,
-	const char *path, struct ctl_table *table, size_t table_size);
-struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table,
+	const char *path, const struct ctl_table *table, size_t table_size);
+struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table,
 					    size_t table_size);
 void unregister_sysctl_table(struct ctl_table_header * table);
 
 extern int sysctl_init_bases(void);
-extern void __register_sysctl_init(const char *path, struct ctl_table *table,
+extern void __register_sysctl_init(const char *path, const struct ctl_table *table,
 				 const char *table_name, size_t table_size);
 #define register_sysctl_init(path, table)	\
 	__register_sysctl_init(path, table, #table, ARRAY_SIZE(table))
@@ -251,7 +251,7 @@ extern int no_unaligned_warning;
 
 #else /* CONFIG_SYSCTL */
 
-static inline void register_sysctl_init(const char *path, struct ctl_table *table)
+static inline void register_sysctl_init(const char *path, const struct ctl_table *table)
 {
 }
 
@@ -261,7 +261,7 @@ static inline struct ctl_table_header *register_sysctl_mount_point(const char *p
 }
 
 static inline struct ctl_table_header *register_sysctl_sz(const char *path,
-							  struct ctl_table *table,
+							  const struct ctl_table *table,
 							  size_t table_size)
 {
 	return NULL;
-- 
cgit v1.2.3


From 823a566221a5639f6c69424897218e5d6431a970 Mon Sep 17 00:00:00 2001
From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Date: Wed, 9 Oct 2024 11:20:31 +0200
Subject: locking/ww_mutex: Adjust to lockdep nest_lock requirements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using mutex_acquire_nest() with a nest_lock, lockdep refcounts the
number of acquired lockdep_maps of mutexes of the same class, and also
keeps a pointer to the first acquired lockdep_map of a class. That pointer
is then used for various comparison-, printing- and checking purposes,
but there is no mechanism to actively ensure that lockdep_map stays in
memory. Instead, a warning is printed if the lockdep_map is freed and
there are still held locks of the same lock class, even if the lockdep_map
itself has been released.

In the context of WW/WD transactions that means that if a user unlocks
and frees a ww_mutex from within an ongoing ww transaction, and that
mutex happens to be the first ww_mutex grabbed in the transaction,
such a warning is printed and there might be a risk of a UAF.

Note that this is only problem when lockdep is enabled and affects only
dereferences of struct lockdep_map.

Adjust to this by adding a fake lockdep_map to the acquired context and
make sure it is the first acquired lockdep map of the associated
ww_mutex class. Then hold it for the duration of the WW/WD transaction.

This has the side effect that trying to lock a ww mutex *without* a
ww_acquire_context but where a such context has been acquire, we'd see
a lockdep splat. The test-ww_mutex.c selftest attempts to do that, so
modify that particular test to not acquire a ww_acquire_context if it
is not going to be used.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241009092031.6356-1-thomas.hellstrom@linux.intel.com
---
 include/linux/ww_mutex.h       | 14 ++++++++++++++
 kernel/locking/test-ww_mutex.c |  8 +++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index bb763085479a..a401a2f31a77 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -65,6 +65,16 @@ struct ww_acquire_ctx {
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
+	/**
+	 * @first_lock_dep_map: fake lockdep_map for first locked ww_mutex.
+	 *
+	 * lockdep requires the lockdep_map for the first locked ww_mutex
+	 * in a ww transaction to remain in memory until all ww_mutexes of
+	 * the transaction have been unlocked. Ensure this by keeping a
+	 * fake locked ww_mutex lockdep map between ww_acquire_init() and
+	 * ww_acquire_fini().
+	 */
+	struct lockdep_map first_lock_dep_map;
 #endif
 #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
 	unsigned int deadlock_inject_interval;
@@ -146,7 +156,10 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
 	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
 	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
 			 &ww_class->acquire_key, 0);
+	lockdep_init_map(&ctx->first_lock_dep_map, ww_class->mutex_name,
+			 &ww_class->mutex_key, 0);
 	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
+	mutex_acquire_nest(&ctx->first_lock_dep_map, 0, 0, &ctx->dep_map, _RET_IP_);
 #endif
 #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
 	ctx->deadlock_inject_interval = 1;
@@ -185,6 +198,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
 static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+	mutex_release(&ctx->first_lock_dep_map, _THIS_IP_);
 	mutex_release(&ctx->dep_map, _THIS_IP_);
 #endif
 #ifdef DEBUG_WW_MUTEXES
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 10a5736a21c2..5d58b2c0ef98 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -62,7 +62,8 @@ static int __test_mutex(unsigned int flags)
 	int ret;
 
 	ww_mutex_init(&mtx.mutex, &ww_class);
-	ww_acquire_init(&ctx, &ww_class);
+	if (flags & TEST_MTX_CTX)
+		ww_acquire_init(&ctx, &ww_class);
 
 	INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
 	init_completion(&mtx.ready);
@@ -90,7 +91,8 @@ static int __test_mutex(unsigned int flags)
 		ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
 	}
 	ww_mutex_unlock(&mtx.mutex);
-	ww_acquire_fini(&ctx);
+	if (flags & TEST_MTX_CTX)
+		ww_acquire_fini(&ctx);
 
 	if (ret) {
 		pr_err("%s(flags=%x): mutual exclusion failure\n",
@@ -679,7 +681,7 @@ static int __init test_ww_mutex_init(void)
 	if (ret)
 		return ret;
 
-	ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+	ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 5461f3fd74a89757f95f351eb0bc26aafc2a2e91 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Fri, 20 Sep 2024 00:27:58 +0100
Subject: backlight: Remove notifier

backlight_register_notifier and backlight_unregister_notifier have
been unused since commit 6cb634d0dc85 ("ACPI: video: Remove code to
unregister acpi_video backlight when a native backlight registers")

With those not being called, it means that the backlight_notifier
list is always empty.

Remove the functions, the list itself and the enum used in the
notifications.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Simona Vetter <simona.vetter@ffwll.ch>
Link: https://lore.kernel.org/r/20240919232758.639925-1-linux@treblig.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/backlight.c | 42 -------------------------------------
 include/linux/backlight.h           | 20 ------------------
 2 files changed, 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index a82934694d05..f699e5827ccb 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -65,7 +65,6 @@
 
 static struct list_head backlight_dev_list;
 static struct mutex backlight_dev_list_mutex;
-static struct blocking_notifier_head backlight_notifier;
 
 static const char *const backlight_types[] = {
 	[BACKLIGHT_RAW] = "raw",
@@ -467,9 +466,6 @@ struct backlight_device *backlight_device_register(const char *name,
 	list_add(&new_bd->entry, &backlight_dev_list);
 	mutex_unlock(&backlight_dev_list_mutex);
 
-	blocking_notifier_call_chain(&backlight_notifier,
-				     BACKLIGHT_REGISTERED, new_bd);
-
 	return new_bd;
 }
 EXPORT_SYMBOL(backlight_device_register);
@@ -539,9 +535,6 @@ void backlight_device_unregister(struct backlight_device *bd)
 	mutex_unlock(&pmac_backlight_mutex);
 #endif
 
-	blocking_notifier_call_chain(&backlight_notifier,
-				     BACKLIGHT_UNREGISTERED, bd);
-
 	mutex_lock(&bd->ops_lock);
 	bd->ops = NULL;
 	mutex_unlock(&bd->ops_lock);
@@ -566,40 +559,6 @@ static int devm_backlight_device_match(struct device *dev, void *res,
 	return *r == data;
 }
 
-/**
- * backlight_register_notifier - get notified of backlight (un)registration
- * @nb: notifier block with the notifier to call on backlight (un)registration
- *
- * Register a notifier to get notified when backlight devices get registered
- * or unregistered.
- *
- * RETURNS:
- *
- * 0 on success, otherwise a negative error code
- */
-int backlight_register_notifier(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&backlight_notifier, nb);
-}
-EXPORT_SYMBOL(backlight_register_notifier);
-
-/**
- * backlight_unregister_notifier - unregister a backlight notifier
- * @nb: notifier block to unregister
- *
- * Register a notifier to get notified when backlight devices get registered
- * or unregistered.
- *
- * RETURNS:
- *
- * 0 on success, otherwise a negative error code
- */
-int backlight_unregister_notifier(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_unregister(&backlight_notifier, nb);
-}
-EXPORT_SYMBOL(backlight_unregister_notifier);
-
 /**
  * devm_backlight_device_register - register a new backlight device
  * @dev: the device to register
@@ -767,7 +726,6 @@ static int __init backlight_class_init(void)
 
 	INIT_LIST_HEAD(&backlight_dev_list);
 	mutex_init(&backlight_dev_list_mutex);
-	BLOCKING_INIT_NOTIFIER_HEAD(&backlight_notifier);
 
 	return 0;
 }
diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index ea9c1bc8148e..f5652e5a9060 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -66,24 +66,6 @@ enum backlight_type {
 	BACKLIGHT_TYPE_MAX,
 };
 
-/**
- * enum backlight_notification - the type of notification
- *
- * The notifications that is used for notification sent to the receiver
- * that registered notifications using backlight_register_notifier().
- */
-enum backlight_notification {
-	/**
-	 * @BACKLIGHT_REGISTERED: The backlight device is registered.
-	 */
-	BACKLIGHT_REGISTERED,
-
-	/**
-	 * @BACKLIGHT_UNREGISTERED: The backlight revice is unregistered.
-	 */
-	BACKLIGHT_UNREGISTERED,
-};
-
 /** enum backlight_scale - the type of scale used for brightness values
  *
  * The type of scale used for brightness values.
@@ -421,8 +403,6 @@ void devm_backlight_device_unregister(struct device *dev,
 				      struct backlight_device *bd);
 void backlight_force_update(struct backlight_device *bd,
 			    enum backlight_update_reason reason);
-int backlight_register_notifier(struct notifier_block *nb);
-int backlight_unregister_notifier(struct notifier_block *nb);
 struct backlight_device *backlight_device_get_by_name(const char *name);
 struct backlight_device *backlight_device_get_by_type(enum backlight_type type);
 int backlight_device_set_brightness(struct backlight_device *bd,
-- 
cgit v1.2.3


From 6f1067cfbee72b04fc42234f7f1588f838cec0b6 Mon Sep 17 00:00:00 2001
From: Thomas Richard <thomas.richard@bootlin.com>
Date: Tue, 1 Oct 2024 13:53:27 +0200
Subject: mfd: Add Congatec Board Controller driver

Add core MFD driver for the Board Controller found on some Congatec SMARC
module. This Board Controller provides functions like watchdog, GPIO, and
I2C busses.

This commit adds support only for the conga-SA7 module.

Signed-off-by: Thomas Richard <thomas.richard@bootlin.com>
Link: https://lore.kernel.org/r/20241001-congatec-board-controller-v3-1-39ceceed5c47@bootlin.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/Kconfig      |  12 ++
 drivers/mfd/Makefile     |   1 +
 drivers/mfd/cgbc-core.c  | 411 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/cgbc.h |  44 +++++
 4 files changed, 468 insertions(+)
 create mode 100644 drivers/mfd/cgbc-core.c
 create mode 100644 include/linux/mfd/cgbc.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index f9325bcce1b9..03c1e4e3eea4 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -236,6 +236,18 @@ config MFD_AXP20X_RSB
 	  components like regulators or the PEK (Power Enable Key) under the
 	  corresponding menus.
 
+config MFD_CGBC
+	tristate "Congatec Board Controller"
+	select MFD_CORE
+	depends on X86
+	help
+	  This is the core driver of the Board Controller found on some Congatec
+	  SMARC modules. The Board Controller provides functions like watchdog,
+	  I2C busses, and GPIO controller.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called cgbc-core.
+
 config MFD_CROS_EC_DEV
 	tristate "ChromeOS Embedded Controller multifunction device"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 2a9f91e81af8..e057d6d6faef 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_MFD_SM501)		+= sm501.o
 obj-$(CONFIG_ARCH_BCM2835)	+= bcm2835-pm.o
 obj-$(CONFIG_MFD_BCM590XX)	+= bcm590xx.o
 obj-$(CONFIG_MFD_BD9571MWV)	+= bd9571mwv.o
+obj-$(CONFIG_MFD_CGBC)		+= cgbc-core.o
 obj-$(CONFIG_MFD_CROS_EC_DEV)	+= cros_ec_dev.o
 obj-$(CONFIG_MFD_CS42L43)	+= cs42l43.o
 obj-$(CONFIG_MFD_CS42L43_I2C)	+= cs42l43-i2c.o
diff --git a/drivers/mfd/cgbc-core.c b/drivers/mfd/cgbc-core.c
new file mode 100644
index 000000000000..93004a6b29c1
--- /dev/null
+++ b/drivers/mfd/cgbc-core.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Congatec Board Controller core driver.
+ *
+ * The x86 Congatec modules have an embedded micro controller named Board
+ * Controller. This Board Controller has a Watchdog timer, some GPIOs, and two
+ * I2C busses.
+ *
+ * Copyright (C) 2024 Bootlin
+ *
+ * Author: Thomas Richard <thomas.richard@bootlin.com>
+ */
+
+#include <linux/dmi.h>
+#include <linux/iopoll.h>
+#include <linux/mfd/cgbc.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/sysfs.h>
+
+#define CGBC_IO_SESSION_BASE	0x0E20
+#define CGBC_IO_SESSION_END	0x0E30
+#define CGBC_IO_CMD_BASE	0x0E00
+#define CGBC_IO_CMD_END		0x0E10
+
+#define CGBC_MASK_STATUS	(BIT(6) | BIT(7))
+#define CGBC_MASK_DATA_COUNT	0x1F
+#define CGBC_MASK_ERROR_CODE	0x1F
+
+#define CGBC_STATUS_DATA_READY	0x00
+#define CGBC_STATUS_CMD_READY	BIT(6)
+#define CGBC_STATUS_ERROR	(BIT(6) | BIT(7))
+
+#define CGBC_SESSION_CMD		0x00
+#define CGBC_SESSION_CMD_IDLE		0x00
+#define CGBC_SESSION_CMD_REQUEST	0x01
+#define CGBC_SESSION_DATA		0x01
+#define CGBC_SESSION_STATUS		0x02
+#define CGBC_SESSION_STATUS_FREE	0x03
+#define CGBC_SESSION_ACCESS		0x04
+#define CGBC_SESSION_ACCESS_GAINED	0x00
+
+#define CGBC_SESSION_VALID_MIN  0x02
+#define CGBC_SESSION_VALID_MAX  0xFE
+
+#define CGBC_CMD_STROBE			0x00
+#define CGBC_CMD_INDEX			0x02
+#define CGBC_CMD_INDEX_CBM_MAN8		0x00
+#define CGBC_CMD_INDEX_CBM_AUTO32	0x03
+#define CGBC_CMD_DATA			0x04
+#define CGBC_CMD_ACCESS			0x0C
+
+#define CGBC_CMD_GET_FW_REV	0x21
+
+static struct platform_device *cgbc_pdev;
+
+/* Wait the Board Controller is ready to receive some session commands */
+static int cgbc_wait_device(struct cgbc_device_data *cgbc)
+{
+	u16 status;
+	int ret;
+
+	ret = readx_poll_timeout(ioread16, cgbc->io_session + CGBC_SESSION_STATUS, status,
+				 status == CGBC_SESSION_STATUS_FREE, 0, 500000);
+
+	if (ret || ioread32(cgbc->io_session + CGBC_SESSION_ACCESS))
+		ret = -ENODEV;
+
+	return ret;
+}
+
+static int cgbc_session_command(struct cgbc_device_data *cgbc, u8 cmd)
+{
+	int ret;
+	u8 val;
+
+	ret = readx_poll_timeout(ioread8, cgbc->io_session + CGBC_SESSION_CMD, val,
+				 val == CGBC_SESSION_CMD_IDLE, 0, 100000);
+	if (ret)
+		return ret;
+
+	iowrite8(cmd, cgbc->io_session + CGBC_SESSION_CMD);
+
+	ret = readx_poll_timeout(ioread8, cgbc->io_session + CGBC_SESSION_CMD, val,
+				 val == CGBC_SESSION_CMD_IDLE, 0, 100000);
+	if (ret)
+		return ret;
+
+	ret = (int)ioread8(cgbc->io_session + CGBC_SESSION_DATA);
+
+	iowrite8(CGBC_SESSION_STATUS_FREE, cgbc->io_session + CGBC_SESSION_STATUS);
+
+	return ret;
+}
+
+static int cgbc_session_request(struct cgbc_device_data *cgbc)
+{
+	unsigned int ret;
+
+	ret = cgbc_wait_device(cgbc);
+
+	if (ret)
+		return dev_err_probe(cgbc->dev, ret, "device not found or not ready\n");
+
+	cgbc->session = cgbc_session_command(cgbc, CGBC_SESSION_CMD_REQUEST);
+
+	/* The Board Controller sent us a wrong session handle, we cannot communicate with it */
+	if (cgbc->session < CGBC_SESSION_VALID_MIN || cgbc->session > CGBC_SESSION_VALID_MAX)
+		return dev_err_probe(cgbc->dev, -ECONNREFUSED,
+				     "failed to get a valid session handle\n");
+
+	return 0;
+}
+
+static void cgbc_session_release(struct cgbc_device_data *cgbc)
+{
+	if (cgbc_session_command(cgbc, cgbc->session) != cgbc->session)
+		dev_warn(cgbc->dev, "failed to release session\n");
+}
+
+static bool cgbc_command_lock(struct cgbc_device_data *cgbc)
+{
+	iowrite8(cgbc->session, cgbc->io_cmd + CGBC_CMD_ACCESS);
+
+	return ioread8(cgbc->io_cmd + CGBC_CMD_ACCESS) == cgbc->session;
+}
+
+static void cgbc_command_unlock(struct cgbc_device_data *cgbc)
+{
+	iowrite8(cgbc->session, cgbc->io_cmd + CGBC_CMD_ACCESS);
+}
+
+int cgbc_command(struct cgbc_device_data *cgbc, void *cmd, unsigned int cmd_size, void *data,
+		 unsigned int data_size, u8 *status)
+{
+	u8 checksum = 0, data_checksum = 0, istatus = 0, val;
+	u8 *_data = (u8 *)data;
+	u8 *_cmd = (u8 *)cmd;
+	int mode_change = -1;
+	bool lock;
+	int ret, i;
+
+	mutex_lock(&cgbc->lock);
+
+	/* Request access */
+	ret = readx_poll_timeout(cgbc_command_lock, cgbc, lock, lock, 0, 100000);
+	if (ret)
+		goto out;
+
+	/* Wait board controller is ready */
+	ret = readx_poll_timeout(ioread8, cgbc->io_cmd + CGBC_CMD_STROBE, val,
+				 val == CGBC_CMD_STROBE, 0, 100000);
+	if (ret)
+		goto release;
+
+	/* Write command packet */
+	if (cmd_size <= 2) {
+		iowrite8(CGBC_CMD_INDEX_CBM_MAN8, cgbc->io_cmd + CGBC_CMD_INDEX);
+	} else {
+		iowrite8(CGBC_CMD_INDEX_CBM_AUTO32, cgbc->io_cmd + CGBC_CMD_INDEX);
+		if ((cmd_size % 4) != 0x03)
+			mode_change = (cmd_size & 0xFFFC) - 1;
+	}
+
+	for (i = 0; i < cmd_size; i++) {
+		iowrite8(_cmd[i], cgbc->io_cmd + CGBC_CMD_DATA + (i % 4));
+		checksum ^= _cmd[i];
+		if (mode_change == i)
+			iowrite8((i + 1) | CGBC_CMD_INDEX_CBM_MAN8, cgbc->io_cmd + CGBC_CMD_INDEX);
+	}
+
+	/* Append checksum byte */
+	iowrite8(checksum, cgbc->io_cmd + CGBC_CMD_DATA + (i % 4));
+
+	/* Perform command strobe */
+	iowrite8(cgbc->session, cgbc->io_cmd + CGBC_CMD_STROBE);
+
+	/* Rewind cmd buffer index */
+	iowrite8(CGBC_CMD_INDEX_CBM_AUTO32, cgbc->io_cmd + CGBC_CMD_INDEX);
+
+	/* Wait command completion */
+	ret = read_poll_timeout(ioread8, val, val == CGBC_CMD_STROBE, 0, 100000, false,
+				cgbc->io_cmd + CGBC_CMD_STROBE);
+	if (ret)
+		goto release;
+
+	istatus = ioread8(cgbc->io_cmd + CGBC_CMD_DATA);
+	checksum = istatus;
+
+	/* Check command status */
+	switch (istatus & CGBC_MASK_STATUS) {
+	case CGBC_STATUS_DATA_READY:
+		if (istatus > data_size)
+			istatus = data_size;
+		for (i = 0; i < istatus; i++) {
+			_data[i] = ioread8(cgbc->io_cmd + CGBC_CMD_DATA + ((i + 1) % 4));
+			checksum ^= _data[i];
+		}
+		data_checksum = ioread8(cgbc->io_cmd + CGBC_CMD_DATA + ((i + 1) % 4));
+		istatus &= CGBC_MASK_DATA_COUNT;
+		break;
+	case CGBC_STATUS_ERROR:
+	case CGBC_STATUS_CMD_READY:
+		data_checksum = ioread8(cgbc->io_cmd + CGBC_CMD_DATA + 1);
+		if ((istatus & CGBC_MASK_STATUS) == CGBC_STATUS_ERROR)
+			ret = -EIO;
+		istatus = istatus & CGBC_MASK_ERROR_CODE;
+		break;
+	default:
+		data_checksum = ioread8(cgbc->io_cmd + CGBC_CMD_DATA + 1);
+		istatus &= CGBC_MASK_ERROR_CODE;
+		ret = -EIO;
+		break;
+	}
+
+	/* Checksum verification */
+	if (ret == 0 && data_checksum != checksum)
+		ret = -EIO;
+
+release:
+	cgbc_command_unlock(cgbc);
+
+out:
+	mutex_unlock(&cgbc->lock);
+
+	if (status)
+		*status = istatus;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cgbc_command);
+
+static struct mfd_cell cgbc_devs[] = {
+	{ .name = "cgbc-wdt"	},
+	{ .name = "cgbc-gpio"	},
+	{ .name = "cgbc-i2c", .id = 1 },
+	{ .name = "cgbc-i2c", .id = 2 },
+};
+
+static int cgbc_map(struct cgbc_device_data *cgbc)
+{
+	struct device *dev = cgbc->dev;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct resource *ioport;
+
+	ioport = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (!ioport)
+		return -EINVAL;
+
+	cgbc->io_session = devm_ioport_map(dev, ioport->start, resource_size(ioport));
+	if (!cgbc->io_session)
+		return -ENOMEM;
+
+	ioport = platform_get_resource(pdev, IORESOURCE_IO, 1);
+	if (!ioport)
+		return -EINVAL;
+
+	cgbc->io_cmd = devm_ioport_map(dev, ioport->start, resource_size(ioport));
+	if (!cgbc->io_cmd)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static const struct resource cgbc_resources[] = {
+	{
+		.start  = CGBC_IO_SESSION_BASE,
+		.end    = CGBC_IO_SESSION_END,
+		.flags  = IORESOURCE_IO,
+	},
+	{
+		.start  = CGBC_IO_CMD_BASE,
+		.end    = CGBC_IO_CMD_END,
+		.flags  = IORESOURCE_IO,
+	},
+};
+
+static ssize_t cgbc_version_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct cgbc_device_data *cgbc = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "CGBCP%c%c%c\n", cgbc->version.feature, cgbc->version.major,
+			  cgbc->version.minor);
+}
+
+static DEVICE_ATTR_RO(cgbc_version);
+
+static struct attribute *cgbc_attrs[] = {
+	&dev_attr_cgbc_version.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(cgbc);
+
+static int cgbc_get_version(struct cgbc_device_data *cgbc)
+{
+	u8 cmd = CGBC_CMD_GET_FW_REV;
+	u8 data[4];
+	int ret;
+
+	ret = cgbc_command(cgbc, &cmd, 1, &data, sizeof(data), NULL);
+	if (ret)
+		return ret;
+
+	cgbc->version.feature = data[0];
+	cgbc->version.major = data[1];
+	cgbc->version.minor = data[2];
+
+	return 0;
+}
+
+static int cgbc_init_device(struct cgbc_device_data *cgbc)
+{
+	int ret;
+
+	ret = cgbc_session_request(cgbc);
+	if (ret)
+		return ret;
+
+	ret = cgbc_get_version(cgbc);
+	if (ret)
+		return ret;
+
+	return mfd_add_devices(cgbc->dev, -1, cgbc_devs, ARRAY_SIZE(cgbc_devs), NULL, 0, NULL);
+}
+
+static int cgbc_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct cgbc_device_data *cgbc;
+	int ret;
+
+	cgbc = devm_kzalloc(dev, sizeof(*cgbc), GFP_KERNEL);
+	if (!cgbc)
+		return -ENOMEM;
+
+	cgbc->dev = dev;
+
+	ret = cgbc_map(cgbc);
+	if (ret)
+		return ret;
+
+	mutex_init(&cgbc->lock);
+
+	platform_set_drvdata(pdev, cgbc);
+
+	return cgbc_init_device(cgbc);
+}
+
+static void cgbc_remove(struct platform_device *pdev)
+{
+	struct cgbc_device_data *cgbc = platform_get_drvdata(pdev);
+
+	cgbc_session_release(cgbc);
+
+	mfd_remove_devices(&pdev->dev);
+}
+
+static struct platform_driver cgbc_driver = {
+	.driver		= {
+		.name		= "cgbc",
+		.dev_groups	= cgbc_groups,
+	},
+	.probe		= cgbc_probe,
+	.remove_new	= cgbc_remove,
+};
+
+static const struct dmi_system_id cgbc_dmi_table[] __initconst = {
+	{
+		.ident = "SA7",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "congatec"),
+			DMI_MATCH(DMI_BOARD_NAME, "conga-SA7"),
+		},
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(dmi, cgbc_dmi_table);
+
+static int __init cgbc_init(void)
+{
+	const struct dmi_system_id *id;
+	int ret = -ENODEV;
+
+	id = dmi_first_match(cgbc_dmi_table);
+	if (IS_ERR_OR_NULL(id))
+		return ret;
+
+	cgbc_pdev = platform_device_register_simple("cgbc", PLATFORM_DEVID_NONE, cgbc_resources,
+						    ARRAY_SIZE(cgbc_resources));
+	if (IS_ERR(cgbc_pdev))
+		return PTR_ERR(cgbc_pdev);
+
+	return platform_driver_register(&cgbc_driver);
+}
+
+static void __exit cgbc_exit(void)
+{
+	platform_device_unregister(cgbc_pdev);
+	platform_driver_unregister(&cgbc_driver);
+}
+
+module_init(cgbc_init);
+module_exit(cgbc_exit);
+
+MODULE_DESCRIPTION("Congatec Board Controller Core Driver");
+MODULE_AUTHOR("Thomas Richard <thomas.richard@bootlin.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:cgbc-core");
diff --git a/include/linux/mfd/cgbc.h b/include/linux/mfd/cgbc.h
new file mode 100644
index 000000000000..badbec4c7033
--- /dev/null
+++ b/include/linux/mfd/cgbc.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Congatec Board Controller driver definitions
+ *
+ * Copyright (C) 2024 Bootlin
+ * Author: Thomas Richard <thomas.richard@bootlin.com>
+ */
+
+#ifndef _LINUX_MFD_CGBC_H_
+
+/**
+ * struct cgbc_version - Board Controller device version structure
+ * @feature:	Board Controller feature number
+ * @major:	Board Controller major revision
+ * @minor:	Board Controller minor revision
+ */
+struct cgbc_version {
+	unsigned char feature;
+	unsigned char major;
+	unsigned char minor;
+};
+
+/**
+ * struct cgbc_device_data - Internal representation of the Board Controller device
+ * @io_session:		Pointer to the session IO memory
+ * @io_cmd:		Pointer to the command IO memory
+ * @session:		Session id returned by the Board Controller
+ * @dev:		Pointer to kernel device structure
+ * @cgbc_version:	Board Controller version structure
+ * @mutex:		Board Controller mutex
+ */
+struct cgbc_device_data {
+	void __iomem		*io_session;
+	void __iomem		*io_cmd;
+	u8			session;
+	struct device		*dev;
+	struct cgbc_version	version;
+	struct mutex		lock;
+};
+
+int cgbc_command(struct cgbc_device_data *cgbc, void *cmd, unsigned int cmd_size,
+		 void *data, unsigned int data_size, u8 *status);
+
+#endif /*_LINUX_MFD_CGBC_H_*/
-- 
cgit v1.2.3


From 0e6caab8db8bc9359d1a8363e1ee9b2205ed704d Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 8 Oct 2024 21:07:11 -0400
Subject: tracing: Declare system call tracepoints with TRACE_EVENT_SYSCALL

In preparation for allowing system call tracepoints to handle page
faults, introduce TRACE_EVENT_SYSCALL to declare the sys_enter/sys_exit
tracepoints.

Move the common code between __DECLARE_TRACE and __DECLARE_TRACE_SYSCALL
into __DECLARE_TRACE_COMMON.

This change is not meant to alter the generated code, and only prepares
the following modifications.

Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-2-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h      | 53 +++++++++++++++++++++++++++++++----------
 include/trace/bpf_probe.h       |  3 +++
 include/trace/define_trace.h    |  5 ++++
 include/trace/events/syscalls.h |  4 ++--
 include/trace/perf.h            |  3 +++
 include/trace/trace_events.h    | 28 ++++++++++++++++++++++
 6 files changed, 81 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 3d33b9872cec..76e441b39a96 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -197,7 +197,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * it_func[0] is never NULL because there is at least one element in the array
  * when the array itself is non NULL.
  */
-#define __DO_TRACE(name, args, cond)					\
+#define __DO_TRACE(name, args, cond, syscall)				\
 	do {								\
 		int __maybe_unused __idx = 0;				\
 									\
@@ -222,21 +222,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * site if it is not watching, as it will need to be active when the
  * tracepoint is enabled.
  */
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
+#define __DECLARE_TRACE_COMMON(name, proto, args, cond, data_proto)	\
 	extern int __traceiter_##name(data_proto);			\
 	DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name);	\
 	extern struct tracepoint __tracepoint_##name;			\
-	static inline void trace_##name(proto)				\
-	{								\
-		if (static_branch_unlikely(&__tracepoint_##name.key))	\
-			__DO_TRACE(name,				\
-				TP_ARGS(args),				\
-				TP_CONDITION(cond));			\
-		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
-			WARN_ONCE(!rcu_is_watching(),			\
-				  "RCU not watching for tracepoint");	\
-		}							\
-	}								\
 	static inline int						\
 	register_trace_##name(void (*probe)(data_proto), void *data)	\
 	{								\
@@ -266,6 +255,34 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		return static_branch_unlikely(&__tracepoint_##name.key);\
 	}
 
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
+	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+	static inline void trace_##name(proto)				\
+	{								\
+		if (static_branch_unlikely(&__tracepoint_##name.key))	\
+			__DO_TRACE(name,				\
+				TP_ARGS(args),				\
+				TP_CONDITION(cond), 0);			\
+		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
+			WARN_ONCE(!rcu_is_watching(),			\
+				  "RCU not watching for tracepoint");	\
+		}							\
+	}
+
+#define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto)	\
+	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+	static inline void trace_##name(proto)				\
+	{								\
+		if (static_branch_unlikely(&__tracepoint_##name.key))	\
+			__DO_TRACE(name,				\
+				TP_ARGS(args),				\
+				TP_CONDITION(cond), 1);			\
+		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
+			WARN_ONCE(!rcu_is_watching(),			\
+				  "RCU not watching for tracepoint");	\
+		}							\
+	}
+
 /*
  * We have no guarantee that gcc and the linker won't up-align the tracepoint
  * structures, so we create an array of pointers that will be used for iteration
@@ -348,6 +365,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		return false;						\
 	}
 
+#define __DECLARE_TRACE_SYSCALL	__DECLARE_TRACE
+
 #define DEFINE_TRACE_FN(name, reg, unreg, proto, args)
 #define DEFINE_TRACE(name, proto, args)
 #define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
@@ -409,6 +428,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
 			PARAMS(void *__data, proto))
 
+#define DECLARE_TRACE_SYSCALL(name, proto, args)			\
+	__DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args),	\
+				cpu_online(raw_smp_processor_id()),	\
+				PARAMS(void *__data, proto))
+
 #define TRACE_EVENT_FLAGS(event, flag)
 
 #define TRACE_EVENT_PERF_PERM(event, expr...)
@@ -546,6 +570,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			      struct, assign, print)		\
 	DECLARE_TRACE_CONDITION(name, PARAMS(proto),		\
 				PARAMS(args), PARAMS(cond))
+#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign,	\
+			    print, reg, unreg)			\
+	DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args))
 
 #define TRACE_EVENT_FLAGS(event, flag)
 
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index a2ea11cc912e..c85bbce5aaa5 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -53,6 +53,9 @@ __bpf_trace_##call(void *__data, proto)					\
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 	__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args))
 
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
 /*
  * This part is compiled out, it is only here as a build time check
  * to make sure that if the tracepoint handling changes, the
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 00723935dcc7..ff5fa17a6259 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -46,6 +46,10 @@
 		assign, print, reg, unreg)			\
 	DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args))
 
+#undef TRACE_EVENT_SYSCALL
+#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, print, reg, unreg) \
+	DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args))
+
 #undef TRACE_EVENT_NOP
 #define TRACE_EVENT_NOP(name, proto, args, struct, assign, print)
 
@@ -107,6 +111,7 @@
 #undef TRACE_EVENT
 #undef TRACE_EVENT_FN
 #undef TRACE_EVENT_FN_COND
+#undef TRACE_EVENT_SYSCALL
 #undef TRACE_EVENT_CONDITION
 #undef TRACE_EVENT_NOP
 #undef DEFINE_EVENT_NOP
diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h
index b6e0cbc2c71f..f31ff446b468 100644
--- a/include/trace/events/syscalls.h
+++ b/include/trace/events/syscalls.h
@@ -15,7 +15,7 @@
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
 
-TRACE_EVENT_FN(sys_enter,
+TRACE_EVENT_SYSCALL(sys_enter,
 
 	TP_PROTO(struct pt_regs *regs, long id),
 
@@ -41,7 +41,7 @@ TRACE_EVENT_FN(sys_enter,
 
 TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY)
 
-TRACE_EVENT_FN(sys_exit,
+TRACE_EVENT_SYSCALL(sys_exit,
 
 	TP_PROTO(struct pt_regs *regs, long ret),
 
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 2c11181c82e0..ded997af481e 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -55,6 +55,9 @@ perf_trace_##call(void *__data, proto)					\
 				  head, __task);			\
 }
 
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
 /*
  * This part is compiled out, it is only here as a build time check
  * to make sure that if the tracepoint handling changes, the
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index c2f9cabf154d..8bcbb9ee44de 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -45,6 +45,16 @@
 			     PARAMS(print));		       \
 	DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
 
+#undef TRACE_EVENT_SYSCALL
+#define TRACE_EVENT_SYSCALL(name, proto, args, tstruct, assign, print, reg, unreg) \
+	DECLARE_EVENT_SYSCALL_CLASS(name,		       \
+			     PARAMS(proto),		       \
+			     PARAMS(args),		       \
+			     PARAMS(tstruct),		       \
+			     PARAMS(assign),		       \
+			     PARAMS(print));		       \
+	DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
+
 #include "stages/stage1_struct_define.h"
 
 #undef DECLARE_EVENT_CLASS
@@ -57,6 +67,9 @@
 									\
 	static struct trace_event_class event_class_##name;
 
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)	\
 	static struct trace_event_call	__used		\
@@ -117,6 +130,9 @@
 		tstruct;						\
 	};
 
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)
 
@@ -208,6 +224,9 @@ static struct trace_event_functions trace_event_type_funcs_##call = {	\
 	.trace			= trace_raw_output_##call,		\
 };
 
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, call, proto, args, print)		\
 static notrace enum print_line_t					\
@@ -265,6 +284,9 @@ static inline notrace int trace_event_get_offsets_##call(		\
 	return __data_size;						\
 }
 
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -409,6 +431,9 @@ trace_event_raw_event_##call(void *__data, proto)			\
  * fail to compile unless it too is updated.
  */
 
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)			\
 static inline void ftrace_test_probe_##call(void)			\
@@ -434,6 +459,9 @@ static struct trace_event_class __used __refdata event_class_##call = { \
 	_TRACE_PERF_INIT(call)						\
 };
 
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)			\
 									\
-- 
cgit v1.2.3


From a363d27cdbc2bc2d1899b5a1520b64e3590fcd9a Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 8 Oct 2024 21:07:15 -0400
Subject: tracing: Allow system call tracepoints to handle page faults

Use Tasks Trace RCU to protect iteration of system call enter/exit
tracepoint probes to allow those probes to handle page faults.

In preparation for this change, all tracers registering to system call
enter/exit tracepoints should expect those to be called with preemption
enabled.

This allows tracers to fault-in userspace system call arguments such as
path strings within their probe callbacks.

Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-6-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 18 ++++++++++++++++--
 init/Kconfig               |  1 +
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 76e441b39a96..0dc67fad706c 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
 #include <linux/tracepoint-defs.h>
 #include <linux/static_call.h>
 
@@ -107,6 +108,7 @@ void for_each_tracepoint_in_module(struct module *mod,
 #ifdef CONFIG_TRACEPOINTS
 static inline void tracepoint_synchronize_unregister(void)
 {
+	synchronize_rcu_tasks_trace();
 	synchronize_rcu();
 }
 #else
@@ -196,6 +198,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 /*
  * it_func[0] is never NULL because there is at least one element in the array
  * when the array itself is non NULL.
+ *
+ * With @syscall=0, the tracepoint callback array dereference is
+ * protected by disabling preemption.
+ * With @syscall=1, the tracepoint callback array dereference is
+ * protected by Tasks Trace RCU, which allows probes to handle page
+ * faults.
  */
 #define __DO_TRACE(name, args, cond, syscall)				\
 	do {								\
@@ -204,11 +212,17 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		if (!(cond))						\
 			return;						\
 									\
-		preempt_disable_notrace();				\
+		if (syscall)						\
+			rcu_read_lock_trace();				\
+		else							\
+			preempt_disable_notrace();			\
 									\
 		__DO_TRACE_CALL(name, TP_ARGS(args));			\
 									\
-		preempt_enable_notrace();				\
+		if (syscall)						\
+			rcu_read_unlock_trace();			\
+		else							\
+			preempt_enable_notrace();			\
 	} while (0)
 
 /*
diff --git a/init/Kconfig b/init/Kconfig
index 530a382ee0fe..4ac3d1b48278 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1985,6 +1985,7 @@ config BINDGEN_VERSION_TEXT
 #
 config TRACEPOINTS
 	bool
+	select TASKS_TRACE_RCU
 
 source "kernel/Kconfig.kexec"
 
-- 
cgit v1.2.3


From 21291491e3f39d1dac6453e376f7619b21239b5e Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Wed, 9 Oct 2024 01:35:52 +0100
Subject: clk: Remove unused clk_hw_rate_is_protected

clk_hw_rate_is_protected() was added in 2017's commit
e55a839a7a1c ("clk: add clock protection mechanism to clk core")

but has been unused.

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://lore.kernel.org/r/20241009003552.254675-1-linux@treblig.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 6 ------
 include/linux/clk-provider.h | 1 -
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index d02451f951cf..9b45fa005030 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -608,12 +608,6 @@ bool clk_hw_is_prepared(const struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(clk_hw_is_prepared);
 
-bool clk_hw_rate_is_protected(const struct clk_hw *hw)
-{
-	return clk_core_rate_is_protected(hw->core);
-}
-EXPORT_SYMBOL_GPL(clk_hw_rate_is_protected);
-
 bool clk_hw_is_enabled(const struct clk_hw *hw)
 {
 	return clk_core_is_enabled(hw->core);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 7e43caabb54b..5cbd112d1187 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1360,7 +1360,6 @@ unsigned long clk_hw_get_flags(const struct clk_hw *hw);
 	(clk_hw_get_flags((hw)) & CLK_SET_RATE_PARENT)
 
 bool clk_hw_is_prepared(const struct clk_hw *hw);
-bool clk_hw_rate_is_protected(const struct clk_hw *hw);
 bool clk_hw_is_enabled(const struct clk_hw *hw);
 bool __clk_is_enabled(struct clk *clk);
 struct clk *__clk_lookup(const char *name);
-- 
cgit v1.2.3


From 87173021f1583ee37f4801fcde354729da8db3dc Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 8 Oct 2024 10:29:03 -0700
Subject: ipv4: Link IPv4 address to per-netns hash table.

As a prep for per-netns RTNL conversion, we want to namespacify
the IPv4 address hash table and the GC work.

Let's allocate the per-netns IPv4 address hash table to
net->ipv4.inet_addr_lst and link IPv4 addresses into it.

The actual users will be converted later.

Note that the IPv6 address hash table is already namespacified.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20241008172906.1326-2-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/inetdevice.h |  1 +
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/devinet.c         | 22 +++++++++++++++++++---
 3 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index cb5280e6cc21..d0c2bf67a9b0 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -142,6 +142,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 
 struct in_ifaddr {
 	struct hlist_node	hash;
+	struct hlist_node	addr_lst;
 	struct in_ifaddr	__rcu *ifa_next;
 	struct in_device	*ifa_dev;
 	struct rcu_head		rcu_head;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 276f622f3516..29eba2eaaa26 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -270,5 +270,6 @@ struct netns_ipv4 {
 
 	atomic_t	rt_genid;
 	siphash_key_t	ip_id_key;
+	struct hlist_head	*inet_addr_lst;
 };
 #endif
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ab76744383cf..059807a627a6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -134,11 +134,13 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
 
 	ASSERT_RTNL();
 	hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+	hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]);
 }
 
 static void inet_hash_remove(struct in_ifaddr *ifa)
 {
 	ASSERT_RTNL();
+	hlist_del_init_rcu(&ifa->addr_lst);
 	hlist_del_init_rcu(&ifa->hash);
 }
 
@@ -228,6 +230,7 @@ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev)
 	ifa->ifa_dev = in_dev;
 
 	INIT_HLIST_NODE(&ifa->hash);
+	INIT_HLIST_NODE(&ifa->addr_lst);
 
 	return ifa;
 }
@@ -2663,14 +2666,21 @@ static struct ctl_table ctl_forward_entry[] = {
 
 static __net_init int devinet_init_net(struct net *net)
 {
-	int err;
-	struct ipv4_devconf *all, *dflt;
 #ifdef CONFIG_SYSCTL
-	struct ctl_table *tbl;
 	struct ctl_table_header *forw_hdr;
+	struct ctl_table *tbl;
 #endif
+	struct ipv4_devconf *all, *dflt;
+	int err;
+	int i;
 
 	err = -ENOMEM;
+	net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE,
+						sizeof(struct hlist_head),
+						GFP_KERNEL);
+	if (!net->ipv4.inet_addr_lst)
+		goto err_alloc_hash;
+
 	all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL);
 	if (!all)
 		goto err_alloc_all;
@@ -2731,6 +2741,9 @@ static __net_init int devinet_init_net(struct net *net)
 	net->ipv4.forw_hdr = forw_hdr;
 #endif
 
+	for (i = 0; i < IN4_ADDR_HSIZE; i++)
+		INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]);
+
 	net->ipv4.devconf_all = all;
 	net->ipv4.devconf_dflt = dflt;
 	return 0;
@@ -2748,6 +2761,8 @@ err_alloc_ctl:
 err_alloc_dflt:
 	kfree(all);
 err_alloc_all:
+	kfree(net->ipv4.inet_addr_lst);
+err_alloc_hash:
 	return err;
 }
 
@@ -2766,6 +2781,7 @@ static __net_exit void devinet_exit_net(struct net *net)
 #endif
 	kfree(net->ipv4.devconf_dflt);
 	kfree(net->ipv4.devconf_all);
+	kfree(net->ipv4.inet_addr_lst);
 }
 
 static __net_initdata struct pernet_operations devinet_ops = {
-- 
cgit v1.2.3


From 99ee348e6a41cf24b334a1bb7cde87239e8e2d95 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 8 Oct 2024 10:29:06 -0700
Subject: ipv4: Retire global IPv4 hash table inet_addr_lst.

No one uses inet_addr_lst anymore, so let's remove it.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20241008172906.1326-5-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/inetdevice.h |  1 -
 net/ipv4/devinet.c         | 10 ----------
 2 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index d0c2bf67a9b0..d9c690c8c80b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -141,7 +141,6 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 							  ARP_EVICT_NOCARRIER)
 
 struct in_ifaddr {
-	struct hlist_node	hash;
 	struct hlist_node	addr_lst;
 	struct in_ifaddr	__rcu *ifa_next;
 	struct in_device	*ifa_dev;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ac245944e89e..7c156f85b7d2 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -119,8 +119,6 @@ struct inet_fill_args {
 #define IN4_ADDR_HSIZE_SHIFT	8
 #define IN4_ADDR_HSIZE		(1U << IN4_ADDR_HSIZE_SHIFT)
 
-static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
-
 static u32 inet_addr_hash(const struct net *net, __be32 addr)
 {
 	u32 val = (__force u32) addr ^ net_hash_mix(net);
@@ -133,7 +131,6 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
 	u32 hash = inet_addr_hash(net, ifa->ifa_local);
 
 	ASSERT_RTNL();
-	hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
 	hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]);
 }
 
@@ -141,7 +138,6 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
 {
 	ASSERT_RTNL();
 	hlist_del_init_rcu(&ifa->addr_lst);
-	hlist_del_init_rcu(&ifa->hash);
 }
 
 /**
@@ -228,7 +224,6 @@ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev)
 	in_dev_hold(in_dev);
 	ifa->ifa_dev = in_dev;
 
-	INIT_HLIST_NODE(&ifa->hash);
 	INIT_HLIST_NODE(&ifa->addr_lst);
 
 	return ifa;
@@ -2804,11 +2799,6 @@ static struct rtnl_af_ops inet_af_ops __read_mostly = {
 
 void __init devinet_init(void)
 {
-	int i;
-
-	for (i = 0; i < IN4_ADDR_HSIZE; i++)
-		INIT_HLIST_HEAD(&inet_addr_lst[i]);
-
 	register_pernet_subsys(&devinet_ops);
 	register_netdevice_notifier(&ip_netdev_notifier);
 
-- 
cgit v1.2.3


From ee3283c608dfa21251b0821d7bb198c7ae3189f6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 2 Oct 2024 17:27:16 -0400
Subject: timekeeping: Add interfaces for handling timestamps with a floor
 value

Multigrain timestamps allow the kernel to use fine-grained timestamps when
an inode's attributes is being actively observed via ->getattr().  With
this support, it's possible for a file to get a fine-grained timestamp, and
another modified after it to get a coarse-grained stamp that is earlier
than the fine-grained time.  If this happens then the files can appear to
have been modified in reverse order, which breaks VFS ordering guarantees
[1].

To prevent this, maintain a floor value for multigrain timestamps.
Whenever a fine-grained timestamp is handed out, record it, and when later
coarse-grained stamps are handed out, ensure they are not earlier than that
value. If the coarse-grained timestamp is earlier than the fine-grained
floor, return the floor value instead.

Add a static singleton atomic64_t into timekeeper.c that is used to keep
track of the latest fine-grained time ever handed out. This is tracked as a
monotonic ktime_t value to ensure that it isn't affected by clock
jumps. Because it is updated at different times than the rest of the
timekeeper object, the floor value is managed independently of the
timekeeper via a cmpxchg() operation, and sits on its own cacheline.

Add two new public interfaces:

- ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the
  coarse-grained clock and the floor time

- ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries
  to swap it into the floor. A timespec64 is filled with the result.

The floor value is global and updated via a single try_cmpxchg(). If
that fails then the operation raced with a concurrent update. Any
concurrent update must be later than the existing floor value, so any
racing tasks can accept any resulting floor value without retrying.

[1]: POSIX requires that files be stamped with realtime clock values, and
     makes no provision for dealing with backward clock jumps. If a backward
     realtime clock jump occurs, then files can appear to have been modified
     in reverse order.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241002-mgtime-v10-1-d1c4717f5284@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/timekeeping.h |   4 ++
 kernel/time/timekeeping.c   | 104 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fc12a9ba2c88..7aa85246c183 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -45,6 +45,10 @@ extern void ktime_get_real_ts64(struct timespec64 *tv);
 extern void ktime_get_coarse_ts64(struct timespec64 *ts);
 extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
 
+/* Multigrain timestamp interfaces */
+extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
+extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
+
 void getboottime64(struct timespec64 *ts);
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e6f409bf311..441792c907fa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -114,6 +114,23 @@ static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
 	.base[1] = FAST_TK_INIT,
 };
 
+/*
+ * Multigrain timestamps require tracking the latest fine-grained timestamp
+ * that has been issued, and never returning a coarse-grained timestamp that is
+ * earlier than that value.
+ *
+ * mg_floor represents the latest fine-grained time that has been handed out as
+ * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
+ * converted to a realtime clock value on an as-needed basis.
+ *
+ * Maintaining mg_floor ensures the multigrain interfaces never issue a
+ * timestamp earlier than one that has been previously issued.
+ *
+ * The exception to this rule is when there is a backward realtime clock jump. If
+ * such an event occurs, a timestamp can appear to be earlier than a previous one.
+ */
+static __cacheline_aligned_in_smp atomic64_t mg_floor;
+
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
 	while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -2394,6 +2411,93 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
 }
 EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
 
+/**
+ * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
+ * @ts:		timespec64 to be filled
+ *
+ * Fetch the global mg_floor value, convert it to realtime and compare it
+ * to the current coarse-grained time. Fill @ts with whichever is
+ * latest. Note that this is a filesystem-specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	u64 floor = atomic64_read(&mg_floor);
+	ktime_t f_real, offset, coarse;
+	unsigned int seq;
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		*ts = tk_xtime(tk);
+		offset = tk_core.timekeeper.offs_real;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	coarse = timespec64_to_ktime(*ts);
+	f_real = ktime_add(floor, offset);
+	if (ktime_after(f_real, coarse))
+		*ts = ktime_to_timespec64(f_real);
+}
+
+/**
+ * ktime_get_real_ts64_mg - attempt to update floor value and return result
+ * @ts:		pointer to the timespec to be set
+ *
+ * Get a monotonic fine-grained time value and attempt to swap it into
+ * mg_floor. If that succeeds then accept the new floor value. If it fails
+ * then another task raced in during the interim time and updated the
+ * floor.  Since any update to the floor must be later than the previous
+ * floor, either outcome is acceptable.
+ *
+ * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
+ * and determining that the resulting coarse-grained timestamp did not effect
+ * a change in ctime. Any more recent floor value would effect a change to
+ * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
+ *
+ * @ts will be filled with the latest floor value, regardless of the outcome of
+ * the cmpxchg. Note that this is a filesystem specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_real_ts64_mg(struct timespec64 *ts)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	ktime_t old = atomic64_read(&mg_floor);
+	ktime_t offset, mono;
+	unsigned int seq;
+	u64 nsecs;
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+
+		ts->tv_sec = tk->xtime_sec;
+		mono = tk->tkr_mono.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
+		offset = tk_core.timekeeper.offs_real;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	mono = ktime_add_ns(mono, nsecs);
+
+	/*
+	 * Attempt to update the floor with the new time value. As any
+	 * update must be later then the existing floor, and would effect
+	 * a change to ctime from the perspective of the current task,
+	 * accept the resulting floor value regardless of the outcome of
+	 * the swap.
+	 */
+	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
+		ts->tv_nsec = 0;
+		timespec64_add_ns(ts, nsecs);
+	} else {
+		/*
+		 * Another task changed mg_floor since "old" was fetched.
+		 * "old" has been updated with the latest value of "mg_floor".
+		 * That value is newer than the previous floor value, which
+		 * is enough to effect a change to ctime. Accept it.
+		 */
+		*ts = ktime_to_timespec64(ktime_add(old, offset));
+	}
+}
+
 void ktime_get_coarse_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-- 
cgit v1.2.3


From 2a15385742c689a271345dcbb4c28b9c568bc7ce Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 2 Oct 2024 17:27:17 -0400
Subject: timekeeping: Add percpu counter for tracking floor swap events

The mgtime_floor value is a global variable for tracking the latest
fine-grained timestamp handed out. Because it's a global, track the
number of times that a new floor value is assigned.

Add a new percpu counter to the timekeeping code to track the number of
floor swap events that have occurred. A later patch will add a debugfs
file to display this counter alongside other stats involving multigrain
timestamps.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Link: https://lore.kernel.org/all/20241002-mgtime-v10-2-d1c4717f5284@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/timekeeping.h        |  1 +
 kernel/time/timekeeping.c          |  1 +
 kernel/time/timekeeping_debug.c    | 13 +++++++++++++
 kernel/time/timekeeping_internal.h | 15 +++++++++++++++
 4 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 7aa85246c183..84a035e86ac8 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -48,6 +48,7 @@ extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
 /* Multigrain timestamp interfaces */
 extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
 extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
+extern unsigned long timekeeping_get_mg_floor_swaps(void);
 
 void getboottime64(struct timespec64 *ts);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 441792c907fa..962b2a31f015 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2487,6 +2487,7 @@ void ktime_get_real_ts64_mg(struct timespec64 *ts)
 	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
 		ts->tv_nsec = 0;
 		timespec64_add_ns(ts, nsecs);
+		timekeeping_inc_mg_floor_swaps();
 	} else {
 		/*
 		 * Another task changed mg_floor since "old" was fetched.
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index b73e8850e58d..badeb222eab9 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -17,6 +17,9 @@
 
 #define NUM_BINS 32
 
+/* Incremented every time mg_floor is updated */
+DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
 static unsigned int sleep_time_bin[NUM_BINS] = {0};
 
 static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
@@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t)
 			   (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
 }
 
+unsigned long timekeeping_get_mg_floor_swaps(void)
+{
+	unsigned long sum = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu));
+
+	return sum;
+}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ca2787d1642..0bbae825bc02 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -10,9 +10,24 @@
  * timekeeping debug functions
  */
 #ifdef CONFIG_DEBUG_FS
+
+DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+	this_cpu_inc(timekeeping_mg_floor_swaps);
+}
+
 extern void tk_debug_account_sleep_time(const struct timespec64 *t);
+
 #else
+
 #define tk_debug_account_sleep_time(x)
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+}
+
 #endif
 
 #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
-- 
cgit v1.2.3


From 7f2c86cba3c584c7227cddaabdf0ab54c8151e60 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 2 Oct 2024 17:27:20 -0400
Subject: fs: handle delegated timestamps in setattr_copy_mgtime

An update to the inode ctime typically requires the latest clock
value possible. The exception to this rule is when there is a nfsd write
delegation and the server is proxying timestamps from the client.

When nfsd gets a CB_GETATTR response, update the timestamp value in the
inode to the values that the client is tracking. The client doesn't send
a ctime value (since that's always determined by the exported
filesystem), but it can send a mtime value. In the case where it does,
update the ctime to a value commensurate with that instead of the
current time.

If ATTR_DELEG is set, then use ia_ctime value instead of setting the
timestamp to the current time.

With the addition of delegated timestamps, the server may receive a
request to update only the atime, which doesn't involve a ctime update.
Trust the ATTR_CTIME flag in the update and only update the ctime when
it's set.

Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20241002-mgtime-v10-5-d1c4717f5284@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/attr.c          | 29 ++++++++++++++--------
 fs/inode.c         | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 3 files changed, 94 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index 0309c2bd8afa..9caf63d20d03 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -286,16 +286,21 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
 	unsigned int ia_valid = attr->ia_valid;
 	struct timespec64 now;
 
-	/*
-	 * If the ctime isn't being updated then nothing else should be
-	 * either.
-	 */
-	if (!(ia_valid & ATTR_CTIME)) {
-		WARN_ON_ONCE(ia_valid & (ATTR_ATIME|ATTR_MTIME));
-		return;
+	if (ia_valid & ATTR_CTIME) {
+		/*
+		 * In the case of an update for a write delegation, we must respect
+		 * the value in ia_ctime and not use the current time.
+		 */
+		if (ia_valid & ATTR_DELEG)
+			now = inode_set_ctime_deleg(inode, attr->ia_ctime);
+		else
+			now = inode_set_ctime_current(inode);
+	} else {
+		/* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */
+		WARN_ON_ONCE(ia_valid & ATTR_MTIME);
+		now = current_time(inode);
 	}
 
-	now = inode_set_ctime_current(inode);
 	if (ia_valid & ATTR_ATIME_SET)
 		inode_set_atime_to_ts(inode, attr->ia_atime);
 	else if (ia_valid & ATTR_ATIME)
@@ -354,8 +359,12 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
 		inode_set_atime_to_ts(inode, attr->ia_atime);
 	if (ia_valid & ATTR_MTIME)
 		inode_set_mtime_to_ts(inode, attr->ia_mtime);
-	if (ia_valid & ATTR_CTIME)
-		inode_set_ctime_to_ts(inode, attr->ia_ctime);
+	if (ia_valid & ATTR_CTIME) {
+		if (ia_valid & ATTR_DELEG)
+			inode_set_ctime_deleg(inode, attr->ia_ctime);
+		else
+			inode_set_ctime_to_ts(inode, attr->ia_ctime);
+	}
 }
 EXPORT_SYMBOL(setattr_copy);
 
diff --git a/fs/inode.c b/fs/inode.c
index 53f56f6e1ff2..7d1ede60e549 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2717,6 +2717,79 @@ out:
 }
 EXPORT_SYMBOL(inode_set_ctime_current);
 
+/**
+ * inode_set_ctime_deleg - try to update the ctime on a delegated inode
+ * @inode: inode to update
+ * @update: timespec64 to set the ctime
+ *
+ * Attempt to atomically update the ctime on behalf of a delegation holder.
+ *
+ * The nfs server can call back the holder of a delegation to get updated
+ * inode attributes, including the mtime. When updating the mtime, update
+ * the ctime to a value at least equal to that.
+ *
+ * This can race with concurrent updates to the inode, in which
+ * case the update is skipped.
+ *
+ * Note that this works even when multigrain timestamps are not enabled,
+ * so it is used in either case.
+ */
+struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update)
+{
+	struct timespec64 now, cur_ts;
+	u32 cur, old;
+
+	/* pairs with try_cmpxchg below */
+	cur = smp_load_acquire(&inode->i_ctime_nsec);
+	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
+	cur_ts.tv_sec = inode->i_ctime_sec;
+
+	/* If the update is older than the existing value, skip it. */
+	if (timespec64_compare(&update, &cur_ts) <= 0)
+		return cur_ts;
+
+	ktime_get_coarse_real_ts64_mg(&now);
+
+	/* Clamp the update to "now" if it's in the future */
+	if (timespec64_compare(&update, &now) > 0)
+		update = now;
+
+	update = timestamp_truncate(update, inode);
+
+	/* No need to update if the values are already the same */
+	if (timespec64_equal(&update, &cur_ts))
+		return cur_ts;
+
+	/*
+	 * Try to swap the nsec value into place. If it fails, that means
+	 * it raced with an update due to a write or similar activity. That
+	 * stamp takes precedence, so just skip the update.
+	 */
+retry:
+	old = cur;
+	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
+		inode->i_ctime_sec = update.tv_sec;
+		mgtime_counter_inc(mg_ctime_swaps);
+		return update;
+	}
+
+	/*
+	 * Was the change due to another task marking the old ctime QUERIED?
+	 *
+	 * If so, then retry the swap. This can only happen once since
+	 * the only way to clear I_CTIME_QUERIED is to stamp the inode
+	 * with a new ctime.
+	 */
+	if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED)))
+		goto retry;
+
+	/* Otherwise, it was a new timestamp. */
+	cur_ts.tv_sec = inode->i_ctime_sec;
+	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
+	return cur_ts;
+}
+EXPORT_SYMBOL(inode_set_ctime_deleg);
+
 /**
  * in_group_or_capable - check whether caller is CAP_FSETID privileged
  * @idmap:	idmap of the mount @inode was found from
diff --git a/include/linux/fs.h b/include/linux/fs.h
index eff688e75f2f..ea7ed437d2b1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1544,6 +1544,8 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
 
 struct timespec64 current_time(struct inode *inode);
 struct timespec64 inode_set_ctime_current(struct inode *inode);
+struct timespec64 inode_set_ctime_deleg(struct inode *inode,
+					struct timespec64 update);
 
 static inline time64_t inode_get_atime_sec(const struct inode *inode)
 {
-- 
cgit v1.2.3


From 016f426a14f09faa8bdb68b063c2947edf3108a1 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Tue, 8 Oct 2024 21:32:09 +0300
Subject: net/mlx5: qos: Flesh out element_attributes in mlx5_ifc.h

This is used for multiple purposes, depending on the scheduling element
created. There are a few helper struct defined a long time ago, but they
are not easy to find in the file and they are about to get new members.
This commit cleans up this area a bit by:
- moving the helper structs closer to where they are relevant.
- defining a helper union to include all of them to help
  discoverability.
- making use of it everywhere element_attributes is used.
- using a consistent 'attr' name.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 18 +++---
 include/linux/mlx5/mlx5_ifc.h                     | 67 +++++++++++++----------
 2 files changed, 45 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 02a3563f51ad..7154eeff4fd4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -339,7 +339,7 @@ static int esw_qos_vport_create_sched_element(struct mlx5_eswitch *esw,
 	struct mlx5_esw_rate_group *group = vport->qos.group;
 	struct mlx5_core_dev *dev = esw->dev;
 	u32 parent_tsar_ix;
-	void *vport_elem;
+	void *attr;
 	int err;
 
 	if (!esw_qos_element_type_supported(dev, SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT))
@@ -348,8 +348,8 @@ static int esw_qos_vport_create_sched_element(struct mlx5_eswitch *esw,
 	parent_tsar_ix = group ? group->tsar_ix : esw->qos.root_tsar_ix;
 	MLX5_SET(scheduling_context, sched_ctx, element_type,
 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
-	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes);
-	MLX5_SET(vport_element, vport_elem, vport_number, vport->vport);
+	attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes);
+	MLX5_SET(vport_element, attr, vport_number, vport->vport);
 	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_tsar_ix);
 	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate);
 	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
@@ -443,8 +443,8 @@ __esw_qos_create_rate_group(struct mlx5_eswitch *esw, struct netlink_ext_ack *ex
 {
 	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
 	struct mlx5_esw_rate_group *group;
-	__be32 *attr;
 	u32 divider;
+	void *attr;
 	int err;
 
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
@@ -453,12 +453,10 @@ __esw_qos_create_rate_group(struct mlx5_eswitch *esw, struct netlink_ext_ack *ex
 
 	MLX5_SET(scheduling_context, tsar_ctx, element_type,
 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
-
-	attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
-	*attr = cpu_to_be32(TSAR_ELEMENT_TSAR_TYPE_DWRR << 16);
-
 	MLX5_SET(scheduling_context, tsar_ctx, parent_element_id,
 		 esw->qos.root_tsar_ix);
+	attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
+	MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR);
 	err = mlx5_create_scheduling_element_cmd(esw->dev,
 						 SCHEDULING_HIERARCHY_E_SWITCH,
 						 tsar_ctx,
@@ -559,7 +557,7 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta
 {
 	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
 	struct mlx5_core_dev *dev = esw->dev;
-	__be32 *attr;
+	void *attr;
 	int err;
 
 	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
@@ -573,7 +571,7 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta
 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
 
 	attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
-	*attr = cpu_to_be32(TSAR_ELEMENT_TSAR_TYPE_DWRR << 16);
+	MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR);
 
 	err = mlx5_create_scheduling_element_cmd(dev,
 						 SCHEDULING_HIERARCHY_E_SWITCH,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 96d369112bfa..c79ba6197673 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4105,11 +4105,47 @@ enum {
 	ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP	= 1 << 4,
 };
 
+enum {
+	TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0,
+	TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1,
+	TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2,
+};
+
+enum {
+	TSAR_TYPE_CAP_MASK_DWRR		= 1 << 0,
+	TSAR_TYPE_CAP_MASK_ROUND_ROBIN	= 1 << 1,
+	TSAR_TYPE_CAP_MASK_ETS		= 1 << 2,
+};
+
+struct mlx5_ifc_tsar_element_bits {
+	u8         reserved_at_0[0x8];
+	u8         tsar_type[0x8];
+	u8         reserved_at_10[0x10];
+};
+
+struct mlx5_ifc_vport_element_bits {
+	u8         reserved_at_0[0x10];
+	u8         vport_number[0x10];
+};
+
+struct mlx5_ifc_vport_tc_element_bits {
+	u8         traffic_class[0x4];
+	u8         reserved_at_4[0xc];
+	u8         vport_number[0x10];
+};
+
+union mlx5_ifc_element_attributes_bits {
+	struct mlx5_ifc_tsar_element_bits tsar;
+	struct mlx5_ifc_vport_element_bits vport;
+	struct mlx5_ifc_vport_tc_element_bits vport_tc;
+	u8 reserved_at_0[0x20];
+};
+
 struct mlx5_ifc_scheduling_context_bits {
 	u8         element_type[0x8];
 	u8         reserved_at_8[0x18];
 
-	u8         element_attributes[0x20];
+	union mlx5_ifc_element_attributes_bits element_attributes;
 
 	u8         parent_element_id[0x20];
 
@@ -4798,35 +4834,6 @@ struct mlx5_ifc_register_loopback_control_bits {
 	u8         reserved_at_20[0x60];
 };
 
-struct mlx5_ifc_vport_tc_element_bits {
-	u8         traffic_class[0x4];
-	u8         reserved_at_4[0xc];
-	u8         vport_number[0x10];
-};
-
-struct mlx5_ifc_vport_element_bits {
-	u8         reserved_at_0[0x10];
-	u8         vport_number[0x10];
-};
-
-enum {
-	TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0,
-	TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1,
-	TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2,
-};
-
-enum {
-	TSAR_TYPE_CAP_MASK_DWRR		= 1 << 0,
-	TSAR_TYPE_CAP_MASK_ROUND_ROBIN	= 1 << 1,
-	TSAR_TYPE_CAP_MASK_ETS		= 1 << 2,
-};
-
-struct mlx5_ifc_tsar_element_bits {
-	u8         reserved_at_0[0x8];
-	u8         tsar_type[0x8];
-	u8         reserved_at_10[0x10];
-};
-
 enum {
 	MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_SUCCESS = 0x0,
 	MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL = 0x1,
-- 
cgit v1.2.3


From fceffbfe57af7d9941d08e1a995cccf558d08451 Mon Sep 17 00:00:00 2001
From: Patrick Rudolph <patrick.rudolph@9elements.com>
Date: Wed, 2 Oct 2024 14:54:58 +0200
Subject: regulator: max5970: Drop unused structs

After splitting the max5970 into a MFD device clean the remaining
code and drop unused structs.

The struct max5970_data and enum max5970_chip_type aren't used.

Signed-off-by: Patrick Rudolph <patrick.rudolph@9elements.com>
Acked-by: Lee Jones <lee@kernel.org>
Link: https://patch.msgid.link/20241002125500.78278-1-patrick.rudolph@9elements.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/max5970-regulator.c | 21 ++++-----------------
 include/linux/mfd/max5970.h           | 12 ------------
 2 files changed, 4 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/max5970-regulator.c b/drivers/regulator/max5970-regulator.c
index 4a568b1b0107..fc14177ddf5d 100644
--- a/drivers/regulator/max5970-regulator.c
+++ b/drivers/regulator/max5970-regulator.c
@@ -485,7 +485,7 @@ static int max597x_irq_handler(int irq, struct regulator_irq_data *rid,
 }
 
 static int max597x_adc_range(struct regmap *regmap, const int ch,
-			     u32 *irng, u32 *mon_rng)
+			     int *irng, int *mon_rng)
 {
 	unsigned int reg;
 	int ret;
@@ -552,7 +552,6 @@ static int max597x_setup_irq(struct device *dev,
 
 static int max597x_regulator_probe(struct platform_device *pdev)
 {
-	struct max5970_data *max597x;
 	struct regmap *regmap = dev_get_regmap(pdev->dev.parent, NULL);
 	struct max5970_regulator *data;
 	struct i2c_client *i2c = to_i2c_client(pdev->dev.parent);
@@ -566,26 +565,18 @@ static int max597x_regulator_probe(struct platform_device *pdev)
 	if (!regmap)
 		return -EPROBE_DEFER;
 
-	max597x = devm_kzalloc(&i2c->dev, sizeof(struct max5970_data), GFP_KERNEL);
-	if (!max597x)
-		return -ENOMEM;
-
 	rdevs = devm_kcalloc(&i2c->dev, MAX5970_NUM_SWITCHES, sizeof(struct regulator_dev *),
 			     GFP_KERNEL);
 	if (!rdevs)
 		return -ENOMEM;
 
-	i2c_set_clientdata(i2c, max597x);
-
 	if (of_device_is_compatible(i2c->dev.of_node, "maxim,max5978"))
-		max597x->num_switches = MAX5978_NUM_SWITCHES;
+		num_switches = MAX5978_NUM_SWITCHES;
 	else if (of_device_is_compatible(i2c->dev.of_node, "maxim,max5970"))
-		max597x->num_switches = MAX5970_NUM_SWITCHES;
+		num_switches = MAX5970_NUM_SWITCHES;
 	else
 		return -ENODEV;
 
-	num_switches = max597x->num_switches;
-
 	for (i = 0; i < num_switches; i++) {
 		data =
 		    devm_kzalloc(&i2c->dev, sizeof(struct max5970_regulator),
@@ -596,13 +587,10 @@ static int max597x_regulator_probe(struct platform_device *pdev)
 		data->num_switches = num_switches;
 		data->regmap = regmap;
 
-		ret = max597x_adc_range(regmap, i, &max597x->irng[i], &max597x->mon_rng[i]);
+		ret = max597x_adc_range(regmap, i, &data->irng, &data->mon_rng);
 		if (ret < 0)
 			return ret;
 
-		data->irng = max597x->irng[i];
-		data->mon_rng = max597x->mon_rng[i];
-
 		config.dev = &i2c->dev;
 		config.driver_data = (void *)data;
 		config.regmap = data->regmap;
@@ -614,7 +602,6 @@ static int max597x_regulator_probe(struct platform_device *pdev)
 			return PTR_ERR(rdev);
 		}
 		rdevs[i] = rdev;
-		max597x->shunt_micro_ohms[i] = data->shunt_micro_ohms;
 	}
 
 	if (IS_REACHABLE(CONFIG_HWMON)) {
diff --git a/include/linux/mfd/max5970.h b/include/linux/mfd/max5970.h
index 762a7d40c843..fc50e89edfaa 100644
--- a/include/linux/mfd/max5970.h
+++ b/include/linux/mfd/max5970.h
@@ -16,18 +16,6 @@
 #define MAX5978_NUM_SWITCHES 1
 #define MAX5970_NUM_LEDS     4
 
-struct max5970_data {
-	int num_switches;
-	u32 irng[MAX5970_NUM_SWITCHES];
-	u32 mon_rng[MAX5970_NUM_SWITCHES];
-	u32 shunt_micro_ohms[MAX5970_NUM_SWITCHES];
-};
-
-enum max5970_chip_type {
-	TYPE_MAX5978 = 1,
-	TYPE_MAX5970,
-};
-
 #define MAX5970_REG_CURRENT_L(ch)		(0x01 + (ch) * 4)
 #define MAX5970_REG_CURRENT_H(ch)		(0x00 + (ch) * 4)
 #define MAX5970_REG_VOLTAGE_L(ch)		(0x03 + (ch) * 4)
-- 
cgit v1.2.3


From 0e8158b4a82eb5bfb69df17510d210b073896f00 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 2 Oct 2024 14:22:24 +0200
Subject: OPP: Rework _set_required_devs() to manage a single device per call

At this point there are no consumer drivers that makes use of
_set_required_devs(), hence it should be straightforward to rework the code
to enable it to better integrate with the PM domain attach procedure.

During attach, one device at the time is being hooked up to its
corresponding PM domain. Therefore, let's update the _set_required_devs()
to align to this behaviour, allowing callers to fill out one required_dev
per call. Subsequent changes starts making use of this.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/20241002122232.194245-4-ulf.hansson@linaro.org
---
 drivers/opp/core.c     | 84 +++++++++++++++++++++++++++++++++-----------------
 drivers/opp/opp.h      |  4 ++-
 include/linux/pm_opp.h | 10 +++---
 3 files changed, 65 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 3aa18737470f..40e5f29a2670 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2473,47 +2473,73 @@ err:
 
 }
 
-static int _opp_set_required_devs(struct opp_table *opp_table,
-				  struct device *dev,
-				  struct device **required_devs)
+static int _opp_set_required_dev(struct opp_table *opp_table,
+				 struct device *dev,
+				 struct device *required_dev,
+				 unsigned int index)
 {
-	int i;
+	struct opp_table *required_table, *pd_table;
+	struct device *gdev;
 
-	if (!opp_table->required_devs) {
+	/* Genpd core takes care of propagation to parent genpd */
+	if (opp_table->is_genpd) {
+		dev_err(dev, "%s: Operation not supported for genpds\n", __func__);
+		return -EOPNOTSUPP;
+	}
+
+	if (index >= opp_table->required_opp_count) {
 		dev_err(dev, "Required OPPs not available, can't set required devs\n");
 		return -EINVAL;
 	}
 
-	/* Another device that shares the OPP table has set the required devs ? */
-	if (opp_table->required_devs[0])
-		return 0;
+	required_table = opp_table->required_opp_tables[index];
+	if (IS_ERR(required_table)) {
+		dev_err(dev, "Missing OPP table, unable to set the required devs\n");
+		return -ENODEV;
+	}
 
-	for (i = 0; i < opp_table->required_opp_count; i++) {
-		/* Genpd core takes care of propagation to parent genpd */
-		if (required_devs[i] && opp_table->is_genpd &&
-		    opp_table->required_opp_tables[i]->is_genpd) {
-			dev_err(dev, "%s: Operation not supported for genpds\n", __func__);
-			return -EOPNOTSUPP;
+	/*
+	 * The required_opp_tables parsing is not perfect, as the OPP core does
+	 * the parsing solely based on the DT node pointers. The core sets the
+	 * required_opp_tables entry to the first OPP table in the "opp_tables"
+	 * list, that matches with the node pointer.
+	 *
+	 * If the target DT OPP table is used by multiple devices and they all
+	 * create separate instances of 'struct opp_table' from it, then it is
+	 * possible that the required_opp_tables entry may be set to the
+	 * incorrect sibling device.
+	 *
+	 * Cross check it again and fix if required.
+	 */
+	gdev = dev_to_genpd_dev(required_dev);
+	if (IS_ERR(gdev))
+		return PTR_ERR(gdev);
+
+	pd_table = _find_opp_table(gdev);
+	if (!IS_ERR(pd_table)) {
+		if (pd_table != required_table) {
+			dev_pm_opp_put_opp_table(required_table);
+			opp_table->required_opp_tables[index] = pd_table;
+		} else {
+			dev_pm_opp_put_opp_table(pd_table);
 		}
-
-		opp_table->required_devs[i] = required_devs[i];
 	}
 
+	opp_table->required_devs[index] = required_dev;
 	return 0;
 }
 
-static void _opp_put_required_devs(struct opp_table *opp_table)
+static void _opp_put_required_dev(struct opp_table *opp_table,
+				  unsigned int index)
 {
-	int i;
-
-	for (i = 0; i < opp_table->required_opp_count; i++)
-		opp_table->required_devs[i] = NULL;
+	opp_table->required_devs[index] = NULL;
 }
 
 static void _opp_clear_config(struct opp_config_data *data)
 {
-	if (data->flags & OPP_CONFIG_REQUIRED_DEVS)
-		_opp_put_required_devs(data->opp_table);
+	if (data->flags & OPP_CONFIG_REQUIRED_DEV)
+		_opp_put_required_dev(data->opp_table,
+				      data->required_dev_index);
 	else if (data->flags & OPP_CONFIG_GENPD)
 		_opp_detach_genpd(data->opp_table);
 
@@ -2630,7 +2656,7 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Attach genpds */
 	if (config->genpd_names) {
-		if (config->required_devs) {
+		if (config->required_dev) {
 			ret = -EINVAL;
 			goto err;
 		}
@@ -2641,13 +2667,15 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 			goto err;
 
 		data->flags |= OPP_CONFIG_GENPD;
-	} else if (config->required_devs) {
-		ret = _opp_set_required_devs(opp_table, dev,
-					     config->required_devs);
+	} else if (config->required_dev) {
+		ret = _opp_set_required_dev(opp_table, dev,
+					    config->required_dev,
+					    config->required_dev_index);
 		if (ret)
 			goto err;
 
-		data->flags |= OPP_CONFIG_REQUIRED_DEVS;
+		data->required_dev_index = config->required_dev_index;
+		data->flags |= OPP_CONFIG_REQUIRED_DEV;
 	}
 
 	ret = xa_alloc(&opp_configs, &id, data, XA_LIMIT(1, INT_MAX),
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index cff1fabd1ae3..3eed6f145e90 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -35,12 +35,13 @@ extern struct list_head opp_tables;
 #define OPP_CONFIG_PROP_NAME		BIT(3)
 #define OPP_CONFIG_SUPPORTED_HW		BIT(4)
 #define OPP_CONFIG_GENPD		BIT(5)
-#define OPP_CONFIG_REQUIRED_DEVS	BIT(6)
+#define OPP_CONFIG_REQUIRED_DEV		BIT(6)
 
 /**
  * struct opp_config_data - data for set config operations
  * @opp_table: OPP table
  * @flags: OPP config flags
+ * @required_dev_index: The position in the array of required_devs
  *
  * This structure stores the OPP config information for each OPP table
  * configuration by the callers.
@@ -48,6 +49,7 @@ extern struct list_head opp_tables;
 struct opp_config_data {
 	struct opp_table *opp_table;
 	unsigned int flags;
+	unsigned int required_dev_index;
 };
 
 /**
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 6424692c30b7..bc74bc69107a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -63,10 +63,11 @@ typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table,
  * @supported_hw_count: Number of elements in the array.
  * @regulator_names: Array of pointers to the names of the regulator, NULL terminated.
  * @genpd_names: Null terminated array of pointers containing names of genpd to
- *		attach. Mutually exclusive with required_devs.
+ *		attach. Mutually exclusive with required_dev.
  * @virt_devs: Pointer to return the array of genpd virtual devices. Mutually
- *		exclusive with required_devs.
- * @required_devs: Required OPP devices. Mutually exclusive with genpd_names/virt_devs.
+ *		exclusive with required_dev.
+ * @required_dev: Required OPP device. Mutually exclusive with genpd_names/virt_devs.
+ * @required_dev_index: The index of the required OPP for the @required_dev.
  *
  * This structure contains platform specific OPP configurations for the device.
  */
@@ -81,7 +82,8 @@ struct dev_pm_opp_config {
 	const char * const *regulator_names;
 	const char * const *genpd_names;
 	struct device ***virt_devs;
-	struct device **required_devs;
+	struct device *required_dev;
+	unsigned int required_dev_index;
 };
 
 #define OPP_LEVEL_UNSET			U32_MAX
-- 
cgit v1.2.3


From 98d277a79126a2df8a2617215846d01f823cfffa Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 2 Oct 2024 14:22:25 +0200
Subject: PM: domains: Support required OPPs in dev_pm_domain_attach_list()

In the multiple PM domain case we need platform code to specify the index
of the corresponding required OPP in DT for a device, which is what
*_opp_attach_genpd() is there to help us with.

However, attaching a device to its PM domains is in general better done
with dev_pm_domain_attach_list(). To avoid having two different ways to
manage this and to prepare for the removal of *_opp_attach_genpd(), let's
extend dev_pm_domain_attach|detach_list() to manage the required OPPs too.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/20241002122232.194245-5-ulf.hansson@linaro.org
---
 drivers/base/power/common.c | 21 ++++++++++++++++++++-
 include/linux/pm_domain.h   |  8 ++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index cca2fd0a1aed..781968a128ff 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -11,6 +11,7 @@
 #include <linux/pm_clock.h>
 #include <linux/acpi.h>
 #include <linux/pm_domain.h>
+#include <linux/pm_opp.h>
 
 #include "power.h"
 
@@ -222,13 +223,15 @@ int dev_pm_domain_attach_list(struct device *dev,
 	if (!pds)
 		return -ENOMEM;
 
-	size = sizeof(*pds->pd_devs) + sizeof(*pds->pd_links);
+	size = sizeof(*pds->pd_devs) + sizeof(*pds->pd_links) +
+	       sizeof(*pds->opp_tokens);
 	pds->pd_devs = kcalloc(num_pds, size, GFP_KERNEL);
 	if (!pds->pd_devs) {
 		ret = -ENOMEM;
 		goto free_pds;
 	}
 	pds->pd_links = (void *)(pds->pd_devs + num_pds);
+	pds->opp_tokens = (void *)(pds->pd_links + num_pds);
 
 	if (link_flags && pd_flags & PD_FLAG_DEV_LINK_ON)
 		link_flags |= DL_FLAG_RPM_ACTIVE;
@@ -244,6 +247,19 @@ int dev_pm_domain_attach_list(struct device *dev,
 			goto err_attach;
 		}
 
+		if (pd_flags & PD_FLAG_REQUIRED_OPP) {
+			struct dev_pm_opp_config config = {
+				.required_dev = pd_dev,
+				.required_dev_index = i,
+			};
+
+			ret = dev_pm_opp_set_config(dev, &config);
+			if (ret < 0)
+				goto err_link;
+
+			pds->opp_tokens[i] = ret;
+		}
+
 		if (link_flags) {
 			struct device_link *link;
 
@@ -264,9 +280,11 @@ int dev_pm_domain_attach_list(struct device *dev,
 	return num_pds;
 
 err_link:
+	dev_pm_opp_clear_config(pds->opp_tokens[i]);
 	dev_pm_domain_detach(pd_dev, true);
 err_attach:
 	while (--i >= 0) {
+		dev_pm_opp_clear_config(pds->opp_tokens[i]);
 		if (pds->pd_links[i])
 			device_link_del(pds->pd_links[i]);
 		dev_pm_domain_detach(pds->pd_devs[i], true);
@@ -361,6 +379,7 @@ void dev_pm_domain_detach_list(struct dev_pm_domain_list *list)
 		return;
 
 	for (i = 0; i < list->num_pds; i++) {
+		dev_pm_opp_clear_config(list->opp_tokens[i]);
 		if (list->pd_links[i])
 			device_link_del(list->pd_links[i]);
 		dev_pm_domain_detach(list->pd_devs[i], true);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index b637ec14025f..92f9d56f623d 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -30,9 +30,16 @@
  *				supplier and its PM domain when creating the
  *				device-links.
  *
+ * PD_FLAG_REQUIRED_OPP:	Assign required_devs for the required OPPs. The
+ *				index of the required OPP must correspond to the
+ *				index in the array of the pd_names. If pd_names
+ *				isn't specified, the index just follows the
+ *				index for the attached PM domain.
+ *
  */
 #define PD_FLAG_NO_DEV_LINK		BIT(0)
 #define PD_FLAG_DEV_LINK_ON		BIT(1)
+#define PD_FLAG_REQUIRED_OPP		BIT(2)
 
 struct dev_pm_domain_attach_data {
 	const char * const *pd_names;
@@ -43,6 +50,7 @@ struct dev_pm_domain_attach_data {
 struct dev_pm_domain_list {
 	struct device **pd_devs;
 	struct device_link **pd_links;
+	u32 *opp_tokens;
 	u32 num_pds;
 };
 
-- 
cgit v1.2.3


From e130ca9d4873f8d01e3e7b8137e0c14196a3cfb4 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 2 Oct 2024 14:22:27 +0200
Subject: pmdomain: core: Set the required dev for a required OPP during genpd
 attach

In the single PM domain case there is no need for platform code to specify
the index of the corresponding required OPP in DT, as the index must be
zero. This allows us to assign a required dev for the required OPP from
genpd, while attaching a device to its PM domain.

In this way, we can remove some of the genpd specific code in the OPP core
for the single PM domain case. Although, this cleanup is made from a
subsequent change.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/20241002122232.194245-7-ulf.hansson@linaro.org
---
 drivers/pmdomain/core.c   | 42 +++++++++++++++++++++++++++++++++++++++---
 include/linux/pm_domain.h |  1 +
 2 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 259abd338f4c..76490f0bf1e2 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -1722,6 +1722,7 @@ static void genpd_free_dev_data(struct device *dev,
 
 	spin_unlock_irq(&dev->power.lock);
 
+	dev_pm_opp_clear_config(gpd_data->opp_token);
 	kfree(gpd_data->td);
 	kfree(gpd_data);
 	dev_pm_put_subsys_data(dev);
@@ -2884,6 +2885,29 @@ static void genpd_dev_pm_sync(struct device *dev)
 	genpd_queue_power_off_work(pd);
 }
 
+static int genpd_set_required_opp_dev(struct device *dev,
+				      struct device *base_dev)
+{
+	struct dev_pm_opp_config config = {
+		.required_dev = dev,
+	};
+	int ret;
+
+	/* Limit support to non-providers for now. */
+	if (of_property_present(base_dev->of_node, "#power-domain-cells"))
+		return 0;
+
+	if (!dev_pm_opp_of_has_required_opp(base_dev))
+		return 0;
+
+	ret = dev_pm_opp_set_config(base_dev, &config);
+	if (ret < 0)
+		return ret;
+
+	dev_gpd_data(dev)->opp_token = ret;
+	return 0;
+}
+
 static int genpd_set_required_opp(struct device *dev, unsigned int index)
 {
 	int ret, pstate;
@@ -2908,7 +2932,8 @@ err:
 }
 
 static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev,
-				 unsigned int index, bool power_on)
+				 unsigned int index, unsigned int num_domains,
+				 bool power_on)
 {
 	struct of_phandle_args pd_args;
 	struct generic_pm_domain *pd;
@@ -2940,6 +2965,17 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev,
 	dev->pm_domain->detach = genpd_dev_pm_detach;
 	dev->pm_domain->sync = genpd_dev_pm_sync;
 
+	/*
+	 * For a single PM domain the index of the required OPP must be zero, so
+	 * let's try to assign a required dev in that case. In the multiple PM
+	 * domains case, we need platform code to specify the index.
+	 */
+	if (num_domains == 1) {
+		ret = genpd_set_required_opp_dev(dev, base_dev);
+		if (ret)
+			goto err;
+	}
+
 	ret = genpd_set_required_opp(dev, index);
 	if (ret)
 		goto err;
@@ -2994,7 +3030,7 @@ int genpd_dev_pm_attach(struct device *dev)
 				       "#power-domain-cells") != 1)
 		return 0;
 
-	return __genpd_dev_pm_attach(dev, dev, 0, true);
+	return __genpd_dev_pm_attach(dev, dev, 0, 1, true);
 }
 EXPORT_SYMBOL_GPL(genpd_dev_pm_attach);
 
@@ -3047,7 +3083,7 @@ struct device *genpd_dev_pm_attach_by_id(struct device *dev,
 	}
 
 	/* Try to attach the device to the PM domain at the specified index. */
-	ret = __genpd_dev_pm_attach(virt_dev, dev, index, false);
+	ret = __genpd_dev_pm_attach(virt_dev, dev, index, num_domains, false);
 	if (ret < 1) {
 		device_unregister(virt_dev);
 		return ret ? ERR_PTR(ret) : NULL;
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 92f9d56f623d..76775ab38898 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -252,6 +252,7 @@ struct generic_pm_domain_data {
 	unsigned int performance_state;
 	unsigned int default_pstate;
 	unsigned int rpm_pstate;
+	unsigned int opp_token;
 	bool hw_mode;
 	void *data;
 };
-- 
cgit v1.2.3


From d6caca30a548764f8cfd78393ea09fefdf285212 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 2 Oct 2024 14:22:32 +0200
Subject: OPP: Drop redundant *_opp_attach|detach_genpd()

All users of *_opp_attach|detach_genpd(), have been converted to use
dev|devm_pm_domain_attach|detach_list(), hence let's drop it along with its
corresponding exported functions.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20241002122232.194245-12-ulf.hansson@linaro.org
---
 drivers/opp/core.c     | 131 +------------------------------------------------
 drivers/opp/opp.h      |   3 +-
 include/linux/pm_opp.h |  38 +-------------
 3 files changed, 3 insertions(+), 169 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 40e5f29a2670..0311b18319a4 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2360,119 +2360,6 @@ static void _opp_put_config_regulators_helper(struct opp_table *opp_table)
 		opp_table->config_regulators = NULL;
 }
 
-static void _opp_detach_genpd(struct opp_table *opp_table)
-{
-	int index;
-
-	for (index = 0; index < opp_table->required_opp_count; index++) {
-		if (!opp_table->required_devs[index])
-			continue;
-
-		dev_pm_domain_detach(opp_table->required_devs[index], false);
-		opp_table->required_devs[index] = NULL;
-	}
-}
-
-/*
- * Multiple generic power domains for a device are supported with the help of
- * virtual genpd devices, which are created for each consumer device - genpd
- * pair. These are the device structures which are attached to the power domain
- * and are required by the OPP core to set the performance state of the genpd.
- * The same API also works for the case where single genpd is available and so
- * we don't need to support that separately.
- *
- * This helper will normally be called by the consumer driver of the device
- * "dev", as only that has details of the genpd names.
- *
- * This helper needs to be called once with a list of all genpd to attach.
- * Otherwise the original device structure will be used instead by the OPP core.
- *
- * The order of entries in the names array must match the order in which
- * "required-opps" are added in DT.
- */
-static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev,
-			const char * const *names, struct device ***virt_devs)
-{
-	struct device *virt_dev, *gdev;
-	struct opp_table *genpd_table;
-	int index = 0, ret = -EINVAL;
-	const char * const *name = names;
-
-	if (!opp_table->required_devs) {
-		dev_err(dev, "Required OPPs not available, can't attach genpd\n");
-		return -EINVAL;
-	}
-
-	/* Genpd core takes care of propagation to parent genpd */
-	if (opp_table->is_genpd) {
-		dev_err(dev, "%s: Operation not supported for genpds\n", __func__);
-		return -EOPNOTSUPP;
-	}
-
-	/* Checking only the first one is enough ? */
-	if (opp_table->required_devs[0])
-		return 0;
-
-	while (*name) {
-		if (index >= opp_table->required_opp_count) {
-			dev_err(dev, "Index can't be greater than required-opp-count - 1, %s (%d : %d)\n",
-				*name, opp_table->required_opp_count, index);
-			goto err;
-		}
-
-		virt_dev = dev_pm_domain_attach_by_name(dev, *name);
-		if (IS_ERR_OR_NULL(virt_dev)) {
-			ret = virt_dev ? PTR_ERR(virt_dev) : -ENODEV;
-			dev_err(dev, "Couldn't attach to pm_domain: %d\n", ret);
-			goto err;
-		}
-
-		/*
-		 * The required_opp_tables parsing is not perfect, as the OPP
-		 * core does the parsing solely based on the DT node pointers.
-		 * The core sets the required_opp_tables entry to the first OPP
-		 * table in the "opp_tables" list, that matches with the node
-		 * pointer.
-		 *
-		 * If the target DT OPP table is used by multiple devices and
-		 * they all create separate instances of 'struct opp_table' from
-		 * it, then it is possible that the required_opp_tables entry
-		 * may be set to the incorrect sibling device.
-		 *
-		 * Cross check it again and fix if required.
-		 */
-		gdev = dev_to_genpd_dev(virt_dev);
-		if (IS_ERR(gdev)) {
-			ret = PTR_ERR(gdev);
-			goto err;
-		}
-
-		genpd_table = _find_opp_table(gdev);
-		if (!IS_ERR(genpd_table)) {
-			if (genpd_table != opp_table->required_opp_tables[index]) {
-				dev_pm_opp_put_opp_table(opp_table->required_opp_tables[index]);
-				opp_table->required_opp_tables[index] = genpd_table;
-			} else {
-				dev_pm_opp_put_opp_table(genpd_table);
-			}
-		}
-
-		opp_table->required_devs[index] = virt_dev;
-		index++;
-		name++;
-	}
-
-	if (virt_devs)
-		*virt_devs = opp_table->required_devs;
-
-	return 0;
-
-err:
-	_opp_detach_genpd(opp_table);
-	return ret;
-
-}
-
 static int _opp_set_required_dev(struct opp_table *opp_table,
 				 struct device *dev,
 				 struct device *required_dev,
@@ -2540,9 +2427,6 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_REQUIRED_DEV)
 		_opp_put_required_dev(data->opp_table,
 				      data->required_dev_index);
-	else if (data->flags & OPP_CONFIG_GENPD)
-		_opp_detach_genpd(data->opp_table);
-
 	if (data->flags & OPP_CONFIG_REGULATOR)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
@@ -2654,20 +2538,7 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 		data->flags |= OPP_CONFIG_REGULATOR;
 	}
 
-	/* Attach genpds */
-	if (config->genpd_names) {
-		if (config->required_dev) {
-			ret = -EINVAL;
-			goto err;
-		}
-
-		ret = _opp_attach_genpd(opp_table, dev, config->genpd_names,
-					config->virt_devs);
-		if (ret)
-			goto err;
-
-		data->flags |= OPP_CONFIG_GENPD;
-	} else if (config->required_dev) {
+	if (config->required_dev) {
 		ret = _opp_set_required_dev(opp_table, dev,
 					    config->required_dev,
 					    config->required_dev_index);
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 3eed6f145e90..4bdb79ffa101 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -34,8 +34,7 @@ extern struct list_head opp_tables;
 #define OPP_CONFIG_REGULATOR_HELPER	BIT(2)
 #define OPP_CONFIG_PROP_NAME		BIT(3)
 #define OPP_CONFIG_SUPPORTED_HW		BIT(4)
-#define OPP_CONFIG_GENPD		BIT(5)
-#define OPP_CONFIG_REQUIRED_DEV		BIT(6)
+#define OPP_CONFIG_REQUIRED_DEV		BIT(5)
 
 /**
  * struct opp_config_data - data for set config operations
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index bc74bc69107a..568183e3e641 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -62,11 +62,7 @@ typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table,
  * @supported_hw: Array of hierarchy of versions to match.
  * @supported_hw_count: Number of elements in the array.
  * @regulator_names: Array of pointers to the names of the regulator, NULL terminated.
- * @genpd_names: Null terminated array of pointers containing names of genpd to
- *		attach. Mutually exclusive with required_dev.
- * @virt_devs: Pointer to return the array of genpd virtual devices. Mutually
- *		exclusive with required_dev.
- * @required_dev: Required OPP device. Mutually exclusive with genpd_names/virt_devs.
+ * @required_dev: The required OPP device.
  * @required_dev_index: The index of the required OPP for the @required_dev.
  *
  * This structure contains platform specific OPP configurations for the device.
@@ -80,8 +76,6 @@ struct dev_pm_opp_config {
 	const unsigned int *supported_hw;
 	unsigned int supported_hw_count;
 	const char * const *regulator_names;
-	const char * const *genpd_names;
-	struct device ***virt_devs;
 	struct device *required_dev;
 	unsigned int required_dev_index;
 };
@@ -677,36 +671,6 @@ static inline void dev_pm_opp_put_config_regulators(int token)
 	dev_pm_opp_clear_config(token);
 }
 
-/* genpd helpers */
-static inline int dev_pm_opp_attach_genpd(struct device *dev,
-					  const char * const *names,
-					  struct device ***virt_devs)
-{
-	struct dev_pm_opp_config config = {
-		.genpd_names = names,
-		.virt_devs = virt_devs,
-	};
-
-	return dev_pm_opp_set_config(dev, &config);
-}
-
-static inline void dev_pm_opp_detach_genpd(int token)
-{
-	dev_pm_opp_clear_config(token);
-}
-
-static inline int devm_pm_opp_attach_genpd(struct device *dev,
-					   const char * const *names,
-					   struct device ***virt_devs)
-{
-	struct dev_pm_opp_config config = {
-		.genpd_names = names,
-		.virt_devs = virt_devs,
-	};
-
-	return devm_pm_opp_set_config(dev, &config);
-}
-
 /* prop-name helpers */
 static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
-- 
cgit v1.2.3


From 13d68a16430312fc21990f48326366eb73891202 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 9 Oct 2024 10:09:47 +0200
Subject: genetlink: extend info user-storage to match NL cb ctx

This allows a more uniform implementation of non-dump and dump
operations, and will be used later in the series to avoid some
per-operation allocation.

Additionally rename the NL_ASSERT_DUMP_CTX_FITS macro, to
fit a more extended usage.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/1130cc2896626b84587a2a5f96a5c6829638f4da.1728460186.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/vxlan/vxlan_mdb.c        | 2 +-
 include/linux/netlink.h              | 5 +++--
 include/net/genetlink.h              | 8 ++++++--
 net/core/netdev-genl.c               | 2 +-
 net/core/rtnetlink.c                 | 2 +-
 net/devlink/devl_internal.h          | 2 +-
 net/ethtool/rss.c                    | 2 +-
 net/netfilter/nf_conntrack_netlink.c | 2 +-
 net/netlink/genetlink.c              | 4 ++--
 9 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c
index 60eb95a06d55..ebed05a2804c 100644
--- a/drivers/net/vxlan/vxlan_mdb.c
+++ b/drivers/net/vxlan/vxlan_mdb.c
@@ -284,7 +284,7 @@ int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb,
 
 	ASSERT_RTNL();
 
-	NL_ASSERT_DUMP_CTX_FITS(struct vxlan_mdb_dump_ctx);
+	NL_ASSERT_CTX_FITS(struct vxlan_mdb_dump_ctx);
 
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
 			cb->nlh->nlmsg_seq, RTM_NEWMDB, sizeof(*bpm),
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index b332c2048c75..a3ca198a3a9e 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -34,6 +34,7 @@ struct netlink_skb_parms {
 
 #define NETLINK_CB(skb)		(*(struct netlink_skb_parms*)&((skb)->cb))
 #define NETLINK_CREDS(skb)	(&NETLINK_CB((skb)).creds)
+#define NETLINK_CTX_SIZE	48
 
 
 void netlink_table_grab(void);
@@ -293,7 +294,7 @@ struct netlink_callback {
 	int			flags;
 	bool			strict_check;
 	union {
-		u8		ctx[48];
+		u8		ctx[NETLINK_CTX_SIZE];
 
 		/* args is deprecated. Cast a struct over ctx instead
 		 * for proper type safety.
@@ -302,7 +303,7 @@ struct netlink_callback {
 	};
 };
 
-#define NL_ASSERT_DUMP_CTX_FITS(type_name)				\
+#define NL_ASSERT_CTX_FITS(type_name)					\
 	BUILD_BUG_ON(sizeof(type_name) >				\
 		     sizeof_field(struct netlink_callback, ctx))
 
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 9ab49bfeae78..9d3726e8f90e 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -124,7 +124,8 @@ struct genl_family {
  * @genlhdr: generic netlink message header
  * @attrs: netlink attributes
  * @_net: network namespace
- * @user_ptr: user pointers
+ * @ctx: storage space for the use by the family
+ * @user_ptr: user pointers (deprecated, use ctx instead)
  * @extack: extended ACK report struct
  */
 struct genl_info {
@@ -135,7 +136,10 @@ struct genl_info {
 	struct genlmsghdr *	genlhdr;
 	struct nlattr **	attrs;
 	possible_net_t		_net;
-	void *			user_ptr[2];
+	union {
+		u8		ctx[NETLINK_CTX_SIZE];
+		void *		user_ptr[2];
+	};
 	struct netlink_ext_ack *extack;
 };
 
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 1cb954f2d39e..358cba248796 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -24,7 +24,7 @@ struct netdev_nl_dump_ctx {
 
 static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb)
 {
-	NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx);
+	NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx);
 
 	return (struct netdev_nl_dump_ctx *)cb->ctx;
 }
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 6d68247aea70..a9b81b7d9746 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -6243,7 +6243,7 @@ static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	int idx, s_idx;
 	int err;
 
-	NL_ASSERT_DUMP_CTX_FITS(struct rtnl_mdb_dump_ctx);
+	NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx);
 
 	if (cb->strict_check) {
 		err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack);
diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h
index c7a8e13f917c..a9f064ab9ed9 100644
--- a/net/devlink/devl_internal.h
+++ b/net/devlink/devl_internal.h
@@ -166,7 +166,7 @@ int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb,
 static inline struct devlink_nl_dump_state *
 devlink_dump_state(struct netlink_callback *cb)
 {
-	NL_ASSERT_DUMP_CTX_FITS(struct devlink_nl_dump_state);
+	NL_ASSERT_CTX_FITS(struct devlink_nl_dump_state);
 
 	return (struct devlink_nl_dump_state *)cb->ctx;
 }
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index e07386275e14..7cb106b590ab 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -224,7 +224,7 @@ struct rss_nl_dump_ctx {
 
 static struct rss_nl_dump_ctx *rss_dump_ctx(struct netlink_callback *cb)
 {
-	NL_ASSERT_DUMP_CTX_FITS(struct rss_nl_dump_ctx);
+	NL_ASSERT_CTX_FITS(struct rss_nl_dump_ctx);
 
 	return (struct rss_nl_dump_ctx *)cb->ctx;
 }
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 6a1239433830..36168f8b6efa 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3870,7 +3870,7 @@ static int __init ctnetlink_init(void)
 {
 	int ret;
 
-	NL_ASSERT_DUMP_CTX_FITS(struct ctnetlink_list_dump_ctx);
+	NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx);
 
 	ret = nfnetlink_subsys_register(&ctnl_subsys);
 	if (ret < 0) {
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index feb54c63a116..29387b605f3e 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -997,7 +997,7 @@ static int genl_start(struct netlink_callback *cb)
 	info->info.attrs	= attrs;
 	genl_info_net_set(&info->info, sock_net(cb->skb->sk));
 	info->info.extack	= cb->extack;
-	memset(&info->info.user_ptr, 0, sizeof(info->info.user_ptr));
+	memset(&info->info.ctx, 0, sizeof(info->info.ctx));
 
 	cb->data = info;
 	if (ops->start) {
@@ -1104,7 +1104,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family,
 	info.attrs = attrbuf;
 	info.extack = extack;
 	genl_info_net_set(&info, net);
-	memset(&info.user_ptr, 0, sizeof(info.user_ptr));
+	memset(&info.ctx, 0, sizeof(info.ctx));
 
 	if (ops->pre_doit) {
 		err = ops->pre_doit(ops, skb, &info);
-- 
cgit v1.2.3


From 4b623f9f0f59652ea71fcb27d60b4c3b65126dbb Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 9 Oct 2024 10:09:49 +0200
Subject: net-shapers: implement NL get operation

Introduce the basic infrastructure to implement the net-shaper
core functionality. Each network devices carries a net-shaper cache,
the NL get() operation fetches the data from such cache.

The cache is initially empty, will be fill by the set()/group()
operation implemented later and is destroyed at device cleanup time.

The net_shaper_fill_handle(), net_shaper_ctx_init(), and
net_shaper_generic_pre() implementations handle generic index type
attributes, despite the current caller always pass a constant value
to avoid more noise in later patches using them with different
attributes.

Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/ddd10fd645a9367803ad02fca4a5664ea5ace170.1728460186.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/kapi.rst |   3 +
 include/linux/netdevice.h         |  21 +++
 include/net/net_shaper.h          | 120 ++++++++++++++
 net/core/dev.c                    |   6 +
 net/core/dev.h                    |   6 +
 net/shaper/shaper.c               | 335 +++++++++++++++++++++++++++++++++++++-
 6 files changed, 484 insertions(+), 7 deletions(-)
 create mode 100644 include/net/net_shaper.h

(limited to 'include/linux')

diff --git a/Documentation/networking/kapi.rst b/Documentation/networking/kapi.rst
index ea55f462cefa..98682b9a13ee 100644
--- a/Documentation/networking/kapi.rst
+++ b/Documentation/networking/kapi.rst
@@ -104,6 +104,9 @@ Driver Support
 .. kernel-doc:: include/linux/netdevice.h
    :internal:
 
+.. kernel-doc:: include/net/net_shaper.h
+   :internal:
+
 PHY Support
 -----------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3baf8e539b6f..e6b93d01e631 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1603,6 +1603,14 @@ struct net_device_ops {
 	int			(*ndo_hwtstamp_set)(struct net_device *dev,
 						    struct kernel_hwtstamp_config *kernel_config,
 						    struct netlink_ext_ack *extack);
+
+#if IS_ENABLED(CONFIG_NET_SHAPER)
+	/**
+	 * @net_shaper_ops: Device shaping offload operations
+	 * see include/net/net_shapers.h
+	 */
+	const struct net_shaper_ops *net_shaper_ops;
+#endif
 };
 
 /**
@@ -2406,6 +2414,19 @@ struct net_device {
 
 	u64			max_pacing_offload_horizon;
 
+	/**
+	 * @lock: protects @net_shaper_hierarchy, feel free to use for other
+	 * netdev-scope protection. Ordering: take after rtnl_lock.
+	 */
+	struct mutex		lock;
+
+#if IS_ENABLED(CONFIG_NET_SHAPER)
+	/**
+	 * @net_shaper_hierarchy: data tracking the current shaper status
+	 *  see include/net/net_shapers.h
+	 */
+	struct net_shaper_hierarchy *net_shaper_hierarchy;
+#endif
 	u8			priv[] ____cacheline_aligned
 				       __counted_by(priv_len);
 } ____cacheline_aligned;
diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h
new file mode 100644
index 000000000000..5c3f49b52fe9
--- /dev/null
+++ b/include/net/net_shaper.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _NET_SHAPER_H_
+#define _NET_SHAPER_H_
+
+#include <linux/types.h>
+
+#include <uapi/linux/net_shaper.h>
+
+struct net_device;
+struct devlink;
+struct netlink_ext_ack;
+
+enum net_shaper_binding_type {
+	NET_SHAPER_BINDING_TYPE_NETDEV,
+	/* NET_SHAPER_BINDING_TYPE_DEVLINK_PORT */
+};
+
+struct net_shaper_binding {
+	enum net_shaper_binding_type type;
+	union {
+		struct net_device *netdev;
+		struct devlink *devlink;
+	};
+};
+
+struct net_shaper_handle {
+	enum net_shaper_scope scope;
+	u32 id;
+};
+
+/**
+ * struct net_shaper - represents a shaping node on the NIC H/W
+ * zeroed field are considered not set.
+ * @parent: Unique identifier for the shaper parent, usually implied
+ * @handle: Unique identifier for this shaper
+ * @metric: Specify if the rate limits refers to PPS or BPS
+ * @bw_min: Minimum guaranteed rate for this shaper
+ * @bw_max: Maximum peak rate allowed for this shaper
+ * @burst: Maximum burst for the peek rate of this shaper
+ * @priority: Scheduling priority for this shaper
+ * @weight: Scheduling weight for this shaper
+ */
+struct net_shaper {
+	struct net_shaper_handle parent;
+	struct net_shaper_handle handle;
+	enum net_shaper_metric metric;
+	u64 bw_min;
+	u64 bw_max;
+	u64 burst;
+	u32 priority;
+	u32 weight;
+
+	/* private: */
+	u32 leaves; /* accounted only for NODE scope */
+	struct rcu_head rcu;
+};
+
+/**
+ * struct net_shaper_ops - Operations on device H/W shapers
+ *
+ * The operations applies to either net_device and devlink objects.
+ * The initial shaping configuration at device initialization is empty:
+ * does not constraint the rate in any way.
+ * The network core keeps track of the applied user-configuration in
+ * the net_device or devlink structure.
+ * The operations are serialized via a per device lock.
+ *
+ * Device not supporting any kind of nesting should not provide the
+ * group operation.
+ *
+ * Each shaper is uniquely identified within the device with a 'handle'
+ * comprising the shaper scope and a scope-specific id.
+ */
+struct net_shaper_ops {
+	/**
+	 * @group: create the specified shapers scheduling group
+	 *
+	 * Nest the @leaves shapers identified under the * @node shaper.
+	 * All the shapers belong to the device specified by @binding.
+	 * The @leaves arrays size is specified by @leaves_count.
+	 * Create either the @leaves and the @node shaper; or if they already
+	 * exists, links them together in the desired way.
+	 * @leaves scope must be NET_SHAPER_SCOPE_QUEUE.
+	 */
+	int (*group)(struct net_shaper_binding *binding, int leaves_count,
+		     const struct net_shaper *leaves,
+		     const struct net_shaper *node,
+		     struct netlink_ext_ack *extack);
+
+	/**
+	 * @set: Updates the specified shaper
+	 *
+	 * Updates or creates the @shaper on the device specified by @binding.
+	 */
+	int (*set)(struct net_shaper_binding *binding,
+		   const struct net_shaper *shaper,
+		   struct netlink_ext_ack *extack);
+
+	/**
+	 * @delete: Removes the specified shaper
+	 *
+	 * Removes the shaper configuration as identified by the given @handle
+	 * on the device specified by @binding, restoring the default behavior.
+	 */
+	int (*delete)(struct net_shaper_binding *binding,
+		      const struct net_shaper_handle *handle,
+		      struct netlink_ext_ack *extack);
+
+	/**
+	 * @capabilities: get the shaper features supported by the device
+	 *
+	 * Fills the bitmask @cap with the supported capabilities for the
+	 * specified @scope and device specified by @binding.
+	 */
+	void (*capabilities)(struct net_shaper_binding *binding,
+			     enum net_shaper_scope scope, unsigned long *cap);
+};
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index ea5fbcd133ae..6e727c49a6f7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11147,6 +11147,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	hash_init(dev->qdisc_hash);
 #endif
 
+	mutex_init(&dev->lock);
+
 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
 
@@ -11217,6 +11219,8 @@ void free_netdev(struct net_device *dev)
 		return;
 	}
 
+	mutex_destroy(&dev->lock);
+
 	kfree(dev->ethtool);
 	netif_free_tx_queues(dev);
 	netif_free_rx_queues(dev);
@@ -11426,6 +11430,8 @@ void unregister_netdevice_many_notify(struct list_head *head,
 
 		mutex_destroy(&dev->ethtool->rss_lock);
 
+		net_shaper_flush_netdev(dev);
+
 		if (skb)
 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
 
diff --git a/net/core/dev.h b/net/core/dev.h
index 5654325c5b71..13c558874af3 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -35,6 +35,12 @@ void dev_addr_flush(struct net_device *dev);
 int dev_addr_init(struct net_device *dev);
 void dev_addr_check(struct net_device *dev);
 
+#if IS_ENABLED(CONFIG_NET_SHAPER)
+void net_shaper_flush_netdev(struct net_device *dev);
+#else
+static inline void net_shaper_flush_netdev(struct net_device *dev) {}
+#endif
+
 /* sysctls not referred to from outside net/core/ */
 extern int		netdev_unregister_timeout_secs;
 extern int		weight_p;
diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c
index a1b20888f502..22daf7dde999 100644
--- a/net/shaper/shaper.c
+++ b/net/shaper/shaper.c
@@ -1,30 +1,333 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <linux/bits.h>
+#include <linux/bitfield.h>
+#include <linux/idr.h>
 #include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/skbuff.h>
+#include <linux/xarray.h>
+#include <net/devlink.h>
+#include <net/net_shaper.h>
 
 #include "shaper_nl_gen.h"
 
+#include "../core/dev.h"
+
+#define NET_SHAPER_SCOPE_SHIFT	26
+#define NET_SHAPER_ID_MASK	GENMASK(NET_SHAPER_SCOPE_SHIFT - 1, 0)
+#define NET_SHAPER_SCOPE_MASK	GENMASK(31, NET_SHAPER_SCOPE_SHIFT)
+
+#define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK
+
+struct net_shaper_hierarchy {
+	struct xarray shapers;
+};
+
+struct net_shaper_nl_ctx {
+	struct net_shaper_binding binding;
+	netdevice_tracker dev_tracker;
+	unsigned long start_index;
+};
+
+static struct net_shaper_binding *net_shaper_binding_from_ctx(void *ctx)
+{
+	return &((struct net_shaper_nl_ctx *)ctx)->binding;
+}
+
+static struct net_shaper_hierarchy *
+net_shaper_hierarchy(struct net_shaper_binding *binding)
+{
+	/* Pairs with WRITE_ONCE() in net_shaper_hierarchy_setup. */
+	if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV)
+		return READ_ONCE(binding->netdev->net_shaper_hierarchy);
+
+	/* No other type supported yet. */
+	return NULL;
+}
+
+static int net_shaper_fill_binding(struct sk_buff *msg,
+				   const struct net_shaper_binding *binding,
+				   u32 type)
+{
+	/* Should never happen, as currently only NETDEV is supported. */
+	if (WARN_ON_ONCE(binding->type != NET_SHAPER_BINDING_TYPE_NETDEV))
+		return -EINVAL;
+
+	if (nla_put_u32(msg, type, binding->netdev->ifindex))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int net_shaper_fill_handle(struct sk_buff *msg,
+				  const struct net_shaper_handle *handle,
+				  u32 type)
+{
+	struct nlattr *handle_attr;
+
+	if (handle->scope == NET_SHAPER_SCOPE_UNSPEC)
+		return 0;
+
+	handle_attr = nla_nest_start(msg, type);
+	if (!handle_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(msg, NET_SHAPER_A_HANDLE_SCOPE, handle->scope) ||
+	    (handle->scope >= NET_SHAPER_SCOPE_QUEUE &&
+	     nla_put_u32(msg, NET_SHAPER_A_HANDLE_ID, handle->id)))
+		goto handle_nest_cancel;
+
+	nla_nest_end(msg, handle_attr);
+	return 0;
+
+handle_nest_cancel:
+	nla_nest_cancel(msg, handle_attr);
+	return -EMSGSIZE;
+}
+
+static int
+net_shaper_fill_one(struct sk_buff *msg,
+		    const struct net_shaper_binding *binding,
+		    const struct net_shaper *shaper,
+		    const struct genl_info *info)
+{
+	void *hdr;
+
+	hdr = genlmsg_iput(msg, info);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) ||
+	    net_shaper_fill_handle(msg, &shaper->parent,
+				   NET_SHAPER_A_PARENT) ||
+	    net_shaper_fill_handle(msg, &shaper->handle,
+				   NET_SHAPER_A_HANDLE) ||
+	    ((shaper->bw_min || shaper->bw_max || shaper->burst) &&
+	     nla_put_u32(msg, NET_SHAPER_A_METRIC, shaper->metric)) ||
+	    (shaper->bw_min &&
+	     nla_put_uint(msg, NET_SHAPER_A_BW_MIN, shaper->bw_min)) ||
+	    (shaper->bw_max &&
+	     nla_put_uint(msg, NET_SHAPER_A_BW_MAX, shaper->bw_max)) ||
+	    (shaper->burst &&
+	     nla_put_uint(msg, NET_SHAPER_A_BURST, shaper->burst)) ||
+	    (shaper->priority &&
+	     nla_put_u32(msg, NET_SHAPER_A_PRIORITY, shaper->priority)) ||
+	    (shaper->weight &&
+	     nla_put_u32(msg, NET_SHAPER_A_WEIGHT, shaper->weight)))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/* Initialize the context fetching the relevant device and
+ * acquiring a reference to it.
+ */
+static int net_shaper_ctx_setup(const struct genl_info *info, int type,
+				struct net_shaper_nl_ctx *ctx)
+{
+	struct net *ns = genl_info_net(info);
+	struct net_device *dev;
+	int ifindex;
+
+	if (GENL_REQ_ATTR_CHECK(info, type))
+		return -EINVAL;
+
+	ifindex = nla_get_u32(info->attrs[type]);
+	dev = netdev_get_by_index(ns, ifindex, &ctx->dev_tracker, GFP_KERNEL);
+	if (!dev) {
+		NL_SET_BAD_ATTR(info->extack, info->attrs[type]);
+		return -ENOENT;
+	}
+
+	if (!dev->netdev_ops->net_shaper_ops) {
+		NL_SET_BAD_ATTR(info->extack, info->attrs[type]);
+		netdev_put(dev, &ctx->dev_tracker);
+		return -EOPNOTSUPP;
+	}
+
+	ctx->binding.type = NET_SHAPER_BINDING_TYPE_NETDEV;
+	ctx->binding.netdev = dev;
+	return 0;
+}
+
+static void net_shaper_ctx_cleanup(struct net_shaper_nl_ctx *ctx)
+{
+	if (ctx->binding.type == NET_SHAPER_BINDING_TYPE_NETDEV)
+		netdev_put(ctx->binding.netdev, &ctx->dev_tracker);
+}
+
+static u32 net_shaper_handle_to_index(const struct net_shaper_handle *handle)
+{
+	return FIELD_PREP(NET_SHAPER_SCOPE_MASK, handle->scope) |
+		FIELD_PREP(NET_SHAPER_ID_MASK, handle->id);
+}
+
+static struct net_shaper *
+net_shaper_lookup(struct net_shaper_binding *binding,
+		  const struct net_shaper_handle *handle)
+{
+	struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding);
+	u32 index = net_shaper_handle_to_index(handle);
+
+	return hierarchy ? xa_load(&hierarchy->shapers, index) : NULL;
+}
+
+static int net_shaper_parse_handle(const struct nlattr *attr,
+				   const struct genl_info *info,
+				   struct net_shaper_handle *handle)
+{
+	struct nlattr *tb[NET_SHAPER_A_HANDLE_MAX + 1];
+	struct nlattr *id_attr;
+	u32 id = 0;
+	int ret;
+
+	ret = nla_parse_nested(tb, NET_SHAPER_A_HANDLE_MAX, attr,
+			       net_shaper_handle_nl_policy, info->extack);
+	if (ret < 0)
+		return ret;
+
+	if (NL_REQ_ATTR_CHECK(info->extack, attr, tb,
+			      NET_SHAPER_A_HANDLE_SCOPE))
+		return -EINVAL;
+
+	handle->scope = nla_get_u32(tb[NET_SHAPER_A_HANDLE_SCOPE]);
+
+	/* The default id for NODE scope shapers is an invalid one
+	 * to help the 'group' operation discriminate between new
+	 * NODE shaper creation (ID_UNSPEC) and reuse of existing
+	 * shaper (any other value).
+	 */
+	id_attr = tb[NET_SHAPER_A_HANDLE_ID];
+	if (id_attr)
+		id = nla_get_u32(id_attr);
+	else if (handle->scope == NET_SHAPER_SCOPE_NODE)
+		id = NET_SHAPER_ID_UNSPEC;
+
+	handle->id = id;
+	return 0;
+}
+
+static int net_shaper_generic_pre(struct genl_info *info, int type)
+{
+	struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)info->ctx;
+
+	BUILD_BUG_ON(sizeof(*ctx) > sizeof(info->ctx));
+
+	return net_shaper_ctx_setup(info, type, ctx);
+}
+
 int net_shaper_nl_pre_doit(const struct genl_split_ops *ops,
 			   struct sk_buff *skb, struct genl_info *info)
 {
-	return -EOPNOTSUPP;
+	return net_shaper_generic_pre(info, NET_SHAPER_A_IFINDEX);
+}
+
+static void net_shaper_generic_post(struct genl_info *info)
+{
+	net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)info->ctx);
 }
 
 void net_shaper_nl_post_doit(const struct genl_split_ops *ops,
 			     struct sk_buff *skb, struct genl_info *info)
 {
+	net_shaper_generic_post(info);
+}
+
+int net_shaper_nl_pre_dumpit(struct netlink_callback *cb)
+{
+	struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx;
+	const struct genl_info *info = genl_info_dump(cb);
+
+	return net_shaper_ctx_setup(info, NET_SHAPER_A_IFINDEX, ctx);
+}
+
+int net_shaper_nl_post_dumpit(struct netlink_callback *cb)
+{
+	net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)cb->ctx);
+	return 0;
 }
 
 int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info)
 {
-	return -EOPNOTSUPP;
+	struct net_shaper_binding *binding;
+	struct net_shaper_handle handle;
+	struct net_shaper *shaper;
+	struct sk_buff *msg;
+	int ret;
+
+	if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_HANDLE))
+		return -EINVAL;
+
+	binding = net_shaper_binding_from_ctx(info->ctx);
+	ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info,
+				      &handle);
+	if (ret < 0)
+		return ret;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	shaper = net_shaper_lookup(binding, &handle);
+	if (!shaper) {
+		NL_SET_BAD_ATTR(info->extack,
+				info->attrs[NET_SHAPER_A_HANDLE]);
+		rcu_read_unlock();
+		ret = -ENOENT;
+		goto free_msg;
+	}
+
+	ret = net_shaper_fill_one(msg, binding, shaper, info);
+	rcu_read_unlock();
+	if (ret)
+		goto free_msg;
+
+	ret = genlmsg_reply(msg, info);
+	if (ret)
+		goto free_msg;
+
+	return 0;
+
+free_msg:
+	nlmsg_free(msg);
+	return ret;
 }
 
 int net_shaper_nl_get_dumpit(struct sk_buff *skb,
 			     struct netlink_callback *cb)
 {
-	return -EOPNOTSUPP;
+	struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx;
+	const struct genl_info *info = genl_info_dump(cb);
+	struct net_shaper_hierarchy *hierarchy;
+	struct net_shaper_binding *binding;
+	struct net_shaper *shaper;
+	int ret = 0;
+
+	/* Don't error out dumps performed before any set operation. */
+	binding = net_shaper_binding_from_ctx(ctx);
+	hierarchy = net_shaper_hierarchy(binding);
+	if (!hierarchy)
+		return 0;
+
+	rcu_read_lock();
+	for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index,
+				 U32_MAX, XA_PRESENT)); ctx->start_index++) {
+		ret = net_shaper_fill_one(skb, binding, shaper, info);
+		if (ret)
+			break;
+	}
+	rcu_read_unlock();
+
+	return ret;
 }
 
 int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info)
@@ -37,14 +340,32 @@ int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info)
 	return -EOPNOTSUPP;
 }
 
-int net_shaper_nl_pre_dumpit(struct netlink_callback *cb)
+static void net_shaper_flush(struct net_shaper_binding *binding)
 {
-	return -EOPNOTSUPP;
+	struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding);
+	struct net_shaper *cur;
+	unsigned long index;
+
+	if (!hierarchy)
+		return;
+
+	xa_lock(&hierarchy->shapers);
+	xa_for_each(&hierarchy->shapers, index, cur) {
+		__xa_erase(&hierarchy->shapers, index);
+		kfree(cur);
+	}
+	xa_unlock(&hierarchy->shapers);
+	kfree(hierarchy);
 }
 
-int net_shaper_nl_post_dumpit(struct netlink_callback *cb)
+void net_shaper_flush_netdev(struct net_device *dev)
 {
-	return -EOPNOTSUPP;
+	struct net_shaper_binding binding = {
+		.type = NET_SHAPER_BINDING_TYPE_NETDEV,
+		.netdev = dev,
+	};
+
+	net_shaper_flush(&binding);
 }
 
 static int __init shaper_init(void)
-- 
cgit v1.2.3


From 608a5c05c39b75fa2539ce9e521d289c5a5326f7 Mon Sep 17 00:00:00 2001
From: Wenjun Wu <wenjun1.wu@intel.com>
Date: Wed, 9 Oct 2024 10:09:58 +0200
Subject: virtchnl: support queue rate limit and quanta size configuration

This patch adds new virtchnl opcodes and structures for rate limit
and quanta size configuration, which include:
1. VIRTCHNL_OP_CONFIG_QUEUE_BW, to configure max bandwidth for each
VF per queue.
2. VIRTCHNL_OP_CONFIG_QUANTA, to configure quanta size per queue.
3. VIRTCHNL_OP_GET_QOS_CAPS, VF queries current QoS configuration, such
as enabled TCs, arbiter type, up2tc and bandwidth of VSI node. The
configuration is previously set by DCB and PF, and now is the potential
QoS capability of VF. VF can take it as reference to configure queue TC
mapping.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Wenjun Wu <wenjun1.wu@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/839002f7bd6f63b985a060a51b079f6e6dbbe237.1728460186.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/avf/virtchnl.h | 119 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index f41395264dca..223e433c39fe 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -89,6 +89,9 @@ enum virtchnl_rx_hsplit {
 	VIRTCHNL_RX_HSPLIT_SPLIT_SCTP    = 8,
 };
 
+enum virtchnl_bw_limit_type {
+	VIRTCHNL_BW_SHAPER = 0,
+};
 /* END GENERIC DEFINES */
 
 /* Opcodes for VF-PF communication. These are placed in the v_opcode field
@@ -151,6 +154,11 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 = 55,
 	VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 = 56,
 	VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 = 57,
+	/* opcode 57 - 65 are reserved */
+	VIRTCHNL_OP_GET_QOS_CAPS = 66,
+	/* opcode 68 through 111 are reserved */
+	VIRTCHNL_OP_CONFIG_QUEUE_BW = 112,
+	VIRTCHNL_OP_CONFIG_QUANTA = 113,
 	VIRTCHNL_OP_MAX,
 };
 
@@ -261,6 +269,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC	BIT(26)
 #define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF		BIT(27)
 #define VIRTCHNL_VF_OFFLOAD_FDIR_PF		BIT(28)
+#define VIRTCHNL_VF_OFFLOAD_QOS			BIT(29)
 
 #define VF_BASE_MODE_OFFLOADS (VIRTCHNL_VF_OFFLOAD_L2 | \
 			       VIRTCHNL_VF_OFFLOAD_VLAN | \
@@ -1416,6 +1425,85 @@ struct virtchnl_fdir_del {
 
 VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del);
 
+struct virtchnl_shaper_bw {
+	/* Unit is Kbps */
+	u32 committed;
+	u32 peak;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_shaper_bw);
+
+/* VIRTCHNL_OP_GET_QOS_CAPS
+ * VF sends this message to get its QoS Caps, such as
+ * TC number, Arbiter and Bandwidth.
+ */
+struct virtchnl_qos_cap_elem {
+	u8 tc_num;
+	u8 tc_prio;
+#define VIRTCHNL_ABITER_STRICT      0
+#define VIRTCHNL_ABITER_ETS         2
+	u8 arbiter;
+#define VIRTCHNL_STRICT_WEIGHT      1
+	u8 weight;
+	enum virtchnl_bw_limit_type type;
+	union {
+		struct virtchnl_shaper_bw shaper;
+		u8 pad2[32];
+	};
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_qos_cap_elem);
+
+struct virtchnl_qos_cap_list {
+	u16 vsi_id;
+	u16 num_elem;
+	struct virtchnl_qos_cap_elem cap[];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(4, virtchnl_qos_cap_list);
+#define virtchnl_qos_cap_list_LEGACY_SIZEOF	44
+
+/* VIRTCHNL_OP_CONFIG_QUEUE_BW */
+struct virtchnl_queue_bw {
+	u16 queue_id;
+	u8 tc;
+	u8 pad;
+	struct virtchnl_shaper_bw shaper;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_queue_bw);
+
+struct virtchnl_queues_bw_cfg {
+	u16 vsi_id;
+	u16 num_queues;
+	struct virtchnl_queue_bw cfg[];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(4, virtchnl_queues_bw_cfg);
+#define virtchnl_queues_bw_cfg_LEGACY_SIZEOF	16
+
+enum virtchnl_queue_type {
+	VIRTCHNL_QUEUE_TYPE_TX			= 0,
+	VIRTCHNL_QUEUE_TYPE_RX			= 1,
+};
+
+/* structure to specify a chunk of contiguous queues */
+struct virtchnl_queue_chunk {
+	/* see enum virtchnl_queue_type */
+	s32 type;
+	u16 start_queue_id;
+	u16 num_queues;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_queue_chunk);
+
+struct virtchnl_quanta_cfg {
+	u16 quanta_size;
+	struct virtchnl_queue_chunk queue_select;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_quanta_cfg);
+
 #define __vss_byone(p, member, count, old)				      \
 	(struct_size(p, member, count) + (old - 1 - struct_size(p, member, 0)))
 
@@ -1438,6 +1526,8 @@ VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del);
 		 __vss(virtchnl_vlan_filter_list_v2, __vss_byelem, p, m, c),  \
 		 __vss(virtchnl_tc_info, __vss_byelem, p, m, c),	      \
 		 __vss(virtchnl_rdma_qvlist_info, __vss_byelem, p, m, c),     \
+		 __vss(virtchnl_qos_cap_list, __vss_byelem, p, m, c),	      \
+		 __vss(virtchnl_queues_bw_cfg, __vss_byelem, p, m, c),	      \
 		 __vss(virtchnl_rss_key, __vss_byone, p, m, c),		      \
 		 __vss(virtchnl_rss_lut, __vss_byone, p, m, c))
 
@@ -1637,6 +1727,35 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2:
 		valid_len = sizeof(struct virtchnl_vlan_setting);
 		break;
+	case VIRTCHNL_OP_GET_QOS_CAPS:
+		break;
+	case VIRTCHNL_OP_CONFIG_QUEUE_BW:
+		valid_len = virtchnl_queues_bw_cfg_LEGACY_SIZEOF;
+		if (msglen >= valid_len) {
+			struct virtchnl_queues_bw_cfg *q_bw =
+				(struct virtchnl_queues_bw_cfg *)msg;
+
+			valid_len = virtchnl_struct_size(q_bw, cfg,
+							 q_bw->num_queues);
+			if (q_bw->num_queues == 0) {
+				err_msg_format = true;
+				break;
+			}
+		}
+		break;
+	case VIRTCHNL_OP_CONFIG_QUANTA:
+		valid_len = sizeof(struct virtchnl_quanta_cfg);
+		if (msglen >= valid_len) {
+			struct virtchnl_quanta_cfg *q_quanta =
+				(struct virtchnl_quanta_cfg *)msg;
+
+			if (q_quanta->quanta_size == 0 ||
+			    q_quanta->queue_select.num_queues == 0) {
+				err_msg_format = true;
+				break;
+			}
+		}
+		break;
 	/* These are always errors coming from the VF. */
 	case VIRTCHNL_OP_EVENT:
 	case VIRTCHNL_OP_UNKNOWN:
-- 
cgit v1.2.3


From 0a6c61bc9c636e9a32d9f5a4d6d3b031d08763ab Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 10 Oct 2024 23:59:09 +0900
Subject: fgraph: Simplify return address printing in function graph tracer

Simplify return address printing in the function graph tracer by removing
fgraph_extras. Since this feature is only used by the function graph
tracer and the feature flags can directly accessible from the function
graph tracer, fgraph_extras can be removed from the fgraph callback.

Cc: Donglin Peng <dolinux.peng@gmail.com>
Link: https://lore.kernel.org/172857234900.270774.15378354017601069781.stgit@devnote2
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h               | 16 +++----------
 kernel/trace/fgraph.c                | 45 +++++++++++++++++++++++-------------
 kernel/trace/ftrace.c                |  3 +--
 kernel/trace/trace.h                 |  3 +--
 kernel/trace/trace_functions_graph.c | 18 +++++++--------
 kernel/trace/trace_irqsoff.c         |  3 +--
 kernel/trace/trace_sched_wakeup.c    |  3 +--
 kernel/trace/trace_selftest.c        |  8 +++----
 8 files changed, 48 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2ac3b3b53cd0..4c7dd5e58c9f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1068,29 +1068,20 @@ struct ftrace_graph_ret {
 	unsigned long long rettime;
 } __packed;
 
-struct fgraph_extras;
 struct fgraph_ops;
 
 /* Type of the callback handlers for tracing function graph*/
 typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *,
 				       struct fgraph_ops *); /* return */
 typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *,
-				      struct fgraph_ops *,
-				      struct fgraph_extras *); /* entry */
+				      struct fgraph_ops *); /* entry */
 
 extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
-				   struct fgraph_ops *gops,
-				   struct fgraph_extras *extras);
+				   struct fgraph_ops *gops);
 bool ftrace_pids_enabled(struct ftrace_ops *ops);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-/* Used to convey some extra datas when creating a graph entry */
-struct fgraph_extras {
-	u32 flags;
-	unsigned long retaddr;
-};
-
 struct fgraph_ops {
 	trace_func_graph_ent_t		entryfunc;
 	trace_func_graph_ret_t		retfunc;
@@ -1131,13 +1122,12 @@ function_graph_enter(unsigned long ret, unsigned long func,
 
 struct ftrace_ret_stack *
 ftrace_graph_get_ret_stack(struct task_struct *task, int skip);
+unsigned long ftrace_graph_top_ret_addr(struct task_struct *task);
 
 unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 				    unsigned long ret, unsigned long *retp);
 unsigned long *fgraph_get_task_var(struct fgraph_ops *gops);
 
-u32 graph_tracer_flags_get(u32 flags);
-
 /*
  * Sometimes we don't want to trace a function with the function
  * graph tracer but we want them to keep traced by the usual function
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 27e523f01ed2..ee829d65f301 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -290,8 +290,7 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset)
 }
 
 /* ftrace_graph_entry set to this to tell some archs to run function graph */
-static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops,
-		     struct fgraph_extras *extras)
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops)
 {
 	return 0;
 }
@@ -519,8 +518,7 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
 #endif
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
-			    struct fgraph_ops *gops,
-			    struct fgraph_extras *extras)
+			    struct fgraph_ops *gops)
 {
 	return 0;
 }
@@ -648,20 +646,13 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 			 unsigned long frame_pointer, unsigned long *retp)
 {
 	struct ftrace_graph_ent trace;
-	struct fgraph_extras extras;
 	unsigned long bitmap = 0;
 	int offset;
 	int i;
-	int idx = 0;
 
 	trace.func = func;
 	trace.depth = ++current->curr_ret_depth;
 
-	extras.flags = graph_tracer_flags_get(TRACE_GRAPH_PRINT_RETADDR);
-	if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR)
-		&& extras.flags & TRACE_GRAPH_PRINT_RETADDR)
-		extras.retaddr = ftrace_graph_ret_addr(current, &idx, ret, retp);
-
 	offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
 	if (offset < 0)
 		goto out;
@@ -670,7 +661,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 	if (static_branch_likely(&fgraph_do_direct)) {
 		int save_curr_ret_stack = current->curr_ret_stack;
 
-		if (static_call(fgraph_func)(&trace, fgraph_direct_gops, &extras))
+		if (static_call(fgraph_func)(&trace, fgraph_direct_gops))
 			bitmap |= BIT(fgraph_direct_gops->idx);
 		else
 			/* Clear out any saved storage */
@@ -688,7 +679,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 
 			save_curr_ret_stack = current->curr_ret_stack;
 			if (ftrace_ops_test(&gops->ops, func, NULL) &&
-			    gops->entryfunc(&trace, gops, &extras))
+			    gops->entryfunc(&trace, gops))
 				bitmap |= BIT(i);
 			else
 				/* Clear out any saved storage */
@@ -905,6 +896,29 @@ ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
 	return ret_stack;
 }
 
+/**
+ * ftrace_graph_top_ret_addr - return the top return address in the shadow stack
+ * @task: The task to read the shadow stack from.
+ *
+ * Return the first return address on the shadow stack of the @task, which is
+ * not the fgraph's return_to_handler.
+ */
+unsigned long ftrace_graph_top_ret_addr(struct task_struct *task)
+{
+	unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler);
+	struct ftrace_ret_stack *ret_stack = NULL;
+	int offset = task->curr_ret_stack;
+
+	if (offset < 0)
+		return 0;
+
+	do {
+		ret_stack = get_ret_stack(task, offset, &offset);
+	} while (ret_stack && ret_stack->ret == return_handler);
+
+	return ret_stack ? ret_stack->ret : 0;
+}
+
 /**
  * ftrace_graph_ret_addr - return the original value of the return address
  * @task: The task the unwinder is being executed on
@@ -1145,8 +1159,7 @@ void ftrace_graph_exit_task(struct task_struct *t)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 static int fgraph_pid_func(struct ftrace_graph_ent *trace,
-			   struct fgraph_ops *gops,
-			   struct fgraph_extras *extras)
+			   struct fgraph_ops *gops)
 {
 	struct trace_array *tr = gops->ops.private;
 	int pid;
@@ -1160,7 +1173,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace,
 			return 0;
 	}
 
-	return gops->saved_func(trace, gops, NULL);
+	return gops->saved_func(trace, gops);
 }
 
 void fgraph_update_pid_func(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5d87dac83b80..cae388122ca8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -827,8 +827,7 @@ struct profile_fgraph_data {
 };
 
 static int profile_graph_entry(struct ftrace_graph_ent *trace,
-			       struct fgraph_ops *gops,
-			       struct fgraph_extras *extras)
+			       struct fgraph_ops *gops)
 {
 	struct profile_fgraph_data *profile_data;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 13f08f257c0b..6adf48ef4312 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -695,8 +695,7 @@ void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 
 void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
-int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
-		      struct fgraph_extras *extras);
+int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3dd63ae2afe8..20d0c579d3b5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -90,9 +90,9 @@ static struct tracer_flags tracer_flags = {
 	.opts = trace_opts
 };
 
-u32 graph_tracer_flags_get(u32 flags)
+static bool tracer_flags_is_set(u32 flags)
 {
-	return tracer_flags.val & flags;
+	return (tracer_flags.val & flags) == flags;
 }
 
 /*
@@ -179,8 +179,7 @@ struct fgraph_times {
 };
 
 int trace_graph_entry(struct ftrace_graph_ent *trace,
-		      struct fgraph_ops *gops,
-		      struct fgraph_extras *extras)
+		      struct fgraph_ops *gops)
 {
 	unsigned long *task_var = fgraph_get_task_var(gops);
 	struct trace_array *tr = gops->private;
@@ -246,11 +245,12 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		trace_ctx = tracing_gen_ctx_flags(flags);
-		if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && extras
-				&& (extras->flags & TRACE_GRAPH_PRINT_RETADDR)))
-			ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx,
-							  extras->retaddr);
-		else
+		if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
+			tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR))) {
+			unsigned long retaddr = ftrace_graph_top_ret_addr(current);
+
+			ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
+		} else
 			ret = __trace_graph_entry(tr, trace, trace_ctx);
 	} else {
 		ret = 0;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index eb3aa36cf10f..fce064e20570 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -176,8 +176,7 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
 }
 
 static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
-			       struct fgraph_ops *gops,
-			       struct fgraph_extras *extras)
+			       struct fgraph_ops *gops)
 {
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 155de2551507..ae2ace5e515a 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -113,8 +113,7 @@ static int wakeup_display_graph(struct trace_array *tr, int set)
 }
 
 static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
-			      struct fgraph_ops *gops,
-			      struct fgraph_extras *extras)
+			      struct fgraph_ops *gops)
 {
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index fbb99f8c8062..d3a14ae47e26 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -774,8 +774,7 @@ struct fgraph_fixture {
 };
 
 static __init int store_entry(struct ftrace_graph_ent *trace,
-			      struct fgraph_ops *gops,
-			      struct fgraph_extras *extras)
+			      struct fgraph_ops *gops)
 {
 	struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
 	const char *type = fixture->store_type_name;
@@ -1026,8 +1025,7 @@ static unsigned int graph_hang_thresh;
 
 /* Wrap the real function entry probe to avoid possible hanging */
 static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
-				      struct fgraph_ops *gops,
-				      struct fgraph_extras *extras)
+				      struct fgraph_ops *gops)
 {
 	/* This is harmlessly racy, we want to approximately detect a hang */
 	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
@@ -1041,7 +1039,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
 		return 0;
 	}
 
-	return trace_graph_entry(trace, gops, NULL);
+	return trace_graph_entry(trace, gops);
 }
 
 static struct fgraph_ops fgraph_ops __initdata  = {
-- 
cgit v1.2.3


From bafffd56c608106d11e7aec851f114dcd66b2091 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Thu, 10 Oct 2024 14:54:46 +0100
Subject: clocksource: Remove unused clocksource_change_rating

clocksource_change_rating() has been unused since 2017's commit
63ed4e0c67df ("Drivers: hv: vmbus: Consolidate all Hyper-V specific clocksource code")

Remove it.

__clocksource_change_rating now only has one use which is ifdef'd.
Move it into the ifdef'd section.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20241010135446.213098-1-linux@treblig.org
---
 include/linux/clocksource.h |  1 -
 kernel/time/clocksource.c   | 40 ++++++++++------------------------------
 2 files changed, 10 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index d35b677b08fe..ef1b16da6ad5 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -215,7 +215,6 @@ static inline s64 clocksource_cyc2ns(u64 cycles, u32 mult, u32 shift)
 
 extern int clocksource_unregister(struct clocksource*);
 extern void clocksource_touch_watchdog(void);
-extern void clocksource_change_rating(struct clocksource *cs, int rating);
 extern void clocksource_suspend(void);
 extern void clocksource_resume(void);
 extern struct clocksource * __init clocksource_default_clock(void);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 23336eecb4f4..aab6472853fa 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -20,6 +20,8 @@
 #include "tick-internal.h"
 #include "timekeeping_internal.h"
 
+static void clocksource_enqueue(struct clocksource *cs);
+
 static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
 {
 	u64 delta = clocksource_delta(end, start, cs->mask);
@@ -171,7 +173,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
 }
 
 static int clocksource_watchdog_kthread(void *data);
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
 
 static void clocksource_watchdog_work(struct work_struct *work)
 {
@@ -191,6 +192,13 @@ static void clocksource_watchdog_work(struct work_struct *work)
 	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
 }
 
+static void clocksource_change_rating(struct clocksource *cs, int rating)
+{
+	list_del(&cs->list);
+	cs->rating = rating;
+	clocksource_enqueue(cs);
+}
+
 static void __clocksource_unstable(struct clocksource *cs)
 {
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
@@ -697,7 +705,7 @@ static int __clocksource_watchdog_kthread(void)
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
 			list_del_init(&cs->wd_list);
-			__clocksource_change_rating(cs, 0);
+			clocksource_change_rating(cs, 0);
 			select = 1;
 		}
 		if (cs->flags & CLOCK_SOURCE_RESELECT) {
@@ -1255,34 +1263,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 }
 EXPORT_SYMBOL_GPL(__clocksource_register_scale);
 
-static void __clocksource_change_rating(struct clocksource *cs, int rating)
-{
-	list_del(&cs->list);
-	cs->rating = rating;
-	clocksource_enqueue(cs);
-}
-
-/**
- * clocksource_change_rating - Change the rating of a registered clocksource
- * @cs:		clocksource to be changed
- * @rating:	new rating
- */
-void clocksource_change_rating(struct clocksource *cs, int rating)
-{
-	unsigned long flags;
-
-	mutex_lock(&clocksource_mutex);
-	clocksource_watchdog_lock(&flags);
-	__clocksource_change_rating(cs, rating);
-	clocksource_watchdog_unlock(&flags);
-
-	clocksource_select();
-	clocksource_select_watchdog(false);
-	clocksource_suspend_select(false);
-	mutex_unlock(&clocksource_mutex);
-}
-EXPORT_SYMBOL(clocksource_change_rating);
-
 /*
  * Unbind clocksource @cs. Called with clocksource_mutex held
  */
-- 
cgit v1.2.3


From 445936f9e258eca624c8239056bd8cd6e853b3fd Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 23 Sep 2024 11:59:57 +0200
Subject: thermal: core: Add user thresholds support

The user thresholds mechanism is a way to have the userspace to tell
the thermal framework to send a notification when a temperature limit
is crossed. There is no id, no hysteresis, just the temperature and
the direction of the limit crossing. That means we can be notified
when a threshold is crossed the way up only, or the way down only or
both ways. That allows to create hysteresis values if it is needed.

A threshold can be added, deleted or flushed. The latter means all
thresholds belonging to a thermal zone will be deleted.

When a threshold is added:

 - if the same threshold (temperature and direction) exists, an error
   is returned

 - if a threshold is specified with the same temperature but a
   different direction, the specified direction is added

 - if there is no threshold with the same temperature then it is
   created

When a threshold is deleted:

 - if the same threshold (temperature and direction) exists, it is
   deleted

 - if a threshold is specified with the same temperature but a
   different direction, the specified direction is removed

 - if there is no threshold with the same temperature, then an error
   is returned

When the threshold are flushed:

 - All thresholds related to a thermal zone are deleted

When a threshold is crossed:

 - the userspace does not need to know which threshold(s) have been
   crossed, it will be notified with the current temperature and the
   previous temperature

 - if multiple thresholds have been crossed between two updates only
   one notification will be send to the userspace, it is pointless to
   send a notification per thresholds crossed as the userspace can
   handle that easily when it has the temperature delta information

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20240923100005.2532430-2-daniel.lezcano@linaro.org
[ rjw: Subject edit, use BIT(0) and BIT(1) in symbol definitions ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/Makefile             |   1 +
 drivers/thermal/thermal_core.h       |   2 +
 drivers/thermal/thermal_thresholds.c | 229 +++++++++++++++++++++++++++++++++++
 drivers/thermal/thermal_thresholds.h |  19 +++
 include/linux/thermal.h              |   3 +
 include/uapi/linux/thermal.h         |   2 +
 6 files changed, 256 insertions(+)
 create mode 100644 drivers/thermal/thermal_thresholds.c
 create mode 100644 drivers/thermal/thermal_thresholds.h

(limited to 'include/linux')

diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 41c4d56beb40..1e1559bb971e 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -6,6 +6,7 @@ CFLAGS_thermal_core.o		:= -I$(src)
 obj-$(CONFIG_THERMAL)		+= thermal_sys.o
 thermal_sys-y			+= thermal_core.o thermal_sysfs.o
 thermal_sys-y			+= thermal_trip.o thermal_helpers.o
+thermal_sys-y			+= thermal_thresholds.o
 
 # netlink interface to manage the thermal framework
 thermal_sys-$(CONFIG_THERMAL_NETLINK)		+= thermal_netlink.o
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index a64d39b1c86b..1ea91d59498b 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -13,6 +13,7 @@
 #include <linux/thermal.h>
 
 #include "thermal_netlink.h"
+#include "thermal_thresholds.h"
 #include "thermal_debugfs.h"
 
 struct thermal_attr {
@@ -139,6 +140,7 @@ struct thermal_zone_device {
 #ifdef CONFIG_THERMAL_DEBUGFS
 	struct thermal_debugfs *debugfs;
 #endif
+	struct list_head user_thresholds;
 	struct thermal_trip_desc trips[] __counted_by(num_trips);
 };
 
diff --git a/drivers/thermal/thermal_thresholds.c b/drivers/thermal/thermal_thresholds.c
new file mode 100644
index 000000000000..f33b6d5474d8
--- /dev/null
+++ b/drivers/thermal/thermal_thresholds.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2024 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * Thermal thresholds
+ */
+#include <linux/list.h>
+#include <linux/list_sort.h>
+#include <linux/slab.h>
+
+#include "thermal_core.h"
+#include "thermal_thresholds.h"
+
+int thermal_thresholds_init(struct thermal_zone_device *tz)
+{
+	INIT_LIST_HEAD(&tz->user_thresholds);
+
+	return 0;
+}
+
+void thermal_thresholds_flush(struct thermal_zone_device *tz)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+	struct user_threshold *entry, *tmp;
+
+	lockdep_assert_held(&tz->lock);
+
+	list_for_each_entry_safe(entry, tmp, thresholds, list_node) {
+		list_del(&entry->list_node);
+		kfree(entry);
+	}
+
+	__thermal_zone_device_update(tz, THERMAL_TZ_FLUSH_THRESHOLDS);
+}
+
+void thermal_thresholds_exit(struct thermal_zone_device *tz)
+{
+	thermal_thresholds_flush(tz);
+}
+
+static int __thermal_thresholds_cmp(void *data,
+				    const struct list_head *l1,
+				    const struct list_head *l2)
+{
+	struct user_threshold *t1 = container_of(l1, struct user_threshold, list_node);
+	struct user_threshold *t2 = container_of(l2, struct user_threshold, list_node);
+
+	return t1->temperature - t2->temperature;
+}
+
+static struct user_threshold *__thermal_thresholds_find(const struct list_head *thresholds,
+							int temperature)
+{
+	struct user_threshold *t;
+
+	list_for_each_entry(t, thresholds, list_node)
+		if (t->temperature == temperature)
+			return t;
+
+	return NULL;
+}
+
+static bool __thermal_threshold_is_crossed(struct user_threshold *threshold, int temperature,
+					   int last_temperature, int direction,
+					   int *low, int *high)
+{
+
+	if (temperature >= threshold->temperature) {
+		if (threshold->temperature > *low &&
+		    THERMAL_THRESHOLD_WAY_DOWN & threshold->direction)
+			*low = threshold->temperature;
+
+		if (last_temperature < threshold->temperature &&
+		    threshold->direction & direction)
+			return true;
+	} else {
+		if (threshold->temperature < *high && THERMAL_THRESHOLD_WAY_UP
+		    & threshold->direction)
+			*high = threshold->temperature;
+
+		if (last_temperature >= threshold->temperature &&
+		    threshold->direction & direction)
+			return true;
+	}
+
+	return false;
+}
+
+static bool thermal_thresholds_handle_raising(struct list_head *thresholds, int temperature,
+					      int last_temperature, int *low, int *high)
+{
+	struct user_threshold *t;
+
+	list_for_each_entry(t, thresholds, list_node) {
+		if (__thermal_threshold_is_crossed(t, temperature, last_temperature,
+						   THERMAL_THRESHOLD_WAY_UP, low, high))
+			return true;
+	}
+
+	return false;
+}
+
+static bool thermal_thresholds_handle_dropping(struct list_head *thresholds, int temperature,
+					       int last_temperature, int *low, int *high)
+{
+	struct user_threshold *t;
+
+	list_for_each_entry_reverse(t, thresholds, list_node) {
+		if (__thermal_threshold_is_crossed(t, temperature, last_temperature,
+						   THERMAL_THRESHOLD_WAY_DOWN, low, high))
+			return true;
+	}
+
+	return false;
+}
+
+void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+
+	int temperature = tz->temperature;
+	int last_temperature = tz->last_temperature;
+	bool notify;
+
+	lockdep_assert_held(&tz->lock);
+
+	/*
+	 * We need a second update in order to detect a threshold being crossed
+	 */
+	if (last_temperature == THERMAL_TEMP_INVALID)
+		return;
+
+	/*
+	 * The temperature is stable, so obviously we can not have
+	 * crossed a threshold.
+	 */
+	if (last_temperature == temperature)
+		return;
+
+	/*
+	 * Since last update the temperature:
+	 * - increased : thresholds are crossed the way up
+	 * - decreased : thresholds are crossed the way down
+	 */
+	if (temperature > last_temperature)
+		notify = thermal_thresholds_handle_raising(thresholds, temperature,
+							   last_temperature, low, high);
+	else
+		notify = thermal_thresholds_handle_dropping(thresholds, temperature,
+							    last_temperature, low, high);
+
+	if (notify)
+		pr_debug("A threshold has been crossed the way %s, with a temperature=%d, last_temperature=%d\n",
+			 temperature > last_temperature ? "up" : "down", temperature, last_temperature);
+}
+
+int thermal_thresholds_add(struct thermal_zone_device *tz, int temperature, int direction)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+	struct user_threshold *t;
+
+	lockdep_assert_held(&tz->lock);
+
+	t = __thermal_thresholds_find(thresholds, temperature);
+	if (t) {
+		if (t->direction == direction)
+			return -EEXIST;
+
+		t->direction |= direction;
+	} else {
+
+		t = kmalloc(sizeof(*t), GFP_KERNEL);
+		if (!t)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&t->list_node);
+		t->temperature = temperature;
+		t->direction = direction;
+		list_add(&t->list_node, thresholds);
+		list_sort(NULL, thresholds, __thermal_thresholds_cmp);
+	}
+
+	__thermal_zone_device_update(tz, THERMAL_TZ_ADD_THRESHOLD);
+
+	return 0;
+}
+
+int thermal_thresholds_delete(struct thermal_zone_device *tz, int temperature, int direction)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+	struct user_threshold *t;
+
+	lockdep_assert_held(&tz->lock);
+
+	t = __thermal_thresholds_find(thresholds, temperature);
+	if (!t)
+		return -ENOENT;
+
+	if (t->direction == direction) {
+		list_del(&t->list_node);
+		kfree(t);
+	} else {
+		t->direction &= ~direction;
+	}
+
+	__thermal_zone_device_update(tz, THERMAL_TZ_DEL_THRESHOLD);
+
+	return 0;
+}
+
+int thermal_thresholds_for_each(struct thermal_zone_device *tz,
+				int (*cb)(struct user_threshold *, void *arg), void *arg)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+	struct user_threshold *entry;
+	int ret;
+
+	lockdep_assert_held(&tz->lock);
+
+	list_for_each_entry(entry, thresholds, list_node) {
+		ret = cb(entry, arg);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
diff --git a/drivers/thermal/thermal_thresholds.h b/drivers/thermal/thermal_thresholds.h
new file mode 100644
index 000000000000..232f4e8089af
--- /dev/null
+++ b/drivers/thermal/thermal_thresholds.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __THERMAL_THRESHOLDS_H__
+#define __THERMAL_THRESHOLDS_H__
+
+struct user_threshold {
+	struct list_head list_node;
+	int temperature;
+	int direction;
+};
+
+int thermal_thresholds_init(struct thermal_zone_device *tz);
+void thermal_thresholds_exit(struct thermal_zone_device *tz);
+void thermal_thresholds_flush(struct thermal_zone_device *tz);
+void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high);
+int thermal_thresholds_add(struct thermal_zone_device *tz, int temperature, int direction);
+int thermal_thresholds_delete(struct thermal_zone_device *tz, int temperature, int direction);
+int thermal_thresholds_for_each(struct thermal_zone_device *tz,
+				int (*cb)(struct user_threshold *, void *arg), void *arg);
+#endif
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 25ea8fe2313e..bcaa92732e14 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -56,6 +56,9 @@ enum thermal_notify_event {
 	THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */
 	THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */
 	THERMAL_TZ_RESUME, /* Thermal zone is resuming after system sleep */
+	THERMAL_TZ_ADD_THRESHOLD, /* Threshold added */
+	THERMAL_TZ_DEL_THRESHOLD, /* Threshold deleted */
+	THERMAL_TZ_FLUSH_THRESHOLDS, /* All thresholds deleted */
 };
 
 /**
diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h
index fc78bf3aead7..2e6f60a36173 100644
--- a/include/uapi/linux/thermal.h
+++ b/include/uapi/linux/thermal.h
@@ -3,6 +3,8 @@
 #define _UAPI_LINUX_THERMAL_H
 
 #define THERMAL_NAME_LENGTH	20
+#define THERMAL_THRESHOLD_WAY_UP	BIT(0)
+#define THERMAL_THRESHOLD_WAY_DOWN	BIT(1)
 
 enum thermal_device_mode {
 	THERMAL_DEVICE_DISABLED = 0,
-- 
cgit v1.2.3


From 9fb6fef0fb49124291837af1da5028f79d53f98e Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 14 Jun 2024 13:06:03 +0300
Subject: resource: Add resource set range and size helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Setting the end address for a resource with a given size lacks a helper and
is therefore coded manually unlike the getter side which has a helper for
resource size calculation. Also, almost all callsites that calculate the
end address for a resource also set the start address right before it like
this:

  res->start = start_addr;
  res->end = res->start + size - 1;

Add resource_set_range(res, start_addr, size) that sets the start address
and calculates the end address to simplify this often repeated fragment.

Also add resource_set_size() for the cases where setting the start address
of the resource is not necessary but mention in its kerneldoc that
resource_set_range() is preferred when setting both addresses.

Link: https://lore.kernel.org/r/20240614100606.15830-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/ioport.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6e9fb667a1c5..5385349f0b8a 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -249,6 +249,38 @@ struct resource *lookup_resource(struct resource *root, resource_size_t start);
 int adjust_resource(struct resource *res, resource_size_t start,
 		    resource_size_t size);
 resource_size_t resource_alignment(struct resource *res);
+
+/**
+ * resource_set_size - Calculate resource end address from size and start
+ * @res: Resource descriptor
+ * @size: Size of the resource
+ *
+ * Calculate the end address for @res based on @size.
+ *
+ * Note: The start address of @res must be set when calling this function.
+ * Prefer resource_set_range() if setting both the start address and @size.
+ */
+static inline void resource_set_size(struct resource *res, resource_size_t size)
+{
+	res->end = res->start + size - 1;
+}
+
+/**
+ * resource_set_range - Set resource start and end addresses
+ * @res: Resource descriptor
+ * @start: Start address for the resource
+ * @size: Size of the resource
+ *
+ * Set @res start address and calculate the end address based on @size.
+ */
+static inline void resource_set_range(struct resource *res,
+				      resource_size_t start,
+				      resource_size_t size)
+{
+	res->start = start;
+	resource_set_size(res, size);
+}
+
 static inline resource_size_t resource_size(const struct resource *res)
 {
 	return res->end - res->start + 1;
-- 
cgit v1.2.3


From 7888af4166d4ab07ba51234be6ba332b7807e901 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 8 Oct 2024 19:05:28 -0400
Subject: ftrace: Make ftrace_regs abstract from direct use

ftrace_regs was created to hold registers that store information to save
function parameters, return value and stack. Since it is a subset of
pt_regs, it should only be used by its accessor functions. But because
pt_regs can easily be taken from ftrace_regs (on most archs), it is
tempting to use it directly. But when running on other architectures, it
may fail to build or worse, build but crash the kernel!

Instead, make struct ftrace_regs an empty structure and have the
architectures define __arch_ftrace_regs and all the accessor functions
will typecast to it to get to the actual fields. This will help avoid
usage of ftrace_regs directly.

Link: https://lore.kernel.org/all/20241007171027.629bdafd@gandalf.local.home/

Cc: "linux-arch@vger.kernel.org" <linux-arch@vger.kernel.org>
Cc: "x86@kernel.org" <x86@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Naveen N Rao <naveen@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Paul  Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas  Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav  Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/20241008230628.958778821@goodmis.org
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Heiko Carstens <hca@linux.ibm.com> # s390
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 arch/arm64/include/asm/ftrace.h          | 20 +++++++++++---------
 arch/arm64/kernel/asm-offsets.c          | 22 +++++++++++-----------
 arch/arm64/kernel/ftrace.c               | 10 +++++-----
 arch/loongarch/include/asm/ftrace.h      | 22 ++++++++++++----------
 arch/loongarch/kernel/ftrace_dyn.c       |  2 +-
 arch/powerpc/include/asm/ftrace.h        | 21 ++++++++++++---------
 arch/powerpc/kernel/trace/ftrace.c       |  4 ++--
 arch/powerpc/kernel/trace/ftrace_64_pg.c |  2 +-
 arch/riscv/include/asm/ftrace.h          | 21 ++++++++++++---------
 arch/riscv/kernel/asm-offsets.c          | 28 ++++++++++++++--------------
 arch/riscv/kernel/ftrace.c               |  2 +-
 arch/s390/include/asm/ftrace.h           | 23 +++++++++++++----------
 arch/s390/kernel/asm-offsets.c           |  4 ++--
 arch/s390/kernel/ftrace.c                |  2 +-
 arch/s390/lib/test_unwind.c              |  4 ++--
 arch/x86/include/asm/ftrace.h            | 25 ++++++++++++++-----------
 arch/x86/kernel/ftrace.c                 |  2 +-
 include/linux/ftrace.h                   | 21 ++++++++++++++++++---
 kernel/trace/ftrace.c                    |  2 +-
 19 files changed, 134 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index dc9cf0bd2a4c..bbb69c7751b9 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -56,6 +56,8 @@ unsigned long ftrace_call_adjust(unsigned long addr);
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
 struct dyn_ftrace;
 struct ftrace_ops;
+struct ftrace_regs;
+#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
 
 #define arch_ftrace_get_regs(regs) NULL
 
@@ -63,7 +65,7 @@ struct ftrace_ops;
  * Note: sizeof(struct ftrace_regs) must be a multiple of 16 to ensure correct
  * stack alignment
  */
-struct ftrace_regs {
+struct __arch_ftrace_regs {
 	/* x0 - x8 */
 	unsigned long regs[9];
 
@@ -83,47 +85,47 @@ struct ftrace_regs {
 static __always_inline unsigned long
 ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs)
 {
-	return fregs->pc;
+	return arch_ftrace_regs(fregs)->pc;
 }
 
 static __always_inline void
 ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
 				    unsigned long pc)
 {
-	fregs->pc = pc;
+	arch_ftrace_regs(fregs)->pc = pc;
 }
 
 static __always_inline unsigned long
 ftrace_regs_get_stack_pointer(const struct ftrace_regs *fregs)
 {
-	return fregs->sp;
+	return arch_ftrace_regs(fregs)->sp;
 }
 
 static __always_inline unsigned long
 ftrace_regs_get_argument(struct ftrace_regs *fregs, unsigned int n)
 {
 	if (n < 8)
-		return fregs->regs[n];
+		return arch_ftrace_regs(fregs)->regs[n];
 	return 0;
 }
 
 static __always_inline unsigned long
 ftrace_regs_get_return_value(const struct ftrace_regs *fregs)
 {
-	return fregs->regs[0];
+	return arch_ftrace_regs(fregs)->regs[0];
 }
 
 static __always_inline void
 ftrace_regs_set_return_value(struct ftrace_regs *fregs,
 			     unsigned long ret)
 {
-	fregs->regs[0] = ret;
+	arch_ftrace_regs(fregs)->regs[0] = ret;
 }
 
 static __always_inline void
 ftrace_override_function_with_return(struct ftrace_regs *fregs)
 {
-	fregs->pc = fregs->lr;
+	arch_ftrace_regs(fregs)->pc = arch_ftrace_regs(fregs)->lr;
 }
 
 int ftrace_regs_query_register_offset(const char *name);
@@ -143,7 +145,7 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs,
 	 * The ftrace trampoline will return to this address instead of the
 	 * instrumented function.
 	 */
-	fregs->direct_tramp = addr;
+	arch_ftrace_regs(fregs)->direct_tramp = addr;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 27de1dddb0ab..a5de57f68219 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -84,19 +84,19 @@ int main(void)
   DEFINE(PT_REGS_SIZE,		sizeof(struct pt_regs));
   BLANK();
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
-  DEFINE(FREGS_X0,		offsetof(struct ftrace_regs, regs[0]));
-  DEFINE(FREGS_X2,		offsetof(struct ftrace_regs, regs[2]));
-  DEFINE(FREGS_X4,		offsetof(struct ftrace_regs, regs[4]));
-  DEFINE(FREGS_X6,		offsetof(struct ftrace_regs, regs[6]));
-  DEFINE(FREGS_X8,		offsetof(struct ftrace_regs, regs[8]));
-  DEFINE(FREGS_FP,		offsetof(struct ftrace_regs, fp));
-  DEFINE(FREGS_LR,		offsetof(struct ftrace_regs, lr));
-  DEFINE(FREGS_SP,		offsetof(struct ftrace_regs, sp));
-  DEFINE(FREGS_PC,		offsetof(struct ftrace_regs, pc));
+  DEFINE(FREGS_X0,		offsetof(struct __arch_ftrace_regs, regs[0]));
+  DEFINE(FREGS_X2,		offsetof(struct __arch_ftrace_regs, regs[2]));
+  DEFINE(FREGS_X4,		offsetof(struct __arch_ftrace_regs, regs[4]));
+  DEFINE(FREGS_X6,		offsetof(struct __arch_ftrace_regs, regs[6]));
+  DEFINE(FREGS_X8,		offsetof(struct __arch_ftrace_regs, regs[8]));
+  DEFINE(FREGS_FP,		offsetof(struct __arch_ftrace_regs, fp));
+  DEFINE(FREGS_LR,		offsetof(struct __arch_ftrace_regs, lr));
+  DEFINE(FREGS_SP,		offsetof(struct __arch_ftrace_regs, sp));
+  DEFINE(FREGS_PC,		offsetof(struct __arch_ftrace_regs, pc));
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-  DEFINE(FREGS_DIRECT_TRAMP,	offsetof(struct ftrace_regs, direct_tramp));
+  DEFINE(FREGS_DIRECT_TRAMP,	offsetof(struct __arch_ftrace_regs, direct_tramp));
 #endif
-  DEFINE(FREGS_SIZE,		sizeof(struct ftrace_regs));
+  DEFINE(FREGS_SIZE,		sizeof(struct __arch_ftrace_regs));
   BLANK();
 #endif
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index a650f5e11fc5..b2d947175cbe 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -23,10 +23,10 @@ struct fregs_offset {
 	int offset;
 };
 
-#define FREGS_OFFSET(n, field)				\
-{							\
-	.name = n,					\
-	.offset = offsetof(struct ftrace_regs, field),	\
+#define FREGS_OFFSET(n, field)					\
+{								\
+	.name = n,						\
+	.offset = offsetof(struct __arch_ftrace_regs, field),	\
 }
 
 static const struct fregs_offset fregs_offsets[] = {
@@ -481,7 +481,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	prepare_ftrace_return(ip, &fregs->lr, fregs->fp);
+	prepare_ftrace_return(ip, &arch_ftrace_regs(fregs)->lr, arch_ftrace_regs(fregs)->fp);
 }
 #else
 /*
diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h
index c0a682808e07..0e15d36ce251 100644
--- a/arch/loongarch/include/asm/ftrace.h
+++ b/arch/loongarch/include/asm/ftrace.h
@@ -43,38 +43,40 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent);
 
 #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
 struct ftrace_ops;
+struct ftrace_regs;
+#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
 
-struct ftrace_regs {
+struct __arch_ftrace_regs {
 	struct pt_regs regs;
 };
 
 static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
 {
-	return &fregs->regs;
+	return &arch_ftrace_regs(fregs)->regs;
 }
 
 static __always_inline unsigned long
 ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs)
 {
-	return instruction_pointer(&fregs->regs);
+	return instruction_pointer(&arch_ftrace_regs(fregs)->regs);
 }
 
 static __always_inline void
 ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip)
 {
-	instruction_pointer_set(&fregs->regs, ip);
+	instruction_pointer_set(&arch_ftrace_regs(fregs)->regs, ip);
 }
 
 #define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(&(fregs)->regs, n)
+	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
 #define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(&(fregs)->regs)
+	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_return_value(fregs) \
-	regs_return_value(&(fregs)->regs)
+	regs_return_value(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(&(fregs)->regs, ret)
+	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
 #define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(&(fregs)->regs)
+	override_function_with_return(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_query_register_offset(name) \
 	regs_query_register_offset(name)
 
@@ -90,7 +92,7 @@ __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
 }
 
 #define arch_ftrace_set_direct_caller(fregs, addr) \
-	__arch_ftrace_set_direct_caller(&(fregs)->regs, addr)
+	__arch_ftrace_set_direct_caller(&arch_ftrace_regs(fregs)->regs, addr)
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
 #endif
diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c
index bff058317062..18056229e22e 100644
--- a/arch/loongarch/kernel/ftrace_dyn.c
+++ b/arch/loongarch/kernel/ftrace_dyn.c
@@ -241,7 +241,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent)
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	struct pt_regs *regs = &fregs->regs;
+	struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
 	unsigned long *parent = (unsigned long *)&regs->regs[1];
 
 	prepare_ftrace_return(ip, (unsigned long *)parent);
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 559560286e6d..e299fd47d201 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -32,39 +32,42 @@ struct dyn_arch_ftrace {
 int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
 #define ftrace_init_nop ftrace_init_nop
 
-struct ftrace_regs {
+struct ftrace_regs;
+#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
+
+struct __arch_ftrace_regs {
 	struct pt_regs regs;
 };
 
 static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
 {
 	/* We clear regs.msr in ftrace_call */
-	return fregs->regs.msr ? &fregs->regs : NULL;
+	return arch_ftrace_regs(fregs)->regs.msr ? &arch_ftrace_regs(fregs)->regs : NULL;
 }
 
 static __always_inline void
 ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
 				    unsigned long ip)
 {
-	regs_set_return_ip(&fregs->regs, ip);
+	regs_set_return_ip(&arch_ftrace_regs(fregs)->regs, ip);
 }
 
 static __always_inline unsigned long
 ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs)
 {
-	return instruction_pointer(&fregs->regs);
+	return instruction_pointer(&arch_ftrace_regs(fregs)->regs);
 }
 
 #define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(&(fregs)->regs, n)
+	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
 #define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(&(fregs)->regs)
+	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_return_value(fregs) \
-	regs_return_value(&(fregs)->regs)
+	regs_return_value(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(&(fregs)->regs, ret)
+	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
 #define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(&(fregs)->regs)
+	override_function_with_return(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_query_register_offset(name) \
 	regs_query_register_offset(name)
 
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index d8d6b4fd9a14..df41f4a7c738 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -421,7 +421,7 @@ int __init ftrace_dyn_arch_init(void)
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	unsigned long sp = fregs->regs.gpr[1];
+	unsigned long sp = arch_ftrace_regs(fregs)->regs.gpr[1];
 	int bit;
 
 	if (unlikely(ftrace_graph_is_dead()))
@@ -439,6 +439,6 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 
 	ftrace_test_recursion_unlock(bit);
 out:
-	fregs->regs.link = parent_ip;
+	arch_ftrace_regs(fregs)->regs.link = parent_ip;
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c
index 12fab1803bcf..d3c5552e4984 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.c
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c
@@ -829,7 +829,7 @@ out:
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	fregs->regs.link = __prepare_ftrace_return(parent_ip, ip, fregs->regs.gpr[1]);
+	arch_ftrace_regs(fregs)->regs.link = __prepare_ftrace_return(parent_ip, ip, arch_ftrace_regs(fregs)->regs.gpr[1]);
 }
 #else
 unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 2cddd79ff21b..c6bcdff105b5 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -126,7 +126,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
 #define arch_ftrace_get_regs(regs) NULL
 struct ftrace_ops;
-struct ftrace_regs {
+struct ftrace_regs;
+#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
+
+struct __arch_ftrace_regs {
 	unsigned long epc;
 	unsigned long ra;
 	unsigned long sp;
@@ -150,42 +153,42 @@ struct ftrace_regs {
 static __always_inline unsigned long ftrace_regs_get_instruction_pointer(const struct ftrace_regs
 									 *fregs)
 {
-	return fregs->epc;
+	return arch_ftrace_regs(fregs)->epc;
 }
 
 static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
 								unsigned long pc)
 {
-	fregs->epc = pc;
+	arch_ftrace_regs(fregs)->epc = pc;
 }
 
 static __always_inline unsigned long ftrace_regs_get_stack_pointer(const struct ftrace_regs *fregs)
 {
-	return fregs->sp;
+	return arch_ftrace_regs(fregs)->sp;
 }
 
 static __always_inline unsigned long ftrace_regs_get_argument(struct ftrace_regs *fregs,
 							      unsigned int n)
 {
 	if (n < 8)
-		return fregs->args[n];
+		return arch_ftrace_regs(fregs)->args[n];
 	return 0;
 }
 
 static __always_inline unsigned long ftrace_regs_get_return_value(const struct ftrace_regs *fregs)
 {
-	return fregs->a0;
+	return arch_ftrace_regs(fregs)->a0;
 }
 
 static __always_inline void ftrace_regs_set_return_value(struct ftrace_regs *fregs,
 							 unsigned long ret)
 {
-	fregs->a0 = ret;
+	arch_ftrace_regs(fregs)->a0 = ret;
 }
 
 static __always_inline void ftrace_override_function_with_return(struct ftrace_regs *fregs)
 {
-	fregs->epc = fregs->ra;
+	arch_ftrace_regs(fregs)->epc = arch_ftrace_regs(fregs)->ra;
 }
 
 int ftrace_regs_query_register_offset(const char *name);
@@ -196,7 +199,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 
 static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr)
 {
-	fregs->t1 = addr;
+	arch_ftrace_regs(fregs)->t1 = addr;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index e94180ba432f..f6f5a277ba9d 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -498,19 +498,19 @@ void asm_offsets(void)
 	OFFSET(STACKFRAME_RA, stackframe, ra);
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
-	DEFINE(FREGS_SIZE_ON_STACK, ALIGN(sizeof(struct ftrace_regs), STACK_ALIGN));
-	DEFINE(FREGS_EPC,	    offsetof(struct ftrace_regs, epc));
-	DEFINE(FREGS_RA,	    offsetof(struct ftrace_regs, ra));
-	DEFINE(FREGS_SP,	    offsetof(struct ftrace_regs, sp));
-	DEFINE(FREGS_S0,	    offsetof(struct ftrace_regs, s0));
-	DEFINE(FREGS_T1,	    offsetof(struct ftrace_regs, t1));
-	DEFINE(FREGS_A0,	    offsetof(struct ftrace_regs, a0));
-	DEFINE(FREGS_A1,	    offsetof(struct ftrace_regs, a1));
-	DEFINE(FREGS_A2,	    offsetof(struct ftrace_regs, a2));
-	DEFINE(FREGS_A3,	    offsetof(struct ftrace_regs, a3));
-	DEFINE(FREGS_A4,	    offsetof(struct ftrace_regs, a4));
-	DEFINE(FREGS_A5,	    offsetof(struct ftrace_regs, a5));
-	DEFINE(FREGS_A6,	    offsetof(struct ftrace_regs, a6));
-	DEFINE(FREGS_A7,	    offsetof(struct ftrace_regs, a7));
+	DEFINE(FREGS_SIZE_ON_STACK, ALIGN(sizeof(struct __arch_ftrace_regs), STACK_ALIGN));
+	DEFINE(FREGS_EPC,	    offsetof(struct __arch_ftrace_regs, epc));
+	DEFINE(FREGS_RA,	    offsetof(struct __arch_ftrace_regs, ra));
+	DEFINE(FREGS_SP,	    offsetof(struct __arch_ftrace_regs, sp));
+	DEFINE(FREGS_S0,	    offsetof(struct __arch_ftrace_regs, s0));
+	DEFINE(FREGS_T1,	    offsetof(struct __arch_ftrace_regs, t1));
+	DEFINE(FREGS_A0,	    offsetof(struct __arch_ftrace_regs, a0));
+	DEFINE(FREGS_A1,	    offsetof(struct __arch_ftrace_regs, a1));
+	DEFINE(FREGS_A2,	    offsetof(struct __arch_ftrace_regs, a2));
+	DEFINE(FREGS_A3,	    offsetof(struct __arch_ftrace_regs, a3));
+	DEFINE(FREGS_A4,	    offsetof(struct __arch_ftrace_regs, a4));
+	DEFINE(FREGS_A5,	    offsetof(struct __arch_ftrace_regs, a5));
+	DEFINE(FREGS_A6,	    offsetof(struct __arch_ftrace_regs, a6));
+	DEFINE(FREGS_A7,	    offsetof(struct __arch_ftrace_regs, a7));
 #endif
 }
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 4b95c574fd04..5081ad886841 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -214,7 +214,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	prepare_ftrace_return(&fregs->ra, ip, fregs->s0);
+	prepare_ftrace_return(&arch_ftrace_regs(fregs)->ra, ip, arch_ftrace_regs(fregs)->s0);
 }
 #else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 extern void ftrace_graph_call(void);
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 406746666eb7..1498d0a9c762 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -51,13 +51,16 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	return addr;
 }
 
-struct ftrace_regs {
+struct ftrace_regs;
+#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
+
+struct __arch_ftrace_regs {
 	struct pt_regs regs;
 };
 
 static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
 {
-	struct pt_regs *regs = &fregs->regs;
+	struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
 
 	if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS))
 		return regs;
@@ -84,26 +87,26 @@ static __always_inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph
 static __always_inline unsigned long
 ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs)
 {
-	return fregs->regs.psw.addr;
+	return arch_ftrace_regs(fregs)->regs.psw.addr;
 }
 
 static __always_inline void
 ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
 				    unsigned long ip)
 {
-	fregs->regs.psw.addr = ip;
+	arch_ftrace_regs(fregs)->regs.psw.addr = ip;
 }
 
 #define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(&(fregs)->regs, n)
+	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
 #define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(&(fregs)->regs)
+	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_return_value(fregs) \
-	regs_return_value(&(fregs)->regs)
+	regs_return_value(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(&(fregs)->regs, ret)
+	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
 #define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(&(fregs)->regs)
+	override_function_with_return(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_query_register_offset(name) \
 	regs_query_register_offset(name)
 
@@ -117,7 +120,7 @@ ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
  */
 static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr)
 {
-	struct pt_regs *regs = &fregs->regs;
+	struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
 	regs->orig_gpr2 = addr;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 5529248d84fb..db9659980175 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -184,8 +184,8 @@ int main(void)
 	OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp);
 	DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs));
 #endif
-	OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs);
-	DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs));
+	OFFSET(__FTRACE_REGS_PT_REGS, __arch_ftrace_regs, regs);
+	DEFINE(__FTRACE_REGS_SIZE, sizeof(struct __arch_ftrace_regs));
 
 	OFFSET(__PCPU_FLAGS, pcpu, flags);
 	return 0;
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 0b6e62d1d8b8..51439a71e392 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -318,7 +318,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	if (bit < 0)
 		return;
 
-	kmsan_unpoison_memory(fregs, sizeof(*fregs));
+	kmsan_unpoison_memory(fregs, ftrace_regs_size());
 	regs = ftrace_get_regs(fregs);
 	p = get_kprobe((kprobe_opcode_t *)ip);
 	if (!regs || unlikely(!p) || kprobe_disabled(p))
diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c
index 8b7f981e6f34..6e42100875e7 100644
--- a/arch/s390/lib/test_unwind.c
+++ b/arch/s390/lib/test_unwind.c
@@ -270,9 +270,9 @@ static void notrace __used test_unwind_ftrace_handler(unsigned long ip,
 						      struct ftrace_ops *fops,
 						      struct ftrace_regs *fregs)
 {
-	struct unwindme *u = (struct unwindme *)fregs->regs.gprs[2];
+	struct unwindme *u = (struct unwindme *)arch_ftrace_regs(fregs)->regs.gprs[2];
 
-	u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &fregs->regs : NULL,
+	u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &arch_ftrace_regs(fregs)->regs : NULL,
 			     (u->flags & UWM_SP) ? u->sp : 0);
 }
 
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 0152a81d9b4a..87943f7a299b 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -33,7 +33,10 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 }
 
 #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
-struct ftrace_regs {
+struct ftrace_regs;
+#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
+
+struct __arch_ftrace_regs {
 	struct pt_regs		regs;
 };
 
@@ -41,27 +44,27 @@ static __always_inline struct pt_regs *
 arch_ftrace_get_regs(struct ftrace_regs *fregs)
 {
 	/* Only when FL_SAVE_REGS is set, cs will be non zero */
-	if (!fregs->regs.cs)
+	if (!arch_ftrace_regs(fregs)->regs.cs)
 		return NULL;
-	return &fregs->regs;
+	return &arch_ftrace_regs(fregs)->regs;
 }
 
 #define ftrace_regs_set_instruction_pointer(fregs, _ip)	\
-	do { (fregs)->regs.ip = (_ip); } while (0)
+	do { arch_ftrace_regs(fregs)->regs.ip = (_ip); } while (0)
 
 #define ftrace_regs_get_instruction_pointer(fregs) \
-	((fregs)->regs.ip)
+	arch_ftrace_regs(fregs)->regs.ip)
 
 #define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(&(fregs)->regs, n)
+	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
 #define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(&(fregs)->regs)
+	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_return_value(fregs) \
-	regs_return_value(&(fregs)->regs)
+	regs_return_value(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(&(fregs)->regs, ret)
+	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
 #define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(&(fregs)->regs)
+	override_function_with_return(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_query_register_offset(name) \
 	regs_query_register_offset(name)
 
@@ -88,7 +91,7 @@ __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
 	regs->orig_ax = addr;
 }
 #define arch_ftrace_set_direct_caller(fregs, addr) \
-	__arch_ftrace_set_direct_caller(&(fregs)->regs, addr)
+	__arch_ftrace_set_direct_caller(&arch_ftrace_regs(fregs)->regs, addr)
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8da0e66ca22d..adb09f78edb2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -647,7 +647,7 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	struct pt_regs *regs = &fregs->regs;
+	struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
 	unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
 
 	prepare_ftrace_return(ip, (unsigned long *)stack, 0);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4c7dd5e58c9f..66f10291a0b2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -115,8 +115,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
 
 extern int ftrace_enabled;
 
-#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
-
 /**
  * ftrace_regs - ftrace partial/optimal register set
  *
@@ -142,11 +140,28 @@ extern int ftrace_enabled;
  *
  * NOTE: user *must not* access regs directly, only do it via APIs, because
  * the member can be changed according to the architecture.
+ * This is why the structure is empty here, so that nothing accesses
+ * the ftrace_regs directly.
  */
 struct ftrace_regs {
+	/* Nothing to see here, use the accessor functions! */
+};
+
+#define ftrace_regs_size()	sizeof(struct __arch_ftrace_regs)
+
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
+struct __arch_ftrace_regs {
 	struct pt_regs		regs;
 };
-#define arch_ftrace_get_regs(fregs) (&(fregs)->regs)
+
+struct ftrace_regs;
+#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
+
+static inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
+{
+	return &arch_ftrace_regs(fregs)->regs;
+}
 
 /*
  * ftrace_regs_set_instruction_pointer() is to be defined by the architecture
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cae388122ca8..e9fd4fb2769e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7943,7 +7943,7 @@ out:
 void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 			       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	kmsan_unpoison_memory(fregs, sizeof(*fregs));
+	kmsan_unpoison_memory(fregs, ftrace_regs_size());
 	__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
 }
 #else
-- 
cgit v1.2.3


From 483c5c2bc6b1c0d17214e2672032def605ff2ba9 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@cqsoftware.com.cn>
Date: Fri, 20 Sep 2024 13:34:23 +0800
Subject: serial: clean up uart_info

Since commit ebd2c8f6d2ec ("serial: kill off uart_info") has
removed uart_info, the uart_info declaration looks lonely,
let it go.

Signed-off-by: Yanteng Si <siyanteng@cqsoftware.com.cn>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240920053423.1373354-1-siyanteng@cqsoftware.com.cn
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_data/sa11x0-serial.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/sa11x0-serial.h b/include/linux/platform_data/sa11x0-serial.h
index 8b79ab08af45..a88096bc74e4 100644
--- a/include/linux/platform_data/sa11x0-serial.h
+++ b/include/linux/platform_data/sa11x0-serial.h
@@ -10,7 +10,6 @@
 #define SA11X0_SERIAL_H
 
 struct uart_port;
-struct uart_info;
 
 /*
  * This is a temporary structure for registering these
-- 
cgit v1.2.3


From 7738a7ab9d12c5371ed97114ee2132d4512e9fd5 Mon Sep 17 00:00:00 2001
From: Parker Newman <pnewman@connecttech.com>
Date: Wed, 2 Oct 2024 11:12:33 -0400
Subject: misc: eeprom: eeprom_93cx6: Add quirk for extra read clock cycle

Add a quirk similar to eeprom_93xx46 to add an extra clock cycle before
reading data from the EEPROM.

The 93Cx6 family of EEPROMs output a "dummy 0 bit" between the writing
of the op-code/address from the host to the EEPROM and the reading of
the actual data from the EEPROM.

More info can be found on page 6 of the AT93C46 datasheet (linked below).
Similar notes are found in other 93xx6 datasheets.

In summary the read operation for a 93Cx6 EEPROM is:
Write to EEPROM:	110[A5-A0]	(9 bits)
Read from EEPROM:	0[D15-D0]	(17 bits)

Where:
	110 is the start bit and READ OpCode
	[A5-A0] is the address to read from
	0 is a "dummy bit" preceding the actual data
	[D15-D0] is the actual data.

Looking at the READ timing diagrams in the 93Cx6 datasheets the dummy
bit should be clocked out on the last address bit clock cycle meaning it
should be discarded naturally.

However, depending on the hardware configuration sometimes this dummy
bit is not discarded. This is the case with Exar PCI UARTs which require
an extra clock cycle between sending the address and reading the data.

Datasheet: https://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-5193-SEEPROM-AT93C46D-Datasheet.pdf
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Parker Newman <pnewman@connecttech.com>
Link: https://lore.kernel.org/r/0f23973efefccd2544705a0480b4ad4c2353e407.1727880931.git.pnewman@connecttech.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/eeprom/eeprom_93cx6.c | 10 ++++++++++
 include/linux/eeprom_93cx6.h       | 11 +++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/eeprom/eeprom_93cx6.c b/drivers/misc/eeprom/eeprom_93cx6.c
index 9627294fe3e9..4c9827fe9217 100644
--- a/drivers/misc/eeprom/eeprom_93cx6.c
+++ b/drivers/misc/eeprom/eeprom_93cx6.c
@@ -186,6 +186,11 @@ void eeprom_93cx6_read(struct eeprom_93cx6 *eeprom, const u8 word,
 	eeprom_93cx6_write_bits(eeprom, command,
 		PCI_EEPROM_WIDTH_OPCODE + eeprom->width);
 
+	if (has_quirk_extra_read_cycle(eeprom)) {
+		eeprom_93cx6_pulse_high(eeprom);
+		eeprom_93cx6_pulse_low(eeprom);
+	}
+
 	/*
 	 * Read the requested 16 bits.
 	 */
@@ -252,6 +257,11 @@ void eeprom_93cx6_readb(struct eeprom_93cx6 *eeprom, const u8 byte,
 	eeprom_93cx6_write_bits(eeprom, command,
 		PCI_EEPROM_WIDTH_OPCODE + eeprom->width + 1);
 
+	if (has_quirk_extra_read_cycle(eeprom)) {
+		eeprom_93cx6_pulse_high(eeprom);
+		eeprom_93cx6_pulse_low(eeprom);
+	}
+
 	/*
 	 * Read the requested 8 bits.
 	 */
diff --git a/include/linux/eeprom_93cx6.h b/include/linux/eeprom_93cx6.h
index c860c72a921d..3a485cc0e0fa 100644
--- a/include/linux/eeprom_93cx6.h
+++ b/include/linux/eeprom_93cx6.h
@@ -11,6 +11,8 @@
 	Supported chipsets: 93c46, 93c56 and 93c66.
  */
 
+#include <linux/bits.h>
+
 /*
  * EEPROM operation defines.
  */
@@ -34,6 +36,7 @@
  * @register_write(struct eeprom_93cx6 *eeprom): handler to
  * write to the eeprom register by using all reg_* fields.
  * @width: eeprom width, should be one of the PCI_EEPROM_WIDTH_* defines
+ * @quirks: eeprom or controller quirks
  * @drive_data: Set if we're driving the data line.
  * @reg_data_in: register field to indicate data input
  * @reg_data_out: register field to indicate data output
@@ -50,6 +53,9 @@ struct eeprom_93cx6 {
 	void (*register_write)(struct eeprom_93cx6 *eeprom);
 
 	int width;
+	unsigned int quirks;
+/* Some EEPROMs require an extra clock cycle before reading */
+#define PCI_EEPROM_QUIRK_EXTRA_READ_CYCLE	BIT(0)
 
 	char drive_data;
 	char reg_data_in;
@@ -71,3 +77,8 @@ extern void eeprom_93cx6_wren(struct eeprom_93cx6 *eeprom, bool enable);
 
 extern void eeprom_93cx6_write(struct eeprom_93cx6 *eeprom,
 			       u8 addr, u16 data);
+
+static inline bool has_quirk_extra_read_cycle(struct eeprom_93cx6 *eeprom)
+{
+	return eeprom->quirks & PCI_EEPROM_QUIRK_EXTRA_READ_CYCLE;
+}
-- 
cgit v1.2.3


From 11dad94b50263bbe87d015041c77be61c8c44161 Mon Sep 17 00:00:00 2001
From: Andrew Kreimer <algonell@gmail.com>
Date: Thu, 10 Oct 2024 12:13:55 +0300
Subject: phy: sun4i-usb: Fix a typo

Fix a typo in comments: wether -> whether.

Signed-off-by: Andrew Kreimer <algonell@gmail.com>
Acked-by: Chen-Yu Tsai <wens@csie.org>
Link: https://lore.kernel.org/r/20241010091355.8271-1-algonell@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/phy/phy-sun4i-usb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy/phy-sun4i-usb.h b/include/linux/phy/phy-sun4i-usb.h
index 91eb755ee73b..f3e7b13608e4 100644
--- a/include/linux/phy/phy-sun4i-usb.h
+++ b/include/linux/phy/phy-sun4i-usb.h
@@ -11,7 +11,7 @@
 /**
  * sun4i_usb_phy_set_squelch_detect() - Enable/disable squelch detect
  * @phy: reference to a sun4i usb phy
- * @enabled: wether to enable or disable squelch detect
+ * @enabled: whether to enable or disable squelch detect
  */
 void sun4i_usb_phy_set_squelch_detect(struct phy *phy, bool enabled);
 
-- 
cgit v1.2.3


From ed870e35db660724ff0d815d9a3ef9a6247ffbab Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 9 Oct 2024 10:32:09 -0700
Subject: lsm: add the lsm_prop data structure

When more than one security module is exporting data to audit and
networking sub-systems a single 32 bit integer is no longer
sufficient to represent the data. Add a structure to be used instead.

The lsm_prop structure definition is intended to keep the LSM
specific information private to the individual security modules.
The module specific information is included in a new set of
header files under include/lsm. Each security module is allowed
to define the information included for its use in the lsm_prop.
SELinux includes a u32 secid. Smack includes a pointer into its
global label list. The conditional compilation based on feature
inclusion is contained in the include/lsm files.

Cc: apparmor@lists.ubuntu.com
Cc: bpf@vger.kernel.org
Cc: selinux@vger.kernel.org
Cc: linux-security-module@vger.kernel.org
Suggested-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: John Johansen <john.johansen@canonical.com>
[PM: added include/linux/lsm/ to MAINTAINERS, subj tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 MAINTAINERS                  |  1 +
 include/linux/lsm/apparmor.h | 17 +++++++++++++++++
 include/linux/lsm/bpf.h      | 16 ++++++++++++++++
 include/linux/lsm/selinux.h  | 16 ++++++++++++++++
 include/linux/lsm/smack.h    | 17 +++++++++++++++++
 include/linux/security.h     | 20 ++++++++++++++++++++
 6 files changed, 87 insertions(+)
 create mode 100644 include/linux/lsm/apparmor.h
 create mode 100644 include/linux/lsm/bpf.h
 create mode 100644 include/linux/lsm/selinux.h
 create mode 100644 include/linux/lsm/smack.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c27f3190737f..f870842fad9c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20846,6 +20846,7 @@ Q:	https://patchwork.kernel.org/project/linux-security-module/list
 B:	mailto:linux-security-module@vger.kernel.org
 P:	https://github.com/LinuxSecurityModule/kernel/blob/main/README.md
 T:	git https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git
+F:	include/linux/lsm/
 F:	include/linux/lsm_audit.h
 F:	include/linux/lsm_hook_defs.h
 F:	include/linux/lsm_hooks.h
diff --git a/include/linux/lsm/apparmor.h b/include/linux/lsm/apparmor.h
new file mode 100644
index 000000000000..612cbfacb072
--- /dev/null
+++ b/include/linux/lsm/apparmor.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linux Security Module interface to other subsystems.
+ * AppArmor presents single pointer to an aa_label structure.
+ */
+#ifndef __LINUX_LSM_APPARMOR_H
+#define __LINUX_LSM_APPARMOR_H
+
+struct aa_label;
+
+struct lsm_prop_apparmor {
+#ifdef CONFIG_SECURITY_APPARMOR
+	struct aa_label *label;
+#endif
+};
+
+#endif /* ! __LINUX_LSM_APPARMOR_H */
diff --git a/include/linux/lsm/bpf.h b/include/linux/lsm/bpf.h
new file mode 100644
index 000000000000..8106e206fcef
--- /dev/null
+++ b/include/linux/lsm/bpf.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linux Security Module interface to other subsystems.
+ * BPF may present a single u32 value.
+ */
+#ifndef __LINUX_LSM_BPF_H
+#define __LINUX_LSM_BPF_H
+#include <linux/types.h>
+
+struct lsm_prop_bpf {
+#ifdef CONFIG_BPF_LSM
+	u32 secid;
+#endif
+};
+
+#endif /* ! __LINUX_LSM_BPF_H */
diff --git a/include/linux/lsm/selinux.h b/include/linux/lsm/selinux.h
new file mode 100644
index 000000000000..9455a6b5b910
--- /dev/null
+++ b/include/linux/lsm/selinux.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linux Security Module interface to other subsystems.
+ * SELinux presents a single u32 value which is known as a secid.
+ */
+#ifndef __LINUX_LSM_SELINUX_H
+#define __LINUX_LSM_SELINUX_H
+#include <linux/types.h>
+
+struct lsm_prop_selinux {
+#ifdef CONFIG_SECURITY_SELINUX
+	u32 secid;
+#endif
+};
+
+#endif /* ! __LINUX_LSM_SELINUX_H */
diff --git a/include/linux/lsm/smack.h b/include/linux/lsm/smack.h
new file mode 100644
index 000000000000..ff730dd7a734
--- /dev/null
+++ b/include/linux/lsm/smack.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linux Security Module interface to other subsystems.
+ * Smack presents a pointer into the global Smack label list.
+ */
+#ifndef __LINUX_LSM_SMACK_H
+#define __LINUX_LSM_SMACK_H
+
+struct smack_known;
+
+struct lsm_prop_smack {
+#ifdef CONFIG_SECURITY_SMACK
+	struct smack_known *skp;
+#endif
+};
+
+#endif /* ! __LINUX_LSM_SMACK_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index b86ec2afc691..555249a8d121 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -34,6 +34,10 @@
 #include <linux/sockptr.h>
 #include <linux/bpf.h>
 #include <uapi/linux/lsm.h>
+#include <linux/lsm/selinux.h>
+#include <linux/lsm/smack.h>
+#include <linux/lsm/apparmor.h>
+#include <linux/lsm/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -152,6 +156,22 @@ enum lockdown_reason {
 	LOCKDOWN_CONFIDENTIALITY_MAX,
 };
 
+/* scaffolding */
+struct lsm_prop_scaffold {
+	u32 secid;
+};
+
+/*
+ * Data exported by the security modules
+ */
+struct lsm_prop {
+	struct lsm_prop_selinux selinux;
+	struct lsm_prop_smack smack;
+	struct lsm_prop_apparmor apparmor;
+	struct lsm_prop_bpf bpf;
+	struct lsm_prop_scaffold scaffold;
+};
+
 extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1];
 extern u32 lsm_active_cnt;
 extern const struct lsm_id *lsm_idlist[];
-- 
cgit v1.2.3


From 870b7fdc660b38c4e1bd8bf48e62aa352ddf8f42 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 9 Oct 2024 10:32:10 -0700
Subject: lsm: use lsm_prop in security_audit_rule_match

Change the secid parameter of security_audit_rule_match
to a lsm_prop structure pointer. Pass the entry from the
lsm_prop structure for the approprite slot to the LSM hook.

Change the users of security_audit_rule_match to use the
lsm_prop instead of a u32. The scaffolding function lsmprop_init()
fills the structure with the value of the old secid, ensuring that
it is available to the appropriate module hook. The sources of
the secid, security_task_getsecid() and security_inode_getsecid(),
will be converted to use the lsm_prop structure later in the series.
At that point the use of lsmprop_init() is dropped.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h       |  3 ++-
 include/linux/security.h            |  7 ++++---
 kernel/auditfilter.c                | 11 +++++++----
 kernel/auditsc.c                    | 18 ++++++++++++++----
 security/apparmor/audit.c           |  8 ++++++--
 security/apparmor/include/audit.h   |  2 +-
 security/integrity/ima/ima.h        |  2 +-
 security/integrity/ima/ima_policy.c | 11 +++++++----
 security/security.c                 |  7 ++++---
 security/selinux/include/audit.h    |  4 ++--
 security/selinux/ss/services.c      | 10 +++++++---
 security/smack/smack_lsm.c          | 11 ++++++++---
 12 files changed, 63 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 9eca013aa5e1..ea7f17e37756 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -416,7 +416,8 @@ LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring,
 LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr,
 	 void **lsmrule, gfp_t gfp)
 LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule)
-LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule)
+LSM_HOOK(int, 0, audit_rule_match, struct lsm_prop *prop, u32 field, u32 op,
+	 void *lsmrule)
 LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
 #endif /* CONFIG_AUDIT */
 
diff --git a/include/linux/security.h b/include/linux/security.h
index 555249a8d121..a4f020491e7c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2115,7 +2115,8 @@ static inline void security_key_post_create_or_update(struct key *keyring,
 int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule,
 			     gfp_t gfp);
 int security_audit_rule_known(struct audit_krule *krule);
-int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule);
+int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
+			      void *lsmrule);
 void security_audit_rule_free(void *lsmrule);
 
 #else
@@ -2131,8 +2132,8 @@ static inline int security_audit_rule_known(struct audit_krule *krule)
 	return 0;
 }
 
-static inline int security_audit_rule_match(u32 secid, u32 field, u32 op,
-					    void *lsmrule)
+static inline int security_audit_rule_match(struct lsm_prop *prop, u32 field,
+					    u32 op, void *lsmrule)
 {
 	return 0;
 }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 470041c49a44..288a2092fd0d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1339,8 +1339,8 @@ int audit_filter(int msgtype, unsigned int listtype)
 
 		for (i = 0; i < e->rule.field_count; i++) {
 			struct audit_field *f = &e->rule.fields[i];
+			struct lsm_prop prop = { };
 			pid_t pid;
-			u32 sid;
 
 			switch (f->type) {
 			case AUDIT_PID:
@@ -1370,9 +1370,12 @@ int audit_filter(int msgtype, unsigned int listtype)
 			case AUDIT_SUBJ_SEN:
 			case AUDIT_SUBJ_CLR:
 				if (f->lsm_rule) {
-					security_current_getsecid_subj(&sid);
-					result = security_audit_rule_match(sid,
-						   f->type, f->op, f->lsm_rule);
+					/* scaffolding */
+					security_current_getsecid_subj(
+							&prop.scaffold.secid);
+					result = security_audit_rule_match(
+						   &prop, f->type, f->op,
+						   f->lsm_rule);
 				}
 				break;
 			case AUDIT_EXE:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cd57053b4a69..aaf672a962d6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -471,6 +471,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 	const struct cred *cred;
 	int i, need_sid = 1;
 	u32 sid;
+	struct lsm_prop prop = { };
 	unsigned int sessionid;
 
 	if (ctx && rule->prio <= ctx->prio)
@@ -681,7 +682,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 					security_current_getsecid_subj(&sid);
 					need_sid = 0;
 				}
-				result = security_audit_rule_match(sid, f->type,
+				/* scaffolding */
+				prop.scaffold.secid = sid;
+				result = security_audit_rule_match(&prop,
+								   f->type,
 								   f->op,
 								   f->lsm_rule);
 			}
@@ -696,15 +700,19 @@ static int audit_filter_rules(struct task_struct *tsk,
 			if (f->lsm_rule) {
 				/* Find files that match */
 				if (name) {
+					/* scaffolding */
+					prop.scaffold.secid = name->osid;
 					result = security_audit_rule_match(
-								name->osid,
+								&prop,
 								f->type,
 								f->op,
 								f->lsm_rule);
 				} else if (ctx) {
 					list_for_each_entry(n, &ctx->names_list, list) {
+						/* scaffolding */
+						prop.scaffold.secid = n->osid;
 						if (security_audit_rule_match(
-								n->osid,
+								&prop,
 								f->type,
 								f->op,
 								f->lsm_rule)) {
@@ -716,7 +724,9 @@ static int audit_filter_rules(struct task_struct *tsk,
 				/* Find ipc objects that match */
 				if (!ctx || ctx->type != AUDIT_IPC)
 					break;
-				if (security_audit_rule_match(ctx->ipc.osid,
+				/* scaffolding */
+				prop.scaffold.secid = ctx->ipc.osid;
+				if (security_audit_rule_match(&prop,
 							      f->type, f->op,
 							      f->lsm_rule))
 					++result;
diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c
index 6b5181c668b5..87df6fa2a48d 100644
--- a/security/apparmor/audit.c
+++ b/security/apparmor/audit.c
@@ -264,13 +264,17 @@ int aa_audit_rule_known(struct audit_krule *rule)
 	return 0;
 }
 
-int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
+int aa_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule)
 {
 	struct aa_audit_rule *rule = vrule;
 	struct aa_label *label;
 	int found = 0;
 
-	label = aa_secid_to_label(sid);
+	/* scaffolding */
+	if (!prop->apparmor.label && prop->scaffold.secid)
+		label = aa_secid_to_label(prop->scaffold.secid);
+	else
+		label = prop->apparmor.label;
 
 	if (!label)
 		return -ENOENT;
diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h
index 0c8cc86b417b..e27229349abb 100644
--- a/security/apparmor/include/audit.h
+++ b/security/apparmor/include/audit.h
@@ -202,6 +202,6 @@ static inline int complain_error(int error)
 void aa_audit_rule_free(void *vrule);
 int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp);
 int aa_audit_rule_known(struct audit_krule *rule);
-int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule);
+int aa_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule);
 
 #endif /* __AA_AUDIT_H */
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 3c323ca213d4..cdfe8c8c7bac 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -555,7 +555,7 @@ static inline void ima_filter_rule_free(void *lsmrule)
 {
 }
 
-static inline int ima_filter_rule_match(u32 secid, u32 field, u32 op,
+static inline int ima_filter_rule_match(struct lsm_prop *prop, u32 field, u32 op,
 					void *lsmrule)
 {
 	return -EINVAL;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 09da8e639239..22a62e675ebc 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -635,7 +635,7 @@ static bool ima_match_rules(struct ima_rule_entry *rule,
 		return false;
 	for (i = 0; i < MAX_LSM_RULES; i++) {
 		int rc = 0;
-		u32 osid;
+		struct lsm_prop prop = { };
 
 		if (!lsm_rule->lsm[i].rule) {
 			if (!lsm_rule->lsm[i].args_p)
@@ -649,15 +649,18 @@ retry:
 		case LSM_OBJ_USER:
 		case LSM_OBJ_ROLE:
 		case LSM_OBJ_TYPE:
-			security_inode_getsecid(inode, &osid);
-			rc = ima_filter_rule_match(osid, lsm_rule->lsm[i].type,
+			/* scaffolding */
+			security_inode_getsecid(inode, &prop.scaffold.secid);
+			rc = ima_filter_rule_match(&prop, lsm_rule->lsm[i].type,
 						   Audit_equal,
 						   lsm_rule->lsm[i].rule);
 			break;
 		case LSM_SUBJ_USER:
 		case LSM_SUBJ_ROLE:
 		case LSM_SUBJ_TYPE:
-			rc = ima_filter_rule_match(secid, lsm_rule->lsm[i].type,
+			/* scaffolding */
+			prop.scaffold.secid = secid;
+			rc = ima_filter_rule_match(&prop, lsm_rule->lsm[i].type,
 						   Audit_equal,
 						   lsm_rule->lsm[i].rule);
 			break;
diff --git a/security/security.c b/security/security.c
index 6875eb4a59fc..deab7f912e12 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5570,7 +5570,7 @@ void security_audit_rule_free(void *lsmrule)
 
 /**
  * security_audit_rule_match() - Check if a label matches an audit rule
- * @secid: security label
+ * @prop: security label
  * @field: LSM audit field
  * @op: matching operator
  * @lsmrule: audit rule
@@ -5581,9 +5581,10 @@ void security_audit_rule_free(void *lsmrule)
  * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on
  *         failure.
  */
-int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
+int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
+			      void *lsmrule)
 {
-	return call_int_hook(audit_rule_match, secid, field, op, lsmrule);
+	return call_int_hook(audit_rule_match, prop, field, op, lsmrule);
 }
 #endif /* CONFIG_AUDIT */
 
diff --git a/security/selinux/include/audit.h b/security/selinux/include/audit.h
index 168d17be7df3..c745ea2a993d 100644
--- a/security/selinux/include/audit.h
+++ b/security/selinux/include/audit.h
@@ -41,7 +41,7 @@ void selinux_audit_rule_free(void *rule);
 
 /**
  * selinux_audit_rule_match - determine if a context ID matches a rule.
- * @sid: the context ID to check
+ * @prop: includes the context ID to check
  * @field: the field this rule refers to
  * @op: the operator the rule uses
  * @rule: pointer to the audit rule to check against
@@ -49,7 +49,7 @@ void selinux_audit_rule_free(void *rule);
  * Returns 1 if the context id matches the rule, 0 if it does not, and
  * -errno on failure.
  */
-int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *rule);
+int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *rule);
 
 /**
  * selinux_audit_rule_known - check to see if rule contains selinux fields.
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index a9830fbfc5c6..e0c14773a7b7 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -3635,7 +3635,7 @@ int selinux_audit_rule_known(struct audit_krule *rule)
 	return 0;
 }
 
-int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
+int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule)
 {
 	struct selinux_state *state = &selinux_state;
 	struct selinux_policy *policy;
@@ -3661,10 +3661,14 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
 		goto out;
 	}
 
-	ctxt = sidtab_search(policy->sidtab, sid);
+	/* scaffolding */
+	if (!prop->selinux.secid && prop->scaffold.secid)
+		prop->selinux.secid = prop->scaffold.secid;
+
+	ctxt = sidtab_search(policy->sidtab, prop->selinux.secid);
 	if (unlikely(!ctxt)) {
 		WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n",
-			  sid);
+			  prop->selinux.secid);
 		match = -ENOENT;
 		goto out;
 	}
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 370fd594da12..535233ad7203 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4757,7 +4757,7 @@ static int smack_audit_rule_known(struct audit_krule *krule)
 
 /**
  * smack_audit_rule_match - Audit given object ?
- * @secid: security id for identifying the object to test
+ * @prop: security id for identifying the object to test
  * @field: audit rule flags given from user-space
  * @op: required testing operator
  * @vrule: smack internal rule presentation
@@ -4765,7 +4765,8 @@ static int smack_audit_rule_known(struct audit_krule *krule)
  * The core Audit hook. It's used to take the decision of
  * whether to audit or not to audit a given object.
  */
-static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule)
+static int smack_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
+				  void *vrule)
 {
 	struct smack_known *skp;
 	char *rule = vrule;
@@ -4778,7 +4779,11 @@ static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule)
 	if (field != AUDIT_SUBJ_USER && field != AUDIT_OBJ_USER)
 		return 0;
 
-	skp = smack_from_secid(secid);
+	/* scaffolding */
+	if (!prop->smack.skp && prop->scaffold.secid)
+		skp = smack_from_secid(prop->scaffold.secid);
+	else
+		skp = prop->smack.skp;
 
 	/*
 	 * No need to do string comparisons. If a match occurs,
-- 
cgit v1.2.3


From 6f2f724f0e116d9ea960ff3dd645add12e60e176 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 9 Oct 2024 10:32:11 -0700
Subject: lsm: add lsmprop_to_secctx hook

Add a new hook security_lsmprop_to_secctx() and its LSM specific
implementations. The LSM specific code will use the lsm_prop element
allocated for that module. This allows for the possibility that more
than one module may be called upon to translate a secid to a string,
as can occur in the audit code.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h     |  2 ++
 include/linux/security.h          | 10 +++++++++-
 security/apparmor/include/secid.h |  2 ++
 security/apparmor/lsm.c           |  1 +
 security/apparmor/secid.c         | 25 +++++++++++++++++++++++--
 security/security.c               | 21 +++++++++++++++++++++
 security/selinux/hooks.c          | 16 ++++++++++++++--
 security/selinux/include/audit.h  |  3 ++-
 security/smack/smack_lsm.c        | 31 ++++++++++++++++++++++++++-----
 9 files changed, 100 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ea7f17e37756..ed6ea0b1ec57 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -294,6 +294,8 @@ LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size)
 LSM_HOOK(int, 0, ismaclabel, const char *name)
 LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata,
 	 u32 *seclen)
+LSM_HOOK(int, -EOPNOTSUPP, lsmprop_to_secctx, struct lsm_prop *prop,
+	 char **secdata, u32 *seclen)
 LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid)
 LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen)
 LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode)
diff --git a/include/linux/security.h b/include/linux/security.h
index a4f020491e7c..f1c68e38b15d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -535,6 +535,7 @@ int security_setprocattr(int lsmid, const char *name, void *value, size_t size);
 int security_netlink_send(struct sock *sk, struct sk_buff *skb);
 int security_ismaclabel(const char *name);
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
+int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata, u32 *seclen);
 int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
 void security_release_secctx(char *secdata, u32 seclen);
 void security_inode_invalidate_secctx(struct inode *inode);
@@ -1488,7 +1489,14 @@ static inline int security_ismaclabel(const char *name)
 	return 0;
 }
 
-static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
+static inline int security_secid_to_secctx(u32 secid, char **secdata,
+					   u32 *seclen)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int security_lsmprop_to_secctx(struct lsm_prop *prop,
+					     char **secdata, u32 *seclen)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/security/apparmor/include/secid.h b/security/apparmor/include/secid.h
index a912a5d5d04f..cc6d1c9f4a47 100644
--- a/security/apparmor/include/secid.h
+++ b/security/apparmor/include/secid.h
@@ -26,6 +26,8 @@ extern int apparmor_display_secid_mode;
 
 struct aa_label *aa_secid_to_label(u32 secid);
 int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
+int apparmor_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+			       u32 *seclen);
 int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
 void apparmor_release_secctx(char *secdata, u32 seclen);
 
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index f5d05297d59e..a58b72ed246c 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -1517,6 +1517,7 @@ static struct security_hook_list apparmor_hooks[] __ro_after_init = {
 #endif
 
 	LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx),
+	LSM_HOOK_INIT(lsmprop_to_secctx, apparmor_lsmprop_to_secctx),
 	LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid),
 	LSM_HOOK_INIT(release_secctx, apparmor_release_secctx),
 
diff --git a/security/apparmor/secid.c b/security/apparmor/secid.c
index 83d3d1e6d9dc..34610888559f 100644
--- a/security/apparmor/secid.c
+++ b/security/apparmor/secid.c
@@ -61,10 +61,10 @@ struct aa_label *aa_secid_to_label(u32 secid)
 	return xa_load(&aa_secids, secid);
 }
 
-int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
+static int apparmor_label_to_secctx(struct aa_label *label, char **secdata,
+				    u32 *seclen)
 {
 	/* TODO: cache secctx and ref count so we don't have to recreate */
-	struct aa_label *label = aa_secid_to_label(secid);
 	int flags = FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED | FLAG_ABS_ROOT;
 	int len;
 
@@ -90,6 +90,27 @@ int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 	return 0;
 }
 
+int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
+{
+	struct aa_label *label = aa_secid_to_label(secid);
+
+	return apparmor_label_to_secctx(label, secdata, seclen);
+}
+
+int apparmor_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+			       u32 *seclen)
+{
+	struct aa_label *label;
+
+	/* scaffolding */
+	if (!prop->apparmor.label && prop->scaffold.secid)
+		label = aa_secid_to_label(prop->scaffold.secid);
+	else
+		label = prop->apparmor.label;
+
+	return apparmor_label_to_secctx(label, secdata, seclen);
+}
+
 int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
 {
 	struct aa_label *label;
diff --git a/security/security.c b/security/security.c
index deab7f912e12..1842f1325e77 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4311,6 +4311,27 @@ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 }
 EXPORT_SYMBOL(security_secid_to_secctx);
 
+/**
+ * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx
+ * @prop: lsm specific information
+ * @secdata: secctx
+ * @seclen: secctx length
+ *
+ * Convert a @prop entry to security context.  If @secdata is NULL the
+ * length of the result will be returned in @seclen, but no @secdata
+ * will be returned.  This does mean that the length could change between
+ * calls to check the length and the next call which actually allocates
+ * and returns the @secdata.
+ *
+ * Return: Return 0 on success, error on failure.
+ */
+int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+			       u32 *seclen)
+{
+	return call_int_hook(lsmprop_to_secctx, prop, secdata, seclen);
+}
+EXPORT_SYMBOL(security_lsmprop_to_secctx);
+
 /**
  * security_secctx_to_secid() - Convert a secctx to a secid
  * @secdata: secctx
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index fc926d3cac6e..6e88faf3c6e5 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6601,8 +6601,19 @@ static int selinux_ismaclabel(const char *name)
 
 static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
-	return security_sid_to_context(secid,
-				       secdata, seclen);
+	return security_sid_to_context(secid, secdata, seclen);
+}
+
+static int selinux_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+				     u32 *seclen)
+{
+	u32 secid = prop->selinux.secid;
+
+	/* scaffolding */
+	if (!secid)
+		secid = prop->scaffold.secid;
+
+	return selinux_secid_to_secctx(secid, secdata, seclen);
 }
 
 static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
@@ -7347,6 +7358,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security),
 	LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
 	LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx),
+	LSM_HOOK_INIT(lsmprop_to_secctx, selinux_lsmprop_to_secctx),
 	LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx),
 	LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security),
 	LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security),
diff --git a/security/selinux/include/audit.h b/security/selinux/include/audit.h
index c745ea2a993d..d5b0425055e4 100644
--- a/security/selinux/include/audit.h
+++ b/security/selinux/include/audit.h
@@ -49,7 +49,8 @@ void selinux_audit_rule_free(void *rule);
  * Returns 1 if the context id matches the rule, 0 if it does not, and
  * -errno on failure.
  */
-int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *rule);
+int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
+			     void *rule);
 
 /**
  * selinux_audit_rule_known - check to see if rule contains selinux fields.
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 535233ad7203..66da7cbcc0b7 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4768,7 +4768,7 @@ static int smack_audit_rule_known(struct audit_krule *krule)
 static int smack_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
 				  void *vrule)
 {
-	struct smack_known *skp;
+	struct smack_known *skp = prop->smack.skp;
 	char *rule = vrule;
 
 	if (unlikely(!rule)) {
@@ -4780,10 +4780,8 @@ static int smack_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
 		return 0;
 
 	/* scaffolding */
-	if (!prop->smack.skp && prop->scaffold.secid)
+	if (!skp && prop->scaffold.secid)
 		skp = smack_from_secid(prop->scaffold.secid);
-	else
-		skp = prop->smack.skp;
 
 	/*
 	 * No need to do string comparisons. If a match occurs,
@@ -4814,7 +4812,6 @@ static int smack_ismaclabel(const char *name)
 	return (strcmp(name, XATTR_SMACK_SUFFIX) == 0);
 }
 
-
 /**
  * smack_secid_to_secctx - return the smack label for a secid
  * @secid: incoming integer
@@ -4833,6 +4830,29 @@ static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 	return 0;
 }
 
+/**
+ * smack_lsmprop_to_secctx - return the smack label
+ * @prop: includes incoming Smack data
+ * @secdata: destination
+ * @seclen: how long it is
+ *
+ * Exists for audit code.
+ */
+static int smack_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+				   u32 *seclen)
+{
+	struct smack_known *skp = prop->smack.skp;
+
+	/* scaffolding */
+	if (!skp && prop->scaffold.secid)
+		skp = smack_from_secid(prop->scaffold.secid);
+
+	if (secdata)
+		*secdata = skp->smk_known;
+	*seclen = strlen(skp->smk_known);
+	return 0;
+}
+
 /**
  * smack_secctx_to_secid - return the secid for a smack label
  * @secdata: smack label
@@ -5192,6 +5212,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
 
 	LSM_HOOK_INIT(ismaclabel, smack_ismaclabel),
 	LSM_HOOK_INIT(secid_to_secctx, smack_secid_to_secctx),
+	LSM_HOOK_INIT(lsmprop_to_secctx, smack_lsmprop_to_secctx),
 	LSM_HOOK_INIT(secctx_to_secid, smack_secctx_to_secid),
 	LSM_HOOK_INIT(inode_notifysecctx, smack_inode_notifysecctx),
 	LSM_HOOK_INIT(inode_setsecctx, smack_inode_setsecctx),
-- 
cgit v1.2.3


From 7183abccd8ac2c486363e267b5d84032818eb725 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 9 Oct 2024 10:32:12 -0700
Subject: audit: maintain an lsm_prop in audit_context

Replace the secid value stored in struct audit_context with a struct
lsm_prop. Change the code that uses this value to accommodate the
change. security_audit_rule_match() expects a lsm_prop, so existing
scaffolding can be removed. A call to security_secid_to_secctx()
is changed to security_lsmprop_to_secctx().  The call to
security_ipc_getsecid() is scaffolded.

A new function lsmprop_is_set() is introduced to identify whether
an lsm_prop contains a non-zero value.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subject line tweak, fix lsmprop_is_set() typo]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 24 ++++++++++++++++++++++++
 kernel/audit.h           |  3 ++-
 kernel/auditsc.c         | 19 ++++++++-----------
 3 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index f1c68e38b15d..c029bfe2c5bb 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -291,6 +291,19 @@ static inline const char *kernel_load_data_id_str(enum kernel_load_data_id id)
 
 #ifdef CONFIG_SECURITY
 
+/**
+ * lsmprop_is_set - report if there is a value in the lsm_prop
+ * @prop: Pointer to the exported LSM data
+ *
+ * Returns true if there is a value set, false otherwise
+ */
+static inline bool lsmprop_is_set(struct lsm_prop *prop)
+{
+	const struct lsm_prop empty = {};
+
+	return !!memcmp(prop, &empty, sizeof(*prop));
+}
+
 int call_blocking_lsm_notifier(enum lsm_event event, void *data);
 int register_blocking_lsm_notifier(struct notifier_block *nb);
 int unregister_blocking_lsm_notifier(struct notifier_block *nb);
@@ -552,6 +565,17 @@ int security_bdev_setintegrity(struct block_device *bdev,
 			       size_t size);
 #else /* CONFIG_SECURITY */
 
+/**
+ * lsmprop_is_set - report if there is a value in the lsm_prop
+ * @prop: Pointer to the exported LSM data
+ *
+ * Returns true if there is a value set, false otherwise
+ */
+static inline bool lsmprop_is_set(struct lsm_prop *prop)
+{
+	return false;
+}
+
 static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data)
 {
 	return 0;
diff --git a/kernel/audit.h b/kernel/audit.h
index a60d2840559e..d14924a887c9 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -11,6 +11,7 @@
 
 #include <linux/fs.h>
 #include <linux/audit.h>
+#include <linux/security.h>
 #include <linux/skbuff.h>
 #include <uapi/linux/mqueue.h>
 #include <linux/tty.h>
@@ -160,7 +161,7 @@ struct audit_context {
 			kuid_t			uid;
 			kgid_t			gid;
 			umode_t			mode;
-			u32			osid;
+			struct lsm_prop		oprop;
 			int			has_perm;
 			uid_t			perm_uid;
 			gid_t			perm_gid;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index aaf672a962d6..d92326215f65 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -724,9 +724,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 				/* Find ipc objects that match */
 				if (!ctx || ctx->type != AUDIT_IPC)
 					break;
-				/* scaffolding */
-				prop.scaffold.secid = ctx->ipc.osid;
-				if (security_audit_rule_match(&prop,
+				if (security_audit_rule_match(&ctx->ipc.oprop,
 							      f->type, f->op,
 							      f->lsm_rule))
 					++result;
@@ -1394,19 +1392,17 @@ static void show_special(struct audit_context *context, int *call_panic)
 			audit_log_format(ab, " a%d=%lx", i,
 				context->socketcall.args[i]);
 		break; }
-	case AUDIT_IPC: {
-		u32 osid = context->ipc.osid;
-
+	case AUDIT_IPC:
 		audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
 				 from_kuid(&init_user_ns, context->ipc.uid),
 				 from_kgid(&init_user_ns, context->ipc.gid),
 				 context->ipc.mode);
-		if (osid) {
+		if (lsmprop_is_set(&context->ipc.oprop)) {
 			char *ctx = NULL;
 			u32 len;
 
-			if (security_secid_to_secctx(osid, &ctx, &len)) {
-				audit_log_format(ab, " osid=%u", osid);
+			if (security_lsmprop_to_secctx(&context->ipc.oprop,
+						       &ctx, &len)) {
 				*call_panic = 1;
 			} else {
 				audit_log_format(ab, " obj=%s", ctx);
@@ -1426,7 +1422,7 @@ static void show_special(struct audit_context *context, int *call_panic)
 				context->ipc.perm_gid,
 				context->ipc.perm_mode);
 		}
-		break; }
+		break;
 	case AUDIT_MQ_OPEN:
 		audit_log_format(ab,
 			"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
@@ -2642,7 +2638,8 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 	context->ipc.gid = ipcp->gid;
 	context->ipc.mode = ipcp->mode;
 	context->ipc.has_perm = 0;
-	security_ipc_getsecid(ipcp, &context->ipc.osid);
+	/* scaffolding */
+	security_ipc_getsecid(ipcp, &context->ipc.oprop.scaffold.secid);
 	context->type = AUDIT_IPC;
 }
 
-- 
cgit v1.2.3


From f4602f163c98bc93c118e196466c1c98186adb67 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 9 Oct 2024 10:32:13 -0700
Subject: lsm: use lsm_prop in security_ipc_getsecid

There may be more than one LSM that provides IPC data for auditing.
Change security_ipc_getsecid() to fill in a lsm_prop structure instead
of the u32 secid.  Change the name to security_ipc_getlsmprop() to
reflect the change.

Cc: audit@vger.kernel.org
Cc: linux-security-module@vger.kernel.org
Cc: selinux@vger.kernel.org
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h |  4 ++--
 include/linux/security.h      | 18 +++++++++++++++---
 kernel/auditsc.c              |  3 +--
 security/security.c           | 14 +++++++-------
 security/selinux/hooks.c      |  9 ++++++---
 security/smack/smack_lsm.c    | 16 +++++++++-------
 6 files changed, 40 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ed6ea0b1ec57..6ef2a345ea03 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -256,8 +256,8 @@ LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
 	 struct inode *inode)
 LSM_HOOK(int, 0, userns_create, const struct cred *cred)
 LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
-LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp,
-	 u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, ipc_getlsmprop, struct kern_ipc_perm *ipcp,
+	 struct lsm_prop *prop)
 LSM_HOOK(int, 0, msg_msg_alloc_security, struct msg_msg *msg)
 LSM_HOOK(void, LSM_RET_VOID, msg_msg_free_security, struct msg_msg *msg)
 LSM_HOOK(int, 0, msg_queue_alloc_security, struct kern_ipc_perm *perm)
diff --git a/include/linux/security.h b/include/linux/security.h
index c029bfe2c5bb..15aef5f68e77 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -289,6 +289,17 @@ static inline const char *kernel_load_data_id_str(enum kernel_load_data_id id)
 	return kernel_load_data_str[id];
 }
 
+/**
+ * lsmprop_init - initialize a lsm_prop structure
+ * @prop: Pointer to the data to initialize
+ *
+ * Set all secid for all modules to the specified value.
+ */
+static inline void lsmprop_init(struct lsm_prop *prop)
+{
+	memset(prop, 0, sizeof(*prop));
+}
+
 #ifdef CONFIG_SECURITY
 
 /**
@@ -515,7 +526,7 @@ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
 int security_create_user_ns(const struct cred *cred);
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
-void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid);
+void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop);
 int security_msg_msg_alloc(struct msg_msg *msg);
 void security_msg_msg_free(struct msg_msg *msg);
 int security_msg_queue_alloc(struct kern_ipc_perm *msq);
@@ -1377,9 +1388,10 @@ static inline int security_ipc_permission(struct kern_ipc_perm *ipcp,
 	return 0;
 }
 
-static inline void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
+static inline void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp,
+					   struct lsm_prop *prop)
 {
-	*secid = 0;
+	lsmprop_init(prop);
 }
 
 static inline int security_msg_msg_alloc(struct msg_msg *msg)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d92326215f65..5019eb32a97f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2638,8 +2638,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 	context->ipc.gid = ipcp->gid;
 	context->ipc.mode = ipcp->mode;
 	context->ipc.has_perm = 0;
-	/* scaffolding */
-	security_ipc_getsecid(ipcp, &context->ipc.oprop.scaffold.secid);
+	security_ipc_getlsmprop(ipcp, &context->ipc.oprop);
 	context->type = AUDIT_IPC;
 }
 
diff --git a/security/security.c b/security/security.c
index 1842f1325e77..f269421c2d72 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3730,17 +3730,17 @@ int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
 }
 
 /**
- * security_ipc_getsecid() - Get the sysv ipc object's secid
+ * security_ipc_getlsmprop() - Get the sysv ipc object LSM data
  * @ipcp: ipc permission structure
- * @secid: secid pointer
+ * @prop: pointer to lsm information
  *
- * Get the secid associated with the ipc object.  In case of failure, @secid
- * will be set to zero.
+ * Get the lsm information associated with the ipc object.
  */
-void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
+
+void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop)
 {
-	*secid = 0;
-	call_void_hook(ipc_getsecid, ipcp, secid);
+	lsmprop_init(prop);
+	call_void_hook(ipc_getlsmprop, ipcp, prop);
 }
 
 /**
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 6e88faf3c6e5..1d43367009ed 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6319,10 +6319,13 @@ static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
 	return ipc_has_perm(ipcp, av);
 }
 
-static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
+static void selinux_ipc_getlsmprop(struct kern_ipc_perm *ipcp,
+				   struct lsm_prop *prop)
 {
 	struct ipc_security_struct *isec = selinux_ipc(ipcp);
-	*secid = isec->sid;
+	prop->selinux.secid = isec->sid;
+	/* scaffolding */
+	prop->scaffold.secid = isec->sid;
 }
 
 static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode)
@@ -7215,7 +7218,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(userns_create, selinux_userns_create),
 
 	LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
-	LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid),
+	LSM_HOOK_INIT(ipc_getlsmprop, selinux_ipc_getlsmprop),
 
 	LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate),
 	LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 66da7cbcc0b7..fed44b4fc73d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3435,16 +3435,18 @@ static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
 }
 
 /**
- * smack_ipc_getsecid - Extract smack security id
+ * smack_ipc_getlsmprop - Extract smack security data
  * @ipp: the object permissions
- * @secid: where result will be saved
+ * @prop: where result will be saved
  */
-static void smack_ipc_getsecid(struct kern_ipc_perm *ipp, u32 *secid)
+static void smack_ipc_getlsmprop(struct kern_ipc_perm *ipp, struct lsm_prop *prop)
 {
-	struct smack_known **blob = smack_ipc(ipp);
-	struct smack_known *iskp = *blob;
+	struct smack_known **iskpp = smack_ipc(ipp);
+	struct smack_known *iskp = *iskpp;
 
-	*secid = iskp->smk_secid;
+	prop->smack.skp = iskp;
+	/* scaffolding */
+	prop->scaffold.secid = iskp->smk_secid;
 }
 
 /**
@@ -5140,7 +5142,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(task_to_inode, smack_task_to_inode),
 
 	LSM_HOOK_INIT(ipc_permission, smack_ipc_permission),
-	LSM_HOOK_INIT(ipc_getsecid, smack_ipc_getsecid),
+	LSM_HOOK_INIT(ipc_getlsmprop, smack_ipc_getlsmprop),
 
 	LSM_HOOK_INIT(msg_msg_alloc_security, smack_msg_msg_alloc_security),
 
-- 
cgit v1.2.3


From 37f670aacd481128ad9a940ac2d3372aecd92824 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 9 Oct 2024 10:32:15 -0700
Subject: lsm: use lsm_prop in security_current_getsecid

Change the security_current_getsecid_subj() and
security_task_getsecid_obj() interfaces to fill in a lsm_prop structure
instead of a u32 secid.  Audit interfaces will need to collect all
possible security data for possible reporting.

Cc: linux-integrity@vger.kernel.org
Cc: audit@vger.kernel.org
Cc: selinux@vger.kernel.org
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h         |  6 ++--
 include/linux/security.h              | 13 ++++----
 kernel/audit.c                        | 11 +++----
 kernel/auditfilter.c                  |  3 +-
 kernel/auditsc.c                      | 22 ++++++++-----
 net/netlabel/netlabel_unlabeled.c     |  5 ++-
 net/netlabel/netlabel_user.h          |  6 +++-
 security/apparmor/lsm.c               | 20 ++++++++----
 security/integrity/ima/ima.h          |  6 ++--
 security/integrity/ima/ima_api.c      |  6 ++--
 security/integrity/ima/ima_appraise.c |  6 ++--
 security/integrity/ima/ima_main.c     | 59 ++++++++++++++++++-----------------
 security/integrity/ima/ima_policy.c   | 14 ++++-----
 security/security.c                   | 28 ++++++++---------
 security/selinux/hooks.c              | 17 ++++++----
 security/smack/smack_lsm.c            | 25 +++++++++------
 16 files changed, 139 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 6ef2a345ea03..8a90fd9ff3c8 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -235,9 +235,9 @@ LSM_HOOK(int, 0, task_fix_setgroups, struct cred *new, const struct cred * old)
 LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid)
 LSM_HOOK(int, 0, task_getpgid, struct task_struct *p)
 LSM_HOOK(int, 0, task_getsid, struct task_struct *p)
-LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid)
-LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj,
-	 struct task_struct *p, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, current_getlsmprop_subj, struct lsm_prop *prop)
+LSM_HOOK(void, LSM_RET_VOID, task_getlsmprop_obj,
+	 struct task_struct *p, struct lsm_prop *prop)
 LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice)
 LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio)
 LSM_HOOK(int, 0, task_getioprio, struct task_struct *p)
diff --git a/include/linux/security.h b/include/linux/security.h
index 15aef5f68e77..9bc8153f4e8b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -507,8 +507,8 @@ int security_task_fix_setgroups(struct cred *new, const struct cred *old);
 int security_task_setpgid(struct task_struct *p, pid_t pgid);
 int security_task_getpgid(struct task_struct *p);
 int security_task_getsid(struct task_struct *p);
-void security_current_getsecid_subj(u32 *secid);
-void security_task_getsecid_obj(struct task_struct *p, u32 *secid);
+void security_current_getlsmprop_subj(struct lsm_prop *prop);
+void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop);
 int security_task_setnice(struct task_struct *p, int nice);
 int security_task_setioprio(struct task_struct *p, int ioprio);
 int security_task_getioprio(struct task_struct *p);
@@ -1305,14 +1305,15 @@ static inline int security_task_getsid(struct task_struct *p)
 	return 0;
 }
 
-static inline void security_current_getsecid_subj(u32 *secid)
+static inline void security_current_getlsmprop_subj(struct lsm_prop *prop)
 {
-	*secid = 0;
+	lsmprop_init(prop);
 }
 
-static inline void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
+static inline void security_task_getlsmprop_obj(struct task_struct *p,
+						struct lsm_prop *prop)
 {
-	*secid = 0;
+	lsmprop_init(prop);
 }
 
 static inline int security_task_setnice(struct task_struct *p, int nice)
diff --git a/kernel/audit.c b/kernel/audit.c
index 47c41e6f9ea9..d2797e8fe182 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2179,16 +2179,16 @@ void audit_log_key(struct audit_buffer *ab, char *key)
 
 int audit_log_task_context(struct audit_buffer *ab)
 {
+	struct lsm_prop prop;
 	char *ctx = NULL;
 	unsigned len;
 	int error;
-	u32 sid;
 
-	security_current_getsecid_subj(&sid);
-	if (!sid)
+	security_current_getlsmprop_subj(&prop);
+	if (!lsmprop_is_set(&prop))
 		return 0;
 
-	error = security_secid_to_secctx(sid, &ctx, &len);
+	error = security_lsmprop_to_secctx(&prop, &ctx, &len);
 	if (error) {
 		if (error != -EINVAL)
 			goto error_path;
@@ -2405,8 +2405,7 @@ int audit_signal_info(int sig, struct task_struct *t)
 			audit_sig_uid = auid;
 		else
 			audit_sig_uid = uid;
-		/* scaffolding */
-		security_current_getsecid_subj(&audit_sig_lsm.scaffold.secid);
+		security_current_getlsmprop_subj(&audit_sig_lsm);
 	}
 
 	return audit_signal_info_syscall(t);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 288a2092fd0d..a7de3dabe6e1 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1371,8 +1371,7 @@ int audit_filter(int msgtype, unsigned int listtype)
 			case AUDIT_SUBJ_CLR:
 				if (f->lsm_rule) {
 					/* scaffolding */
-					security_current_getsecid_subj(
-							&prop.scaffold.secid);
+					security_current_getlsmprop_subj(&prop);
 					result = security_audit_rule_match(
 						   &prop, f->type, f->op,
 						   f->lsm_rule);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5019eb32a97f..6b2b2a8d5647 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -470,7 +470,6 @@ static int audit_filter_rules(struct task_struct *tsk,
 {
 	const struct cred *cred;
 	int i, need_sid = 1;
-	u32 sid;
 	struct lsm_prop prop = { };
 	unsigned int sessionid;
 
@@ -675,15 +674,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 					 * fork()/copy_process() in which case
 					 * the new @tsk creds are still a dup
 					 * of @current's creds so we can still
-					 * use security_current_getsecid_subj()
+					 * use
+					 * security_current_getlsmprop_subj()
 					 * here even though it always refs
 					 * @current's creds
 					 */
-					security_current_getsecid_subj(&sid);
+					security_current_getlsmprop_subj(&prop);
 					need_sid = 0;
 				}
-				/* scaffolding */
-				prop.scaffold.secid = sid;
 				result = security_audit_rule_match(&prop,
 								   f->type,
 								   f->op,
@@ -2730,12 +2728,15 @@ int __audit_sockaddr(int len, void *a)
 void __audit_ptrace(struct task_struct *t)
 {
 	struct audit_context *context = audit_context();
+	struct lsm_prop prop;
 
 	context->target_pid = task_tgid_nr(t);
 	context->target_auid = audit_get_loginuid(t);
 	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
-	security_task_getsecid_obj(t, &context->target_sid);
+	security_task_getlsmprop_obj(t, &prop);
+	/* scaffolding */
+	context->target_sid = prop.scaffold.secid;
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
 }
 
@@ -2751,6 +2752,7 @@ int audit_signal_info_syscall(struct task_struct *t)
 	struct audit_aux_data_pids *axp;
 	struct audit_context *ctx = audit_context();
 	kuid_t t_uid = task_uid(t);
+	struct lsm_prop prop;
 
 	if (!audit_signals || audit_dummy_context())
 		return 0;
@@ -2762,7 +2764,9 @@ int audit_signal_info_syscall(struct task_struct *t)
 		ctx->target_auid = audit_get_loginuid(t);
 		ctx->target_uid = t_uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
-		security_task_getsecid_obj(t, &ctx->target_sid);
+		security_task_getlsmprop_obj(t, &prop);
+		/* scaffolding */
+		ctx->target_sid = prop.scaffold.secid;
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
 		return 0;
 	}
@@ -2783,7 +2787,9 @@ int audit_signal_info_syscall(struct task_struct *t)
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
 	axp->target_uid[axp->pid_count] = t_uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
-	security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
+	security_task_getlsmprop_obj(t, &prop);
+	/* scaffolding */
+	axp->target_sid[axp->pid_count] = prop.scaffold.secid;
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
 	axp->pid_count++;
 
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 9996883bf2b7..5925f48a3ade 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1534,11 +1534,14 @@ int __init netlbl_unlabel_defconf(void)
 	int ret_val;
 	struct netlbl_dom_map *entry;
 	struct netlbl_audit audit_info;
+	struct lsm_prop prop;
 
 	/* Only the kernel is allowed to call this function and the only time
 	 * it is called is at bootup before the audit subsystem is reporting
 	 * messages so don't worry to much about these values. */
-	security_current_getsecid_subj(&audit_info.secid);
+	security_current_getlsmprop_subj(&prop);
+	/* scaffolding */
+	audit_info.secid = prop.scaffold.secid;
 	audit_info.loginuid = GLOBAL_ROOT_UID;
 	audit_info.sessionid = 0;
 
diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h
index d6c5b31eb4eb..39f4f6df5f51 100644
--- a/net/netlabel/netlabel_user.h
+++ b/net/netlabel/netlabel_user.h
@@ -32,7 +32,11 @@
  */
 static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info)
 {
-	security_current_getsecid_subj(&audit_info->secid);
+	struct lsm_prop prop;
+
+	security_current_getlsmprop_subj(&prop);
+	/* scaffolding */
+	audit_info->secid = prop.scaffold.secid;
 	audit_info->loginuid = audit_get_loginuid(current);
 	audit_info->sessionid = audit_get_sessionid(current);
 }
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index a58b72ed246c..6331bcb35ec0 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -982,17 +982,24 @@ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm)
 	return;
 }
 
-static void apparmor_current_getsecid_subj(u32 *secid)
+static void apparmor_current_getlsmprop_subj(struct lsm_prop *prop)
 {
 	struct aa_label *label = __begin_current_label_crit_section();
-	*secid = label->secid;
+
+	prop->apparmor.label = label;
+	/* scaffolding */
+	prop->scaffold.secid = label->secid;
 	__end_current_label_crit_section(label);
 }
 
-static void apparmor_task_getsecid_obj(struct task_struct *p, u32 *secid)
+static void apparmor_task_getlsmprop_obj(struct task_struct *p,
+					  struct lsm_prop *prop)
 {
 	struct aa_label *label = aa_get_task_label(p);
-	*secid = label->secid;
+
+	prop->apparmor.label = label;
+	/* scaffolding */
+	prop->scaffold.secid = label->secid;
 	aa_put_label(label);
 }
 
@@ -1503,8 +1510,9 @@ static struct security_hook_list apparmor_hooks[] __ro_after_init = {
 
 	LSM_HOOK_INIT(task_free, apparmor_task_free),
 	LSM_HOOK_INIT(task_alloc, apparmor_task_alloc),
-	LSM_HOOK_INIT(current_getsecid_subj, apparmor_current_getsecid_subj),
-	LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid_obj),
+	LSM_HOOK_INIT(current_getlsmprop_subj,
+		      apparmor_current_getlsmprop_subj),
+	LSM_HOOK_INIT(task_getlsmprop_obj, apparmor_task_getlsmprop_obj),
 	LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit),
 	LSM_HOOK_INIT(task_kill, apparmor_task_kill),
 	LSM_HOOK_INIT(userns_create, apparmor_userns_create),
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index cdfe8c8c7bac..c0d3b716d11f 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -369,7 +369,7 @@ static inline void ima_process_queued_keys(void) {}
 
 /* LIM API function definitions */
 int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
-		   const struct cred *cred, u32 secid, int mask,
+		   const struct cred *cred, struct lsm_prop *prop, int mask,
 		   enum ima_hooks func, int *pcr,
 		   struct ima_template_desc **template_desc,
 		   const char *func_data, unsigned int *allowed_algos);
@@ -400,8 +400,8 @@ const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);
 
 /* IMA policy related functions */
 int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
-		     const struct cred *cred, u32 secid, enum ima_hooks func,
-		     int mask, int flags, int *pcr,
+		     const struct cred *cred, struct lsm_prop *prop,
+		     enum ima_hooks func, int mask, int flags, int *pcr,
 		     struct ima_template_desc **template_desc,
 		     const char *func_data, unsigned int *allowed_algos);
 void ima_init_policy(void);
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index 984e861f6e33..c35ea613c9f8 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -165,7 +165,7 @@ err_out:
  * @idmap: idmap of the mount the inode was found from
  * @inode: pointer to the inode associated with the object being validated
  * @cred: pointer to credentials structure to validate
- * @secid: secid of the task being validated
+ * @prop: properties of the task being validated
  * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC,
  *        MAY_APPEND)
  * @func: caller identifier
@@ -187,7 +187,7 @@ err_out:
  *
  */
 int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
-		   const struct cred *cred, u32 secid, int mask,
+		   const struct cred *cred, struct lsm_prop *prop, int mask,
 		   enum ima_hooks func, int *pcr,
 		   struct ima_template_desc **template_desc,
 		   const char *func_data, unsigned int *allowed_algos)
@@ -196,7 +196,7 @@ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
 
 	flags &= ima_policy_flag;
 
-	return ima_match_policy(idmap, inode, cred, secid, func, mask,
+	return ima_match_policy(idmap, inode, cred, prop, func, mask,
 				flags, pcr, template_desc, func_data,
 				allowed_algos);
 }
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 656c709b974f..884a3533f7af 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -73,13 +73,13 @@ bool is_ima_appraise_enabled(void)
 int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode,
 		      int mask, enum ima_hooks func)
 {
-	u32 secid;
+	struct lsm_prop prop;
 
 	if (!ima_appraise)
 		return 0;
 
-	security_current_getsecid_subj(&secid);
-	return ima_match_policy(idmap, inode, current_cred(), secid,
+	security_current_getlsmprop_subj(&prop);
+	return ima_match_policy(idmap, inode, current_cred(), &prop,
 				func, mask, IMA_APPRAISE | IMA_HASH, NULL,
 				NULL, NULL, NULL);
 }
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 06132cf47016..553a6f54a1e2 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -206,8 +206,8 @@ static void ima_file_free(struct file *file)
 }
 
 static int process_measurement(struct file *file, const struct cred *cred,
-			       u32 secid, char *buf, loff_t size, int mask,
-			       enum ima_hooks func)
+			       struct lsm_prop *prop, char *buf, loff_t size,
+			       int mask, enum ima_hooks func)
 {
 	struct inode *real_inode, *inode = file_inode(file);
 	struct ima_iint_cache *iint = NULL;
@@ -232,7 +232,7 @@ static int process_measurement(struct file *file, const struct cred *cred,
 	 * bitmask based on the appraise/audit/measurement policy.
 	 * Included is the appraise submask.
 	 */
-	action = ima_get_action(file_mnt_idmap(file), inode, cred, secid,
+	action = ima_get_action(file_mnt_idmap(file), inode, cred, prop,
 				mask, func, &pcr, &template_desc, NULL,
 				&allowed_algos);
 	violation_check = ((func == FILE_CHECK || func == MMAP_CHECK ||
@@ -443,23 +443,23 @@ out:
 static int ima_file_mmap(struct file *file, unsigned long reqprot,
 			 unsigned long prot, unsigned long flags)
 {
-	u32 secid;
+	struct lsm_prop prop;
 	int ret;
 
 	if (!file)
 		return 0;
 
-	security_current_getsecid_subj(&secid);
+	security_current_getlsmprop_subj(&prop);
 
 	if (reqprot & PROT_EXEC) {
-		ret = process_measurement(file, current_cred(), secid, NULL,
+		ret = process_measurement(file, current_cred(), &prop, NULL,
 					  0, MAY_EXEC, MMAP_CHECK_REQPROT);
 		if (ret)
 			return ret;
 	}
 
 	if (prot & PROT_EXEC)
-		return process_measurement(file, current_cred(), secid, NULL,
+		return process_measurement(file, current_cred(), &prop, NULL,
 					   0, MAY_EXEC, MMAP_CHECK);
 
 	return 0;
@@ -488,9 +488,9 @@ static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
 	char *pathbuf = NULL;
 	const char *pathname = NULL;
 	struct inode *inode;
+	struct lsm_prop prop;
 	int result = 0;
 	int action;
-	u32 secid;
 	int pcr;
 
 	/* Is mprotect making an mmap'ed file executable? */
@@ -498,13 +498,13 @@ static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
 	    !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC))
 		return 0;
 
-	security_current_getsecid_subj(&secid);
+	security_current_getlsmprop_subj(&prop);
 	inode = file_inode(vma->vm_file);
 	action = ima_get_action(file_mnt_idmap(vma->vm_file), inode,
-				current_cred(), secid, MAY_EXEC, MMAP_CHECK,
+				current_cred(), &prop, MAY_EXEC, MMAP_CHECK,
 				&pcr, &template, NULL, NULL);
 	action |= ima_get_action(file_mnt_idmap(vma->vm_file), inode,
-				 current_cred(), secid, MAY_EXEC,
+				 current_cred(), &prop, MAY_EXEC,
 				 MMAP_CHECK_REQPROT, &pcr, &template, NULL,
 				 NULL);
 
@@ -542,15 +542,18 @@ static int ima_bprm_check(struct linux_binprm *bprm)
 {
 	int ret;
 	u32 secid;
+	struct lsm_prop prop = { };
 
-	security_current_getsecid_subj(&secid);
-	ret = process_measurement(bprm->file, current_cred(), secid, NULL, 0,
-				  MAY_EXEC, BPRM_CHECK);
+	security_current_getlsmprop_subj(&prop);
+	ret = process_measurement(bprm->file, current_cred(),
+				  &prop, NULL, 0, MAY_EXEC, BPRM_CHECK);
 	if (ret)
 		return ret;
 
 	security_cred_getsecid(bprm->cred, &secid);
-	return process_measurement(bprm->file, bprm->cred, secid, NULL, 0,
+	/* scaffolding */
+	prop.scaffold.secid = secid;
+	return process_measurement(bprm->file, bprm->cred, &prop, NULL, 0,
 				   MAY_EXEC, CREDS_CHECK);
 }
 
@@ -566,10 +569,10 @@ static int ima_bprm_check(struct linux_binprm *bprm)
  */
 static int ima_file_check(struct file *file, int mask)
 {
-	u32 secid;
+	struct lsm_prop prop;
 
-	security_current_getsecid_subj(&secid);
-	return process_measurement(file, current_cred(), secid, NULL, 0,
+	security_current_getlsmprop_subj(&prop);
+	return process_measurement(file, current_cred(), &prop, NULL, 0,
 				   mask & (MAY_READ | MAY_WRITE | MAY_EXEC |
 					   MAY_APPEND), FILE_CHECK);
 }
@@ -768,7 +771,7 @@ static int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
 			 bool contents)
 {
 	enum ima_hooks func;
-	u32 secid;
+	struct lsm_prop prop;
 
 	/*
 	 * Do devices using pre-allocated memory run the risk of the
@@ -788,9 +791,9 @@ static int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
 
 	/* Read entire file for all partial reads. */
 	func = read_idmap[read_id] ?: FILE_CHECK;
-	security_current_getsecid_subj(&secid);
-	return process_measurement(file, current_cred(), secid, NULL,
-				   0, MAY_READ, func);
+	security_current_getlsmprop_subj(&prop);
+	return process_measurement(file, current_cred(), &prop, NULL, 0,
+				   MAY_READ, func);
 }
 
 const int read_idmap[READING_MAX_ID] = {
@@ -818,7 +821,7 @@ static int ima_post_read_file(struct file *file, char *buf, loff_t size,
 			      enum kernel_read_file_id read_id)
 {
 	enum ima_hooks func;
-	u32 secid;
+	struct lsm_prop prop;
 
 	/* permit signed certs */
 	if (!file && read_id == READING_X509_CERTIFICATE)
@@ -831,8 +834,8 @@ static int ima_post_read_file(struct file *file, char *buf, loff_t size,
 	}
 
 	func = read_idmap[read_id] ?: FILE_CHECK;
-	security_current_getsecid_subj(&secid);
-	return process_measurement(file, current_cred(), secid, buf, size,
+	security_current_getlsmprop_subj(&prop);
+	return process_measurement(file, current_cred(), &prop, buf, size,
 				   MAY_READ, func);
 }
 
@@ -967,7 +970,7 @@ int process_buffer_measurement(struct mnt_idmap *idmap,
 	int digest_hash_len = hash_digest_size[ima_hash_algo];
 	int violation = 0;
 	int action = 0;
-	u32 secid;
+	struct lsm_prop prop;
 
 	if (digest && digest_len < digest_hash_len)
 		return -EINVAL;
@@ -990,9 +993,9 @@ int process_buffer_measurement(struct mnt_idmap *idmap,
 	 * buffer measurements.
 	 */
 	if (func) {
-		security_current_getsecid_subj(&secid);
+		security_current_getlsmprop_subj(&prop);
 		action = ima_get_action(idmap, inode, current_cred(),
-					secid, 0, func, &pcr, &template,
+					&prop, 0, func, &pcr, &template,
 					func_data, NULL);
 		if (!(action & IMA_MEASURE) && !digest)
 			return -ENOENT;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 22a62e675ebc..a96dc3ff6aa0 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -557,7 +557,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule,
  * @idmap: idmap of the mount the inode was found from
  * @inode: a pointer to an inode
  * @cred: a pointer to a credentials structure for user validation
- * @secid: the secid of the task to be validated
+ * @prop: LSM properties of the task to be validated
  * @func: LIM hook identifier
  * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
  * @func_data: func specific data, may be NULL
@@ -567,7 +567,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule,
 static bool ima_match_rules(struct ima_rule_entry *rule,
 			    struct mnt_idmap *idmap,
 			    struct inode *inode, const struct cred *cred,
-			    u32 secid, enum ima_hooks func, int mask,
+			    struct lsm_prop *prop, enum ima_hooks func, int mask,
 			    const char *func_data)
 {
 	int i;
@@ -658,8 +658,6 @@ retry:
 		case LSM_SUBJ_USER:
 		case LSM_SUBJ_ROLE:
 		case LSM_SUBJ_TYPE:
-			/* scaffolding */
-			prop.scaffold.secid = secid;
 			rc = ima_filter_rule_match(&prop, lsm_rule->lsm[i].type,
 						   Audit_equal,
 						   lsm_rule->lsm[i].rule);
@@ -723,7 +721,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
  * @inode: pointer to an inode for which the policy decision is being made
  * @cred: pointer to a credentials structure for which the policy decision is
  *        being made
- * @secid: LSM secid of the task to be validated
+ * @prop: LSM properties of the task to be validated
  * @func: IMA hook identifier
  * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
  * @flags: IMA actions to consider (e.g. IMA_MEASURE | IMA_APPRAISE)
@@ -740,8 +738,8 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
  * than writes so ima_match_policy() is classical RCU candidate.
  */
 int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
-		     const struct cred *cred, u32 secid, enum ima_hooks func,
-		     int mask, int flags, int *pcr,
+		     const struct cred *cred, struct lsm_prop *prop,
+		     enum ima_hooks func, int mask, int flags, int *pcr,
 		     struct ima_template_desc **template_desc,
 		     const char *func_data, unsigned int *allowed_algos)
 {
@@ -759,7 +757,7 @@ int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
 		if (!(entry->action & actmask))
 			continue;
 
-		if (!ima_match_rules(entry, idmap, inode, cred, secid,
+		if (!ima_match_rules(entry, idmap, inode, cred, prop,
 				     func, mask, func_data))
 			continue;
 
diff --git a/security/security.c b/security/security.c
index f269421c2d72..5cdb5b171ff2 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3492,33 +3492,33 @@ int security_task_getsid(struct task_struct *p)
 }
 
 /**
- * security_current_getsecid_subj() - Get the current task's subjective secid
- * @secid: secid value
+ * security_current_getlsmprop_subj() - Current task's subjective LSM data
+ * @prop: lsm specific information
  *
  * Retrieve the subjective security identifier of the current task and return
- * it in @secid.  In case of failure, @secid will be set to zero.
+ * it in @prop.
  */
-void security_current_getsecid_subj(u32 *secid)
+void security_current_getlsmprop_subj(struct lsm_prop *prop)
 {
-	*secid = 0;
-	call_void_hook(current_getsecid_subj, secid);
+	lsmprop_init(prop);
+	call_void_hook(current_getlsmprop_subj, prop);
 }
-EXPORT_SYMBOL(security_current_getsecid_subj);
+EXPORT_SYMBOL(security_current_getlsmprop_subj);
 
 /**
- * security_task_getsecid_obj() - Get a task's objective secid
+ * security_task_getlsmprop_obj() - Get a task's objective LSM data
  * @p: target task
- * @secid: secid value
+ * @prop: lsm specific information
  *
  * Retrieve the objective security identifier of the task_struct in @p and
- * return it in @secid. In case of failure, @secid will be set to zero.
+ * return it in @prop.
  */
-void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
+void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop)
 {
-	*secid = 0;
-	call_void_hook(task_getsecid_obj, p, secid);
+	lsmprop_init(prop);
+	call_void_hook(task_getlsmprop_obj, p, prop);
 }
-EXPORT_SYMBOL(security_task_getsecid_obj);
+EXPORT_SYMBOL(security_task_getlsmprop_obj);
 
 /**
  * security_task_setnice() - Check if setting a task's nice value is allowed
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 1d43367009ed..7d6ffd3483a8 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4169,14 +4169,19 @@ static int selinux_task_getsid(struct task_struct *p)
 			    PROCESS__GETSESSION, NULL);
 }
 
-static void selinux_current_getsecid_subj(u32 *secid)
+static void selinux_current_getlsmprop_subj(struct lsm_prop *prop)
 {
-	*secid = current_sid();
+	prop->selinux.secid = current_sid();
+	/* scaffolding */
+	prop->scaffold.secid = prop->selinux.secid;
 }
 
-static void selinux_task_getsecid_obj(struct task_struct *p, u32 *secid)
+static void selinux_task_getlsmprop_obj(struct task_struct *p,
+					struct lsm_prop *prop)
 {
-	*secid = task_sid_obj(p);
+	prop->selinux.secid = task_sid_obj(p);
+	/* scaffolding */
+	prop->scaffold.secid = prop->selinux.secid;
 }
 
 static int selinux_task_setnice(struct task_struct *p, int nice)
@@ -7203,8 +7208,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid),
 	LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid),
 	LSM_HOOK_INIT(task_getsid, selinux_task_getsid),
-	LSM_HOOK_INIT(current_getsecid_subj, selinux_current_getsecid_subj),
-	LSM_HOOK_INIT(task_getsecid_obj, selinux_task_getsecid_obj),
+	LSM_HOOK_INIT(current_getlsmprop_subj, selinux_current_getlsmprop_subj),
+	LSM_HOOK_INIT(task_getlsmprop_obj, selinux_task_getlsmprop_obj),
 	LSM_HOOK_INIT(task_setnice, selinux_task_setnice),
 	LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio),
 	LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index fed44b4fc73d..e74e22c4232f 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2239,30 +2239,35 @@ static int smack_task_getsid(struct task_struct *p)
 }
 
 /**
- * smack_current_getsecid_subj - get the subjective secid of the current task
- * @secid: where to put the result
+ * smack_current_getlsmprop_subj - get the subjective secid of the current task
+ * @prop: where to put the result
  *
  * Sets the secid to contain a u32 version of the task's subjective smack label.
  */
-static void smack_current_getsecid_subj(u32 *secid)
+static void smack_current_getlsmprop_subj(struct lsm_prop *prop)
 {
 	struct smack_known *skp = smk_of_current();
 
-	*secid = skp->smk_secid;
+	prop->smack.skp = skp;
+	/* scaffolding */
+	prop->scaffold.secid = skp->smk_secid;
 }
 
 /**
- * smack_task_getsecid_obj - get the objective secid of the task
+ * smack_task_getlsmprop_obj - get the objective data of the task
  * @p: the task
- * @secid: where to put the result
+ * @prop: where to put the result
  *
  * Sets the secid to contain a u32 version of the task's objective smack label.
  */
-static void smack_task_getsecid_obj(struct task_struct *p, u32 *secid)
+static void smack_task_getlsmprop_obj(struct task_struct *p,
+				      struct lsm_prop *prop)
 {
 	struct smack_known *skp = smk_of_task_struct_obj(p);
 
-	*secid = skp->smk_secid;
+	prop->smack.skp = skp;
+	/* scaffolding */
+	prop->scaffold.secid = skp->smk_secid;
 }
 
 /**
@@ -5130,8 +5135,8 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(task_setpgid, smack_task_setpgid),
 	LSM_HOOK_INIT(task_getpgid, smack_task_getpgid),
 	LSM_HOOK_INIT(task_getsid, smack_task_getsid),
-	LSM_HOOK_INIT(current_getsecid_subj, smack_current_getsecid_subj),
-	LSM_HOOK_INIT(task_getsecid_obj, smack_task_getsecid_obj),
+	LSM_HOOK_INIT(current_getlsmprop_subj, smack_current_getlsmprop_subj),
+	LSM_HOOK_INIT(task_getlsmprop_obj, smack_task_getlsmprop_obj),
 	LSM_HOOK_INIT(task_setnice, smack_task_setnice),
 	LSM_HOOK_INIT(task_setioprio, smack_task_setioprio),
 	LSM_HOOK_INIT(task_getioprio, smack_task_getioprio),
-- 
cgit v1.2.3


From 07f9d2c1132c9b838538b606dfcdab2506cd2ae4 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 9 Oct 2024 10:32:16 -0700
Subject: lsm: use lsm_prop in security_inode_getsecid

Change the security_inode_getsecid() interface to fill in a
lsm_prop structure instead of a u32 secid. This allows for its
callers to gather data from all registered LSMs. Data is provided
for IMA and audit. Change the name to security_inode_getlsmprop().

Cc: linux-integrity@vger.kernel.org
Cc: selinux@vger.kernel.org
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subj line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h       |  3 ++-
 include/linux/security.h            |  7 ++++---
 kernel/auditsc.c                    |  6 +++++-
 security/integrity/ima/ima_policy.c |  3 +--
 security/security.c                 | 11 +++++------
 security/selinux/hooks.c            | 15 +++++++++------
 security/smack/smack_lsm.c          | 12 +++++++-----
 7 files changed, 33 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 8a90fd9ff3c8..23ad7e4f8c67 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -176,7 +176,8 @@ LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
 	 const char *name, const void *value, size_t size, int flags)
 LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
 	 size_t buffer_size)
-LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, inode_getlsmprop, struct inode *inode,
+	 struct lsm_prop *prop)
 LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new)
 LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, struct dentry *src,
 	 const char *name)
diff --git a/include/linux/security.h b/include/linux/security.h
index 9bc8153f4e8b..2b19ef5d799c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -452,7 +452,7 @@ int security_inode_getsecurity(struct mnt_idmap *idmap,
 			       void **buffer, bool alloc);
 int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags);
 int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size);
-void security_inode_getsecid(struct inode *inode, u32 *secid);
+void security_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop);
 int security_inode_copy_up(struct dentry *src, struct cred **new);
 int security_inode_copy_up_xattr(struct dentry *src, const char *name);
 int security_inode_setintegrity(const struct inode *inode,
@@ -1076,9 +1076,10 @@ static inline int security_inode_listsecurity(struct inode *inode, char *buffer,
 	return 0;
 }
 
-static inline void security_inode_getsecid(struct inode *inode, u32 *secid)
+static inline void security_inode_getlsmprop(struct inode *inode,
+					     struct lsm_prop *prop)
 {
-	*secid = 0;
+	lsmprop_init(prop);
 }
 
 static inline int security_inode_copy_up(struct dentry *src, struct cred **new)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6b2b2a8d5647..372302f0528b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2276,13 +2276,17 @@ static void audit_copy_inode(struct audit_names *name,
 			     const struct dentry *dentry,
 			     struct inode *inode, unsigned int flags)
 {
+	struct lsm_prop prop;
+
 	name->ino   = inode->i_ino;
 	name->dev   = inode->i_sb->s_dev;
 	name->mode  = inode->i_mode;
 	name->uid   = inode->i_uid;
 	name->gid   = inode->i_gid;
 	name->rdev  = inode->i_rdev;
-	security_inode_getsecid(inode, &name->osid);
+	security_inode_getlsmprop(inode, &prop);
+	/* scaffolding */
+	name->osid = prop.scaffold.secid;
 	if (flags & AUDIT_INODE_NOEVAL) {
 		name->fcap_ver = -1;
 		return;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index a96dc3ff6aa0..dbfd554b4624 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -649,8 +649,7 @@ retry:
 		case LSM_OBJ_USER:
 		case LSM_OBJ_ROLE:
 		case LSM_OBJ_TYPE:
-			/* scaffolding */
-			security_inode_getsecid(inode, &prop.scaffold.secid);
+			security_inode_getlsmprop(inode, &prop);
 			rc = ima_filter_rule_match(&prop, lsm_rule->lsm[i].type,
 						   Audit_equal,
 						   lsm_rule->lsm[i].rule);
diff --git a/security/security.c b/security/security.c
index 5cdb5b171ff2..5e76e35dda09 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2724,16 +2724,15 @@ int security_inode_listsecurity(struct inode *inode,
 EXPORT_SYMBOL(security_inode_listsecurity);
 
 /**
- * security_inode_getsecid() - Get an inode's secid
+ * security_inode_getlsmprop() - Get an inode's LSM data
  * @inode: inode
- * @secid: secid to return
+ * @prop: lsm specific information to return
  *
- * Get the secid associated with the node.  In case of failure, @secid will be
- * set to zero.
+ * Get the lsm specific information associated with the node.
  */
-void security_inode_getsecid(struct inode *inode, u32 *secid)
+void security_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
 {
-	call_void_hook(inode_getsecid, inode, secid);
+	call_void_hook(inode_getlsmprop, inode, prop);
 }
 
 /**
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7d6ffd3483a8..a9f8d10062b5 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3503,15 +3503,18 @@ static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t
 	return len;
 }
 
-static void selinux_inode_getsecid(struct inode *inode, u32 *secid)
+static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
 {
 	struct inode_security_struct *isec = inode_security_novalidate(inode);
-	*secid = isec->sid;
+
+	prop->selinux.secid = isec->sid;
+	/* scaffolding */
+	prop->scaffold.secid = isec->sid;
 }
 
 static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
 {
-	u32 sid;
+	struct lsm_prop prop;
 	struct task_security_struct *tsec;
 	struct cred *new_creds = *new;
 
@@ -3523,8 +3526,8 @@ static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
 
 	tsec = selinux_cred(new_creds);
 	/* Get label from overlay inode and set it in create_sid */
-	selinux_inode_getsecid(d_inode(src), &sid);
-	tsec->create_sid = sid;
+	selinux_inode_getlsmprop(d_inode(src), &prop);
+	tsec->create_sid = prop.selinux.secid;
 	*new = new_creds;
 	return 0;
 }
@@ -7174,7 +7177,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity),
 	LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity),
 	LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity),
-	LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid),
+	LSM_HOOK_INIT(inode_getlsmprop, selinux_inode_getlsmprop),
 	LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up),
 	LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr),
 	LSM_HOOK_INIT(path_notify, selinux_path_notify),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index e74e22c4232f..5c3cb2837105 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1649,15 +1649,17 @@ static int smack_inode_listsecurity(struct inode *inode, char *buffer,
 }
 
 /**
- * smack_inode_getsecid - Extract inode's security id
+ * smack_inode_getlsmprop - Extract inode's security id
  * @inode: inode to extract the info from
- * @secid: where result will be saved
+ * @prop: where result will be saved
  */
-static void smack_inode_getsecid(struct inode *inode, u32 *secid)
+static void smack_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
 {
 	struct smack_known *skp = smk_of_inode(inode);
 
-	*secid = skp->smk_secid;
+	prop->smack.skp = skp;
+	/* scaffolding */
+	prop->scaffold.secid = skp->smk_secid;
 }
 
 /*
@@ -5110,7 +5112,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(inode_getsecurity, smack_inode_getsecurity),
 	LSM_HOOK_INIT(inode_setsecurity, smack_inode_setsecurity),
 	LSM_HOOK_INIT(inode_listsecurity, smack_inode_listsecurity),
-	LSM_HOOK_INIT(inode_getsecid, smack_inode_getsecid),
+	LSM_HOOK_INIT(inode_getlsmprop, smack_inode_getlsmprop),
 
 	LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
 	LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
-- 
cgit v1.2.3


From b0654ca42998440df42ba2ccc3b7dbe3bf5b7bb5 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 9 Oct 2024 10:32:18 -0700
Subject: lsm: create new security_cred_getlsmprop LSM hook

Create a new LSM hook security_cred_getlsmprop() which, like
security_cred_getsecid(), fetches LSM specific attributes from the
cred structure.  The associated data elements in the audit sub-system
are changed from a secid to a lsm_prop to accommodate multiple possible
LSM audit users.

Cc: linux-integrity@vger.kernel.org
Cc: audit@vger.kernel.org
Cc: selinux@vger.kernel.org
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subj line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h     |  2 ++
 include/linux/security.h          |  5 +++++
 security/integrity/ima/ima_main.c |  7 ++-----
 security/security.c               | 15 +++++++++++++++
 security/selinux/hooks.c          |  8 ++++++++
 security/smack/smack_lsm.c        | 18 ++++++++++++++++++
 6 files changed, 50 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 23ad7e4f8c67..eb2937599cb0 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -218,6 +218,8 @@ LSM_HOOK(int, 0, cred_prepare, struct cred *new, const struct cred *old,
 LSM_HOOK(void, LSM_RET_VOID, cred_transfer, struct cred *new,
 	 const struct cred *old)
 LSM_HOOK(void, LSM_RET_VOID, cred_getsecid, const struct cred *c, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, cred_getlsmprop, const struct cred *c,
+	 struct lsm_prop *prop)
 LSM_HOOK(int, 0, kernel_act_as, struct cred *new, u32 secid)
 LSM_HOOK(int, 0, kernel_create_files_as, struct cred *new, struct inode *inode)
 LSM_HOOK(int, 0, kernel_module_request, char *kmod_name)
diff --git a/include/linux/security.h b/include/linux/security.h
index 2b19ef5d799c..acd2e5d1b0ff 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -488,6 +488,7 @@ void security_cred_free(struct cred *cred);
 int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
 void security_transfer_creds(struct cred *new, const struct cred *old);
 void security_cred_getsecid(const struct cred *c, u32 *secid);
+void security_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop);
 int security_kernel_act_as(struct cred *new, u32 secid);
 int security_kernel_create_files_as(struct cred *new, struct inode *inode);
 int security_kernel_module_request(char *kmod_name);
@@ -1229,6 +1230,10 @@ static inline void security_cred_getsecid(const struct cred *c, u32 *secid)
 	*secid = 0;
 }
 
+static inline void security_cred_getlsmprop(const struct cred *c,
+					    struct lsm_prop *prop)
+{ }
+
 static inline int security_kernel_act_as(struct cred *cred, u32 secid)
 {
 	return 0;
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 553a6f54a1e2..91154bb05900 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -541,8 +541,7 @@ static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
 static int ima_bprm_check(struct linux_binprm *bprm)
 {
 	int ret;
-	u32 secid;
-	struct lsm_prop prop = { };
+	struct lsm_prop prop;
 
 	security_current_getlsmprop_subj(&prop);
 	ret = process_measurement(bprm->file, current_cred(),
@@ -550,9 +549,7 @@ static int ima_bprm_check(struct linux_binprm *bprm)
 	if (ret)
 		return ret;
 
-	security_cred_getsecid(bprm->cred, &secid);
-	/* scaffolding */
-	prop.scaffold.secid = secid;
+	security_cred_getlsmprop(bprm->cred, &prop);
 	return process_measurement(bprm->file, bprm->cred, &prop, NULL, 0,
 				   MAY_EXEC, CREDS_CHECK);
 }
diff --git a/security/security.c b/security/security.c
index 5e76e35dda09..0003d5ace5cc 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3272,6 +3272,21 @@ void security_cred_getsecid(const struct cred *c, u32 *secid)
 }
 EXPORT_SYMBOL(security_cred_getsecid);
 
+/**
+ * security_cred_getlsmprop() - Get the LSM data from a set of credentials
+ * @c: credentials
+ * @prop: destination for the LSM data
+ *
+ * Retrieve the security data of the cred structure @c.  In case of
+ * failure, @prop will be cleared.
+ */
+void security_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
+{
+	lsmprop_init(prop);
+	call_void_hook(cred_getlsmprop, c, prop);
+}
+EXPORT_SYMBOL(security_cred_getlsmprop);
+
 /**
  * security_kernel_act_as() - Set the kernel credentials to act as secid
  * @new: credentials
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a9f8d10062b5..438dfa17faae 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4037,6 +4037,13 @@ static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
 	*secid = cred_sid(c);
 }
 
+static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
+{
+	prop->selinux.secid = cred_sid(c);
+	/* scaffolding */
+	prop->scaffold.secid = prop->selinux.secid;
+}
+
 /*
  * set the security data for a kernel service
  * - all the creation contexts are set to unlabelled
@@ -7203,6 +7210,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare),
 	LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer),
 	LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid),
+	LSM_HOOK_INIT(cred_getlsmprop, selinux_cred_getlsmprop),
 	LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as),
 	LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as),
 	LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 5c3cb2837105..52cc6a65d674 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2150,6 +2150,23 @@ static void smack_cred_getsecid(const struct cred *cred, u32 *secid)
 	rcu_read_unlock();
 }
 
+/**
+ * smack_cred_getlsmprop - get the Smack label for a creds structure
+ * @cred: the object creds
+ * @prop: where to put the data
+ *
+ * Sets the Smack part of the ref
+ */
+static void smack_cred_getlsmprop(const struct cred *cred,
+				  struct lsm_prop *prop)
+{
+	rcu_read_lock();
+	prop->smack.skp = smk_of_task(smack_cred(cred));
+	/* scaffolding */
+	prop->scaffold.secid = prop->smack.skp->smk_secid;
+	rcu_read_unlock();
+}
+
 /**
  * smack_kernel_act_as - Set the subjective context in a set of credentials
  * @new: points to the set of credentials to be modified.
@@ -5132,6 +5149,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(cred_prepare, smack_cred_prepare),
 	LSM_HOOK_INIT(cred_transfer, smack_cred_transfer),
 	LSM_HOOK_INIT(cred_getsecid, smack_cred_getsecid),
+	LSM_HOOK_INIT(cred_getlsmprop, smack_cred_getlsmprop),
 	LSM_HOOK_INIT(kernel_act_as, smack_kernel_act_as),
 	LSM_HOOK_INIT(kernel_create_files_as, smack_kernel_create_files_as),
 	LSM_HOOK_INIT(task_setpgid, smack_task_setpgid),
-- 
cgit v1.2.3


From 8afd8c8faa24249e48f5007aee46209299377588 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 9 Oct 2024 10:32:21 -0700
Subject: lsm: remove lsm_prop scaffolding

Remove the scaffold member from the lsm_prop. Remove the
remaining places it is being set.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subj line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h       |  6 ------
 security/apparmor/audit.c      |  6 +-----
 security/apparmor/lsm.c        |  4 ----
 security/apparmor/secid.c      |  6 +-----
 security/selinux/hooks.c       | 18 +-----------------
 security/selinux/ss/services.c |  4 ----
 security/smack/smack_lsm.c     | 33 ++++-----------------------------
 7 files changed, 7 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index acd2e5d1b0ff..fd690fa73162 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -156,11 +156,6 @@ enum lockdown_reason {
 	LOCKDOWN_CONFIDENTIALITY_MAX,
 };
 
-/* scaffolding */
-struct lsm_prop_scaffold {
-	u32 secid;
-};
-
 /*
  * Data exported by the security modules
  */
@@ -169,7 +164,6 @@ struct lsm_prop {
 	struct lsm_prop_smack smack;
 	struct lsm_prop_apparmor apparmor;
 	struct lsm_prop_bpf bpf;
-	struct lsm_prop_scaffold scaffold;
 };
 
 extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1];
diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c
index 87df6fa2a48d..73087d76f649 100644
--- a/security/apparmor/audit.c
+++ b/security/apparmor/audit.c
@@ -270,11 +270,7 @@ int aa_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule)
 	struct aa_label *label;
 	int found = 0;
 
-	/* scaffolding */
-	if (!prop->apparmor.label && prop->scaffold.secid)
-		label = aa_secid_to_label(prop->scaffold.secid);
-	else
-		label = prop->apparmor.label;
+	label = prop->apparmor.label;
 
 	if (!label)
 		return -ENOENT;
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 6331bcb35ec0..1edc12862a7d 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -987,8 +987,6 @@ static void apparmor_current_getlsmprop_subj(struct lsm_prop *prop)
 	struct aa_label *label = __begin_current_label_crit_section();
 
 	prop->apparmor.label = label;
-	/* scaffolding */
-	prop->scaffold.secid = label->secid;
 	__end_current_label_crit_section(label);
 }
 
@@ -998,8 +996,6 @@ static void apparmor_task_getlsmprop_obj(struct task_struct *p,
 	struct aa_label *label = aa_get_task_label(p);
 
 	prop->apparmor.label = label;
-	/* scaffolding */
-	prop->scaffold.secid = label->secid;
 	aa_put_label(label);
 }
 
diff --git a/security/apparmor/secid.c b/security/apparmor/secid.c
index 34610888559f..6350d107013a 100644
--- a/security/apparmor/secid.c
+++ b/security/apparmor/secid.c
@@ -102,11 +102,7 @@ int apparmor_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
 {
 	struct aa_label *label;
 
-	/* scaffolding */
-	if (!prop->apparmor.label && prop->scaffold.secid)
-		label = aa_secid_to_label(prop->scaffold.secid);
-	else
-		label = prop->apparmor.label;
+	label = prop->apparmor.label;
 
 	return apparmor_label_to_secctx(label, secdata, seclen);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 438dfa17faae..025b60c5b605 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3508,8 +3508,6 @@ static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
 	struct inode_security_struct *isec = inode_security_novalidate(inode);
 
 	prop->selinux.secid = isec->sid;
-	/* scaffolding */
-	prop->scaffold.secid = isec->sid;
 }
 
 static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
@@ -4040,8 +4038,6 @@ static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
 static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
 {
 	prop->selinux.secid = cred_sid(c);
-	/* scaffolding */
-	prop->scaffold.secid = prop->selinux.secid;
 }
 
 /*
@@ -4182,16 +4178,12 @@ static int selinux_task_getsid(struct task_struct *p)
 static void selinux_current_getlsmprop_subj(struct lsm_prop *prop)
 {
 	prop->selinux.secid = current_sid();
-	/* scaffolding */
-	prop->scaffold.secid = prop->selinux.secid;
 }
 
 static void selinux_task_getlsmprop_obj(struct task_struct *p,
 					struct lsm_prop *prop)
 {
 	prop->selinux.secid = task_sid_obj(p);
-	/* scaffolding */
-	prop->scaffold.secid = prop->selinux.secid;
 }
 
 static int selinux_task_setnice(struct task_struct *p, int nice)
@@ -6339,8 +6331,6 @@ static void selinux_ipc_getlsmprop(struct kern_ipc_perm *ipcp,
 {
 	struct ipc_security_struct *isec = selinux_ipc(ipcp);
 	prop->selinux.secid = isec->sid;
-	/* scaffolding */
-	prop->scaffold.secid = isec->sid;
 }
 
 static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode)
@@ -6625,13 +6615,7 @@ static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 static int selinux_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
 				     u32 *seclen)
 {
-	u32 secid = prop->selinux.secid;
-
-	/* scaffolding */
-	if (!secid)
-		secid = prop->scaffold.secid;
-
-	return selinux_secid_to_secctx(secid, secdata, seclen);
+	return selinux_secid_to_secctx(prop->selinux.secid, secdata, seclen);
 }
 
 static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index e0c14773a7b7..07ba5b3643dd 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -3661,10 +3661,6 @@ int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vru
 		goto out;
 	}
 
-	/* scaffolding */
-	if (!prop->selinux.secid && prop->scaffold.secid)
-		prop->selinux.secid = prop->scaffold.secid;
-
 	ctxt = sidtab_search(policy->sidtab, prop->selinux.secid);
 	if (unlikely(!ctxt)) {
 		WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n",
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 52cc6a65d674..0c476282e279 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1655,11 +1655,7 @@ static int smack_inode_listsecurity(struct inode *inode, char *buffer,
  */
 static void smack_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
 {
-	struct smack_known *skp = smk_of_inode(inode);
-
-	prop->smack.skp = skp;
-	/* scaffolding */
-	prop->scaffold.secid = skp->smk_secid;
+	prop->smack.skp = smk_of_inode(inode);
 }
 
 /*
@@ -2162,8 +2158,6 @@ static void smack_cred_getlsmprop(const struct cred *cred,
 {
 	rcu_read_lock();
 	prop->smack.skp = smk_of_task(smack_cred(cred));
-	/* scaffolding */
-	prop->scaffold.secid = prop->smack.skp->smk_secid;
 	rcu_read_unlock();
 }
 
@@ -2265,11 +2259,7 @@ static int smack_task_getsid(struct task_struct *p)
  */
 static void smack_current_getlsmprop_subj(struct lsm_prop *prop)
 {
-	struct smack_known *skp = smk_of_current();
-
-	prop->smack.skp = skp;
-	/* scaffolding */
-	prop->scaffold.secid = skp->smk_secid;
+	prop->smack.skp = smk_of_current();
 }
 
 /**
@@ -2282,11 +2272,7 @@ static void smack_current_getlsmprop_subj(struct lsm_prop *prop)
 static void smack_task_getlsmprop_obj(struct task_struct *p,
 				      struct lsm_prop *prop)
 {
-	struct smack_known *skp = smk_of_task_struct_obj(p);
-
-	prop->smack.skp = skp;
-	/* scaffolding */
-	prop->scaffold.secid = skp->smk_secid;
+	prop->smack.skp = smk_of_task_struct_obj(p);
 }
 
 /**
@@ -3466,11 +3452,8 @@ static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
 static void smack_ipc_getlsmprop(struct kern_ipc_perm *ipp, struct lsm_prop *prop)
 {
 	struct smack_known **iskpp = smack_ipc(ipp);
-	struct smack_known *iskp = *iskpp;
 
-	prop->smack.skp = iskp;
-	/* scaffolding */
-	prop->scaffold.secid = iskp->smk_secid;
+	prop->smack.skp = *iskpp;
 }
 
 /**
@@ -4805,10 +4788,6 @@ static int smack_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
 	if (field != AUDIT_SUBJ_USER && field != AUDIT_OBJ_USER)
 		return 0;
 
-	/* scaffolding */
-	if (!skp && prop->scaffold.secid)
-		skp = smack_from_secid(prop->scaffold.secid);
-
 	/*
 	 * No need to do string comparisons. If a match occurs,
 	 * both pointers will point to the same smack_known
@@ -4869,10 +4848,6 @@ static int smack_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
 {
 	struct smack_known *skp = prop->smack.skp;
 
-	/* scaffolding */
-	if (!skp && prop->scaffold.secid)
-		skp = smack_from_secid(prop->scaffold.secid);
-
 	if (secdata)
 		*secdata = skp->smk_known;
 	*seclen = strlen(skp->smk_known);
-- 
cgit v1.2.3


From 8fa714ca334e0880665a14fed13b16e3e01a67b2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 10 Oct 2024 21:15:35 +0300
Subject: iio: Convert unsigned to unsigned int

Simple type conversion with no functional change implied.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20241010181535.3083262-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio-opaque.h |  2 +-
 include/linux/iio/iio.h        | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 5aec3945555b..a89e7e43e441 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -70,7 +70,7 @@ struct iio_dev_opaque {
 
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry			*debugfs_dentry;
-	unsigned			cached_reg_addr;
+	unsigned int			cached_reg_addr;
 	char				read_buf[20];
 	unsigned int			read_buf_len;
 #endif
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 18779b631e90..3a9b57187a95 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -282,11 +282,11 @@ struct iio_chan_spec {
 	const struct iio_chan_spec_ext_info *ext_info;
 	const char		*extend_name;
 	const char		*datasheet_name;
-	unsigned		modified:1;
-	unsigned		indexed:1;
-	unsigned		output:1;
-	unsigned		differential:1;
-	unsigned		has_ext_scan_type:1;
+	unsigned int		modified:1;
+	unsigned int		indexed:1;
+	unsigned int		output:1;
+	unsigned int		differential:1;
+	unsigned int		has_ext_scan_type:1;
 };
 
 
@@ -541,13 +541,13 @@ struct iio_info {
 	int (*update_scan_mode)(struct iio_dev *indio_dev,
 				const unsigned long *scan_mask);
 	int (*debugfs_reg_access)(struct iio_dev *indio_dev,
-				  unsigned reg, unsigned writeval,
-				  unsigned *readval);
+				  unsigned int reg, unsigned int writeval,
+				  unsigned int *readval);
 	int (*fwnode_xlate)(struct iio_dev *indio_dev,
 			    const struct fwnode_reference_args *iiospec);
-	int (*hwfifo_set_watermark)(struct iio_dev *indio_dev, unsigned val);
+	int (*hwfifo_set_watermark)(struct iio_dev *indio_dev, unsigned int val);
 	int (*hwfifo_flush_to_buffer)(struct iio_dev *indio_dev,
-				      unsigned count);
+				      unsigned int count);
 };
 
 /**
@@ -609,7 +609,7 @@ struct iio_dev {
 	int				scan_bytes;
 
 	const unsigned long		*available_scan_masks;
-	unsigned			__private masklength;
+	unsigned int			__private masklength;
 	const unsigned long		*active_scan_mask;
 	bool				scan_timestamp;
 	struct iio_trigger		*trig;
-- 
cgit v1.2.3


From 454bbde8f0d465e93e5a3a4003ac6c7e62fa4473 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Wed, 9 Oct 2024 10:28:19 +0800
Subject: net: skb: add pskb_network_may_pull_reason() helper

Introduce the function pskb_network_may_pull_reason() and make
pskb_network_may_pull() a simple inline call to it. The drop reasons of
it just come from pskb_may_pull_reason.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 39f1d16f3628..48f1e0fa2a13 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3130,9 +3130,15 @@ static inline int skb_inner_network_offset(const struct sk_buff *skb)
 	return skb_inner_network_header(skb) - skb->data;
 }
 
+static inline enum skb_drop_reason
+pskb_network_may_pull_reason(struct sk_buff *skb, unsigned int len)
+{
+	return pskb_may_pull_reason(skb, skb_network_offset(skb) + len);
+}
+
 static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
 {
-	return pskb_may_pull(skb, skb_network_offset(skb) + len);
+	return pskb_network_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET;
 }
 
 /*
-- 
cgit v1.2.3


From 7948483001039c00dcff4b94c181d7e14693a38c Mon Sep 17 00:00:00 2001
From: Gerhard Engleder <eg@keba.com>
Date: Fri, 11 Oct 2024 21:12:52 +0200
Subject: misc: keba: Add SPI controller device

Add support for the SPI controller auxiliary device. This enables access
to the SPI flash of the FPGA and some other SPI devices.

The actual list of SPI devices is detected by reading some bits out of
the previously registered I2C EEPROM.

Signed-off-by: Gerhard Engleder <eg@keba.com>
Link: https://lore.kernel.org/r/20241011191257.19702-4-gerhard@engleder-embedded.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/keba/cp500.c | 219 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/misc/keba.h |  15 ++++
 2 files changed, 219 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/keba/cp500.c b/drivers/misc/keba/cp500.c
index 1eee130c3a7f..7cebf2929390 100644
--- a/drivers/misc/keba/cp500.c
+++ b/drivers/misc/keba/cp500.c
@@ -12,7 +12,11 @@
 #include <linux/i2c.h>
 #include <linux/misc/keba.h>
 #include <linux/module.h>
+#include <linux/mtd/partitions.h>
+#include <linux/nvmem-consumer.h>
 #include <linux/pci.h>
+#include <linux/spi/flash.h>
+#include <linux/spi/spi.h>
 
 #define CP500 "cp500"
 
@@ -43,6 +47,16 @@
 
 /* EEPROM */
 #define CP500_HW_CPU_EEPROM_NAME	"cp500_cpu_eeprom"
+#define CP500_EEPROM_DA_OFFSET		0x016F
+#define CP500_EEPROM_DA_ESC_TYPE_MASK	0x01
+#define CP500_EEPROM_ESC_LAN9252	0x00
+#define CP500_EEPROM_ESC_ET1100		0x01
+
+/* SPI flash running at full speed */
+#define CP500_FLASH_HZ		(33 * 1000 * 1000)
+
+/* LAN9252 */
+#define CP500_LAN9252_HZ	(10 * 1000 * 1000)
 
 #define CP500_IS_CP035(dev)	((dev)->pci_dev->device == PCI_DEVICE_ID_KEBA_CP035)
 #define CP500_IS_CP505(dev)	((dev)->pci_dev->device == PCI_DEVICE_ID_KEBA_CP505)
@@ -55,25 +69,29 @@ struct cp500_dev_info {
 
 struct cp500_devs {
 	struct cp500_dev_info startup;
+	struct cp500_dev_info spi;
 	struct cp500_dev_info i2c;
 };
 
 /* list of devices within FPGA of CP035 family (CP035, CP056, CP057) */
 static struct cp500_devs cp035_devices = {
-	.startup   = { 0x0000, SZ_4K },
-	.i2c       = { 0x4000, SZ_4K },
+	.startup = { 0x0000, SZ_4K },
+	.spi	 = { 0x1000, SZ_4K },
+	.i2c     = { 0x4000, SZ_4K },
 };
 
 /* list of devices within FPGA of CP505 family (CP503, CP505, CP507) */
 static struct cp500_devs cp505_devices = {
-	.startup   = { 0x0000, SZ_4K },
-	.i2c       = { 0x5000, SZ_4K },
+	.startup = { 0x0000, SZ_4K },
+	.spi     = { 0x4000, SZ_4K },
+	.i2c     = { 0x5000, SZ_4K },
 };
 
 /* list of devices within FPGA of CP520 family (CP520, CP530) */
 static struct cp500_devs cp520_devices = {
-	.startup     = { 0x0000, SZ_4K },
-	.i2c         = { 0x5000, SZ_4K },
+	.startup = { 0x0000, SZ_4K },
+	.spi     = { 0x4000, SZ_4K },
+	.i2c     = { 0x5000, SZ_4K },
 };
 
 struct cp500 {
@@ -85,9 +103,12 @@ struct cp500 {
 		int minor;
 		int build;
 	} version;
+	struct notifier_block nvmem_notifier;
+	atomic_t nvmem_notified;
 
 	/* system FPGA BAR */
 	resource_size_t sys_hwbase;
+	struct keba_spi_auxdev *spi;
 	struct keba_i2c_auxdev *i2c;
 
 	/* ECM EtherCAT BAR */
@@ -97,6 +118,7 @@ struct cp500 {
 };
 
 /* I2C devices */
+#define CP500_EEPROM_ADDR	0x50
 static struct i2c_board_info cp500_i2c_info[] = {
 	{	/* temperature sensor */
 		I2C_BOARD_INFO("emc1403", 0x4c),
@@ -107,30 +129,67 @@ static struct i2c_board_info cp500_i2c_info[] = {
 		 * CP505 family: bridge board
 		 * CP520 family: carrier board
 		 */
-		I2C_BOARD_INFO("24c32", 0x50),
+		I2C_BOARD_INFO("24c32", CP500_EEPROM_ADDR),
 		.dev_name = CP500_HW_CPU_EEPROM_NAME,
 	},
 	{	/* interface board EEPROM */
-		I2C_BOARD_INFO("24c32", 0x51),
+		I2C_BOARD_INFO("24c32", CP500_EEPROM_ADDR + 1),
 	},
 	{	/*
 		 * EEPROM (optional)
 		 * CP505 family: CPU board
 		 * CP520 family: MMI board
 		 */
-		I2C_BOARD_INFO("24c32", 0x52),
+		I2C_BOARD_INFO("24c32", CP500_EEPROM_ADDR + 2),
 	},
 	{	/* extension module 0 EEPROM (optional) */
-		I2C_BOARD_INFO("24c32", 0x53),
+		I2C_BOARD_INFO("24c32", CP500_EEPROM_ADDR + 3),
 	},
 	{	/* extension module 1 EEPROM (optional) */
-		I2C_BOARD_INFO("24c32", 0x54),
+		I2C_BOARD_INFO("24c32", CP500_EEPROM_ADDR + 4),
 	},
 	{	/* extension module 2 EEPROM (optional) */
-		I2C_BOARD_INFO("24c32", 0x55),
+		I2C_BOARD_INFO("24c32", CP500_EEPROM_ADDR + 5),
 	},
 	{	/* extension module 3 EEPROM (optional) */
-		I2C_BOARD_INFO("24c32", 0x56),
+		I2C_BOARD_INFO("24c32", CP500_EEPROM_ADDR + 6),
+	}
+};
+
+/* SPI devices */
+static struct mtd_partition cp500_partitions[] = {
+	{
+		.name       = "system-flash-parts",
+		.size       = MTDPART_SIZ_FULL,
+		.offset     = 0,
+		.mask_flags = 0
+	}
+};
+static const struct flash_platform_data cp500_w25q32 = {
+	.type     = "w25q32",
+	.name     = "system-flash",
+	.parts    = cp500_partitions,
+	.nr_parts = ARRAY_SIZE(cp500_partitions),
+};
+static const struct flash_platform_data cp500_m25p16 = {
+	.type     = "m25p16",
+	.name     = "system-flash",
+	.parts    = cp500_partitions,
+	.nr_parts = ARRAY_SIZE(cp500_partitions),
+};
+static struct spi_board_info cp500_spi_info[] = {
+	{       /* system FPGA configuration bitstream flash */
+		.modalias      = "m25p80",
+		.platform_data = &cp500_m25p16,
+		.max_speed_hz  = CP500_FLASH_HZ,
+		.chip_select   = 0,
+		.mode          = SPI_MODE_3,
+	}, {    /* LAN9252 EtherCAT slave controller */
+		.modalias      = "lan9252",
+		.platform_data = NULL,
+		.max_speed_hz  = CP500_LAN9252_HZ,
+		.chip_select   = 1,
+		.mode          = SPI_MODE_3,
 	}
 };
 
@@ -269,6 +328,125 @@ static int cp500_register_i2c(struct cp500 *cp500)
 	return 0;
 }
 
+static void cp500_spi_release(struct device *dev)
+{
+	struct keba_spi_auxdev *spi =
+		container_of(dev, struct keba_spi_auxdev, auxdev.dev);
+
+	kfree(spi);
+}
+
+static int cp500_register_spi(struct cp500 *cp500, u8 esc_type)
+{
+	int info_size;
+	int ret;
+
+	cp500->spi = kzalloc(sizeof(*cp500->spi), GFP_KERNEL);
+	if (!cp500->spi)
+		return -ENOMEM;
+
+	if (CP500_IS_CP035(cp500))
+		cp500_spi_info[0].platform_data = &cp500_w25q32;
+	if (esc_type == CP500_EEPROM_ESC_LAN9252)
+		info_size = ARRAY_SIZE(cp500_spi_info);
+	else
+		info_size = ARRAY_SIZE(cp500_spi_info) - 1;
+
+	cp500->spi->auxdev.name = "spi";
+	cp500->spi->auxdev.id = 0;
+	cp500->spi->auxdev.dev.release = cp500_spi_release;
+	cp500->spi->auxdev.dev.parent = &cp500->pci_dev->dev;
+	cp500->spi->io = (struct resource) {
+		 /* SPI register area */
+		 .start = (resource_size_t) cp500->sys_hwbase +
+			  cp500->devs->spi.offset,
+		 .end   = (resource_size_t) cp500->sys_hwbase +
+			  cp500->devs->spi.offset +
+			  cp500->devs->spi.size - 1,
+		 .flags = IORESOURCE_MEM,
+	};
+	cp500->spi->info_size = info_size;
+	cp500->spi->info = cp500_spi_info;
+
+	ret = auxiliary_device_init(&cp500->spi->auxdev);
+	if (ret) {
+		kfree(cp500->spi);
+		cp500->spi = NULL;
+
+		return ret;
+	}
+	ret = __auxiliary_device_add(&cp500->spi->auxdev, "keba");
+	if (ret) {
+		auxiliary_device_uninit(&cp500->spi->auxdev);
+		cp500->spi = NULL;
+
+		return ret;
+	}
+
+	return 0;
+}
+
+static int cp500_nvmem_match(struct device *dev, const void *data)
+{
+	const struct cp500 *cp500 = data;
+	struct i2c_client *client;
+
+	/* match only CPU EEPROM below the cp500 device */
+	dev = dev->parent;
+	client = i2c_verify_client(dev);
+	if (!client || client->addr != CP500_EEPROM_ADDR)
+		return 0;
+	while ((dev = dev->parent))
+		if (dev == &cp500->pci_dev->dev)
+			return 1;
+
+	return 0;
+}
+
+static int cp500_nvmem(struct notifier_block *nb, unsigned long action,
+		       void *data)
+{
+	struct nvmem_device *nvmem;
+	struct cp500 *cp500;
+	struct device *dev;
+	int notified;
+	u8 esc_type;
+	int ret;
+
+	if (action != NVMEM_ADD)
+		return NOTIFY_DONE;
+	cp500 = container_of(nb, struct cp500, nvmem_notifier);
+	dev = &cp500->pci_dev->dev;
+
+	/* process CPU EEPROM content only once */
+	notified = atomic_read(&cp500->nvmem_notified);
+	if (notified)
+		return NOTIFY_DONE;
+	nvmem = nvmem_device_find(cp500, cp500_nvmem_match);
+	if (IS_ERR_OR_NULL(nvmem))
+		return NOTIFY_DONE;
+	if (!atomic_try_cmpxchg_relaxed(&cp500->nvmem_notified, &notified, 1)) {
+		nvmem_device_put(nvmem);
+
+		return NOTIFY_DONE;
+	}
+
+	ret = nvmem_device_read(nvmem, CP500_EEPROM_DA_OFFSET, sizeof(esc_type),
+				(void *)&esc_type);
+	nvmem_device_put(nvmem);
+	if (ret != sizeof(esc_type)) {
+		dev_warn(dev, "Failed to read device assembly!\n");
+
+		return NOTIFY_DONE;
+	}
+	esc_type &= CP500_EEPROM_DA_ESC_TYPE_MASK;
+
+	if (cp500_register_spi(cp500, esc_type))
+		dev_warn(dev, "Failed to register SPI!\n");
+
+	return NOTIFY_OK;
+}
+
 static void cp500_register_auxiliary_devs(struct cp500 *cp500)
 {
 	struct device *dev = &cp500->pci_dev->dev;
@@ -285,7 +463,10 @@ static void cp500_unregister_dev(struct auxiliary_device *auxdev)
 
 static void cp500_unregister_auxiliary_devs(struct cp500 *cp500)
 {
-
+	if (cp500->spi) {
+		cp500_unregister_dev(&cp500->spi->auxdev);
+		cp500->spi = NULL;
+	}
 	if (cp500->i2c) {
 		cp500_unregister_dev(&cp500->i2c->auxdev);
 		cp500->i2c = NULL;
@@ -396,15 +577,21 @@ static int cp500_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 
 	pci_set_drvdata(pci_dev, cp500);
 
+	cp500->nvmem_notifier.notifier_call = cp500_nvmem;
+	ret = nvmem_register_notifier(&cp500->nvmem_notifier);
+	if (ret != 0)
+		goto out_free_irq;
 
 	ret = cp500_enable(cp500);
 	if (ret != 0)
-		goto out_free_irq;
+		goto out_unregister_nvmem;
 
 	cp500_register_auxiliary_devs(cp500);
 
 	return 0;
 
+out_unregister_nvmem:
+	nvmem_unregister_notifier(&cp500->nvmem_notifier);
 out_free_irq:
 	pci_free_irq_vectors(pci_dev);
 out_disable:
@@ -422,6 +609,8 @@ static void cp500_remove(struct pci_dev *pci_dev)
 
 	cp500_disable(cp500);
 
+	nvmem_unregister_notifier(&cp500->nvmem_notifier);
+
 	pci_set_drvdata(pci_dev, 0);
 
 	pci_free_irq_vectors(pci_dev);
diff --git a/include/linux/misc/keba.h b/include/linux/misc/keba.h
index 323b31a847c5..1bd5409c6f6f 100644
--- a/include/linux/misc/keba.h
+++ b/include/linux/misc/keba.h
@@ -7,6 +7,7 @@
 #include <linux/auxiliary_bus.h>
 
 struct i2c_board_info;
+struct spi_board_info;
 
 /**
  * struct keba_i2c_auxdev - KEBA I2C auxiliary device
@@ -22,4 +23,18 @@ struct keba_i2c_auxdev {
 	struct i2c_board_info *info;
 };
 
+/**
+ * struct keba_spi_auxdev - KEBA SPI auxiliary device
+ * @auxdev: auxiliary device object
+ * @io: address range of SPI controller IO memory
+ * @info_size: number of SPI devices to be probed
+ * @info: SPI devices to be probed
+ */
+struct keba_spi_auxdev {
+	struct auxiliary_device auxdev;
+	struct resource io;
+	int info_size;
+	struct spi_board_info *info;
+};
+
 #endif /* _LINUX_MISC_KEBA_H */
-- 
cgit v1.2.3


From f965d315bcbd65adfe5e3c161e46b5dc0a463f68 Mon Sep 17 00:00:00 2001
From: Gerhard Engleder <eg@keba.com>
Date: Fri, 11 Oct 2024 21:12:55 +0200
Subject: misc: keba: Add fan device

Add support for the fan auxiliary device. This enables monitoring of the
fan.

Signed-off-by: Gerhard Engleder <eg@keba.com>
Link: https://lore.kernel.org/r/20241011191257.19702-7-gerhard@engleder-embedded.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/keba/cp500.c | 83 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/misc/keba.h | 10 ++++++
 2 files changed, 84 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/keba/cp500.c b/drivers/misc/keba/cp500.c
index 3cf99eaf45c4..ae3ed1cece32 100644
--- a/drivers/misc/keba/cp500.c
+++ b/drivers/misc/keba/cp500.c
@@ -32,6 +32,7 @@
 /* BAR 0 registers */
 #define CP500_VERSION_REG	0x00
 #define CP500_RECONFIG_REG	0x11	/* upper 8-bits of STARTUP register */
+#define CP500_PRESENT_REG	0x20
 #define CP500_AXI_REG		0x40
 
 /* Bits in BUILD_REG */
@@ -40,6 +41,9 @@
 /* Bits in RECONFIG_REG */
 #define CP500_RECFG_REQ		0x01	/* reconfigure FPGA on next reset */
 
+/* Bits in PRESENT_REG */
+#define CP500_PRESENT_FAN0	0x01
+
 /* MSIX */
 #define CP500_AXI_MSIX		3
 #define CP500_NUM_MSIX		8
@@ -77,27 +81,31 @@ struct cp500_devs {
 	struct cp500_dev_info startup;
 	struct cp500_dev_info spi;
 	struct cp500_dev_info i2c;
+	struct cp500_dev_info fan;
 };
 
 /* list of devices within FPGA of CP035 family (CP035, CP056, CP057) */
 static struct cp500_devs cp035_devices = {
-	.startup = { 0x0000, SZ_4K },
-	.spi	 = { 0x1000, SZ_4K },
-	.i2c     = { 0x4000, SZ_4K },
+	.startup   = { 0x0000, SZ_4K },
+	.spi       = { 0x1000, SZ_4K },
+	.i2c       = { 0x4000, SZ_4K },
+	.fan       = { 0x9000, SZ_4K },
 };
 
 /* list of devices within FPGA of CP505 family (CP503, CP505, CP507) */
 static struct cp500_devs cp505_devices = {
-	.startup = { 0x0000, SZ_4K },
-	.spi     = { 0x4000, SZ_4K },
-	.i2c     = { 0x5000, SZ_4K },
+	.startup   = { 0x0000, SZ_4K },
+	.spi       = { 0x4000, SZ_4K },
+	.i2c       = { 0x5000, SZ_4K },
+	.fan       = { 0x9000, SZ_4K },
 };
 
 /* list of devices within FPGA of CP520 family (CP520, CP530) */
 static struct cp500_devs cp520_devices = {
-	.startup = { 0x0000, SZ_4K },
-	.spi     = { 0x4000, SZ_4K },
-	.i2c     = { 0x5000, SZ_4K },
+	.startup   = { 0x0000, SZ_4K },
+	.spi       = { 0x4000, SZ_4K },
+	.i2c       = { 0x5000, SZ_4K },
+	.fan       = { 0x8000, SZ_4K },
 };
 
 struct cp500_nvmem {
@@ -121,6 +129,7 @@ struct cp500 {
 	resource_size_t sys_hwbase;
 	struct keba_spi_auxdev *spi;
 	struct keba_i2c_auxdev *i2c;
+	struct keba_fan_auxdev *fan;
 
 	/* ECM EtherCAT BAR */
 	resource_size_t ecm_hwbase;
@@ -400,6 +409,54 @@ static int cp500_register_spi(struct cp500 *cp500, u8 esc_type)
 	return 0;
 }
 
+static void cp500_fan_release(struct device *dev)
+{
+	struct keba_fan_auxdev *fan =
+		container_of(dev, struct keba_fan_auxdev, auxdev.dev);
+
+	kfree(fan);
+}
+
+static int cp500_register_fan(struct cp500 *cp500)
+{
+	int ret;
+
+	cp500->fan = kzalloc(sizeof(*cp500->fan), GFP_KERNEL);
+	if (!cp500->fan)
+		return -ENOMEM;
+
+	cp500->fan->auxdev.name = "fan";
+	cp500->fan->auxdev.id = 0;
+	cp500->fan->auxdev.dev.release = cp500_fan_release;
+	cp500->fan->auxdev.dev.parent = &cp500->pci_dev->dev;
+	cp500->fan->io = (struct resource) {
+		 /* fan register area */
+		 .start = (resource_size_t) cp500->sys_hwbase +
+			  cp500->devs->fan.offset,
+		 .end   = (resource_size_t) cp500->sys_hwbase +
+			  cp500->devs->fan.offset +
+			  cp500->devs->fan.size - 1,
+		 .flags = IORESOURCE_MEM,
+	};
+
+	ret = auxiliary_device_init(&cp500->fan->auxdev);
+	if (ret) {
+		kfree(cp500->fan);
+		cp500->fan = NULL;
+
+		return ret;
+	}
+	ret = __auxiliary_device_add(&cp500->fan->auxdev, "keba");
+	if (ret) {
+		auxiliary_device_uninit(&cp500->fan->auxdev);
+		cp500->fan = NULL;
+
+		return ret;
+	}
+
+	return 0;
+}
+
 static int cp500_nvmem_read(void *priv, unsigned int offset, void *val,
 			    size_t bytes)
 {
@@ -549,9 +606,13 @@ static int cp500_nvmem(struct notifier_block *nb, unsigned long action,
 static void cp500_register_auxiliary_devs(struct cp500 *cp500)
 {
 	struct device *dev = &cp500->pci_dev->dev;
+	u8 present = ioread8(cp500->system_startup_addr + CP500_PRESENT_REG);
 
 	if (cp500_register_i2c(cp500))
 		dev_warn(dev, "Failed to register I2C!\n");
+	if (present & CP500_PRESENT_FAN0)
+		if (cp500_register_fan(cp500))
+			dev_warn(dev, "Failed to register fan!\n");
 }
 
 static void cp500_unregister_dev(struct auxiliary_device *auxdev)
@@ -570,6 +631,10 @@ static void cp500_unregister_auxiliary_devs(struct cp500 *cp500)
 		cp500_unregister_dev(&cp500->i2c->auxdev);
 		cp500->i2c = NULL;
 	}
+	if (cp500->fan) {
+		cp500_unregister_dev(&cp500->fan->auxdev);
+		cp500->fan = NULL;
+	}
 }
 
 static irqreturn_t cp500_axi_handler(int irq, void *dev)
diff --git a/include/linux/misc/keba.h b/include/linux/misc/keba.h
index 1bd5409c6f6f..451777acc262 100644
--- a/include/linux/misc/keba.h
+++ b/include/linux/misc/keba.h
@@ -37,4 +37,14 @@ struct keba_spi_auxdev {
 	struct spi_board_info *info;
 };
 
+/**
+ * struct keba_fan_auxdev - KEBA fan auxiliary device
+ * @auxdev: auxiliary device object
+ * @io: address range of fan controller IO memory
+ */
+struct keba_fan_auxdev {
+	struct auxiliary_device auxdev;
+	struct resource io;
+};
+
 #endif /* _LINUX_MISC_KEBA_H */
-- 
cgit v1.2.3


From ca7b844b91920573835ad11daaa30630ce112fe1 Mon Sep 17 00:00:00 2001
From: Gerhard Engleder <eg@keba.com>
Date: Fri, 11 Oct 2024 21:12:56 +0200
Subject: misc: keba: Add battery device

Add support for the battery auxiliary device. This enables monitoring of
the battery.

Signed-off-by: Gerhard Engleder <eg@keba.com>
Link: https://lore.kernel.org/r/20241011191257.19702-8-gerhard@engleder-embedded.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/keba/cp500.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/misc/keba.h | 10 ++++++++
 2 files changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/keba/cp500.c b/drivers/misc/keba/cp500.c
index ae3ed1cece32..afd4d7c06cee 100644
--- a/drivers/misc/keba/cp500.c
+++ b/drivers/misc/keba/cp500.c
@@ -82,6 +82,7 @@ struct cp500_devs {
 	struct cp500_dev_info spi;
 	struct cp500_dev_info i2c;
 	struct cp500_dev_info fan;
+	struct cp500_dev_info batt;
 };
 
 /* list of devices within FPGA of CP035 family (CP035, CP056, CP057) */
@@ -90,6 +91,7 @@ static struct cp500_devs cp035_devices = {
 	.spi       = { 0x1000, SZ_4K },
 	.i2c       = { 0x4000, SZ_4K },
 	.fan       = { 0x9000, SZ_4K },
+	.batt      = { 0xA000, SZ_4K },
 };
 
 /* list of devices within FPGA of CP505 family (CP503, CP505, CP507) */
@@ -98,6 +100,7 @@ static struct cp500_devs cp505_devices = {
 	.spi       = { 0x4000, SZ_4K },
 	.i2c       = { 0x5000, SZ_4K },
 	.fan       = { 0x9000, SZ_4K },
+	.batt      = { 0xA000, SZ_4K },
 };
 
 /* list of devices within FPGA of CP520 family (CP520, CP530) */
@@ -106,6 +109,7 @@ static struct cp500_devs cp520_devices = {
 	.spi       = { 0x4000, SZ_4K },
 	.i2c       = { 0x5000, SZ_4K },
 	.fan       = { 0x8000, SZ_4K },
+	.batt      = { 0x9000, SZ_4K },
 };
 
 struct cp500_nvmem {
@@ -130,6 +134,7 @@ struct cp500 {
 	struct keba_spi_auxdev *spi;
 	struct keba_i2c_auxdev *i2c;
 	struct keba_fan_auxdev *fan;
+	struct keba_batt_auxdev *batt;
 
 	/* ECM EtherCAT BAR */
 	resource_size_t ecm_hwbase;
@@ -457,6 +462,54 @@ static int cp500_register_fan(struct cp500 *cp500)
 	return 0;
 }
 
+static void cp500_batt_release(struct device *dev)
+{
+	struct keba_batt_auxdev *fan =
+		container_of(dev, struct keba_batt_auxdev, auxdev.dev);
+
+	kfree(fan);
+}
+
+static int cp500_register_batt(struct cp500 *cp500)
+{
+	int ret;
+
+	cp500->batt = kzalloc(sizeof(*cp500->batt), GFP_KERNEL);
+	if (!cp500->batt)
+		return -ENOMEM;
+
+	cp500->batt->auxdev.name = "batt";
+	cp500->batt->auxdev.id = 0;
+	cp500->batt->auxdev.dev.release = cp500_batt_release;
+	cp500->batt->auxdev.dev.parent = &cp500->pci_dev->dev;
+	cp500->batt->io = (struct resource) {
+		 /* battery register area */
+		 .start = (resource_size_t) cp500->sys_hwbase +
+			  cp500->devs->batt.offset,
+		 .end   = (resource_size_t) cp500->sys_hwbase +
+			  cp500->devs->batt.offset +
+			  cp500->devs->batt.size - 1,
+		 .flags = IORESOURCE_MEM,
+	};
+
+	ret = auxiliary_device_init(&cp500->batt->auxdev);
+	if (ret) {
+		kfree(cp500->batt);
+		cp500->batt = NULL;
+
+		return ret;
+	}
+	ret = __auxiliary_device_add(&cp500->batt->auxdev, "keba");
+	if (ret) {
+		auxiliary_device_uninit(&cp500->batt->auxdev);
+		cp500->batt = NULL;
+
+		return ret;
+	}
+
+	return 0;
+}
+
 static int cp500_nvmem_read(void *priv, unsigned int offset, void *val,
 			    size_t bytes)
 {
@@ -613,6 +666,8 @@ static void cp500_register_auxiliary_devs(struct cp500 *cp500)
 	if (present & CP500_PRESENT_FAN0)
 		if (cp500_register_fan(cp500))
 			dev_warn(dev, "Failed to register fan!\n");
+	if (cp500_register_batt(cp500))
+		dev_warn(dev, "Failed to register battery!\n");
 }
 
 static void cp500_unregister_dev(struct auxiliary_device *auxdev)
@@ -635,6 +690,10 @@ static void cp500_unregister_auxiliary_devs(struct cp500 *cp500)
 		cp500_unregister_dev(&cp500->fan->auxdev);
 		cp500->fan = NULL;
 	}
+	if (cp500->batt) {
+		cp500_unregister_dev(&cp500->batt->auxdev);
+		cp500->batt = NULL;
+	}
 }
 
 static irqreturn_t cp500_axi_handler(int irq, void *dev)
diff --git a/include/linux/misc/keba.h b/include/linux/misc/keba.h
index 451777acc262..ca52716f8437 100644
--- a/include/linux/misc/keba.h
+++ b/include/linux/misc/keba.h
@@ -47,4 +47,14 @@ struct keba_fan_auxdev {
 	struct resource io;
 };
 
+/**
+ * struct keba_batt_auxdev - KEBA battery auxiliary device
+ * @auxdev: auxiliary device object
+ * @io: address range of battery controller IO memory
+ */
+struct keba_batt_auxdev {
+	struct auxiliary_device auxdev;
+	struct resource io;
+};
+
 #endif /* _LINUX_MISC_KEBA_H */
-- 
cgit v1.2.3


From a27b406a49225a17849c86221a32f2d598702719 Mon Sep 17 00:00:00 2001
From: Gerhard Engleder <eg@keba.com>
Date: Fri, 11 Oct 2024 21:12:57 +0200
Subject: misc: keba: Add UART devices

Add support for the UART auxiliary devices. This enables access to up to
3 different UARTs, which are implemented in the FPGA.

Signed-off-by: Gerhard Engleder <eg@keba.com>
Link: https://lore.kernel.org/r/20241011191257.19702-9-gerhard@engleder-embedded.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/keba/cp500.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/misc/keba.h |  12 ++++++
 2 files changed, 116 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/keba/cp500.c b/drivers/misc/keba/cp500.c
index afd4d7c06cee..255d3022dae8 100644
--- a/drivers/misc/keba/cp500.c
+++ b/drivers/misc/keba/cp500.c
@@ -46,6 +46,9 @@
 
 /* MSIX */
 #define CP500_AXI_MSIX		3
+#define CP500_RFB_UART_MSIX	4
+#define CP500_DEBUG_UART_MSIX	5
+#define CP500_SI1_UART_MSIX	6
 #define CP500_NUM_MSIX		8
 #define CP500_NUM_MSIX_NO_MMI	2
 #define CP500_NUM_MSIX_NO_AXI	3
@@ -75,6 +78,7 @@
 struct cp500_dev_info {
 	off_t offset;
 	size_t size;
+	unsigned int msix;
 };
 
 struct cp500_devs {
@@ -83,6 +87,9 @@ struct cp500_devs {
 	struct cp500_dev_info i2c;
 	struct cp500_dev_info fan;
 	struct cp500_dev_info batt;
+	struct cp500_dev_info uart0_rfb;
+	struct cp500_dev_info uart1_dbg;
+	struct cp500_dev_info uart2_si1;
 };
 
 /* list of devices within FPGA of CP035 family (CP035, CP056, CP057) */
@@ -92,6 +99,8 @@ static struct cp500_devs cp035_devices = {
 	.i2c       = { 0x4000, SZ_4K },
 	.fan       = { 0x9000, SZ_4K },
 	.batt      = { 0xA000, SZ_4K },
+	.uart0_rfb = { 0xB000, SZ_4K, CP500_RFB_UART_MSIX },
+	.uart2_si1 = { 0xD000, SZ_4K, CP500_SI1_UART_MSIX },
 };
 
 /* list of devices within FPGA of CP505 family (CP503, CP505, CP507) */
@@ -101,6 +110,8 @@ static struct cp500_devs cp505_devices = {
 	.i2c       = { 0x5000, SZ_4K },
 	.fan       = { 0x9000, SZ_4K },
 	.batt      = { 0xA000, SZ_4K },
+	.uart0_rfb = { 0xB000, SZ_4K, CP500_RFB_UART_MSIX },
+	.uart2_si1 = { 0xD000, SZ_4K, CP500_SI1_UART_MSIX },
 };
 
 /* list of devices within FPGA of CP520 family (CP520, CP530) */
@@ -110,6 +121,8 @@ static struct cp500_devs cp520_devices = {
 	.i2c       = { 0x5000, SZ_4K },
 	.fan       = { 0x8000, SZ_4K },
 	.batt      = { 0x9000, SZ_4K },
+	.uart0_rfb = { 0xC000, SZ_4K, CP500_RFB_UART_MSIX },
+	.uart1_dbg = { 0xD000, SZ_4K, CP500_DEBUG_UART_MSIX },
 };
 
 struct cp500_nvmem {
@@ -135,6 +148,9 @@ struct cp500 {
 	struct keba_i2c_auxdev *i2c;
 	struct keba_fan_auxdev *fan;
 	struct keba_batt_auxdev *batt;
+	struct keba_uart_auxdev *uart0_rfb;
+	struct keba_uart_auxdev *uart1_dbg;
+	struct keba_uart_auxdev *uart2_si1;
 
 	/* ECM EtherCAT BAR */
 	resource_size_t ecm_hwbase;
@@ -510,6 +526,55 @@ static int cp500_register_batt(struct cp500 *cp500)
 	return 0;
 }
 
+static void cp500_uart_release(struct device *dev)
+{
+	struct keba_uart_auxdev *uart =
+		container_of(dev, struct keba_uart_auxdev, auxdev.dev);
+
+	kfree(uart);
+}
+
+static int cp500_register_uart(struct cp500 *cp500,
+			       struct keba_uart_auxdev **uart, const char *name,
+			       struct cp500_dev_info *info, unsigned int irq)
+{
+	int ret;
+
+	*uart = kzalloc(sizeof(**uart), GFP_KERNEL);
+	if (!*uart)
+		return -ENOMEM;
+
+	(*uart)->auxdev.name = name;
+	(*uart)->auxdev.id = 0;
+	(*uart)->auxdev.dev.release = cp500_uart_release;
+	(*uart)->auxdev.dev.parent = &cp500->pci_dev->dev;
+	(*uart)->io = (struct resource) {
+		 /* UART register area */
+		 .start = (resource_size_t) cp500->sys_hwbase + info->offset,
+		 .end   = (resource_size_t) cp500->sys_hwbase + info->offset +
+			  info->size - 1,
+		 .flags = IORESOURCE_MEM,
+	};
+	(*uart)->irq = irq;
+
+	ret = auxiliary_device_init(&(*uart)->auxdev);
+	if (ret) {
+		kfree(*uart);
+		*uart = NULL;
+
+		return ret;
+	}
+	ret = __auxiliary_device_add(&(*uart)->auxdev, "keba");
+	if (ret) {
+		auxiliary_device_uninit(&(*uart)->auxdev);
+		*uart = NULL;
+
+		return ret;
+	}
+
+	return 0;
+}
+
 static int cp500_nvmem_read(void *priv, unsigned int offset, void *val,
 			    size_t bytes)
 {
@@ -668,6 +733,33 @@ static void cp500_register_auxiliary_devs(struct cp500 *cp500)
 			dev_warn(dev, "Failed to register fan!\n");
 	if (cp500_register_batt(cp500))
 		dev_warn(dev, "Failed to register battery!\n");
+	if (cp500->devs->uart0_rfb.size &&
+	    cp500->devs->uart0_rfb.msix < cp500->msix_num) {
+		int irq = pci_irq_vector(cp500->pci_dev,
+					 cp500->devs->uart0_rfb.msix);
+
+		if (cp500_register_uart(cp500, &cp500->uart0_rfb, "rs485-uart",
+					&cp500->devs->uart0_rfb, irq))
+			dev_warn(dev, "Failed to register RFB UART!\n");
+	}
+	if (cp500->devs->uart1_dbg.size &&
+	    cp500->devs->uart1_dbg.msix < cp500->msix_num) {
+		int irq = pci_irq_vector(cp500->pci_dev,
+					 cp500->devs->uart1_dbg.msix);
+
+		if (cp500_register_uart(cp500, &cp500->uart1_dbg, "rs232-uart",
+					&cp500->devs->uart1_dbg, irq))
+			dev_warn(dev, "Failed to register debug UART!\n");
+	}
+	if (cp500->devs->uart2_si1.size &&
+	    cp500->devs->uart2_si1.msix < cp500->msix_num) {
+		int irq = pci_irq_vector(cp500->pci_dev,
+					 cp500->devs->uart2_si1.msix);
+
+		if (cp500_register_uart(cp500, &cp500->uart2_si1, "uart",
+					&cp500->devs->uart2_si1, irq))
+			dev_warn(dev, "Failed to register SI1 UART!\n");
+	}
 }
 
 static void cp500_unregister_dev(struct auxiliary_device *auxdev)
@@ -694,6 +786,18 @@ static void cp500_unregister_auxiliary_devs(struct cp500 *cp500)
 		cp500_unregister_dev(&cp500->batt->auxdev);
 		cp500->batt = NULL;
 	}
+	if (cp500->uart0_rfb) {
+		cp500_unregister_dev(&cp500->uart0_rfb->auxdev);
+		cp500->uart0_rfb = NULL;
+	}
+	if (cp500->uart1_dbg) {
+		cp500_unregister_dev(&cp500->uart1_dbg->auxdev);
+		cp500->uart1_dbg = NULL;
+	}
+	if (cp500->uart2_si1) {
+		cp500_unregister_dev(&cp500->uart2_si1->auxdev);
+		cp500->uart2_si1 = NULL;
+	}
 }
 
 static irqreturn_t cp500_axi_handler(int irq, void *dev)
diff --git a/include/linux/misc/keba.h b/include/linux/misc/keba.h
index ca52716f8437..a81d6fa70851 100644
--- a/include/linux/misc/keba.h
+++ b/include/linux/misc/keba.h
@@ -57,4 +57,16 @@ struct keba_batt_auxdev {
 	struct resource io;
 };
 
+/**
+ * struct keba_uart_auxdev - KEBA UART auxiliary device
+ * @auxdev: auxiliary device object
+ * @io: address range of UART controller IO memory
+ * @irq: number of UART controller interrupt
+ */
+struct keba_uart_auxdev {
+	struct auxiliary_device auxdev;
+	struct resource io;
+	unsigned int irq;
+};
+
 #endif /* _LINUX_MISC_KEBA_H */
-- 
cgit v1.2.3


From cec78a59abc9b1a06f742cb96479fc0bb1fe4e90 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Tue, 17 Sep 2024 18:42:56 +0800
Subject: list: Remove duplicated and unused macro list_for_each_reverse

Remove macro list_for_each_reverse due to below reasons:

- it is same as list_for_each_prev.
- it is not used by current kernel tree.

Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Link: https://lore.kernel.org/r/20240917-fix_list-v2-1-d2914665e89f@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/list.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 5f4b0a39cf46..29a375889fb8 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -686,14 +686,6 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each(pos, head) \
 	for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)
 
-/**
- * list_for_each_reverse - iterate backwards over a list
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- */
-#define list_for_each_reverse(pos, head) \
-	for (pos = (head)->prev; pos != (head); pos = pos->prev)
-
 /**
  * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
  * @pos:	the &struct list_head to use as a loop cursor.
-- 
cgit v1.2.3


From 0ebe74c53b8b62bde7b02415d28e75aa25d6c2e6 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 29 Sep 2024 15:11:12 +0100
Subject: drivers/base: Remove unused auxiliary_find_device

auxiliary_find_device has been unused since commit
1c5de097bea3 ("net/mlx5: Fix mlx5_get_next_dev() peer device matching")

which was the only use since it was originally added.

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20240929141112.69824-1-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/auxiliary_bus.rst |  1 -
 drivers/base/auxiliary.c                   | 29 -----------------------------
 include/linux/auxiliary_bus.h              |  4 ----
 3 files changed, 34 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/auxiliary_bus.rst b/Documentation/driver-api/auxiliary_bus.rst
index cec84908fbc0..b236de773e1d 100644
--- a/Documentation/driver-api/auxiliary_bus.rst
+++ b/Documentation/driver-api/auxiliary_bus.rst
@@ -24,7 +24,6 @@ Auxiliary Device Creation
 
 .. kernel-doc:: drivers/base/auxiliary.c
    :identifiers: auxiliary_device_init __auxiliary_device_add
-                 auxiliary_find_device
 
 Auxiliary Device Memory Model and Lifespan
 ------------------------------------------
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c
index 7823888af4f6..69b7c93613d6 100644
--- a/drivers/base/auxiliary.c
+++ b/drivers/base/auxiliary.c
@@ -335,35 +335,6 @@ int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname)
 }
 EXPORT_SYMBOL_GPL(__auxiliary_device_add);
 
-/**
- * auxiliary_find_device - auxiliary device iterator for locating a particular device.
- * @start: Device to begin with
- * @data: Data to pass to match function
- * @match: Callback function to check device
- *
- * This function returns a reference to a device that is 'found'
- * for later use, as determined by the @match callback.
- *
- * The reference returned should be released with put_device().
- *
- * The callback should return 0 if the device doesn't match and non-zero
- * if it does.  If the callback returns non-zero, this function will
- * return to the caller and not iterate over any more devices.
- */
-struct auxiliary_device *auxiliary_find_device(struct device *start,
-					       const void *data,
-					       device_match_t match)
-{
-	struct device *dev;
-
-	dev = bus_find_device(&auxiliary_bus_type, start, data, match);
-	if (!dev)
-		return NULL;
-
-	return to_auxiliary_dev(dev);
-}
-EXPORT_SYMBOL_GPL(auxiliary_find_device);
-
 /**
  * __auxiliary_driver_register - register a driver for auxiliary bus devices
  * @auxdrv: auxiliary_driver structure
diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h
index 31762324bcc9..65dd7f154374 100644
--- a/include/linux/auxiliary_bus.h
+++ b/include/linux/auxiliary_bus.h
@@ -269,8 +269,4 @@ void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv);
 #define module_auxiliary_driver(__auxiliary_driver) \
 	module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister)
 
-struct auxiliary_device *auxiliary_find_device(struct device *start,
-					       const void *data,
-					       device_match_t match);
-
 #endif /* _AUXILIARY_BUS_H_ */
-- 
cgit v1.2.3


From cd068d51594d9635bf6688fc78717572b78bce6a Mon Sep 17 00:00:00 2001
From: Keita Aihara <keita.aihara@sony.com>
Date: Fri, 13 Sep 2024 18:44:17 +0900
Subject: mmc: core: Add SD card quirk for broken poweroff notification

GIGASTONE Gaming Plus microSD cards manufactured on 02/2022 report that
they support poweroff notification and cache, but they are not working
correctly.

Flush Cache bit never gets cleared in sd_flush_cache() and Poweroff
Notification Ready bit also never gets set to 1 within 1 second from the
end of busy of CMD49 in sd_poweroff_notify().

This leads to I/O error and runtime PM error state.

I observed that the same card manufactured on 01/2024 works as expected.

This problem seems similar to the Kingston cards fixed with
commit c467c8f08185 ("mmc: Add MMC_QUIRK_BROKEN_SD_CACHE for Kingston
Canvas Go Plus from 11/2019") and should be handled using quirks.

CID for the problematic card is here.
12345641535443002000000145016200

Manufacturer ID is 0x12 and defined as CID_MANFID_GIGASTONE as of now,
but would like comments on what naming is appropriate because MID list
is not public and not sure it's right.

Signed-off-by: Keita Aihara <keita.aihara@sony.com>
Link: https://lore.kernel.org/r/20240913094417.GA4191647@sony.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/card.h   | 7 +++++++
 drivers/mmc/core/quirks.h | 9 +++++++++
 drivers/mmc/core/sd.c     | 2 +-
 include/linux/mmc/card.h  | 1 +
 4 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h
index b7754a1b8d97..8476754b1b17 100644
--- a/drivers/mmc/core/card.h
+++ b/drivers/mmc/core/card.h
@@ -82,6 +82,7 @@ struct mmc_fixup {
 #define CID_MANFID_SANDISK_SD   0x3
 #define CID_MANFID_ATP          0x9
 #define CID_MANFID_TOSHIBA      0x11
+#define CID_MANFID_GIGASTONE    0x12
 #define CID_MANFID_MICRON       0x13
 #define CID_MANFID_SAMSUNG      0x15
 #define CID_MANFID_APACER       0x27
@@ -284,4 +285,10 @@ static inline int mmc_card_broken_cache_flush(const struct mmc_card *c)
 {
 	return c->quirks & MMC_QUIRK_BROKEN_CACHE_FLUSH;
 }
+
+static inline int mmc_card_broken_sd_poweroff_notify(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY;
+}
+
 #endif
diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h
index 92905fc46436..89b512905be1 100644
--- a/drivers/mmc/core/quirks.h
+++ b/drivers/mmc/core/quirks.h
@@ -25,6 +25,15 @@ static const struct mmc_fixup __maybe_unused mmc_sd_fixups[] = {
 		   0, -1ull, SDIO_ANY_ID, SDIO_ANY_ID, add_quirk_sd,
 		   MMC_QUIRK_BROKEN_SD_CACHE, EXT_CSD_REV_ANY),
 
+	/*
+	 * GIGASTONE Gaming Plus microSD cards manufactured on 02/2022 never
+	 * clear Flush Cache bit and set Poweroff Notification Ready bit.
+	 */
+	_FIXUP_EXT("ASTC", CID_MANFID_GIGASTONE, 0x3456, 2022, 2,
+		   0, -1ull, SDIO_ANY_ID, SDIO_ANY_ID, add_quirk_sd,
+		   MMC_QUIRK_BROKEN_SD_CACHE | MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY,
+		   EXT_CSD_REV_ANY),
+
 	END_FIXUP
 };
 
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 12fe282bea77..9e62cb7055fe 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -1107,7 +1107,7 @@ static int sd_parse_ext_reg_power(struct mmc_card *card, u8 fno, u8 page,
 	card->ext_power.rev = reg_buf[0] & 0xf;
 
 	/* Power Off Notification support at bit 4. */
-	if (reg_buf[1] & BIT(4))
+	if ((reg_buf[1] & BIT(4)) && !mmc_card_broken_sd_poweroff_notify(card))
 		card->ext_power.feature_support |= SD_EXT_POWER_OFF_NOTIFY;
 
 	/* Power Sustenance support at bit 5. */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index f34407cc2788..543446392776 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -294,6 +294,7 @@ struct mmc_card {
 #define MMC_QUIRK_BROKEN_SD_DISCARD	(1<<14)	/* Disable broken SD discard support */
 #define MMC_QUIRK_BROKEN_SD_CACHE	(1<<15)	/* Disable broken SD cache support */
 #define MMC_QUIRK_BROKEN_CACHE_FLUSH	(1<<16)	/* Don't flush cache until the write has occurred */
+#define MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY	(1<<17) /* Disable broken SD poweroff notify support */
 
 	bool			written_flag;	/* Indicates eMMC has been written since power on */
 	bool			reenable_cmdq;	/* Re-enable Command Queue */
-- 
cgit v1.2.3


From fce2ce78af1e14dc1316aaddb5b3308be05cf452 Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Sun, 6 Oct 2024 08:11:39 +0300
Subject: mmc: sd: SDUC Support Recognition

Ultra Capacity SD cards (SDUC) was already introduced in SD7.0.  Those
cards support capacity larger than 2TB and up to including 128TB.

ACMD41 was extended to support the host-card handshake during
initialization.  The card expects that the HCS & HO2T bits to be set in
the command argument, and sets the applicable bits in the R3 returned
response.  On the contrary, if a SDUC card is inserted to a
non-supporting host, it will never respond to this ACMD41 until
eventually, the host will timed out and give up.

Also, add SD CSD version 3.0 - designated for SDUC, and properly parse
the csd register as the c_size field got expanded to 28 bits.

Do not enable SDUC for now - leave it to the last patch in the series.

Tested-by: Ricky WU <ricky_wu@realtek.com>
Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20241006051148.160278-2-avri.altman@wdc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/bus.c   |  4 +++-
 drivers/mmc/core/card.h  |  3 +++
 drivers/mmc/core/sd.c    | 28 +++++++++++++++++-----------
 drivers/mmc/core/sd.h    |  2 +-
 drivers/mmc/core/sdio.c  |  2 +-
 include/linux/mmc/card.h |  2 +-
 include/linux/mmc/sd.h   |  1 +
 7 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 0ddaee0eae54..30763b342bd3 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -321,7 +321,9 @@ int mmc_add_card(struct mmc_card *card)
 	case MMC_TYPE_SD:
 		type = "SD";
 		if (mmc_card_blockaddr(card)) {
-			if (mmc_card_ext_capacity(card))
+			if (mmc_card_ult_capacity(card))
+				type = "SDUC";
+			else if (mmc_card_ext_capacity(card))
 				type = "SDXC";
 			else
 				type = "SDHC";
diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h
index 8476754b1b17..3205feb1e8ff 100644
--- a/drivers/mmc/core/card.h
+++ b/drivers/mmc/core/card.h
@@ -23,6 +23,7 @@
 #define MMC_CARD_SDXC		(1<<3)		/* card is SDXC */
 #define MMC_CARD_REMOVED	(1<<4)		/* card has been removed */
 #define MMC_STATE_SUSPENDED	(1<<5)		/* card is suspended */
+#define MMC_CARD_SDUC		(1<<6)		/* card is SDUC */
 
 #define mmc_card_present(c)	((c)->state & MMC_STATE_PRESENT)
 #define mmc_card_readonly(c)	((c)->state & MMC_STATE_READONLY)
@@ -30,11 +31,13 @@
 #define mmc_card_ext_capacity(c) ((c)->state & MMC_CARD_SDXC)
 #define mmc_card_removed(c)	((c) && ((c)->state & MMC_CARD_REMOVED))
 #define mmc_card_suspended(c)	((c)->state & MMC_STATE_SUSPENDED)
+#define mmc_card_ult_capacity(c) ((c)->state & MMC_CARD_SDUC)
 
 #define mmc_card_set_present(c)	((c)->state |= MMC_STATE_PRESENT)
 #define mmc_card_set_readonly(c) ((c)->state |= MMC_STATE_READONLY)
 #define mmc_card_set_blockaddr(c) ((c)->state |= MMC_STATE_BLOCKADDR)
 #define mmc_card_set_ext_capacity(c) ((c)->state |= MMC_CARD_SDXC)
+#define mmc_card_set_ult_capacity(c) ((c)->state |= MMC_CARD_SDUC)
 #define mmc_card_set_removed(c) ((c)->state |= MMC_CARD_REMOVED)
 #define mmc_card_set_suspended(c) ((c)->state |= MMC_STATE_SUSPENDED)
 #define mmc_card_clr_suspended(c) ((c)->state &= ~MMC_STATE_SUSPENDED)
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 9e62cb7055fe..63915541c0e4 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -100,7 +100,7 @@ void mmc_decode_cid(struct mmc_card *card)
 /*
  * Given a 128-bit response, decode to our card CSD structure.
  */
-static int mmc_decode_csd(struct mmc_card *card)
+static int mmc_decode_csd(struct mmc_card *card, bool is_sduc)
 {
 	struct mmc_csd *csd = &card->csd;
 	unsigned int e, m, csd_struct;
@@ -144,9 +144,10 @@ static int mmc_decode_csd(struct mmc_card *card)
 			mmc_card_set_readonly(card);
 		break;
 	case 1:
+	case 2:
 		/*
-		 * This is a block-addressed SDHC or SDXC card. Most
-		 * interesting fields are unused and have fixed
+		 * This is a block-addressed SDHC, SDXC or SDUC card.
+		 * Most interesting fields are unused and have fixed
 		 * values. To avoid getting tripped by buggy cards,
 		 * we assume those fixed values ourselves.
 		 */
@@ -159,14 +160,19 @@ static int mmc_decode_csd(struct mmc_card *card)
 		e = unstuff_bits(resp, 96, 3);
 		csd->max_dtr	  = tran_exp[e] * tran_mant[m];
 		csd->cmdclass	  = unstuff_bits(resp, 84, 12);
-		csd->c_size	  = unstuff_bits(resp, 48, 22);
 
-		/* SDXC cards have a minimum C_SIZE of 0x00FFFF */
-		if (csd->c_size >= 0xFFFF)
+		if (csd_struct == 1)
+			m = unstuff_bits(resp, 48, 22);
+		else
+			m = unstuff_bits(resp, 48, 28);
+		csd->c_size = m;
+
+		if (csd->c_size >= 0x400000 && is_sduc)
+			mmc_card_set_ult_capacity(card);
+		else if (csd->c_size >= 0xFFFF)
 			mmc_card_set_ext_capacity(card);
 
-		m = unstuff_bits(resp, 48, 22);
-		csd->capacity     = (1 + m) << 10;
+		csd->capacity     = (1 + (typeof(sector_t))m) << 10;
 
 		csd->read_blkbits = 9;
 		csd->read_partial = 0;
@@ -876,7 +882,7 @@ try_again:
 	return err;
 }
 
-int mmc_sd_get_csd(struct mmc_card *card)
+int mmc_sd_get_csd(struct mmc_card *card, bool is_sduc)
 {
 	int err;
 
@@ -887,7 +893,7 @@ int mmc_sd_get_csd(struct mmc_card *card)
 	if (err)
 		return err;
 
-	err = mmc_decode_csd(card);
+	err = mmc_decode_csd(card, is_sduc);
 	if (err)
 		return err;
 
@@ -1442,7 +1448,7 @@ retry:
 	}
 
 	if (!oldcard) {
-		err = mmc_sd_get_csd(card);
+		err = mmc_sd_get_csd(card, false);
 		if (err)
 			goto free_card;
 
diff --git a/drivers/mmc/core/sd.h b/drivers/mmc/core/sd.h
index fe6dd46927a4..7e8beface2ca 100644
--- a/drivers/mmc/core/sd.h
+++ b/drivers/mmc/core/sd.h
@@ -10,7 +10,7 @@ struct mmc_host;
 struct mmc_card;
 
 int mmc_sd_get_cid(struct mmc_host *host, u32 ocr, u32 *cid, u32 *rocr);
-int mmc_sd_get_csd(struct mmc_card *card);
+int mmc_sd_get_csd(struct mmc_card *card, bool is_sduc);
 void mmc_decode_cid(struct mmc_card *card);
 int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
 	bool reinit);
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 4fb247fde5c0..9566837c9848 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -769,7 +769,7 @@ try_again:
 	 * Read CSD, before selecting the card
 	 */
 	if (!oldcard && mmc_card_sd_combo(card)) {
-		err = mmc_sd_get_csd(card);
+		err = mmc_sd_get_csd(card, false);
 		if (err)
 			goto remove;
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 543446392776..eb67d3d5ff5b 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -35,7 +35,7 @@ struct mmc_csd {
 	unsigned int		wp_grp_size;
 	unsigned int		read_blkbits;
 	unsigned int		write_blkbits;
-	unsigned int		capacity;
+	sector_t		capacity;
 	unsigned int		read_partial:1,
 				read_misalign:1,
 				write_partial:1,
diff --git a/include/linux/mmc/sd.h b/include/linux/mmc/sd.h
index 6727576a8755..865cc0ca8543 100644
--- a/include/linux/mmc/sd.h
+++ b/include/linux/mmc/sd.h
@@ -36,6 +36,7 @@
 /* OCR bit definitions */
 #define SD_OCR_S18R		(1 << 24)    /* 1.8V switching request */
 #define SD_ROCR_S18A		SD_OCR_S18R  /* 1.8V switching accepted by card */
+#define SD_OCR_2T		(1 << 27)    /* HO2T/CO2T - SDUC support */
 #define SD_OCR_XPC		(1 << 28)    /* SDXC power control */
 #define SD_OCR_CCS		(1 << 30)    /* Card Capacity Status */
 
-- 
cgit v1.2.3


From 375b535941bea65b37451f0fd398e28bf4f3bdc3 Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Sun, 6 Oct 2024 08:11:40 +0300
Subject: mmc: sd: Add Extension memory addressing

SDUC memory addressing spans beyond 2TB and up to 128TB.  Therefore, 38
bits are required to access the entire memory space of all sectors.
Those extra 6 bits are to be carried by CMD22 prior of sending
read/write/erase commands: CMD17, CMD18, CMD24, CMD25, CMD32, and CMD33.

CMD22 will carry the higher order 6 bits, and must precedes any of the
above commands even if it targets sector < 2TB.

No error related to address or length is indicated in CMD22 but rather
in the read/write command itself.

Tested-by: Ricky WU <ricky_wu@realtek.com>
Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20241006051148.160278-3-avri.altman@wdc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sd_ops.c | 15 +++++++++++++++
 drivers/mmc/core/sd_ops.h |  1 +
 include/linux/mmc/sd.h    |  3 +++
 3 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd_ops.c b/drivers/mmc/core/sd_ops.c
index f93c392040ae..50d1380e93b8 100644
--- a/drivers/mmc/core/sd_ops.c
+++ b/drivers/mmc/core/sd_ops.c
@@ -16,6 +16,7 @@
 #include <linux/mmc/sd.h>
 
 #include "core.h"
+#include "card.h"
 #include "sd_ops.h"
 #include "mmc_ops.h"
 
@@ -188,6 +189,20 @@ int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr)
 	return 0;
 }
 
+int mmc_send_ext_addr(struct mmc_host *host, u32 addr)
+{
+	struct mmc_command cmd = {
+		.opcode = SD_ADDR_EXT,
+		.arg = addr,
+		.flags = MMC_RSP_R1 | MMC_CMD_AC,
+	};
+
+	if (!mmc_card_ult_capacity(host->card))
+		return 0;
+
+	return mmc_wait_for_cmd(host, &cmd, 0);
+}
+
 static int __mmc_send_if_cond(struct mmc_host *host, u32 ocr, u8 pcie_bits,
 			      u32 *resp)
 {
diff --git a/drivers/mmc/core/sd_ops.h b/drivers/mmc/core/sd_ops.h
index 7667fc223b74..fd3f10b9cf86 100644
--- a/drivers/mmc/core/sd_ops.h
+++ b/drivers/mmc/core/sd_ops.h
@@ -21,6 +21,7 @@ int mmc_send_relative_addr(struct mmc_host *host, unsigned int *rca);
 int mmc_app_send_scr(struct mmc_card *card);
 int mmc_app_sd_status(struct mmc_card *card, void *ssr);
 int mmc_app_cmd(struct mmc_host *host, struct mmc_card *card);
+int mmc_send_ext_addr(struct mmc_host *host, u32 addr);
 
 #endif
 
diff --git a/include/linux/mmc/sd.h b/include/linux/mmc/sd.h
index 865cc0ca8543..af5fc70e09a2 100644
--- a/include/linux/mmc/sd.h
+++ b/include/linux/mmc/sd.h
@@ -15,6 +15,9 @@
 #define SD_SEND_IF_COND           8   /* bcr  [11:0] See below   R7  */
 #define SD_SWITCH_VOLTAGE         11  /* ac                      R1  */
 
+/* Class 2 */
+#define SD_ADDR_EXT		 22   /* ac   [5:0]              R1  */
+
   /* class 10 */
 #define SD_SWITCH                 6   /* adtc [31:0] See below   R1  */
 
-- 
cgit v1.2.3


From 403a0293f1c230524e0185b31f69c02a6aed12c7 Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Sun, 6 Oct 2024 08:11:42 +0300
Subject: mmc: core: Add open-ended Ext memory addressing

For open-ended read/write - just send CMD22 before issuing the command.

Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20241006051148.160278-5-avri.altman@wdc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/block.c | 5 +++++
 drivers/mmc/core/core.c  | 3 +++
 include/linux/mmc/core.h | 4 ++++
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index f4817ea3017b..c9325febc31a 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1759,6 +1759,11 @@ static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq,
 		brq->sbc.flags = MMC_RSP_R1 | MMC_CMD_AC;
 		brq->mrq.sbc = &brq->sbc;
 	}
+
+	if (mmc_card_ult_capacity(card)) {
+		brq->cmd.ext_addr = blk_rq_pos(req) >> 32;
+		brq->cmd.has_ext_addr = true;
+	}
 }
 
 #define MMC_MAX_RETRIES		5
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index d6c819dd68ed..a0b2999684b3 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -336,6 +336,9 @@ int mmc_start_request(struct mmc_host *host, struct mmc_request *mrq)
 {
 	int err;
 
+	if (mrq->cmd && mrq->cmd->has_ext_addr)
+		mmc_send_ext_addr(host, mrq->cmd->ext_addr);
+
 	init_completion(&mrq->cmd_completion);
 
 	mmc_retune_hold(host);
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index f0ac2e469b32..a890a71288ef 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -96,6 +96,10 @@ struct mmc_command {
 	unsigned int		busy_timeout;	/* busy detect timeout in ms */
 	struct mmc_data		*data;		/* data segment associated with cmd */
 	struct mmc_request	*mrq;		/* associated request */
+
+	/* for SDUC */
+	bool has_ext_addr;
+	u8 ext_addr;
 };
 
 struct mmc_data {
-- 
cgit v1.2.3


From 79daeb241db7901e4bd53cce9ab046f376a63a4c Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Sep 2024 18:28:16 +0800
Subject: mmc: core: Prepare to support SD UHS-II cards

The SD UHS-II interface was introduced to the SD spec v4.00 several years
ago. The interface is fundamentally different from an electrical and a
protocol point of view, comparing to the legacy SD interface.

However, the legacy SD protocol is supported through a specific transport
layer (SD-TRAN) defined in the UHS-II addendum of the spec. This allows the
SD card to be managed in a very similar way as a legacy SD card, hence a
lot of code can be re-used to support these new types of cards through the
mmc subsystem.

Moreover, an SD card that supports the UHS-II interface shall also be
backwards compatible with the legacy SD interface, which allows a UHS-II
card to be inserted into a legacy slot. As a matter of fact, this is
already supported by mmc subsystem as of today.

To prepare to add support for UHS-II, this change puts the basic foundation
in the mmc core in place, allowing it to be more easily reviewed before
subsequent changes implements the actual support.

Basically, the approach here adds a new UHS-II bus_ops type and adds a
separate initialization path for the UHS-II card. The intent is to avoid us
from sprinkling the legacy initialization path, but also to simplify
implementation of the UHS-II specific bits.

At this point, there is only one new host ops added to manage the various
ios settings needed for UHS-II. Additional host ops that are needed, are
being added from subsequent changes.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20240913102836.6144-3-victorshihgli@gmail.com
---
 drivers/mmc/core/Makefile  |   2 +-
 drivers/mmc/core/core.c    |  17 ++-
 drivers/mmc/core/core.h    |   1 +
 drivers/mmc/core/sd_uhs2.c | 294 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/card.h   |   7 ++
 include/linux/mmc/host.h   |  23 ++++
 6 files changed, 342 insertions(+), 2 deletions(-)
 create mode 100644 drivers/mmc/core/sd_uhs2.c

(limited to 'include/linux')

diff --git a/drivers/mmc/core/Makefile b/drivers/mmc/core/Makefile
index 6a907736cd7a..15b067e8b0d1 100644
--- a/drivers/mmc/core/Makefile
+++ b/drivers/mmc/core/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_MMC)		+= mmc_core.o
 mmc_core-y			:= core.o bus.o host.o \
 				   mmc.o mmc_ops.o sd.o sd_ops.o \
 				   sdio.o sdio_ops.o sdio_bus.o \
-				   sdio_cis.o sdio_io.o sdio_irq.o \
+				   sdio_cis.o sdio_io.o sdio_irq.o sd_uhs2.o\
 				   slot-gpio.o regulator.o
 mmc_core-$(CONFIG_OF)		+= pwrseq.o
 obj-$(CONFIG_PWRSEQ_SIMPLE)	+= pwrseq_simple.o
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 01a7142eada3..54ca9dc2114c 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2267,6 +2267,18 @@ void mmc_rescan(struct work_struct *work)
 		goto out;
 	}
 
+	/*
+	 * Ideally we should favor initialization of legacy SD cards and defer
+	 * UHS-II enumeration. However, it seems like cards doesn't reliably
+	 * announce their support for UHS-II in the response to the ACMD41,
+	 * while initializing the legacy SD interface. Therefore, let's start
+	 * with UHS-II for now.
+	 */
+	if (!mmc_attach_sd_uhs2(host)) {
+		mmc_release_host(host);
+		goto out;
+	}
+
 	for (i = 0; i < ARRAY_SIZE(freqs); i++) {
 		unsigned int freq = freqs[i];
 		if (freq > host->f_max) {
@@ -2299,10 +2311,13 @@ void mmc_rescan(struct work_struct *work)
 
 void mmc_start_host(struct mmc_host *host)
 {
+	bool power_up = !(host->caps2 &
+			 (MMC_CAP2_NO_PRESCAN_POWERUP | MMC_CAP2_SD_UHS2));
+
 	host->f_init = max(min(freqs[0], host->f_max), host->f_min);
 	host->rescan_disable = 0;
 
-	if (!(host->caps2 & MMC_CAP2_NO_PRESCAN_POWERUP)) {
+	if (power_up) {
 		mmc_claim_host(host);
 		mmc_power_up(host, host->ocr_avail);
 		mmc_release_host(host);
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index fc9d5e9620b1..fc9c066e6468 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -81,6 +81,7 @@ int mmc_detect_card_removed(struct mmc_host *host);
 int mmc_attach_mmc(struct mmc_host *host);
 int mmc_attach_sd(struct mmc_host *host);
 int mmc_attach_sdio(struct mmc_host *host);
+int mmc_attach_sd_uhs2(struct mmc_host *host);
 
 /* Module parameters */
 extern bool use_spi_crc;
diff --git a/drivers/mmc/core/sd_uhs2.c b/drivers/mmc/core/sd_uhs2.c
new file mode 100644
index 000000000000..19d62d45e1ec
--- /dev/null
+++ b/drivers/mmc/core/sd_uhs2.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Linaro Ltd
+ *
+ * Author: Ulf Hansson <ulf.hansson@linaro.org>
+ *
+ * Support for SD UHS-II cards
+ */
+#include <linux/err.h>
+
+#include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
+
+#include "core.h"
+#include "bus.h"
+#include "sd.h"
+#include "mmc_ops.h"
+
+static const unsigned int sd_uhs2_freqs[] = { 52000000, 26000000 };
+
+static int sd_uhs2_power_up(struct mmc_host *host)
+{
+	int err;
+
+	if (host->ios.power_mode == MMC_POWER_ON)
+		return 0;
+
+	host->ios.vdd = fls(host->ocr_avail) - 1;
+	host->ios.clock = host->f_init;
+	host->ios.timing = MMC_TIMING_UHS2_SPEED_A;
+	host->ios.power_mode = MMC_POWER_ON;
+
+	err = host->ops->uhs2_control(host, UHS2_SET_IOS);
+
+	return err;
+}
+
+static int sd_uhs2_power_off(struct mmc_host *host)
+{
+	if (host->ios.power_mode == MMC_POWER_OFF)
+		return 0;
+
+	host->ios.vdd = 0;
+	host->ios.clock = 0;
+	host->ios.timing = MMC_TIMING_LEGACY;
+	host->ios.power_mode = MMC_POWER_OFF;
+
+	return host->ops->uhs2_control(host, UHS2_SET_IOS);
+}
+
+/*
+ * Run the phy initialization sequence, which mainly relies on the UHS-II host
+ * to check that we reach the expected electrical state, between the host and
+ * the card.
+ */
+static int sd_uhs2_phy_init(struct mmc_host *host)
+{
+	return 0;
+}
+
+/*
+ * Do the early initialization of the card, by sending the device init broadcast
+ * command and wait for the process to be completed.
+ */
+static int sd_uhs2_dev_init(struct mmc_host *host)
+{
+	return 0;
+}
+
+/*
+ * Run the enumeration process by sending the enumerate command to the card.
+ * Note that, we currently support only the point to point connection, which
+ * means only one card can be attached per host/slot.
+ */
+static int sd_uhs2_enum(struct mmc_host *host, u32 *node_id)
+{
+	return 0;
+}
+
+/*
+ * Read the UHS-II configuration registers (CFG_REG) of the card, by sending it
+ * commands and by parsing the responses. Store a copy of the relevant data in
+ * card->uhs2_config.
+ */
+static int sd_uhs2_config_read(struct mmc_host *host, struct mmc_card *card)
+{
+	return 0;
+}
+
+/*
+ * Based on the card's and host's UHS-II capabilities, let's update the
+ * configuration of the card and the host. This may also include to move to a
+ * greater speed range/mode. Depending on the updated configuration, we may need
+ * to do a soft reset of the card via sending it a GO_DORMANT_STATE command.
+ *
+ * In the final step, let's check if the card signals "config completion", which
+ * indicates that the card has moved from config state into active state.
+ */
+static int sd_uhs2_config_write(struct mmc_host *host, struct mmc_card *card)
+{
+	return 0;
+}
+
+/*
+ * Initialize the UHS-II card through the SD-TRAN transport layer. This enables
+ * commands/requests to be backwards compatible through the legacy SD protocol.
+ * UHS-II cards has a specific power limit specified for VDD1/VDD2, that should
+ * be set through a legacy CMD6. Note that, the power limit that becomes set,
+ * survives a soft reset through the GO_DORMANT_STATE command.
+ */
+static int sd_uhs2_legacy_init(struct mmc_host *host, struct mmc_card *card)
+{
+	return 0;
+}
+
+/*
+ * Allocate the data structure for the mmc_card and run the UHS-II specific
+ * initialization sequence.
+ */
+static int sd_uhs2_init_card(struct mmc_host *host)
+{
+	struct mmc_card *card;
+	u32 node_id = 0;
+	int err;
+
+	err = sd_uhs2_dev_init(host);
+	if (err)
+		return err;
+
+	err = sd_uhs2_enum(host, &node_id);
+	if (err)
+		return err;
+
+	card = mmc_alloc_card(host, &sd_type);
+	if (IS_ERR(card))
+		return PTR_ERR(card);
+
+	card->uhs2_config.node_id = node_id;
+	card->type = MMC_TYPE_SD;
+
+	err = sd_uhs2_config_read(host, card);
+	if (err)
+		goto err;
+
+	err = sd_uhs2_config_write(host, card);
+	if (err)
+		goto err;
+
+	host->card = card;
+	return 0;
+
+err:
+	mmc_remove_card(card);
+	return err;
+}
+
+static void sd_uhs2_remove(struct mmc_host *host)
+{
+	mmc_remove_card(host->card);
+	host->card = NULL;
+}
+
+static int sd_uhs2_alive(struct mmc_host *host)
+{
+	return mmc_send_status(host->card, NULL);
+}
+
+static void sd_uhs2_detect(struct mmc_host *host)
+{
+	int err;
+
+	mmc_get_card(host->card, NULL);
+	err = _mmc_detect_card_removed(host);
+	mmc_put_card(host->card, NULL);
+
+	if (err) {
+		sd_uhs2_remove(host);
+
+		mmc_claim_host(host);
+		mmc_detach_bus(host);
+		sd_uhs2_power_off(host);
+		mmc_release_host(host);
+	}
+}
+
+static int sd_uhs2_suspend(struct mmc_host *host)
+{
+	return 0;
+}
+
+static int sd_uhs2_resume(struct mmc_host *host)
+{
+	return 0;
+}
+
+static int sd_uhs2_runtime_suspend(struct mmc_host *host)
+{
+	return 0;
+}
+
+static int sd_uhs2_runtime_resume(struct mmc_host *host)
+{
+	return 0;
+}
+
+static int sd_uhs2_shutdown(struct mmc_host *host)
+{
+	return 0;
+}
+
+static int sd_uhs2_hw_reset(struct mmc_host *host)
+{
+	return 0;
+}
+
+static const struct mmc_bus_ops sd_uhs2_ops = {
+	.remove = sd_uhs2_remove,
+	.alive = sd_uhs2_alive,
+	.detect = sd_uhs2_detect,
+	.suspend = sd_uhs2_suspend,
+	.resume = sd_uhs2_resume,
+	.runtime_suspend = sd_uhs2_runtime_suspend,
+	.runtime_resume = sd_uhs2_runtime_resume,
+	.shutdown = sd_uhs2_shutdown,
+	.hw_reset = sd_uhs2_hw_reset,
+};
+
+static int sd_uhs2_attach(struct mmc_host *host)
+{
+	int err;
+
+	err = sd_uhs2_power_up(host);
+	if (err)
+		goto err;
+
+	err = sd_uhs2_phy_init(host);
+	if (err)
+		goto err;
+
+	err = sd_uhs2_init_card(host);
+	if (err)
+		goto err;
+
+	err = sd_uhs2_legacy_init(host, host->card);
+	if (err)
+		goto err;
+
+	mmc_attach_bus(host, &sd_uhs2_ops);
+
+	mmc_release_host(host);
+
+	err = mmc_add_card(host->card);
+	if (err)
+		goto remove_card;
+
+	mmc_claim_host(host);
+	return 0;
+
+remove_card:
+	mmc_remove_card(host->card);
+	host->card = NULL;
+	mmc_claim_host(host);
+	mmc_detach_bus(host);
+err:
+	sd_uhs2_power_off(host);
+	return err;
+}
+
+int mmc_attach_sd_uhs2(struct mmc_host *host)
+{
+	int i, err = 0;
+
+	if (!(host->caps2 & MMC_CAP2_SD_UHS2))
+		return -EOPNOTSUPP;
+
+	/* Turn off the legacy SD interface before trying with UHS-II. */
+	mmc_power_off(host);
+
+	/*
+	 * Start UHS-II initialization at 52MHz and possibly make a retry at
+	 * 26MHz according to the spec. It's required that the host driver
+	 * validates ios->clock, to set a rate within the correct range.
+	 */
+	for (i = 0; i < ARRAY_SIZE(sd_uhs2_freqs); i++) {
+		host->f_init = sd_uhs2_freqs[i];
+		pr_debug("%s: %s: trying to init UHS-II card at %u Hz\n",
+			 mmc_hostname(host), __func__, host->f_init);
+		err = sd_uhs2_attach(host);
+		if (!err)
+			break;
+	}
+
+	return err;
+}
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index eb67d3d5ff5b..0bb1ad1ea4dc 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -209,6 +209,11 @@ struct sd_ext_reg {
 #define SD_EXT_PERF_CMD_QUEUE	(1<<4)
 };
 
+struct sd_uhs2_config {
+	u32			node_id;
+	/* TODO: Extend with more register configs. */
+};
+
 struct sdio_cccr {
 	unsigned int		sdio_vsn;
 	unsigned int		sd_vsn;
@@ -320,6 +325,8 @@ struct mmc_card {
 	struct sd_ext_reg	ext_power;	/* SD extension reg for PM */
 	struct sd_ext_reg	ext_perf;	/* SD extension reg for PERF */
 
+	struct sd_uhs2_config	uhs2_config;	/* SD UHS-II config */
+
 	unsigned int		sdio_funcs;	/* number of SDIO functions */
 	atomic_t		sdio_funcs_probed; /* number of probed SDIO funcs */
 	struct sdio_cccr	cccr;		/* common card info */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 8fc2b328ec4d..e1d380de3aaa 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -64,6 +64,10 @@ struct mmc_ios {
 #define MMC_TIMING_MMC_HS400	10
 #define MMC_TIMING_SD_EXP	11
 #define MMC_TIMING_SD_EXP_1_2V	12
+#define MMC_TIMING_UHS2_SPEED_A    13
+#define MMC_TIMING_UHS2_SPEED_A_HD 14
+#define MMC_TIMING_UHS2_SPEED_B    15
+#define MMC_TIMING_UHS2_SPEED_B_HD 16
 
 	unsigned char	signal_voltage;		/* signalling voltage (1.8V or 3.3V) */
 
@@ -92,6 +96,14 @@ struct mmc_clk_phase_map {
 	struct mmc_clk_phase phase[MMC_NUM_CLK_PHASES];
 };
 
+struct sd_uhs2_caps {
+	/* TODO: Add UHS-II capabilities for the host. */
+};
+
+enum sd_uhs2_operation {
+	UHS2_SET_IOS,
+};
+
 struct mmc_host;
 
 enum mmc_err_stat {
@@ -219,6 +231,14 @@ struct mmc_host_ops {
 
 	/* Initialize an SD express card, mandatory for MMC_CAP2_SD_EXP. */
 	int	(*init_sd_express)(struct mmc_host *host, struct mmc_ios *ios);
+
+	/*
+	 * The uhs2_control callback is used to execute SD UHS-II specific
+	 * operations. It's mandatory to implement for hosts that supports the
+	 * SD UHS-II interface (MMC_CAP2_SD_UHS2). Expected return values are a
+	 * negative errno in case of a failure or zero for success.
+	 */
+	int	(*uhs2_control)(struct mmc_host *host, enum sd_uhs2_operation op);
 };
 
 struct mmc_cqe_ops {
@@ -379,6 +399,7 @@ struct mmc_host {
 				 MMC_CAP2_HS200_1_2V_SDR)
 #define MMC_CAP2_SD_EXP		(1 << 7)	/* SD express via PCIe */
 #define MMC_CAP2_SD_EXP_1_2V	(1 << 8)	/* SD express 1.2V */
+#define MMC_CAP2_SD_UHS2	(1 << 9)	/* SD UHS-II support */
 #define MMC_CAP2_CD_ACTIVE_HIGH	(1 << 10)	/* Card-detect signal active high */
 #define MMC_CAP2_RO_ACTIVE_HIGH	(1 << 11)	/* Write-protect signal active high */
 #define MMC_CAP2_NO_PRESCAN_POWERUP (1 << 14)	/* Don't power up before scan */
@@ -405,6 +426,8 @@ struct mmc_host {
 #endif
 #define MMC_CAP2_ALT_GPT_TEGRA	(1 << 28)	/* Host with eMMC that has GPT entry at a non-standard location */
 
+	struct sd_uhs2_caps	uhs2_caps;	/* Host UHS-II capabilities */
+
 	int			fixed_drv_type;	/* fixed driver type for non-removable media */
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
-- 
cgit v1.2.3


From 153196d550c747367bdbec5cd545a572c5310451 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Sep 2024 18:28:17 +0800
Subject: mmc: core: Announce successful insertion of an SD UHS-II card

To inform the users about SD UHS-II cards, let's extend the print at card
insertion with a "UHS-II" substring. Within this change, it seems
reasonable to convert from using "ultra high speed" into "UHS-I speed", for
the UHS-I type, as it should makes it more clear.

Note that, the new print for UHS-II cards doesn't include the actual
selected speed mode. Instead, this is going to be added from subsequent
change.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20240913102836.6144-4-victorshihgli@gmail.com
---
 drivers/mmc/core/bus.c   | 4 +++-
 include/linux/mmc/host.h | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 1470f78acc6f..9283b28bc69f 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -346,7 +346,9 @@ int mmc_add_card(struct mmc_card *card)
 	if (mmc_card_hs(card))
 		speed_mode = "high speed ";
 	else if (mmc_card_uhs(card))
-		speed_mode = "ultra high speed ";
+		speed_mode = "UHS-I speed ";
+	else if (mmc_card_uhs2(card->host))
+		speed_mode = "UHS-II speed ";
 	else if	(mmc_card_ddr52(card))
 		speed_mode = "high speed DDR ";
 	else if (mmc_card_hs200(card))
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index e1d380de3aaa..9a148280bf42 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -638,6 +638,14 @@ static inline int mmc_card_uhs(struct mmc_card *card)
 		card->host->ios.timing <= MMC_TIMING_UHS_DDR50;
 }
 
+static inline bool mmc_card_uhs2(struct mmc_host *host)
+{
+	return host->ios.timing == MMC_TIMING_UHS2_SPEED_A ||
+	       host->ios.timing == MMC_TIMING_UHS2_SPEED_A_HD ||
+	       host->ios.timing == MMC_TIMING_UHS2_SPEED_B ||
+	       host->ios.timing == MMC_TIMING_UHS2_SPEED_B_HD;
+}
+
 void mmc_retune_timer_stop(struct mmc_host *host);
 
 static inline void mmc_retune_needed(struct mmc_host *host)
-- 
cgit v1.2.3


From a56ffd3a83ed2e10e0d9e0b199547bfa0d206aac Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Sep 2024 18:28:18 +0800
Subject: mmc: core: Extend support for mmc regulators with a vqmmc2

To allow an additional external regulator to be controlled by an mmc host
driver, let's add support for a vqmmc2 regulator to the mmc core.

For an SD UHS-II interface the vqmmc2 regulator may correspond to the so
called vdd2 supply, as described by the SD spec. Initially, only 1.8V is
needed, hence limit the new helper function, mmc_regulator_set_vqmmc2() to
this too.

Note that, to allow for flexibility mmc host drivers need to manage the
enable/disable of the vqmmc2 regulator themselves, while the regulator is
looked up through the common mmc_regulator_get_supply().

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20240913102836.6144-5-victorshihgli@gmail.com
---
 drivers/mmc/core/regulator.c | 34 ++++++++++++++++++++++++++++++++++
 include/linux/mmc/host.h     | 11 +++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/regulator.c b/drivers/mmc/core/regulator.c
index 01747ab1024e..3dae2e9b7978 100644
--- a/drivers/mmc/core/regulator.c
+++ b/drivers/mmc/core/regulator.c
@@ -226,6 +226,33 @@ int mmc_regulator_set_vqmmc(struct mmc_host *mmc, struct mmc_ios *ios)
 }
 EXPORT_SYMBOL_GPL(mmc_regulator_set_vqmmc);
 
+/**
+ * mmc_regulator_set_vqmmc2 - Set vqmmc2 as per the ios->vqmmc2_voltage
+ * @mmc: The mmc host to regulate
+ * @ios: The io bus settings
+ *
+ * Sets a new voltage level for the vqmmc2 regulator, which may correspond to
+ * the vdd2 regulator for an SD UHS-II interface. This function is expected to
+ * be called by mmc host drivers.
+ *
+ * Returns a negative error code on failure, zero if the voltage level was
+ * changed successfully or a positive value if the level didn't need to change.
+ */
+int mmc_regulator_set_vqmmc2(struct mmc_host *mmc, struct mmc_ios *ios)
+{
+	if (IS_ERR(mmc->supply.vqmmc2))
+		return -EINVAL;
+
+	switch (ios->vqmmc2_voltage) {
+	case MMC_VQMMC2_VOLTAGE_180:
+		return mmc_regulator_set_voltage_if_supported(
+			mmc->supply.vqmmc2, 1700000, 1800000, 1950000);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(mmc_regulator_set_vqmmc2);
+
 #else
 
 static inline int mmc_regulator_get_ocrmask(struct regulator *supply)
@@ -252,6 +279,7 @@ int mmc_regulator_get_supply(struct mmc_host *mmc)
 
 	mmc->supply.vmmc = devm_regulator_get_optional(dev, "vmmc");
 	mmc->supply.vqmmc = devm_regulator_get_optional(dev, "vqmmc");
+	mmc->supply.vqmmc2 = devm_regulator_get_optional(dev, "vqmmc2");
 
 	if (IS_ERR(mmc->supply.vmmc)) {
 		if (PTR_ERR(mmc->supply.vmmc) == -EPROBE_DEFER)
@@ -275,6 +303,12 @@ int mmc_regulator_get_supply(struct mmc_host *mmc)
 		dev_dbg(dev, "No vqmmc regulator found\n");
 	}
 
+	if (IS_ERR(mmc->supply.vqmmc2)) {
+		if (PTR_ERR(mmc->supply.vqmmc2) == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+		dev_dbg(dev, "No vqmmc2 regulator found\n");
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mmc_regulator_get_supply);
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 9a148280bf42..a7f790c75bc8 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -75,6 +75,9 @@ struct mmc_ios {
 #define MMC_SIGNAL_VOLTAGE_180	1
 #define MMC_SIGNAL_VOLTAGE_120	2
 
+	unsigned char	vqmmc2_voltage;
+#define MMC_VQMMC2_VOLTAGE_180	0
+
 	unsigned char	drv_type;		/* driver type (A, B, C, D) */
 
 #define MMC_SET_DRIVER_TYPE_B	0
@@ -308,6 +311,7 @@ struct mmc_pwrseq;
 struct mmc_supply {
 	struct regulator *vmmc;		/* Card power supply */
 	struct regulator *vqmmc;	/* Optional Vccq supply */
+	struct regulator *vqmmc2;	/* Optional supply for phy */
 };
 
 struct mmc_ctx {
@@ -590,6 +594,7 @@ int mmc_regulator_set_ocr(struct mmc_host *mmc,
 			struct regulator *supply,
 			unsigned short vdd_bit);
 int mmc_regulator_set_vqmmc(struct mmc_host *mmc, struct mmc_ios *ios);
+int mmc_regulator_set_vqmmc2(struct mmc_host *mmc, struct mmc_ios *ios);
 #else
 static inline int mmc_regulator_set_ocr(struct mmc_host *mmc,
 				 struct regulator *supply,
@@ -603,6 +608,12 @@ static inline int mmc_regulator_set_vqmmc(struct mmc_host *mmc,
 {
 	return -EINVAL;
 }
+
+static inline int mmc_regulator_set_vqmmc2(struct mmc_host *mmc,
+					   struct mmc_ios *ios)
+{
+	return -EINVAL;
+}
 #endif
 
 int mmc_regulator_get_supply(struct mmc_host *mmc);
-- 
cgit v1.2.3


From a9a75f9dc23c1562dcb261c0a8f3d6fc70d246cd Mon Sep 17 00:00:00 2001
From: Victor Shih <victor.shih@genesyslogic.com.tw>
Date: Fri, 13 Sep 2024 18:28:19 +0800
Subject: mmc: core: Add definitions for SD UHS-II cards

Add UHS-II specific data structures for commands and defines for
registers, as described in Part 1 UHS-II Addendum Version 1.01.

UHS-II related definitions are listed below:
  1. UHS-II card capability: sd_uhs2_caps{}
  2. UHS-II configuration: sd_uhs2_config{}
  3. UHS-II register I/O address and register field definitions: sd_uhs2.h

Signed-off-by: Jason Lai <jason.lai@genesyslogic.com.tw>
Signed-off-by: Victor Shih <victor.shih@genesyslogic.com.tw>
Link: https://lore.kernel.org/r/20240913102836.6144-6-victorshihgli@gmail.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/card.h    |  31 +++++-
 include/linux/mmc/host.h    |  25 ++++-
 include/linux/mmc/sd_uhs2.h | 240 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 294 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/mmc/sd_uhs2.h

(limited to 'include/linux')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 0bb1ad1ea4dc..526fce581657 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -188,6 +188,12 @@ struct sd_switch_caps {
 #define SD_MAX_CURRENT_400	(1 << SD_SET_CURRENT_LIMIT_400)
 #define SD_MAX_CURRENT_600	(1 << SD_SET_CURRENT_LIMIT_600)
 #define SD_MAX_CURRENT_800	(1 << SD_SET_CURRENT_LIMIT_800)
+
+#define SD4_SET_POWER_LIMIT_0_72W	0
+#define SD4_SET_POWER_LIMIT_1_44W	1
+#define SD4_SET_POWER_LIMIT_2_16W	2
+#define SD4_SET_POWER_LIMIT_2_88W	3
+#define SD4_SET_POWER_LIMIT_1_80W	4
 };
 
 struct sd_ext_reg {
@@ -211,7 +217,30 @@ struct sd_ext_reg {
 
 struct sd_uhs2_config {
 	u32			node_id;
-	/* TODO: Extend with more register configs. */
+
+	u32			n_fcu;
+	u32			maxblk_len;
+	u8			n_lanes;
+	u8			dadr_len;
+	u8			app_type;
+	u8			phy_minor_rev;
+	u8			phy_major_rev;
+	u8			can_hibernate;
+	u8			n_lss_sync;
+	u8			n_lss_dir;
+	u8			link_minor_rev;
+	u8			link_major_rev;
+	u8			dev_type;
+	u8			n_data_gap;
+
+	u32			n_fcu_set;
+	u32			maxblk_len_set;
+	u8			n_lanes_set;
+	u8			speed_range_set;
+	u8			n_lss_sync_set;
+	u8			n_lss_dir_set;
+	u8			n_data_gap_set;
+	u8			max_retry_set;
 };
 
 struct sdio_cccr {
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index a7f790c75bc8..0980d06ed419 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -17,6 +17,7 @@
 #include <linux/mmc/pm.h>
 #include <linux/dma-direction.h>
 #include <linux/blk-crypto-profile.h>
+#include <linux/mmc/sd_uhs2.h>
 
 struct mmc_ios {
 	unsigned int	clock;			/* clock rate */
@@ -100,7 +101,29 @@ struct mmc_clk_phase_map {
 };
 
 struct sd_uhs2_caps {
-	/* TODO: Add UHS-II capabilities for the host. */
+	u32	dap;
+	u32	gap;
+	u32	group_desc;
+	u32	maxblk_len;
+	u32	n_fcu;
+	u8	n_lanes;
+	u8	addr64;
+	u8	card_type;
+	u8	phy_rev;
+	u8	speed_range;
+	u8	n_lss_sync;
+	u8	n_lss_dir;
+	u8	link_rev;
+	u8	host_type;
+	u8	n_data_gap;
+
+	u32	maxblk_len_set;
+	u32	n_fcu_set;
+	u8	n_lanes_set;
+	u8	n_lss_sync_set;
+	u8	n_lss_dir_set;
+	u8	n_data_gap_set;
+	u8	max_retry_set;
 };
 
 enum sd_uhs2_operation {
diff --git a/include/linux/mmc/sd_uhs2.h b/include/linux/mmc/sd_uhs2.h
new file mode 100644
index 000000000000..7abe9bd870c7
--- /dev/null
+++ b/include/linux/mmc/sd_uhs2.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Header file for UHS-II packets, Host Controller registers and I/O
+ * accessors.
+ *
+ *  Copyright (C) 2014 Intel Corp, All Rights Reserved.
+ */
+#ifndef LINUX_MMC_UHS2_H
+#define LINUX_MMC_UHS2_H
+
+/* LINK Layer definition */
+/*
+ * UHS2 Header:
+ * Refer to UHS-II Addendum Version 1.02 Figure 5-2, the format of CCMD Header is described below:
+ *      bit [3:0]  : DID(Destination ID = Node ID of UHS2 card)
+ *      bit [6:4]  : TYP(Packet Type)
+ *                   000b: CCMD(Control command packet)
+ *                   001b: DCMD(Data command packet)
+ *                   010b: RES(Response packet)
+ *                   011b: DATA(Data payload packet)
+ *                   111b: MSG(Message packet)
+ *                   Others: Reserved
+ *      bit [7]    : NP(Native Packet)
+ *      bit [10:8] : TID(Transaction ID)
+ *      bit [11]   : Reserved
+ *      bit [15:12]: SID(Source ID 0: Node ID of Host)
+ *
+ * Broadcast CCMD issued by Host is represented as DID=SID=0.
+ */
+/*
+ * UHS2 Argument:
+ * Refer to UHS-II Addendum Version 1.02 Figure 6-5, the format of CCMD Argument is described below:
+ *      bit [3:0]  : MSB of IOADR
+ *      bit [5:4]  : PLEN(Payload Length)
+ *                   00b: 0 byte
+ *                   01b: 4 bytes
+ *                   10b: 8 bytes
+ *                   11b: 16 bytes
+ *      bit [6]    : Reserved
+ *      bit [7]    : R/W(Read/Write)
+ *                   0: Control read command
+ *                   1: Control write command
+ *      bit [15:8] : LSB of IOADR
+ *
+ * I/O Address specifies the address of register in UHS-II I/O space accessed by CCMD.
+ * The unit of I/O Address is 4 Bytes. It is transmitted in MSB first, LSB last.
+ */
+#define UHS2_NATIVE_PACKET_POS	7
+#define UHS2_NATIVE_PACKET	(1 << UHS2_NATIVE_PACKET_POS)
+
+#define UHS2_PACKET_TYPE_POS	4
+#define UHS2_PACKET_TYPE_CCMD	(0 << UHS2_PACKET_TYPE_POS)
+#define UHS2_PACKET_TYPE_DCMD	(1 << UHS2_PACKET_TYPE_POS)
+#define UHS2_PACKET_TYPE_RES	(2 << UHS2_PACKET_TYPE_POS)
+#define UHS2_PACKET_TYPE_DATA	(3 << UHS2_PACKET_TYPE_POS)
+#define UHS2_PACKET_TYPE_MSG	(7 << UHS2_PACKET_TYPE_POS)
+
+#define UHS2_DEST_ID_MASK	0x0F
+#define UHS2_DEST_ID		0x1
+
+#define UHS2_SRC_ID_POS		12
+#define UHS2_SRC_ID_MASK	0xF000
+
+#define UHS2_TRANS_ID_POS	8
+#define UHS2_TRANS_ID_MASK	0x0700
+
+/* UHS2 MSG */
+#define UHS2_MSG_CTG_POS	5
+#define UHS2_MSG_CTG_LMSG	0x00
+#define UHS2_MSG_CTG_INT	0x60
+#define UHS2_MSG_CTG_AMSG	0x80
+
+#define UHS2_MSG_CTG_FCREQ	0x00
+#define UHS2_MSG_CTG_FCRDY	0x01
+#define UHS2_MSG_CTG_STAT	0x02
+
+#define UHS2_MSG_CODE_POS			8
+#define UHS2_MSG_CODE_FC_UNRECOVER_ERR		0x8
+#define UHS2_MSG_CODE_STAT_UNRECOVER_ERR	0x8
+#define UHS2_MSG_CODE_STAT_RECOVER_ERR		0x1
+
+/* TRANS Layer definition */
+
+/* Native packets*/
+#define UHS2_NATIVE_CMD_RW_POS	7
+#define UHS2_NATIVE_CMD_WRITE	(1 << UHS2_NATIVE_CMD_RW_POS)
+#define UHS2_NATIVE_CMD_READ	(0 << UHS2_NATIVE_CMD_RW_POS)
+
+#define UHS2_NATIVE_CMD_PLEN_POS	4
+#define UHS2_NATIVE_CMD_PLEN_4B		(1 << UHS2_NATIVE_CMD_PLEN_POS)
+#define UHS2_NATIVE_CMD_PLEN_8B		(2 << UHS2_NATIVE_CMD_PLEN_POS)
+#define UHS2_NATIVE_CMD_PLEN_16B	(3 << UHS2_NATIVE_CMD_PLEN_POS)
+
+#define UHS2_NATIVE_CCMD_GET_MIOADR_MASK	0xF00
+#define UHS2_NATIVE_CCMD_MIOADR_MASK		0x0F
+
+#define UHS2_NATIVE_CCMD_LIOADR_POS		8
+#define UHS2_NATIVE_CCMD_GET_LIOADR_MASK	0x0FF
+
+#define UHS2_CCMD_DEV_INIT_COMPLETE_FLAG	BIT(11)
+#define UHS2_DEV_INIT_PAYLOAD_LEN		1
+#define UHS2_DEV_INIT_RESP_LEN			6
+#define UHS2_DEV_ENUM_PAYLOAD_LEN		1
+#define UHS2_DEV_ENUM_RESP_LEN			8
+#define UHS2_CFG_WRITE_PAYLOAD_LEN		2
+#define UHS2_CFG_WRITE_PHY_SET_RESP_LEN		4
+#define UHS2_CFG_WRITE_GENERIC_SET_RESP_LEN	5
+#define UHS2_GO_DORMANT_PAYLOAD_LEN		1
+
+/*
+ * UHS2 Argument:
+ * Refer to UHS-II Addendum Version 1.02 Figure 6-8, the format of DCMD Argument is described below:
+ *      bit [3:0]  : Reserved
+ *      bit [6:3]  : TMODE(Transfer Mode)
+ *                   bit 3: DAM(Data Access Mode)
+ *                   bit 4: TLUM(TLEN Unit Mode)
+ *                   bit 5: LM(Length Mode)
+ *                   bit 6: DM(Duplex Mode)
+ *      bit [7]    : R/W(Read/Write)
+ *                   0: Control read command
+ *                   1: Control write command
+ *      bit [15:8] : Reserved
+ *
+ * I/O Address specifies the address of register in UHS-II I/O space accessed by CCMD.
+ * The unit of I/O Address is 4 Bytes. It is transmitted in MSB first, LSB last.
+ */
+#define UHS2_DCMD_DM_POS		6
+#define UHS2_DCMD_2L_HD_MODE		(1 << UHS2_DCMD_DM_POS)
+#define UHS2_DCMD_LM_POS		5
+#define UHS2_DCMD_LM_TLEN_EXIST		(1 << UHS2_DCMD_LM_POS)
+#define UHS2_DCMD_TLUM_POS		4
+#define UHS2_DCMD_TLUM_BYTE_MODE	(1 << UHS2_DCMD_TLUM_POS)
+#define UHS2_NATIVE_DCMD_DAM_POS	3
+#define UHS2_NATIVE_DCMD_DAM_IO		(1 << UHS2_NATIVE_DCMD_DAM_POS)
+
+#define UHS2_RES_NACK_POS	7
+#define UHS2_RES_NACK_MASK	(0x1 << UHS2_RES_NACK_POS)
+
+#define UHS2_RES_ECODE_POS	4
+#define UHS2_RES_ECODE_MASK	0x7
+#define UHS2_RES_ECODE_COND	1
+#define UHS2_RES_ECODE_ARG	2
+#define UHS2_RES_ECODE_GEN	3
+
+/* IOADR of device registers */
+#define UHS2_IOADR_GENERIC_CAPS		0x00
+#define UHS2_IOADR_PHY_CAPS		0x02
+#define UHS2_IOADR_LINK_CAPS		0x04
+#define UHS2_IOADR_RSV_CAPS		0x06
+#define UHS2_IOADR_GENERIC_SETTINGS	0x08
+#define UHS2_IOADR_PHY_SETTINGS		0x0A
+#define UHS2_IOADR_LINK_SETTINGS	0x0C
+#define UHS2_IOADR_PRESET		0x40
+
+/* SD application packets */
+#define UHS2_SD_CMD_INDEX_POS	8
+
+#define UHS2_SD_CMD_APP_POS	14
+#define UHS2_SD_CMD_APP		(1 << UHS2_SD_CMD_APP_POS)
+
+/* UHS-II Device Registers */
+#define UHS2_DEV_CONFIG_REG	0x000
+
+/* General Caps and Settings registers */
+#define UHS2_DEV_CONFIG_GEN_CAPS	(UHS2_DEV_CONFIG_REG + 0x000)
+#define UHS2_DEV_CONFIG_N_LANES_POS	8
+#define UHS2_DEV_CONFIG_N_LANES_MASK	0x3F
+#define UHS2_DEV_CONFIG_2L_HD_FD	0x1
+#define UHS2_DEV_CONFIG_2D1U_FD		0x2
+#define UHS2_DEV_CONFIG_1D2U_FD		0x4
+#define UHS2_DEV_CONFIG_2D2U_FD		0x8
+#define UHS2_DEV_CONFIG_DADR_POS	14
+#define UHS2_DEV_CONFIG_DADR_MASK	0x1
+#define UHS2_DEV_CONFIG_APP_POS		16
+#define UHS2_DEV_CONFIG_APP_MASK	0xFF
+#define UHS2_DEV_CONFIG_APP_SD_MEM	0x1
+
+#define UHS2_DEV_CONFIG_GEN_SET			(UHS2_DEV_CONFIG_REG + 0x008)
+#define UHS2_DEV_CONFIG_GEN_SET_N_LANES_POS	8
+#define UHS2_DEV_CONFIG_GEN_SET_2L_FD_HD	0x0
+#define UHS2_DEV_CONFIG_GEN_SET_2D1U_FD		0x2
+#define UHS2_DEV_CONFIG_GEN_SET_1D2U_FD		0x3
+#define UHS2_DEV_CONFIG_GEN_SET_2D2U_FD		0x4
+#define UHS2_DEV_CONFIG_GEN_SET_CFG_COMPLETE	BIT(31)
+
+/* PHY Caps and Settings registers */
+#define UHS2_DEV_CONFIG_PHY_CAPS	(UHS2_DEV_CONFIG_REG + 0x002)
+#define UHS2_DEV_CONFIG_PHY_MINOR_MASK	0xF
+#define UHS2_DEV_CONFIG_PHY_MAJOR_POS	4
+#define UHS2_DEV_CONFIG_PHY_MAJOR_MASK	0x3
+#define UHS2_DEV_CONFIG_CAN_HIBER_POS	15
+#define UHS2_DEV_CONFIG_CAN_HIBER_MASK	0x1
+#define UHS2_DEV_CONFIG_PHY_CAPS1	(UHS2_DEV_CONFIG_REG + 0x003)
+#define UHS2_DEV_CONFIG_N_LSS_SYN_MASK	0xF
+#define UHS2_DEV_CONFIG_N_LSS_DIR_POS	4
+#define UHS2_DEV_CONFIG_N_LSS_DIR_MASK	0xF
+
+#define UHS2_DEV_CONFIG_PHY_SET			(UHS2_DEV_CONFIG_REG + 0x00A)
+#define UHS2_DEV_CONFIG_PHY_SET_SPEED_POS	6
+#define UHS2_DEV_CONFIG_PHY_SET_SPEED_A		0x0
+#define UHS2_DEV_CONFIG_PHY_SET_SPEED_B		0x1
+
+/* LINK-TRAN Caps and Settings registers */
+#define UHS2_DEV_CONFIG_LINK_TRAN_CAPS		(UHS2_DEV_CONFIG_REG + 0x004)
+#define UHS2_DEV_CONFIG_LT_MINOR_MASK		0xF
+#define UHS2_DEV_CONFIG_LT_MAJOR_POS		4
+#define UHS2_DEV_CONFIG_LT_MAJOR_MASK		0x3
+#define UHS2_DEV_CONFIG_N_FCU_POS		8
+#define UHS2_DEV_CONFIG_N_FCU_MASK		0xFF
+#define UHS2_DEV_CONFIG_DEV_TYPE_POS		16
+#define UHS2_DEV_CONFIG_DEV_TYPE_MASK		0x7
+#define UHS2_DEV_CONFIG_MAX_BLK_LEN_POS		20
+#define UHS2_DEV_CONFIG_MAX_BLK_LEN_MASK	0xFFF
+#define UHS2_DEV_CONFIG_LINK_TRAN_CAPS1		(UHS2_DEV_CONFIG_REG + 0x005)
+#define UHS2_DEV_CONFIG_N_DATA_GAP_MASK		0xFF
+
+#define UHS2_DEV_CONFIG_LINK_TRAN_SET		(UHS2_DEV_CONFIG_REG + 0x00C)
+#define UHS2_DEV_CONFIG_LT_SET_MAX_BLK_LEN	0x200
+#define UHS2_DEV_CONFIG_LT_SET_MAX_RETRY_POS	16
+
+/* Preset register */
+#define UHS2_DEV_CONFIG_PRESET	(UHS2_DEV_CONFIG_REG + 0x040)
+
+#define UHS2_DEV_INT_REG	0x100
+
+#define UHS2_DEV_STATUS_REG	0x180
+
+#define UHS2_DEV_CMD_REG		0x200
+#define UHS2_DEV_CMD_FULL_RESET		(UHS2_DEV_CMD_REG + 0x000)
+#define UHS2_DEV_CMD_GO_DORMANT_STATE	(UHS2_DEV_CMD_REG + 0x001)
+#define UHS2_DEV_CMD_DORMANT_HIBER	BIT(7)
+#define UHS2_DEV_CMD_DEVICE_INIT	(UHS2_DEV_CMD_REG + 0x002)
+#define UHS2_DEV_INIT_COMPLETE_FLAG	BIT(11)
+#define UHS2_DEV_CMD_ENUMERATE		(UHS2_DEV_CMD_REG + 0x003)
+#define UHS2_DEV_CMD_TRANS_ABORT	(UHS2_DEV_CMD_REG + 0x004)
+
+#define UHS2_RCLK_MAX	52000000
+#define UHS2_RCLK_MIN	26000000
+
+#endif /* LINUX_MMC_UHS2_H */
-- 
cgit v1.2.3


From 5bb798cfdfd00182065cf55c1ff4b2c08d3be13f Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 28 Sep 2024 18:20:56 +0200
Subject: memstick: Constify struct memstick_device_id

'struct memstick_device_id' are not modified in these drivers.

Constifying this structure moves some data to a read-only section, so
increases overall security.

Update memstick_dev_match(), memstick_bus_match() and struct
memstick_driver accordingly.

On a x86_64, with allmodconfig, as an example:
Before:
======
   text	   data	    bss	    dec	    hex	filename
  74055	   3455	     88	  77598	  12f1e	drivers/memstick/core/ms_block.o

After:
=====
   text	   data	    bss	    dec	    hex	filename
  74087	   3423	     88	  77598	  12f1e	drivers/memstick/core/ms_block.o

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/6509d6f6ed64193f04e747a98ccea7492c976ca8.1727540434.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/memstick/core/memstick.c    | 4 ++--
 drivers/memstick/core/ms_block.c    | 2 +-
 drivers/memstick/core/mspro_block.c | 2 +-
 include/linux/memstick.h            | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index 9a3a784054cc..ae4e8b8e6eb7 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -26,7 +26,7 @@ static DEFINE_IDR(memstick_host_idr);
 static DEFINE_SPINLOCK(memstick_host_lock);
 
 static int memstick_dev_match(struct memstick_dev *card,
-			      struct memstick_device_id *id)
+			      const struct memstick_device_id *id)
 {
 	if (id->match_flags & MEMSTICK_MATCH_ALL) {
 		if ((id->type == card->id.type)
@@ -44,7 +44,7 @@ static int memstick_bus_match(struct device *dev, const struct device_driver *dr
 						 dev);
 	const struct memstick_driver *ms_drv = container_of_const(drv, struct memstick_driver,
 								  driver);
-	struct memstick_device_id *ids = ms_drv->id_table;
+	const struct memstick_device_id *ids = ms_drv->id_table;
 
 	if (ids) {
 		while (ids->match_flags) {
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index c0383959dbb2..20a2466bec23 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2279,7 +2279,7 @@ out:
 
 #endif /* CONFIG_PM */
 
-static struct memstick_device_id msb_id_tbl[] = {
+static const struct memstick_device_id msb_id_tbl[] = {
 	{MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE,
 	 MEMSTICK_CLASS_FLASH},
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 49accfdc89d6..13b317c56069 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1349,7 +1349,7 @@ out_unlock:
 
 #endif /* CONFIG_PM */
 
-static struct memstick_device_id mspro_block_id_tbl[] = {
+static const struct memstick_device_id mspro_block_id_tbl[] = {
 	{MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_PRO, MEMSTICK_CATEGORY_STORAGE_DUO,
 	 MEMSTICK_CLASS_DUO},
 	{}
diff --git a/include/linux/memstick.h b/include/linux/memstick.h
index ebf73d4ee969..107bdcbedf79 100644
--- a/include/linux/memstick.h
+++ b/include/linux/memstick.h
@@ -293,7 +293,7 @@ struct memstick_host {
 };
 
 struct memstick_driver {
-	struct memstick_device_id *id_table;
+	const struct memstick_device_id *id_table;
 	int                       (*probe)(struct memstick_dev *card);
 	void                      (*remove)(struct memstick_dev *card);
 	int                       (*suspend)(struct memstick_dev *card,
-- 
cgit v1.2.3


From 7e019dcc470f27066c98697e43d930df8d54bd9c Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 9 Oct 2024 09:50:07 -0400
Subject: sched: Improve cache locality of RSEQ concurrency IDs for
 intermittent workloads

commit 223baf9d17f25 ("sched: Fix performance regression introduced by mm_cid")
introduced a per-mm/cpu current concurrency id (mm_cid), which keeps
a reference to the concurrency id allocated for each CPU. This reference
expires shortly after a 100ms delay.

These per-CPU references keep the per-mm-cid data cache-local in
situations where threads are running at least once on each CPU within
each 100ms window, thus keeping the per-cpu reference alive.

However, intermittent workloads behaving in bursts spaced by more than
100ms on each CPU exhibit bad cache locality and degraded performance
compared to purely per-cpu data indexing, because concurrency IDs are
allocated over various CPUs and cores, therefore losing cache locality
of the associated data.

Introduce the following changes to improve per-mm-cid cache locality:

- Add a "recent_cid" field to the per-mm/cpu mm_cid structure to keep
  track of which mm_cid value was last used, and use it as a hint to
  attempt re-allocating the same concurrency ID the next time this
  mm/cpu needs to allocate a concurrency ID,

- Add a per-mm CPUs allowed mask, which keeps track of the union of
  CPUs allowed for all threads belonging to this mm. This cpumask is
  only set during the lifetime of the mm, never cleared, so it
  represents the union of all the CPUs allowed since the beginning of
  the mm lifetime (note that the mm_cpumask() is really arch-specific
  and tailored to the TLB flush needs, and is thus _not_ a viable
  approach for this),

- Add a per-mm nr_cpus_allowed to keep track of the weight of the
  per-mm CPUs allowed mask (for fast access),

- Add a per-mm max_nr_cid to keep track of the highest number of
  concurrency IDs allocated for the mm. This is used for expanding the
  concurrency ID allocation within the upper bound defined by:

    min(mm->nr_cpus_allowed, mm->mm_users)

  When the next unused CID value reaches this threshold, stop trying
  to expand the cid allocation and use the first available cid value
  instead.

  Spreading allocation to use all the cid values within the range

    [ 0, min(mm->nr_cpus_allowed, mm->mm_users) - 1 ]

  improves cache locality while preserving mm_cid compactness within the
  expected user limits,

- In __mm_cid_try_get, only return cid values within the range
  [ 0, mm->nr_cpus_allowed ] rather than [ 0, nr_cpu_ids ]. This
  prevents allocating cids above the number of allowed cpus in
  rare scenarios where cid allocation races with a concurrent
  remote-clear of the per-mm/cpu cid. This improvement is made
  possible by the addition of the per-mm CPUs allowed mask,

- In sched_mm_cid_migrate_to, use mm->nr_cpus_allowed rather than
  t->nr_cpus_allowed. This criterion was really meant to compare
  the number of mm->mm_users to the number of CPUs allowed for the
  entire mm. Therefore, the prior comparison worked fine when all
  threads shared the same CPUs allowed mask, but not so much in
  scenarios where those threads have different masks (e.g. each
  thread pinned to a single CPU). This improvement is made
  possible by the addition of the per-mm CPUs allowed mask.

* Benchmarks

Each thread increments 16kB worth of 8-bit integers in bursts, with
a configurable delay between each thread's execution. Each thread run
one after the other (no threads run concurrently). The order of
thread execution in the sequence is random. The thread execution
sequence begins again after all threads have executed. The 16kB areas
are allocated with rseq_mempool and indexed by either cpu_id, mm_cid
(not cache-local), or cache-local mm_cid. Each thread is pinned to its
own core.

Testing configurations:

8-core/1-L3:        Use 8 cores within a single L3
24-core/24-L3:      Use 24 cores, 1 core per L3
192-core/24-L3:     Use 192 cores (all cores in the system)
384-thread/24-L3:   Use 384 HW threads (all HW threads in the system)

Intermittent workload delays between threads: 200ms, 10ms.

Hardware:

CPU(s):                   384
  On-line CPU(s) list:    0-383
Vendor ID:                AuthenticAMD
  Model name:             AMD EPYC 9654 96-Core Processor
    Thread(s) per core:   2
    Core(s) per socket:   96
    Socket(s):            2
Caches (sum of all):
  L1d:                    6 MiB (192 instances)
  L1i:                    6 MiB (192 instances)
  L2:                     192 MiB (192 instances)
  L3:                     768 MiB (24 instances)

Each result is an average of 5 test runs. The cache-local speedup
is calculated as: (cache-local mm_cid) / (mm_cid).

Intermittent workload delay: 200ms

                     per-cpu     mm_cid    cache-local mm_cid    cache-local speedup
                         (ns)      (ns)                  (ns)
8-core/1-L3             1374      19289                  1336            14.4x
24-core/24-L3           2423      26721                  1594            16.7x
192-core/24-L3          2291      15826                  2153             7.3x
384-thread/24-L3        1874      13234                  1907             6.9x

Intermittent workload delay: 10ms

                     per-cpu     mm_cid    cache-local mm_cid    cache-local speedup
                         (ns)      (ns)                  (ns)
8-core/1-L3               662       756                   686             1.1x
24-core/24-L3            1378      3648                  1035             3.5x
192-core/24-L3           1439     10833                  1482             7.3x
384-thread/24-L3         1503     10570                  1556             6.8x

[ This deprecates the prior "sched: NUMA-aware per-memory-map concurrency IDs"
  patch series with a simpler and more general approach. ]

[ This patch applies on top of v6.12-rc1. ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/lkml/20240823185946.418340-1-mathieu.desnoyers@efficios.com/
---
 fs/exec.c                |  2 +-
 include/linux/mm_types.h | 72 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/fork.c            |  2 +-
 kernel/sched/core.c      | 22 +++++++++------
 kernel/sched/sched.h     | 48 ++++++++++++++++++++++----------
 5 files changed, 112 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 6c53920795c2..aaa605529a75 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -990,7 +990,7 @@ static int exec_mmap(struct mm_struct *mm)
 	active_mm = tsk->active_mm;
 	tsk->active_mm = mm;
 	tsk->mm = mm;
-	mm_init_cid(mm);
+	mm_init_cid(mm, tsk);
 	/*
 	 * This prevents preemption while active_mm is being loaded and
 	 * it and mm are being updated, which could cause problems for
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6e3bdf8e38bc..381d22eba088 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -782,6 +782,7 @@ struct vm_area_struct {
 struct mm_cid {
 	u64 time;
 	int cid;
+	int recent_cid;
 };
 #endif
 
@@ -852,6 +853,27 @@ struct mm_struct {
 		 * When the next mm_cid scan is due (in jiffies).
 		 */
 		unsigned long mm_cid_next_scan;
+		/**
+		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
+		 *
+		 * Number of CPUs allowed in the union of all mm's
+		 * threads allowed CPUs.
+		 */
+		unsigned int nr_cpus_allowed;
+		/**
+		 * @max_nr_cid: Maximum number of concurrency IDs allocated.
+		 *
+		 * Track the highest number of concurrency IDs allocated for the
+		 * mm.
+		 */
+		atomic_t max_nr_cid;
+		/**
+		 * @cpus_allowed_lock: Lock protecting mm cpus_allowed.
+		 *
+		 * Provide mutual exclusion for mm cpus_allowed and
+		 * mm nr_cpus_allowed updates.
+		 */
+		raw_spinlock_t cpus_allowed_lock;
 #endif
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* size of all page tables */
@@ -1170,18 +1192,30 @@ static inline int mm_cid_clear_lazy_put(int cid)
 	return cid & ~MM_CID_LAZY_PUT;
 }
 
+/*
+ * mm_cpus_allowed: Union of all mm's threads allowed CPUs.
+ */
+static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm)
+{
+	unsigned long bitmap = (unsigned long)mm;
+
+	bitmap += offsetof(struct mm_struct, cpu_bitmap);
+	/* Skip cpu_bitmap */
+	bitmap += cpumask_size();
+	return (struct cpumask *)bitmap;
+}
+
 /* Accessor for struct mm_struct's cidmask. */
 static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
 {
-	unsigned long cid_bitmap = (unsigned long)mm;
+	unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm);
 
-	cid_bitmap += offsetof(struct mm_struct, cpu_bitmap);
-	/* Skip cpu_bitmap */
+	/* Skip mm_cpus_allowed */
 	cid_bitmap += cpumask_size();
 	return (struct cpumask *)cid_bitmap;
 }
 
-static inline void mm_init_cid(struct mm_struct *mm)
+static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 {
 	int i;
 
@@ -1189,17 +1223,22 @@ static inline void mm_init_cid(struct mm_struct *mm)
 		struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
 
 		pcpu_cid->cid = MM_CID_UNSET;
+		pcpu_cid->recent_cid = MM_CID_UNSET;
 		pcpu_cid->time = 0;
 	}
+	mm->nr_cpus_allowed = p->nr_cpus_allowed;
+	atomic_set(&mm->max_nr_cid, 0);
+	raw_spin_lock_init(&mm->cpus_allowed_lock);
+	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	cpumask_clear(mm_cidmask(mm));
 }
 
-static inline int mm_alloc_cid_noprof(struct mm_struct *mm)
+static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
 {
 	mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
 	if (!mm->pcpu_cid)
 		return -ENOMEM;
-	mm_init_cid(mm);
+	mm_init_cid(mm, p);
 	return 0;
 }
 #define mm_alloc_cid(...)	alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
@@ -1212,16 +1251,31 @@ static inline void mm_destroy_cid(struct mm_struct *mm)
 
 static inline unsigned int mm_cid_size(void)
 {
-	return cpumask_size();
+	return 2 * cpumask_size();	/* mm_cpus_allowed(), mm_cidmask(). */
+}
+
+static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask)
+{
+	struct cpumask *mm_allowed = mm_cpus_allowed(mm);
+
+	if (!mm)
+		return;
+	/* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
+	raw_spin_lock(&mm->cpus_allowed_lock);
+	cpumask_or(mm_allowed, mm_allowed, cpumask);
+	WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
+	raw_spin_unlock(&mm->cpus_allowed_lock);
 }
 #else /* CONFIG_SCHED_MM_CID */
-static inline void mm_init_cid(struct mm_struct *mm) { }
-static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; }
+static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
+static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
 static inline void mm_destroy_cid(struct mm_struct *mm) { }
+
 static inline unsigned int mm_cid_size(void)
 {
 	return 0;
 }
+static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
 #endif /* CONFIG_SCHED_MM_CID */
 
 struct mmu_gather;
diff --git a/kernel/fork.c b/kernel/fork.c
index 89ceb4a68af2..7d950e93f080 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1298,7 +1298,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (init_new_context(p, mm))
 		goto fail_nocontext;
 
-	if (mm_alloc_cid(mm))
+	if (mm_alloc_cid(mm, p))
 		goto fail_cid;
 
 	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7db711ba6d12..f5ec452e2c5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2696,6 +2696,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 		put_prev_task(rq, p);
 
 	p->sched_class->set_cpus_allowed(p, ctx);
+	mm_set_cpus_allowed(p->mm, ctx->new_mask);
 
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -10243,6 +10244,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
 	 */
 	if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
 		return -1;
+	WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
 	return src_cid;
 }
 
@@ -10255,7 +10257,8 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
 {
 	struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
 	struct mm_struct *mm = t->mm;
-	int src_cid, dst_cid, src_cpu;
+	int src_cid, src_cpu;
+	bool dst_cid_is_set;
 	struct rq *src_rq;
 
 	lockdep_assert_rq_held(dst_rq);
@@ -10272,9 +10275,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
 	 * allocation closest to 0 in cases where few threads migrate around
 	 * many CPUs.
 	 *
-	 * If destination cid is already set, we may have to just clear
-	 * the src cid to ensure compactness in frequent migrations
-	 * scenarios.
+	 * If destination cid or recent cid is already set, we may have
+	 * to just clear the src cid to ensure compactness in frequent
+	 * migrations scenarios.
 	 *
 	 * It is not useful to clear the src cid when the number of threads is
 	 * greater or equal to the number of allowed CPUs, because user-space
@@ -10282,9 +10285,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
 	 * allowed CPUs.
 	 */
 	dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
-	dst_cid = READ_ONCE(dst_pcpu_cid->cid);
-	if (!mm_cid_is_unset(dst_cid) &&
-	    atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
+	dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
+			 !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
+	if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
 		return;
 	src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
 	src_rq = cpu_rq(src_cpu);
@@ -10295,13 +10298,14 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
 							    src_cid);
 	if (src_cid == -1)
 		return;
-	if (!mm_cid_is_unset(dst_cid)) {
+	if (dst_cid_is_set) {
 		__mm_cid_put(mm, src_cid);
 		return;
 	}
 	/* Move src_cid to dst cpu. */
 	mm_cid_snapshot_time(dst_rq, mm);
 	WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
+	WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
 }
 
 static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
@@ -10540,7 +10544,7 @@ void sched_mm_cid_after_execve(struct task_struct *t)
 		 * Matches barrier in sched_mm_cid_remote_clear_old().
 		 */
 		smp_mb();
-		t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+		t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
 	}
 	rseq_set_notify_resume(t);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fba524c81c63..20b6e75604ec 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3596,24 +3596,41 @@ static inline void mm_cid_put(struct mm_struct *mm)
 	__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
 }
 
-static inline int __mm_cid_try_get(struct mm_struct *mm)
+static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
 {
-	struct cpumask *cpumask;
-	int cid;
+	struct cpumask *cidmask = mm_cidmask(mm);
+	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+	int cid = __this_cpu_read(pcpu_cid->recent_cid);
 
-	cpumask = mm_cidmask(mm);
+	/* Try to re-use recent cid. This improves cache locality. */
+	if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask))
+		return cid;
+	/*
+	 * Expand cid allocation if the maximum number of concurrency
+	 * IDs allocated (max_nr_cid) is below the number cpus allowed
+	 * and number of threads. Expanding cid allocation as much as
+	 * possible improves cache locality.
+	 */
+	cid = atomic_read(&mm->max_nr_cid);
+	while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
+		if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
+			continue;
+		if (!cpumask_test_and_set_cpu(cid, cidmask))
+			return cid;
+	}
 	/*
+	 * Find the first available concurrency id.
 	 * Retry finding first zero bit if the mask is temporarily
 	 * filled. This only happens during concurrent remote-clear
 	 * which owns a cid without holding a rq lock.
 	 */
 	for (;;) {
-		cid = cpumask_first_zero(cpumask);
-		if (cid < nr_cpu_ids)
+		cid = cpumask_first_zero(cidmask);
+		if (cid < READ_ONCE(mm->nr_cpus_allowed))
 			break;
 		cpu_relax();
 	}
-	if (cpumask_test_and_set_cpu(cid, cpumask))
+	if (cpumask_test_and_set_cpu(cid, cidmask))
 		return -1;
 
 	return cid;
@@ -3631,7 +3648,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
 	WRITE_ONCE(pcpu_cid->time, rq->clock);
 }
 
-static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
+static inline int __mm_cid_get(struct rq *rq, struct task_struct *t,
+			       struct mm_struct *mm)
 {
 	int cid;
 
@@ -3641,13 +3659,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
 	 * guarantee forward progress.
 	 */
 	if (!READ_ONCE(use_cid_lock)) {
-		cid = __mm_cid_try_get(mm);
+		cid = __mm_cid_try_get(t, mm);
 		if (cid >= 0)
 			goto end;
 		raw_spin_lock(&cid_lock);
 	} else {
 		raw_spin_lock(&cid_lock);
-		cid = __mm_cid_try_get(mm);
+		cid = __mm_cid_try_get(t, mm);
 		if (cid >= 0)
 			goto unlock;
 	}
@@ -3667,7 +3685,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
 	 * all newcoming allocations observe the use_cid_lock flag set.
 	 */
 	do {
-		cid = __mm_cid_try_get(mm);
+		cid = __mm_cid_try_get(t, mm);
 		cpu_relax();
 	} while (cid < 0);
 	/*
@@ -3684,7 +3702,8 @@ end:
 	return cid;
 }
 
-static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
+static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
+			     struct mm_struct *mm)
 {
 	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
 	struct cpumask *cpumask;
@@ -3701,8 +3720,9 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
 		if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
 			__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
 	}
-	cid = __mm_cid_get(rq, mm);
+	cid = __mm_cid_get(rq, t, mm);
 	__this_cpu_write(pcpu_cid->cid, cid);
+	__this_cpu_write(pcpu_cid->recent_cid, cid);
 
 	return cid;
 }
@@ -3755,7 +3775,7 @@ static inline void switch_mm_cid(struct rq *rq,
 		prev->mm_cid = -1;
 	}
 	if (next->mm_cid_active)
-		next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
+		next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
 }
 
 #else /* !CONFIG_SCHED_MM_CID: */
-- 
cgit v1.2.3


From c2f8fde8689272a55b9319b69dfe7e8f0e2e9dfe Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 14 Oct 2024 11:40:56 +0200
Subject: fs: add helper to use mount option as path or fd

Allow filesystems to use a mount option either as a
file or path.

Link: https://lore.kernel.org/r/20241014-work-overlayfs-v3-1-32b3fed1286e@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs_parser.c            | 20 ++++++++++++++++++++
 include/linux/fs_parser.h |  5 ++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 24727ec34e5a..8f583b814e6e 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -308,6 +308,26 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
 }
 EXPORT_SYMBOL(fs_param_is_fd);
 
+int fs_param_is_file_or_string(struct p_log *log,
+			       const struct fs_parameter_spec *p,
+			       struct fs_parameter *param,
+			       struct fs_parse_result *result)
+{
+	switch (param->type) {
+	case fs_value_is_string:
+		return fs_param_is_string(log, p, param, result);
+	case fs_value_is_file:
+		result->uint_32 = param->dirfd;
+		if (result->uint_32 <= INT_MAX)
+			return 0;
+		break;
+	default:
+		break;
+	}
+	return fs_param_bad_value(log, param);
+}
+EXPORT_SYMBOL(fs_param_is_file_or_string);
+
 int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p,
 		    struct fs_parameter *param, struct fs_parse_result *result)
 {
diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
index 6cf713a7e6c6..3cef566088fc 100644
--- a/include/linux/fs_parser.h
+++ b/include/linux/fs_parser.h
@@ -28,7 +28,8 @@ typedef int fs_param_type(struct p_log *,
  */
 fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64,
 	fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev,
-	fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid;
+	fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid,
+	fs_param_is_file_or_string;
 
 /*
  * Specification of the type of value a parameter wants.
@@ -133,6 +134,8 @@ static inline bool fs_validate_description(const char *name,
 #define fsparam_bdev(NAME, OPT)	__fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL)
 #define fsparam_path(NAME, OPT)	__fsparam(fs_param_is_path, NAME, OPT, 0, NULL)
 #define fsparam_fd(NAME, OPT)	__fsparam(fs_param_is_fd, NAME, OPT, 0, NULL)
+#define fsparam_file_or_string(NAME, OPT) \
+				__fsparam(fs_param_is_file_or_string, NAME, OPT, 0, NULL)
 #define fsparam_uid(NAME, OPT) __fsparam(fs_param_is_uid, NAME, OPT, 0, NULL)
 #define fsparam_gid(NAME, OPT) __fsparam(fs_param_is_gid, NAME, OPT, 0, NULL)
 
-- 
cgit v1.2.3


From 6e3ea06240adfef7b46e2338dd824541c31de06d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 7 Oct 2024 18:03:23 +0300
Subject: dmaengine: acpi: Drop unused devm_acpi_dma_controller_free()

After introduction a few years ago the devm_acpi_dma_controller_free()
was never used. Drop it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20241007150436.2183575-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 Documentation/driver-api/driver-model/devres.rst |  1 -
 drivers/dma/acpi-dma.c                           | 15 ---------------
 include/linux/acpi_dma.h                         |  4 ----
 3 files changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 5f2ee8d717b1..f23dbe5d6606 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -459,7 +459,6 @@ SERDEV
 
 SLAVE DMA ENGINE
   devm_acpi_dma_controller_register()
-  devm_acpi_dma_controller_free()
 
 SPI
   devm_spi_alloc_master()
diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c
index a58a1600dd65..d5beb96ef510 100644
--- a/drivers/dma/acpi-dma.c
+++ b/drivers/dma/acpi-dma.c
@@ -276,21 +276,6 @@ int devm_acpi_dma_controller_register(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_acpi_dma_controller_register);
 
-/**
- * devm_acpi_dma_controller_free - resource managed acpi_dma_controller_free()
- * @dev:	device that is unregistering as DMA controller
- *
- * Unregister a DMA controller registered with
- * devm_acpi_dma_controller_register(). Normally this function will not need to
- * be called and the resource management code will ensure that the resource is
- * freed.
- */
-void devm_acpi_dma_controller_free(struct device *dev)
-{
-	WARN_ON(devres_release(dev, devm_acpi_dma_release, NULL, NULL));
-}
-EXPORT_SYMBOL_GPL(devm_acpi_dma_controller_free);
-
 /**
  * acpi_dma_update_dma_spec - prepare dma specifier to pass to translation function
  * @adma:	struct acpi_dma of DMA controller
diff --git a/include/linux/acpi_dma.h b/include/linux/acpi_dma.h
index 72cedb916a9c..3ef1ec7a04cb 100644
--- a/include/linux/acpi_dma.h
+++ b/include/linux/acpi_dma.h
@@ -65,7 +65,6 @@ int devm_acpi_dma_controller_register(struct device *dev,
 		struct dma_chan *(*acpi_dma_xlate)
 		(struct acpi_dma_spec *, struct acpi_dma *),
 		void *data);
-void devm_acpi_dma_controller_free(struct device *dev);
 
 struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev,
 						      size_t index);
@@ -94,9 +93,6 @@ static inline int devm_acpi_dma_controller_register(struct device *dev,
 {
 	return -ENODEV;
 }
-static inline void devm_acpi_dma_controller_free(struct device *dev)
-{
-}
 
 static inline struct dma_chan *acpi_dma_request_slave_chan_by_index(
 		struct device *dev, size_t index)
-- 
cgit v1.2.3


From 662f045332addc961940e48eb920caa954abbf09 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 7 Oct 2024 18:03:25 +0300
Subject: dmaengine: acpi: Clean up headers

There is a few things done:
- include only the headers we are direct user of
- when pointer is in use, provide a forward declaration
- add missing headers
- sort alphabetically

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20241007150436.2183575-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/acpi-dma.c   | 13 ++++++++-----
 include/linux/acpi_dma.h |  5 +++--
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c
index f594ea265c76..2abbe11e797e 100644
--- a/drivers/dma/acpi-dma.c
+++ b/drivers/dma/acpi-dma.c
@@ -9,18 +9,21 @@
  *	    Mika Westerberg <mika.westerberg@linux.intel.com>
  */
 
+#include <linux/acpi.h>
+#include <linux/acpi_dma.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/err.h>
-#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/ioport.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/acpi.h>
-#include <linux/acpi_dma.h>
 #include <linux/property.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
 
 static LIST_HEAD(acpi_dma_list);
 static DEFINE_MUTEX(acpi_dma_lock);
diff --git a/include/linux/acpi_dma.h b/include/linux/acpi_dma.h
index 3ef1ec7a04cb..e748b2877602 100644
--- a/include/linux/acpi_dma.h
+++ b/include/linux/acpi_dma.h
@@ -11,10 +11,11 @@
 #ifndef __LINUX_ACPI_DMA_H
 #define __LINUX_ACPI_DMA_H
 
-#include <linux/list.h>
-#include <linux/device.h>
 #include <linux/err.h>
 #include <linux/dmaengine.h>
+#include <linux/types.h>
+
+struct device;
 
 /**
  * struct acpi_dma_spec - slave device DMA resources
-- 
cgit v1.2.3


From 6ba55951e70bf7a8d89c03aa5612d2e0c81ded6d Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Thu, 10 Oct 2024 11:27:15 -0500
Subject: logic_pio: Constify fwnode_handle

The fwnode_handle passed into find_io_range_by_fwnode() and
logic_pio_trans_hwaddr() are not modified, so make them const.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20241010-dt-const-v1-2-87a51f558425@kernel.org
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 include/linux/logic_pio.h | 6 +++---
 lib/logic_pio.c           | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/logic_pio.h b/include/linux/logic_pio.h
index babf4e3c28ba..8f1a9408302f 100644
--- a/include/linux/logic_pio.h
+++ b/include/linux/logic_pio.h
@@ -17,7 +17,7 @@ enum {
 
 struct logic_pio_hwaddr {
 	struct list_head list;
-	struct fwnode_handle *fwnode;
+	const struct fwnode_handle *fwnode;
 	resource_size_t hw_start;
 	resource_size_t io_start;
 	resource_size_t size; /* range size populated */
@@ -110,8 +110,8 @@ void logic_outsl(unsigned long addr, const void *buffer, unsigned int count);
 #endif /* CONFIG_INDIRECT_PIO */
 #define MMIO_UPPER_LIMIT (IO_SPACE_LIMIT - PIO_INDIRECT_SIZE)
 
-struct logic_pio_hwaddr *find_io_range_by_fwnode(struct fwnode_handle *fwnode);
-unsigned long logic_pio_trans_hwaddr(struct fwnode_handle *fwnode,
+struct logic_pio_hwaddr *find_io_range_by_fwnode(const struct fwnode_handle *fwnode);
+unsigned long logic_pio_trans_hwaddr(const struct fwnode_handle *fwnode,
 			resource_size_t hw_addr, resource_size_t size);
 int logic_pio_register_range(struct logic_pio_hwaddr *newrange);
 void logic_pio_unregister_range(struct logic_pio_hwaddr *range);
diff --git a/lib/logic_pio.c b/lib/logic_pio.c
index 2ea564a40064..e29496a38d06 100644
--- a/lib/logic_pio.c
+++ b/lib/logic_pio.c
@@ -122,7 +122,7 @@ void logic_pio_unregister_range(struct logic_pio_hwaddr *range)
  *
  * Traverse the io_range_list to find the registered node for @fwnode.
  */
-struct logic_pio_hwaddr *find_io_range_by_fwnode(struct fwnode_handle *fwnode)
+struct logic_pio_hwaddr *find_io_range_by_fwnode(const struct fwnode_handle *fwnode)
 {
 	struct logic_pio_hwaddr *range, *found_range = NULL;
 
@@ -186,7 +186,7 @@ resource_size_t logic_pio_to_hwaddr(unsigned long pio)
  *
  * Returns Logical PIO value if successful, ~0UL otherwise
  */
-unsigned long logic_pio_trans_hwaddr(struct fwnode_handle *fwnode,
+unsigned long logic_pio_trans_hwaddr(const struct fwnode_handle *fwnode,
 				     resource_size_t addr, resource_size_t size)
 {
 	struct logic_pio_hwaddr *range;
-- 
cgit v1.2.3


From 78e2baf3d96edd21c6f26d8afc0e68d02ec2c51c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2024 17:48:13 +0000
Subject: net: add TIME_WAIT logic to sk_to_full_sk()

TCP will soon attach TIME_WAIT sockets to some ACK and RST.

Make sure sk_to_full_sk() detects this and does not return
a non full socket.

v3: also changed sk_const_to_full_sk()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Brian Vazquez <brianvv@google.com>
Link: https://patch.msgid.link/20241010174817.1543642-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/bpf-cgroup.h | 2 +-
 include/net/inet_sock.h    | 8 ++++++--
 net/core/filter.c          | 6 +-----
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ce91d9b2acb9..f0f219271daf 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) {		       \
 		typeof(sk) __sk = sk_to_full_sk(sk);			       \
-		if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) &&	       \
+		if (__sk && __sk == skb_to_full_sk(skb) &&	       \
 		    cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS))	       \
 			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
 						      CGROUP_INET_EGRESS); \
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index f01dd273bea6..56d8bc5593d3 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
 static inline struct sock *sk_to_full_sk(struct sock *sk)
 {
 #ifdef CONFIG_INET
-	if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
+	if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
 		sk = inet_reqsk(sk)->rsk_listener;
+	if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
+		sk = NULL;
 #endif
 	return sk;
 }
@@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk)
 static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
 {
 #ifdef CONFIG_INET
-	if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
+	if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
 		sk = ((const struct request_sock *)sk)->rsk_listener;
+	if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
+		sk = NULL;
 #endif
 	return sk;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index bd0d08bf76bb..202c1d386e19 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
 		/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
 		 * sock refcnt is decremented to prevent a request_sock leak.
 		 */
-		if (!sk_fullsock(sk2))
-			sk2 = NULL;
 		if (sk2 != sk) {
 			sock_gen_put(sk);
 			/* Ensure there is no need to bump sk2 refcnt */
@@ -6826,8 +6824,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
 		/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
 		 * sock refcnt is decremented to prevent a request_sock leak.
 		 */
-		if (!sk_fullsock(sk2))
-			sk2 = NULL;
 		if (sk2 != sk) {
 			sock_gen_put(sk);
 			/* Ensure there is no need to bump sk2 refcnt */
@@ -7276,7 +7272,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
 {
 	sk = sk_to_full_sk(sk);
 
-	if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+	if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
 		return (unsigned long)sk;
 
 	return (unsigned long)NULL;
-- 
cgit v1.2.3


From f15e3b3ddb9fab1c1731b6154e2cd6573fb54c4d Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Fri, 11 Oct 2024 18:44:56 +0000
Subject: net: napi: Make napi_defer_hard_irqs per-NAPI

Add defer_hard_irqs to napi_struct in preparation for per-NAPI
settings.

The existing sysfs parameter is respected; writes to sysfs will write to
all NAPI structs for the device and the net_device defer_hard_irq field.
Reads from sysfs show the net_device field.

The ability to set defer_hard_irqs on specific NAPI instances will be
added in a later commit, via netdev-genl.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20241011184527.16393-2-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/net_cachelines/net_device.rst       |  1 +
 include/linux/netdevice.h                          |  3 +-
 net/core/dev.c                                     | 10 +++---
 net/core/dev.h                                     | 36 ++++++++++++++++++++++
 net/core/net-sysfs.c                               |  2 +-
 5 files changed, 45 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 1b018ac35e9a..5a7388b2ab6f 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -186,4 +186,5 @@ struct dpll_pin*                    dpll_pin
 struct hlist_head                   page_pools
 struct dim_irq_moder*               irq_moder
 u64                                 max_pacing_offload_horizon
+u32                                 napi_defer_hard_irqs
 =================================== =========================== =================== =================== ===================================================================================
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e6b93d01e631..2e7bc23660ec 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -373,6 +373,7 @@ struct napi_struct {
 	unsigned int		napi_id;
 	struct hrtimer		timer;
 	struct task_struct	*thread;
+	u32			defer_hard_irqs;
 	/* control-path-only fields follow */
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
@@ -2085,7 +2086,6 @@ struct net_device {
 	unsigned int		real_num_rx_queues;
 	struct netdev_rx_queue	*_rx;
 	unsigned long		gro_flush_timeout;
-	u32			napi_defer_hard_irqs;
 	unsigned int		gro_max_size;
 	unsigned int		gro_ipv4_max_size;
 	rx_handler_func_t __rcu	*rx_handler;
@@ -2413,6 +2413,7 @@ struct net_device {
 	struct dim_irq_moder	*irq_moder;
 
 	u64			max_pacing_offload_horizon;
+	u32			napi_defer_hard_irqs;
 
 	/**
 	 * @lock: protects @net_shaper_hierarchy, feel free to use for other
diff --git a/net/core/dev.c b/net/core/dev.c
index b590eefce3b4..fbaa9eabf77f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6233,7 +6233,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 	if (work_done) {
 		if (n->gro_bitmask)
 			timeout = READ_ONCE(n->dev->gro_flush_timeout);
-		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+		n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
 	}
 	if (n->defer_hard_irqs_count > 0) {
 		n->defer_hard_irqs_count--;
@@ -6371,7 +6371,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
 	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 
 	if (flags & NAPI_F_PREFER_BUSY_POLL) {
-		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+		napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
 		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
 		if (napi->defer_hard_irqs_count && timeout) {
 			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
@@ -6653,6 +6653,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	INIT_HLIST_NODE(&napi->napi_hash_node);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
+	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
 	init_gro_hash(napi);
 	napi->skb = NULL;
 	INIT_LIST_HEAD(&napi->rx_list);
@@ -11059,7 +11060,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
 
 	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
 		dev->gro_flush_timeout = 20000;
-		dev->napi_defer_hard_irqs = 1;
+		netdev_set_defer_hard_irqs(dev, 1);
 	}
 }
 EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
@@ -12003,7 +12004,6 @@ static void __init net_dev_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
-	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -12015,7 +12015,7 @@ static void __init net_dev_struct_check(void)
 #ifdef CONFIG_NET_XGRESS
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
 #endif
-	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100);
 }
 
 /*
diff --git a/net/core/dev.h b/net/core/dev.h
index d3ea92949ff3..0716b1048261 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -148,6 +148,42 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
 	WRITE_ONCE(dev->gro_ipv4_max_size, size);
 }
 
+/**
+ * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs
+ * @n: napi struct to get the defer_hard_irqs field from
+ *
+ * Return: the per-NAPI value of the defar_hard_irqs field.
+ */
+static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n)
+{
+	return READ_ONCE(n->defer_hard_irqs);
+}
+
+/**
+ * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi
+ * @n: napi_struct to set the defer_hard_irqs field
+ * @defer: the value the field should be set to
+ */
+static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
+{
+	WRITE_ONCE(n->defer_hard_irqs, defer);
+}
+
+/**
+ * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev
+ * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set
+ * @defer: the defer_hard_irqs value to set
+ */
+static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
+					      u32 defer)
+{
+	struct napi_struct *napi;
+
+	WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
+	list_for_each_entry(napi, &netdev->napi_list, dev_list)
+		napi_set_defer_hard_irqs(napi, defer);
+}
+
 int rps_cpumask_housekeeping(struct cpumask *mask);
 
 #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 05cf5347f25e..25125f356a15 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -429,7 +429,7 @@ static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val
 	if (val > S32_MAX)
 		return -ERANGE;
 
-	WRITE_ONCE(dev->napi_defer_hard_irqs, val);
+	netdev_set_defer_hard_irqs(dev, (u32)val);
 	return 0;
 }
 
-- 
cgit v1.2.3


From acb8d4ed5661d05f794ef2ce34fd11e699e9ca32 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Fri, 11 Oct 2024 18:44:58 +0000
Subject: net: napi: Make gro_flush_timeout per-NAPI

Allow per-NAPI gro_flush_timeout setting.

The existing sysfs parameter is respected; writes to sysfs will write to
all NAPI structs for the device and the net_device gro_flush_timeout
field. Reads from sysfs will read from the net_device field.

The ability to set gro_flush_timeout on specific NAPI instances will be
added in a later commit, via netdev-genl.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20241011184527.16393-4-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/net_cachelines/net_device.rst       |  1 +
 include/linux/netdevice.h                          |  3 +-
 net/core/dev.c                                     | 12 +++----
 net/core/dev.h                                     | 40 ++++++++++++++++++++++
 net/core/net-sysfs.c                               |  2 +-
 5 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 5a7388b2ab6f..67910ea49160 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -186,5 +186,6 @@ struct dpll_pin*                    dpll_pin
 struct hlist_head                   page_pools
 struct dim_irq_moder*               irq_moder
 u64                                 max_pacing_offload_horizon
+unsigned_long                       gro_flush_timeout
 u32                                 napi_defer_hard_irqs
 =================================== =========================== =================== =================== ===================================================================================
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7bc23660ec..93241d4de437 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -373,6 +373,7 @@ struct napi_struct {
 	unsigned int		napi_id;
 	struct hrtimer		timer;
 	struct task_struct	*thread;
+	unsigned long		gro_flush_timeout;
 	u32			defer_hard_irqs;
 	/* control-path-only fields follow */
 	struct list_head	dev_list;
@@ -2085,7 +2086,6 @@ struct net_device {
 	int			ifindex;
 	unsigned int		real_num_rx_queues;
 	struct netdev_rx_queue	*_rx;
-	unsigned long		gro_flush_timeout;
 	unsigned int		gro_max_size;
 	unsigned int		gro_ipv4_max_size;
 	rx_handler_func_t __rcu	*rx_handler;
@@ -2413,6 +2413,7 @@ struct net_device {
 	struct dim_irq_moder	*irq_moder;
 
 	u64			max_pacing_offload_horizon;
+	unsigned long		gro_flush_timeout;
 	u32			napi_defer_hard_irqs;
 
 	/**
diff --git a/net/core/dev.c b/net/core/dev.c
index fbaa9eabf77f..e21ace3551d5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6232,12 +6232,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 
 	if (work_done) {
 		if (n->gro_bitmask)
-			timeout = READ_ONCE(n->dev->gro_flush_timeout);
+			timeout = napi_get_gro_flush_timeout(n);
 		n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
 	}
 	if (n->defer_hard_irqs_count > 0) {
 		n->defer_hard_irqs_count--;
-		timeout = READ_ONCE(n->dev->gro_flush_timeout);
+		timeout = napi_get_gro_flush_timeout(n);
 		if (timeout)
 			ret = false;
 	}
@@ -6372,7 +6372,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
 
 	if (flags & NAPI_F_PREFER_BUSY_POLL) {
 		napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
-		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+		timeout = napi_get_gro_flush_timeout(napi);
 		if (napi->defer_hard_irqs_count && timeout) {
 			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
 			skip_schedule = true;
@@ -6654,6 +6654,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
 	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+	napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
 	init_gro_hash(napi);
 	napi->skb = NULL;
 	INIT_LIST_HEAD(&napi->rx_list);
@@ -11059,7 +11060,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
 	WARN_ON(dev->reg_state == NETREG_REGISTERED);
 
 	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
-		dev->gro_flush_timeout = 20000;
+		netdev_set_gro_flush_timeout(dev, 20000);
 		netdev_set_defer_hard_irqs(dev, 1);
 	}
 }
@@ -12003,7 +12004,6 @@ static void __init net_dev_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
-	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -12015,7 +12015,7 @@ static void __init net_dev_struct_check(void)
 #ifdef CONFIG_NET_XGRESS
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
 #endif
-	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
 }
 
 /*
diff --git a/net/core/dev.h b/net/core/dev.h
index 0716b1048261..7d0aab7e3ef1 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -184,6 +184,46 @@ static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
 		napi_set_defer_hard_irqs(napi, defer);
 }
 
+/**
+ * napi_get_gro_flush_timeout - get the gro_flush_timeout
+ * @n: napi struct to get the gro_flush_timeout from
+ *
+ * Return: the per-NAPI value of the gro_flush_timeout field.
+ */
+static inline unsigned long
+napi_get_gro_flush_timeout(const struct napi_struct *n)
+{
+	return READ_ONCE(n->gro_flush_timeout);
+}
+
+/**
+ * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi
+ * @n: napi struct to set the gro_flush_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout
+ */
+static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
+					      unsigned long timeout)
+{
+	WRITE_ONCE(n->gro_flush_timeout, timeout);
+}
+
+/**
+ * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs
+ * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set
+ * @timeout: the timeout value to set
+ */
+static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
+						unsigned long timeout)
+{
+	struct napi_struct *napi;
+
+	WRITE_ONCE(netdev->gro_flush_timeout, timeout);
+	list_for_each_entry(napi, &netdev->napi_list, dev_list)
+		napi_set_gro_flush_timeout(napi, timeout);
+}
+
 int rps_cpumask_housekeeping(struct cpumask *mask);
 
 #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 25125f356a15..2d9afc6e2161 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -409,7 +409,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
 
 static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
 {
-	WRITE_ONCE(dev->gro_flush_timeout, val);
+	netdev_set_gro_flush_timeout(dev, val);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 86e25f40aa1e9e54e081e55016f65b5c92523989 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Fri, 11 Oct 2024 18:45:00 +0000
Subject: net: napi: Add napi_config

Add a persistent NAPI config area for NAPI configuration to the core.
Drivers opt-in to setting the persistent config for a NAPI by passing an
index when calling netif_napi_add_config.

napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
(after the NAPIs are deleted).

Drivers which call netif_napi_add_config will have persistent per-NAPI
settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.

Per-NAPI settings are saved in napi_disable and restored in napi_enable.

Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20241011184527.16393-6-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/net_cachelines/net_device.rst       |  1 +
 include/linux/netdevice.h                          | 36 +++++++++-
 net/core/dev.c                                     | 80 ++++++++++++++++++++--
 net/core/dev.h                                     | 12 ++++
 4 files changed, 119 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 67910ea49160..db6192b2bb50 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -186,6 +186,7 @@ struct dpll_pin*                    dpll_pin
 struct hlist_head                   page_pools
 struct dim_irq_moder*               irq_moder
 u64                                 max_pacing_offload_horizon
+struct_napi_config*                 napi_config
 unsigned_long                       gro_flush_timeout
 u32                                 napi_defer_hard_irqs
 =================================== =========================== =================== =================== ===================================================================================
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 93241d4de437..8feaca12655e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -342,6 +342,15 @@ struct gro_list {
  */
 #define GRO_HASH_BUCKETS	8
 
+/*
+ * Structure for per-NAPI config
+ */
+struct napi_config {
+	u64 gro_flush_timeout;
+	u32 defer_hard_irqs;
+	unsigned int napi_id;
+};
+
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
@@ -379,6 +388,8 @@ struct napi_struct {
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	int			irq;
+	int			index;
+	struct napi_config	*config;
 };
 
 enum {
@@ -1868,9 +1879,6 @@ enum netdev_reg_state {
  *				allocated at register_netdev() time
  *	@real_num_rx_queues: 	Number of RX queues currently active in device
  *	@xdp_prog:		XDP sockets filter program pointer
- *	@gro_flush_timeout:	timeout for GRO layer in NAPI
- *	@napi_defer_hard_irqs:	If not zero, provides a counter that would
- *				allow to avoid NIC hard IRQ, on busy queues.
  *
  *	@rx_handler:		handler for received packets
  *	@rx_handler_data: 	XXX: need comments on this one
@@ -2020,6 +2028,11 @@ enum netdev_reg_state {
  *		   where the clock is recovered.
  *
  *	@max_pacing_offload_horizon: max EDT offload horizon in nsec.
+ *	@napi_config: An array of napi_config structures containing per-NAPI
+ *		      settings.
+ *	@gro_flush_timeout:	timeout for GRO layer in NAPI
+ *	@napi_defer_hard_irqs:	If not zero, provides a counter that would
+ *				allow to avoid NIC hard IRQ, on busy queues.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2413,6 +2426,7 @@ struct net_device {
 	struct dim_irq_moder	*irq_moder;
 
 	u64			max_pacing_offload_horizon;
+	struct napi_config	*napi_config;
 	unsigned long		gro_flush_timeout;
 	u32			napi_defer_hard_irqs;
 
@@ -2678,6 +2692,22 @@ netif_napi_add_tx_weight(struct net_device *dev,
 	netif_napi_add_weight(dev, napi, poll, weight);
 }
 
+/**
+ * netif_napi_add_config - initialize a NAPI context with persistent config
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @index: the NAPI index
+ */
+static inline void
+netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
+		      int (*poll)(struct napi_struct *, int), int index)
+{
+	napi->index = index;
+	napi->config = &dev->napi_config[index];
+	netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
+}
+
 /**
  * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
  * @dev:  network device
diff --git a/net/core/dev.c b/net/core/dev.c
index e21ace3551d5..c682173a7642 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6505,6 +6505,23 @@ EXPORT_SYMBOL(napi_busy_loop);
 
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+				    unsigned int napi_id)
+{
+	napi->napi_id = napi_id;
+	hlist_add_head_rcu(&napi->napi_hash_node,
+			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+				  unsigned int napi_id)
+{
+	spin_lock(&napi_hash_lock);
+	WARN_ON_ONCE(napi_by_id(napi_id));
+	__napi_hash_add_with_id(napi, napi_id);
+	spin_unlock(&napi_hash_lock);
+}
+
 static void napi_hash_add(struct napi_struct *napi)
 {
 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
@@ -6517,10 +6534,8 @@ static void napi_hash_add(struct napi_struct *napi)
 		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 			napi_gen_id = MIN_NAPI_ID;
 	} while (napi_by_id(napi_gen_id));
-	napi->napi_id = napi_gen_id;
 
-	hlist_add_head_rcu(&napi->napi_hash_node,
-			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+	__napi_hash_add_with_id(napi, napi_gen_id);
 
 	spin_unlock(&napi_hash_lock);
 }
@@ -6643,6 +6658,28 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 }
 EXPORT_SYMBOL(netif_queue_set_napi);
 
+static void napi_restore_config(struct napi_struct *n)
+{
+	n->defer_hard_irqs = n->config->defer_hard_irqs;
+	n->gro_flush_timeout = n->config->gro_flush_timeout;
+	/* a NAPI ID might be stored in the config, if so use it. if not, use
+	 * napi_hash_add to generate one for us. It will be saved to the config
+	 * in napi_disable.
+	 */
+	if (n->config->napi_id)
+		napi_hash_add_with_id(n, n->config->napi_id);
+	else
+		napi_hash_add(n);
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+	n->config->defer_hard_irqs = n->defer_hard_irqs;
+	n->config->gro_flush_timeout = n->gro_flush_timeout;
+	n->config->napi_id = n->napi_id;
+	napi_hash_del(n);
+}
+
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -6653,8 +6690,6 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	INIT_HLIST_NODE(&napi->napi_hash_node);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
-	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
-	napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
 	init_gro_hash(napi);
 	napi->skb = NULL;
 	INIT_LIST_HEAD(&napi->rx_list);
@@ -6672,7 +6707,13 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 	list_add_rcu(&napi->dev_list, &dev->napi_list);
-	napi_hash_add(napi);
+
+	/* default settings from sysfs are applied to all NAPIs. any per-NAPI
+	 * configuration will be loaded in napi_enable
+	 */
+	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+	napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
 	napi_get_frags_check(napi);
 	/* Create kthread for this napi if dev->threaded is set.
 	 * Clear dev->threaded if kthread creation failed so that
@@ -6704,6 +6745,11 @@ void napi_disable(struct napi_struct *n)
 
 	hrtimer_cancel(&n->timer);
 
+	if (n->config)
+		napi_save_config(n);
+	else
+		napi_hash_del(n);
+
 	clear_bit(NAPI_STATE_DISABLE, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
@@ -6719,6 +6765,11 @@ void napi_enable(struct napi_struct *n)
 {
 	unsigned long new, val = READ_ONCE(n->state);
 
+	if (n->config)
+		napi_restore_config(n);
+	else
+		napi_hash_add(n);
+
 	do {
 		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
 
@@ -6748,7 +6799,11 @@ void __netif_napi_del(struct napi_struct *napi)
 	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 		return;
 
-	napi_hash_del(napi);
+	if (napi->config) {
+		napi->index = -1;
+		napi->config = NULL;
+	}
+
 	list_del_rcu(&napi->dev_list);
 	napi_free_frags(napi);
 
@@ -11085,6 +11140,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		unsigned int txqs, unsigned int rxqs)
 {
 	struct net_device *dev;
+	size_t napi_config_sz;
+	unsigned int maxqs;
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
@@ -11098,6 +11155,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
+	maxqs = max(txqs, rxqs);
+
 	dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
 		       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 	if (!dev)
@@ -11174,6 +11233,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (!dev->ethtool)
 		goto free_all;
 
+	napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+	dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+	if (!dev->napi_config)
+		goto free_all;
+
 	strscpy(dev->name, name);
 	dev->name_assign_type = name_assign_type;
 	dev->group = INIT_NETDEV_GROUP;
@@ -11237,6 +11301,8 @@ void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	kvfree(dev->napi_config);
+
 	ref_tracker_dir_exit(&dev->refcnt_tracker);
 #ifdef CONFIG_PCPU_DEV_REFCNT
 	free_percpu(dev->pcpu_refcnt);
diff --git a/net/core/dev.h b/net/core/dev.h
index 7d0aab7e3ef1..7881bced70a9 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -177,11 +177,17 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
 static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
 					      u32 defer)
 {
+	unsigned int count = max(netdev->num_rx_queues,
+				 netdev->num_tx_queues);
 	struct napi_struct *napi;
+	int i;
 
 	WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
 	list_for_each_entry(napi, &netdev->napi_list, dev_list)
 		napi_set_defer_hard_irqs(napi, defer);
+
+	for (i = 0; i < count; i++)
+		netdev->napi_config[i].defer_hard_irqs = defer;
 }
 
 /**
@@ -217,11 +223,17 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
 static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
 						unsigned long timeout)
 {
+	unsigned int count = max(netdev->num_rx_queues,
+				 netdev->num_tx_queues);
 	struct napi_struct *napi;
+	int i;
 
 	WRITE_ONCE(netdev->gro_flush_timeout, timeout);
 	list_for_each_entry(napi, &netdev->napi_list, dev_list)
 		napi_set_gro_flush_timeout(napi, timeout);
+
+	for (i = 0; i < count; i++)
+		netdev->napi_config[i].gro_flush_timeout = timeout;
 }
 
 int rps_cpumask_housekeeping(struct cpumask *mask);
-- 
cgit v1.2.3


From 4971266e1595f76be3f844c834c1f9357a97dbde Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Oct 2024 16:25:03 -0700
Subject: bpf: Add kmem_cache iterator

The new "kmem_cache" iterator will traverse the list of slab caches
and call attached BPF programs for each entry.  It should check the
argument (ctx.s) if it's NULL before using it.

Now the iteration grabs the slab_mutex only if it traverse the list and
releases the mutex when it runs the BPF program.  The kmem_cache entry
is protected by a refcount during the execution.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz> #slab
Link: https://lore.kernel.org/r/20241010232505.1339892-2-namhyung@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h      |   1 +
 kernel/bpf/Makefile          |   1 +
 kernel/bpf/kmem_cache_iter.c | 175 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 177 insertions(+)
 create mode 100644 kernel/bpf/kmem_cache_iter.c

(limited to 'include/linux')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index c0e3e1426a82..139bdececdcf 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -283,5 +283,6 @@ extern u32 btf_tracing_ids[];
 extern u32 bpf_cgroup_btf_id[];
 extern u32 bpf_local_storage_map_btf_id[];
 extern u32 btf_bpf_map_id[];
+extern u32 bpf_kmem_cache_btf_id[];
 
 #endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 9b9c151b5c82..105328f0b9c0 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -52,3 +52,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/
 obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
 obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
+obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c
new file mode 100644
index 000000000000..ebc101d7da51
--- /dev/null
+++ b/kernel/bpf/kmem_cache_iter.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
+
+struct bpf_iter__kmem_cache {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct kmem_cache *, s);
+};
+
+static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	loff_t cnt = 0;
+	bool found = false;
+	struct kmem_cache *s;
+
+	mutex_lock(&slab_mutex);
+
+	/* Find an entry at the given position in the slab_caches list instead
+	 * of keeping a reference (of the last visited entry, if any) out of
+	 * slab_mutex. It might miss something if one is deleted in the middle
+	 * while it releases the lock.  But it should be rare and there's not
+	 * much we can do about it.
+	 */
+	list_for_each_entry(s, &slab_caches, list) {
+		if (cnt == *pos) {
+			/* Make sure this entry remains in the list by getting
+			 * a new reference count.  Note that boot_cache entries
+			 * have a negative refcount, so don't touch them.
+			 */
+			if (s->refcount > 0)
+				s->refcount++;
+			found = true;
+			break;
+		}
+		cnt++;
+	}
+	mutex_unlock(&slab_mutex);
+
+	if (!found)
+		return NULL;
+
+	return s;
+}
+
+static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__kmem_cache ctx = {
+		.meta = &meta,
+		.s = v,
+	};
+	struct bpf_prog *prog;
+	bool destroy = false;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, true);
+	if (prog && !ctx.s)
+		bpf_iter_run_prog(prog, &ctx);
+
+	if (ctx.s == NULL)
+		return;
+
+	mutex_lock(&slab_mutex);
+
+	/* Skip kmem_cache_destroy() for active entries */
+	if (ctx.s->refcount > 1)
+		ctx.s->refcount--;
+	else if (ctx.s->refcount == 1)
+		destroy = true;
+
+	mutex_unlock(&slab_mutex);
+
+	if (destroy)
+		kmem_cache_destroy(ctx.s);
+}
+
+static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct kmem_cache *s = v;
+	struct kmem_cache *next = NULL;
+	bool destroy = false;
+
+	++*pos;
+
+	mutex_lock(&slab_mutex);
+
+	if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) {
+		next = list_next_entry(s, list);
+
+		WARN_ON_ONCE(next->refcount == 0);
+
+		/* boot_caches have negative refcount, don't touch them */
+		if (next->refcount > 0)
+			next->refcount++;
+	}
+
+	/* Skip kmem_cache_destroy() for active entries */
+	if (s->refcount > 1)
+		s->refcount--;
+	else if (s->refcount == 1)
+		destroy = true;
+
+	mutex_unlock(&slab_mutex);
+
+	if (destroy)
+		kmem_cache_destroy(s);
+
+	return next;
+}
+
+static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__kmem_cache ctx = {
+		.meta = &meta,
+		.s = v,
+	};
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	return ret;
+}
+
+static const struct seq_operations kmem_cache_iter_seq_ops = {
+	.start  = kmem_cache_iter_seq_start,
+	.next   = kmem_cache_iter_seq_next,
+	.stop   = kmem_cache_iter_seq_stop,
+	.show   = kmem_cache_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
+
+static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
+	.seq_ops		= &kmem_cache_iter_seq_ops,
+};
+
+static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
+					    struct seq_file *seq)
+{
+	seq_puts(seq, "kmem_cache iter\n");
+}
+
+DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
+		     struct kmem_cache *s)
+
+static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
+	.target			= "kmem_cache",
+	.feature		= BPF_ITER_RESCHED,
+	.show_fdinfo		= bpf_iter_kmem_cache_show_fdinfo,
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__kmem_cache, s),
+		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+	},
+	.seq_info		= &kmem_cache_iter_seq_info,
+};
+
+static int __init bpf_kmem_cache_iter_init(void)
+{
+	bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
+	return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
+}
+
+late_initcall(bpf_kmem_cache_iter_init);
-- 
cgit v1.2.3


From 6632863226d88383c9e2bedfeeb928ac7f8232b9 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 9 Oct 2024 13:18:08 +0800
Subject: iommu: Remove iommu_present()

The last callsite of iommu_present() is removed by commit <45c690aea8ee>
("drm/tegra: Use iommu_paging_domain_alloc()"). Remove it to avoid dead
code.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20241009051808.29455-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 25 -------------------------
 include/linux/iommu.h |  6 ------
 2 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ce72edd34f07..4c29a4bc368b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1840,31 +1840,6 @@ int bus_iommu_probe(const struct bus_type *bus)
 	return 0;
 }
 
-/**
- * iommu_present() - make platform-specific assumptions about an IOMMU
- * @bus: bus to check
- *
- * Do not use this function. You want device_iommu_mapped() instead.
- *
- * Return: true if some IOMMU is present and aware of devices on the given bus;
- * in general it may not be the only IOMMU, and it may not have anything to do
- * with whatever device you are ultimately interested in.
- */
-bool iommu_present(const struct bus_type *bus)
-{
-	bool ret = false;
-
-	for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) {
-		if (iommu_buses[i] == bus) {
-			spin_lock(&iommu_device_lock);
-			ret = !list_empty(&iommu_device_list);
-			spin_unlock(&iommu_device_lock);
-		}
-	}
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_present);
-
 /**
  * device_iommu_capable() - check for a general IOMMU capability
  * @dev: device to which the capability would be relevant, if available
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..62d1b85c80d3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -785,7 +785,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 }
 
 extern int bus_iommu_probe(const struct bus_type *bus);
-extern bool iommu_present(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
 extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus);
@@ -1081,11 +1080,6 @@ struct iommu_iotlb_gather {};
 struct iommu_dirty_bitmap {};
 struct iommu_dirty_ops {};
 
-static inline bool iommu_present(const struct bus_type *bus)
-{
-	return false;
-}
-
 static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 {
 	return false;
-- 
cgit v1.2.3


From a274465cc3bef2dfd9c9ea5100848dda0a8641e1 Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Thu, 10 Oct 2024 13:54:19 +0100
Subject: net: phy: support 'active-high' property for PHY LEDs

In addition to 'active-low' and 'inactive-high-impedance' also
support 'active-high' property for PHY LED pin configuration.
As only either 'active-high' or 'active-low' can be set at the
same time, WARN and return an error in case both are set.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/91598487773d768f254d5faf06cf65b13e972f0e.1728558223.git.daniel@makrotopia.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/phy_device.c | 6 ++++++
 include/linux/phy.h          | 5 +++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 94da999b6fde..563497a3274c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3358,11 +3358,17 @@ static int of_phy_led(struct phy_device *phydev,
 	if (index > U8_MAX)
 		return -EINVAL;
 
+	if (of_property_read_bool(led, "active-high"))
+		set_bit(PHY_LED_ACTIVE_HIGH, &modes);
 	if (of_property_read_bool(led, "active-low"))
 		set_bit(PHY_LED_ACTIVE_LOW, &modes);
 	if (of_property_read_bool(led, "inactive-high-impedance"))
 		set_bit(PHY_LED_INACTIVE_HIGH_IMPEDANCE, &modes);
 
+	if (WARN_ON(modes & BIT(PHY_LED_ACTIVE_LOW) &&
+		    modes & BIT(PHY_LED_ACTIVE_HIGH)))
+		return -EINVAL;
+
 	if (modes) {
 		/* Return error if asked to set polarity modes but not supported */
 		if (!phydev->drv->led_polarity_set)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index ff762a3d8270..bf0eb4e5d35c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -877,8 +877,9 @@ struct phy_plca_status {
 
 /* Modes for PHY LED configuration */
 enum phy_led_modes {
-	PHY_LED_ACTIVE_LOW = 0,
-	PHY_LED_INACTIVE_HIGH_IMPEDANCE = 1,
+	PHY_LED_ACTIVE_HIGH = 0,
+	PHY_LED_ACTIVE_LOW = 1,
+	PHY_LED_INACTIVE_HIGH_IMPEDANCE = 2,
 
 	/* keep it last */
 	__PHY_LED_MODES_NUM,
-- 
cgit v1.2.3


From f68303cf1cf2fb96e20df5499c194f1fe5bab9e2 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Thu, 10 Oct 2024 11:27:14 -0500
Subject: PCI: Constify pci_register_io_range() fwnode_handle

pci_register_io_range() does not modify the passed in fwnode_handle, so
make it const.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20241010-dt-const-v1-1-87a51f558425@kernel.org
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/pci/pci.c   | 2 +-
 include/linux/pci.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7d85c04fbba2..4b102bd1cfea 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4163,7 +4163,7 @@ EXPORT_SYMBOL(pci_request_regions_exclusive);
  * Record the PCI IO range (expressed as CPU physical address + size).
  * Return a negative value if an error has occurred, zero otherwise
  */
-int pci_register_io_range(struct fwnode_handle *fwnode, phys_addr_t addr,
+int pci_register_io_range(const struct fwnode_handle *fwnode, phys_addr_t addr,
 			resource_size_t	size)
 {
 	int ret = 0;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..733ff6570e2d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1556,7 +1556,7 @@ int __must_check pci_bus_alloc_resource(struct pci_bus *bus,
 			void *alignf_data);
 
 
-int pci_register_io_range(struct fwnode_handle *fwnode, phys_addr_t addr,
+int pci_register_io_range(const struct fwnode_handle *fwnode, phys_addr_t addr,
 			resource_size_t size);
 unsigned long pci_address_to_pio(phys_addr_t addr);
 phys_addr_t pci_pio_to_address(unsigned long pio);
@@ -2019,7 +2019,7 @@ static inline int pci_request_regions(struct pci_dev *dev, const char *res_name)
 { return -EIO; }
 static inline void pci_release_regions(struct pci_dev *dev) { }
 
-static inline int pci_register_io_range(struct fwnode_handle *fwnode,
+static inline int pci_register_io_range(const struct fwnode_handle *fwnode,
 					phys_addr_t addr, resource_size_t size)
 { return -EINVAL; }
 
-- 
cgit v1.2.3


From ec8c2329da1a8695b3fc80ba67cad9dc75d9a3ec Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Thu, 10 Oct 2024 11:27:16 -0500
Subject: of: Constify struct device_node function arguments

Functions which don't change the refcount or otherwise modify struct
device_node can make struct device_node const.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20241010-dt-const-v1-3-87a51f558425@kernel.org
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/of/address.c       | 10 +++++-----
 drivers/of/base.c          |  8 ++++----
 drivers/of/cpu.c           |  2 +-
 drivers/of/irq.c           |  4 ++--
 drivers/of/of_private.h    |  2 +-
 drivers/of/overlay.c       | 14 +++++++-------
 drivers/of/resolver.c      |  4 ++--
 include/linux/of.h         | 14 +++++++-------
 include/linux/of_address.h |  4 ++--
 include/linux/of_irq.h     |  4 ++--
 10 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 286f0c161e33..aa1a4e381461 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -147,7 +147,7 @@ static unsigned int of_bus_pci_get_flags(const __be32 *addr)
  * PCI bus specific translator
  */
 
-static bool of_node_is_pcie(struct device_node *np)
+static bool of_node_is_pcie(const struct device_node *np)
 {
 	bool is_pcie = of_node_name_eq(np, "pcie");
 
@@ -230,8 +230,8 @@ static int __of_address_resource_bounds(struct resource *r, u64 start, u64 size)
  * To guard against that we try to register the IO range first.
  * If that fails we know that pci_address_to_pio() will do too.
  */
-int of_pci_range_to_resource(struct of_pci_range *range,
-			     struct device_node *np, struct resource *res)
+int of_pci_range_to_resource(const struct of_pci_range *range,
+			     const struct device_node *np, struct resource *res)
 {
 	u64 start;
 	int err;
@@ -399,7 +399,7 @@ static struct of_bus *of_match_bus(struct device_node *np)
 	return NULL;
 }
 
-static int of_empty_ranges_quirk(struct device_node *np)
+static int of_empty_ranges_quirk(const struct device_node *np)
 {
 	if (IS_ENABLED(CONFIG_PPC)) {
 		/* To save cycles, we cache the result for global "Mac" setting */
@@ -1030,7 +1030,7 @@ EXPORT_SYMBOL_GPL(of_dma_is_coherent);
  * This is currently only enabled on builds that support Apple ARM devices, as
  * an optimization.
  */
-static bool of_mmio_is_nonposted(struct device_node *np)
+static bool of_mmio_is_nonposted(const struct device_node *np)
 {
 	if (!IS_ENABLED(CONFIG_ARCH_APPLE))
 		return false;
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 20603d3c9931..d1aebb979522 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -771,7 +771,7 @@ struct device_node *of_get_child_by_name(const struct device_node *node,
 }
 EXPORT_SYMBOL(of_get_child_by_name);
 
-struct device_node *__of_find_node_by_path(struct device_node *parent,
+struct device_node *__of_find_node_by_path(const struct device_node *parent,
 						const char *path)
 {
 	struct device_node *child;
@@ -1840,7 +1840,7 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
  *
  * Return: The alias id if found.
  */
-int of_alias_get_id(struct device_node *np, const char *stem)
+int of_alias_get_id(const struct device_node *np, const char *stem)
 {
 	struct alias_prop *app;
 	int id = -ENODEV;
@@ -1898,7 +1898,7 @@ EXPORT_SYMBOL_GPL(of_alias_get_highest_id);
  *
  * Return: TRUE if console successfully setup. Otherwise return FALSE.
  */
-bool of_console_check(struct device_node *dn, char *name, int index)
+bool of_console_check(const struct device_node *dn, char *name, int index)
 {
 	if (!dn || dn != of_stdout || console_set_on_cmdline)
 		return false;
@@ -1986,7 +1986,7 @@ int of_find_last_cache_level(unsigned int cpu)
  *
  * Return: 0 on success or a standard error code on failure.
  */
-int of_map_id(struct device_node *np, u32 id,
+int of_map_id(const struct device_node *np, u32 id,
 	       const char *map_name, const char *map_mask_name,
 	       struct device_node **target, u32 *id_out)
 {
diff --git a/drivers/of/cpu.c b/drivers/of/cpu.c
index d17b2f851082..5214dc3d05ae 100644
--- a/drivers/of/cpu.c
+++ b/drivers/of/cpu.c
@@ -188,7 +188,7 @@ EXPORT_SYMBOL(of_cpu_node_to_id);
  * Return: An idle state node if found at @index. The refcount is incremented
  * for it, so call of_node_put() on it when done. Returns NULL if not found.
  */
-struct device_node *of_get_cpu_state_node(struct device_node *cpu_node,
+struct device_node *of_get_cpu_state_node(const struct device_node *cpu_node,
 					  int index)
 {
 	struct of_phandle_args args;
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index a494f56a0d0e..67fc0ceaa5f5 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -720,7 +720,7 @@ struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id,
  * Returns: the MSI domain for this device (or NULL on failure).
  */
 struct irq_domain *of_msi_get_domain(struct device *dev,
-				     struct device_node *np,
+				     const struct device_node *np,
 				     enum irq_domain_bus_token token)
 {
 	struct of_phandle_iterator it;
@@ -742,7 +742,7 @@ EXPORT_SYMBOL_GPL(of_msi_get_domain);
  * @dev: device structure to associate with an MSI irq domain
  * @np: device node for that device
  */
-void of_msi_configure(struct device *dev, struct device_node *np)
+void of_msi_configure(struct device *dev, const struct device_node *np)
 {
 	dev_set_msi_domain(dev,
 			   of_msi_get_domain(dev, np, DOMAIN_BUS_PLATFORM_MSI));
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index 04aa2a91f851..d957cc6ce437 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -127,7 +127,7 @@ void __of_prop_free(struct property *prop);
 struct device_node *__of_node_dup(const struct device_node *np,
 				  const char *full_name);
 
-struct device_node *__of_find_node_by_path(struct device_node *parent,
+struct device_node *__of_find_node_by_path(const struct device_node *parent,
 						const char *path);
 struct device_node *__of_find_node_by_full_path(struct device_node *node,
 						const char *path);
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index cbdecccca097..19aaa96ee817 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -398,7 +398,7 @@ static int add_changeset_property(struct overlay_changeset *ovcs,
  * invalid @overlay.
  */
 static int add_changeset_node(struct overlay_changeset *ovcs,
-		struct target *target, struct device_node *node)
+		struct target *target, const struct device_node *node)
 {
 	const char *node_kbasename;
 	const __be32 *phandle;
@@ -675,8 +675,8 @@ static int build_changeset(struct overlay_changeset *ovcs)
  * 1) "target" property containing the phandle of the target
  * 2) "target-path" property containing the path of the target
  */
-static struct device_node *find_target(struct device_node *info_node,
-				       struct device_node *target_base)
+static struct device_node *find_target(const struct device_node *info_node,
+				       const struct device_node *target_base)
 {
 	struct device_node *node;
 	char *target_path;
@@ -735,7 +735,7 @@ static struct device_node *find_target(struct device_node *info_node,
  * init_overlay_changeset() must call free_overlay_changeset().
  */
 static int init_overlay_changeset(struct overlay_changeset *ovcs,
-				  struct device_node *target_base)
+				  const struct device_node *target_base)
 {
 	struct device_node *node, *overlay_node;
 	struct fragment *fragment;
@@ -910,7 +910,7 @@ static void free_overlay_changeset(struct overlay_changeset *ovcs)
  */
 
 static int of_overlay_apply(struct overlay_changeset *ovcs,
-			    struct device_node *base)
+			    const struct device_node *base)
 {
 	int ret = 0, ret_revert, ret_tmp;
 
@@ -978,7 +978,7 @@ out:
  */
 
 int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size,
-			 int *ret_ovcs_id, struct device_node *base)
+			 int *ret_ovcs_id, const struct device_node *base)
 {
 	void *new_fdt;
 	void *new_fdt_align;
@@ -1074,7 +1074,7 @@ EXPORT_SYMBOL_GPL(of_overlay_fdt_apply);
  *
  * Returns 1 if @np is @tree or is contained in @tree, else 0
  */
-static int find_node(struct device_node *tree, struct device_node *np)
+static int find_node(const struct device_node *tree, struct device_node *np)
 {
 	if (tree == np)
 		return 1;
diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
index 5cf96776dd7d..ee7769525bb2 100644
--- a/drivers/of/resolver.c
+++ b/drivers/of/resolver.c
@@ -147,8 +147,8 @@ static int node_name_cmp(const struct device_node *dn1,
  * of offsets of the phandle reference(s) within the respective property
  * value(s).  The values at these offsets will be fixed up.
  */
-static int adjust_local_phandle_references(struct device_node *local_fixups,
-		struct device_node *overlay, int phandle_delta)
+static int adjust_local_phandle_references(const struct device_node *local_fixups,
+		const struct device_node *overlay, int phandle_delta)
 {
 	struct device_node *overlay_child;
 	struct property *prop_fix, *prop;
diff --git a/include/linux/of.h b/include/linux/of.h
index 85b60ac9eec5..7875b308f13c 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -357,7 +357,7 @@ extern struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
 extern struct device_node *of_cpu_device_node_get(int cpu);
 extern int of_cpu_node_to_id(struct device_node *np);
 extern struct device_node *of_get_next_cpu_node(struct device_node *prev);
-extern struct device_node *of_get_cpu_state_node(struct device_node *cpu_node,
+extern struct device_node *of_get_cpu_state_node(const struct device_node *cpu_node,
 						 int index);
 extern u64 of_get_cpu_hwid(struct device_node *cpun, unsigned int thread);
 
@@ -395,7 +395,7 @@ extern int of_phandle_iterator_args(struct of_phandle_iterator *it,
 				    int size);
 
 extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align));
-extern int of_alias_get_id(struct device_node *np, const char *stem);
+extern int of_alias_get_id(const struct device_node *np, const char *stem);
 extern int of_alias_get_highest_id(const char *stem);
 
 bool of_machine_compatible_match(const char *const *compats);
@@ -446,9 +446,9 @@ const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur,
  */
 const char *of_prop_next_string(struct property *prop, const char *cur);
 
-bool of_console_check(struct device_node *dn, char *name, int index);
+bool of_console_check(const struct device_node *dn, char *name, int index);
 
-int of_map_id(struct device_node *np, u32 id,
+int of_map_id(const struct device_node *np, u32 id,
 	       const char *map_name, const char *map_mask_name,
 	       struct device_node **target, u32 *id_out);
 
@@ -871,7 +871,7 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag
 {
 }
 
-static inline int of_map_id(struct device_node *np, u32 id,
+static inline int of_map_id(const struct device_node *np, u32 id,
 			     const char *map_name, const char *map_mask_name,
 			     struct device_node **target, u32 *id_out)
 {
@@ -1734,7 +1734,7 @@ struct of_overlay_notify_data {
 #ifdef CONFIG_OF_OVERLAY
 
 int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size,
-			 int *ovcs_id, struct device_node *target_base);
+			 int *ovcs_id, const struct device_node *target_base);
 int of_overlay_remove(int *ovcs_id);
 int of_overlay_remove_all(void);
 
@@ -1744,7 +1744,7 @@ int of_overlay_notifier_unregister(struct notifier_block *nb);
 #else
 
 static inline int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size,
-				       int *ovcs_id, struct device_node *target_base)
+				       int *ovcs_id, const struct device_node *target_base)
 {
 	return -ENOTSUPP;
 }
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 26a19daf0d09..bd46dbcc6e88 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -83,8 +83,8 @@ extern struct of_pci_range *of_pci_range_parser_one(
 					struct of_pci_range *range);
 extern int of_pci_address_to_resource(struct device_node *dev, int bar,
 				      struct resource *r);
-extern int of_pci_range_to_resource(struct of_pci_range *range,
-				    struct device_node *np,
+extern int of_pci_range_to_resource(const struct of_pci_range *range,
+				    const struct device_node *np,
 				    struct resource *res);
 extern int of_range_to_resource(struct device_node *np, int index,
 				struct resource *res);
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index d6d3eae2f145..6337ad4e5fe8 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -48,12 +48,12 @@ extern int of_irq_to_resource_table(struct device_node *dev,
 		struct resource *res, int nr_irqs);
 extern struct device_node *of_irq_find_parent(struct device_node *child);
 extern struct irq_domain *of_msi_get_domain(struct device *dev,
-					    struct device_node *np,
+					    const struct device_node *np,
 					    enum irq_domain_bus_token token);
 extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
 							u32 id,
 							u32 bus_token);
-extern void of_msi_configure(struct device *dev, struct device_node *np);
+extern void of_msi_configure(struct device *dev, const struct device_node *np);
 u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in);
 #else
 static inline void of_irq_init(const struct of_device_id *matches)
-- 
cgit v1.2.3


From 9c63fea9acd077701fd67bbc21b1e77d6b98b7e0 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Thu, 10 Oct 2024 11:27:17 -0500
Subject: of: Constify struct property pointers

Most accesses to struct property do not modify it, so constify struct
property pointers where ever possible in the DT core code.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20241010-dt-const-v1-4-87a51f558425@kernel.org
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/of/base.c       | 12 ++++++------
 drivers/of/kobj.c       |  6 +++---
 drivers/of/of_private.h | 10 +++++-----
 drivers/of/overlay.c    |  5 +++--
 drivers/of/property.c   | 10 +++++-----
 drivers/of/resolver.c   |  8 ++++----
 include/linux/of.h      | 14 +++++++-------
 7 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index d1aebb979522..d94efee4a7fc 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(of_find_all_nodes);
 const void *__of_get_property(const struct device_node *np,
 			      const char *name, int *lenp)
 {
-	struct property *pp = __of_find_property(np, name, lenp);
+	const struct property *pp = __of_find_property(np, name, lenp);
 
 	return pp ? pp->value : NULL;
 }
@@ -282,7 +282,7 @@ const void *__of_get_property(const struct device_node *np,
 const void *of_get_property(const struct device_node *np, const char *name,
 			    int *lenp)
 {
-	struct property *pp = of_find_property(np, name, lenp);
+	const struct property *pp = of_find_property(np, name, lenp);
 
 	return pp ? pp->value : NULL;
 }
@@ -321,7 +321,7 @@ EXPORT_SYMBOL(of_get_property);
 static int __of_device_is_compatible(const struct device_node *device,
 				     const char *compat, const char *type, const char *name)
 {
-	struct property *prop;
+	const struct property *prop;
 	const char *cp;
 	int index = 0, score = 0;
 
@@ -828,7 +828,7 @@ struct device_node *__of_find_node_by_full_path(struct device_node *node,
 struct device_node *of_find_node_opts_by_path(const char *path, const char **opts)
 {
 	struct device_node *np = NULL;
-	struct property *pp;
+	const struct property *pp;
 	unsigned long flags;
 	const char *separator = strchr(path, ':');
 
@@ -974,7 +974,7 @@ struct device_node *of_find_node_with_property(struct device_node *from,
 	const char *prop_name)
 {
 	struct device_node *np;
-	struct property *pp;
+	const struct property *pp;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&devtree_lock, flags);
@@ -1769,7 +1769,7 @@ static void of_alias_add(struct alias_prop *ap, struct device_node *np,
  */
 void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
 {
-	struct property *pp;
+	const struct property *pp;
 
 	of_aliases = of_find_node_by_path("/aliases");
 	of_chosen = of_find_node_by_path("/chosen");
diff --git a/drivers/of/kobj.c b/drivers/of/kobj.c
index 3dbce1e6f184..aeb1709d4e85 100644
--- a/drivers/of/kobj.c
+++ b/drivers/of/kobj.c
@@ -84,7 +84,7 @@ int __of_add_property_sysfs(struct device_node *np, struct property *pp)
 	return rc;
 }
 
-void __of_sysfs_remove_bin_file(struct device_node *np, struct property *prop)
+void __of_sysfs_remove_bin_file(struct device_node *np, const struct property *prop)
 {
 	if (!IS_ENABLED(CONFIG_SYSFS))
 		return;
@@ -93,7 +93,7 @@ void __of_sysfs_remove_bin_file(struct device_node *np, struct property *prop)
 	kfree(prop->attr.attr.name);
 }
 
-void __of_remove_property_sysfs(struct device_node *np, struct property *prop)
+void __of_remove_property_sysfs(struct device_node *np, const struct property *prop)
 {
 	/* at early boot, bail here and defer setup to of_init() */
 	if (of_kset && of_node_is_attached(np))
@@ -101,7 +101,7 @@ void __of_remove_property_sysfs(struct device_node *np, struct property *prop)
 }
 
 void __of_update_property_sysfs(struct device_node *np, struct property *newprop,
-		struct property *oldprop)
+		const struct property *oldprop)
 {
 	/* At early boot, bail out and defer setup to of_init() */
 	if (!of_kset)
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index d957cc6ce437..53a4a5be9997 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -69,9 +69,9 @@ static inline void of_platform_register_reconfig_notifier(void) { }
 #if defined(CONFIG_OF_KOBJ)
 int of_node_is_attached(const struct device_node *node);
 int __of_add_property_sysfs(struct device_node *np, struct property *pp);
-void __of_remove_property_sysfs(struct device_node *np, struct property *prop);
+void __of_remove_property_sysfs(struct device_node *np, const struct property *prop);
 void __of_update_property_sysfs(struct device_node *np, struct property *newprop,
-		struct property *oldprop);
+		const struct property *oldprop);
 int __of_attach_node_sysfs(struct device_node *np);
 void __of_detach_node_sysfs(struct device_node *np);
 #else
@@ -79,9 +79,9 @@ static inline int __of_add_property_sysfs(struct device_node *np, struct propert
 {
 	return 0;
 }
-static inline void __of_remove_property_sysfs(struct device_node *np, struct property *prop) {}
+static inline void __of_remove_property_sysfs(struct device_node *np, const struct property *prop) {}
 static inline void __of_update_property_sysfs(struct device_node *np,
-		struct property *newprop, struct property *oldprop) {}
+		struct property *newprop, const struct property *oldprop) {}
 static inline int __of_attach_node_sysfs(struct device_node *np)
 {
 	return 0;
@@ -142,7 +142,7 @@ extern int __of_update_property(struct device_node *np,
 extern void __of_detach_node(struct device_node *np);
 
 extern void __of_sysfs_remove_bin_file(struct device_node *np,
-				       struct property *prop);
+				       const struct property *prop);
 
 /* illegal phandle value (set when unresolved) */
 #define OF_PHANDLE_ILLEGAL	0xdeadbeef
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 19aaa96ee817..434f6dd6a86c 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -296,10 +296,11 @@ err_free_target_path:
  * invalid @overlay.
  */
 static int add_changeset_property(struct overlay_changeset *ovcs,
-		struct target *target, struct property *overlay_prop,
+		struct target *target, const struct property *overlay_prop,
 		bool is_symbols_prop)
 {
-	struct property *new_prop = NULL, *prop;
+	struct property *new_prop = NULL;
+	const struct property *prop;
 	int ret = 0;
 
 	if (target->in_livetree)
diff --git a/drivers/of/property.c b/drivers/of/property.c
index 11b922fde7af..33dd72cad4ab 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(of_graph_is_present);
 int of_property_count_elems_of_size(const struct device_node *np,
 				const char *propname, int elem_size)
 {
-	struct property *prop = of_find_property(np, propname, NULL);
+	const struct property *prop = of_find_property(np, propname, NULL);
 
 	if (!prop)
 		return -EINVAL;
@@ -104,7 +104,7 @@ EXPORT_SYMBOL_GPL(of_property_count_elems_of_size);
 static void *of_find_property_value_of_size(const struct device_node *np,
 			const char *propname, u32 min, u32 max, size_t *len)
 {
-	struct property *prop = of_find_property(np, propname, NULL);
+	const struct property *prop = of_find_property(np, propname, NULL);
 
 	if (!prop)
 		return ERR_PTR(-EINVAL);
@@ -530,7 +530,7 @@ int of_property_read_string_helper(const struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(of_property_read_string_helper);
 
-const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur,
+const __be32 *of_prop_next_u32(const struct property *prop, const __be32 *cur,
 			       u32 *pu)
 {
 	const void *curv = cur;
@@ -553,7 +553,7 @@ out_val:
 }
 EXPORT_SYMBOL_GPL(of_prop_next_u32);
 
-const char *of_prop_next_string(struct property *prop, const char *cur)
+const char *of_prop_next_string(const struct property *prop, const char *cur)
 {
 	const void *curv = cur;
 
@@ -1466,7 +1466,7 @@ static int of_fwnode_irq_get(const struct fwnode_handle *fwnode,
 
 static int of_fwnode_add_links(struct fwnode_handle *fwnode)
 {
-	struct property *p;
+	const struct property *p;
 	struct device_node *con_np = to_of_node(fwnode);
 
 	if (IS_ENABLED(CONFIG_X86))
diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
index ee7769525bb2..779db058c42f 100644
--- a/drivers/of/resolver.c
+++ b/drivers/of/resolver.c
@@ -42,7 +42,7 @@ static void adjust_overlay_phandles(struct device_node *overlay,
 		int phandle_delta)
 {
 	struct device_node *child;
-	struct property *prop;
+	const struct property *prop;
 	phandle phandle;
 
 	/* adjust node's phandle in node */
@@ -71,10 +71,10 @@ static void adjust_overlay_phandles(struct device_node *overlay,
 }
 
 static int update_usages_of_a_phandle_reference(struct device_node *overlay,
-		struct property *prop_fixup, phandle phandle)
+		const struct property *prop_fixup, phandle phandle)
 {
 	struct device_node *refnode;
-	struct property *prop;
+	const struct property *prop;
 	char *value __free(kfree) = kmemdup(prop_fixup->value, prop_fixup->length, GFP_KERNEL);
 	char *cur, *end, *node_path, *prop_name, *s;
 	int offset, len;
@@ -151,7 +151,7 @@ static int adjust_local_phandle_references(const struct device_node *local_fixup
 		const struct device_node *overlay, int phandle_delta)
 {
 	struct device_node *overlay_child;
-	struct property *prop_fix, *prop;
+	const struct property *prop_fix, *prop;
 	int err, i, count;
 	unsigned int off;
 
diff --git a/include/linux/of.h b/include/linux/of.h
index 7875b308f13c..086a60f3b8a6 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -435,7 +435,7 @@ extern int of_detach_node(struct device_node *);
  * of_property_for_each_u32(np, "propname", u)
  *         printk("U32 value: %x\n", u);
  */
-const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur,
+const __be32 *of_prop_next_u32(const struct property *prop, const __be32 *cur,
 			       u32 *pu);
 /*
  * struct property *prop;
@@ -444,7 +444,7 @@ const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur,
  * of_property_for_each_string(np, "propname", prop, s)
  *         printk("String value: %s\n", s);
  */
-const char *of_prop_next_string(struct property *prop, const char *cur);
+const char *of_prop_next_string(const struct property *prop, const char *cur);
 
 bool of_console_check(const struct device_node *dn, char *name, int index);
 
@@ -826,13 +826,13 @@ static inline bool of_console_check(const struct device_node *dn, const char *na
 	return false;
 }
 
-static inline const __be32 *of_prop_next_u32(struct property *prop,
+static inline const __be32 *of_prop_next_u32(const struct property *prop,
 		const __be32 *cur, u32 *pu)
 {
 	return NULL;
 }
 
-static inline const char *of_prop_next_string(struct property *prop,
+static inline const char *of_prop_next_string(const struct property *prop,
 		const char *cur)
 {
 	return NULL;
@@ -899,7 +899,7 @@ static inline const void *of_device_get_match_data(const struct device *dev)
 #define of_node_cmp(s1, s2)		strcasecmp((s1), (s2))
 #endif
 
-static inline int of_prop_val_eq(struct property *p1, struct property *p2)
+static inline int of_prop_val_eq(const struct property *p1, const struct property *p2)
 {
 	return p1->length == p2->length &&
 	       !memcmp(p1->value, p2->value, (size_t)p1->length);
@@ -1252,7 +1252,7 @@ static inline int of_property_read_string_index(const struct device_node *np,
 static inline bool of_property_read_bool(const struct device_node *np,
 					 const char *propname)
 {
-	struct property *prop = of_find_property(np, propname, NULL);
+	const struct property *prop = of_find_property(np, propname, NULL);
 
 	return prop ? true : false;
 }
@@ -1430,7 +1430,7 @@ static inline int of_property_read_s32(const struct device_node *np,
 	     err = of_phandle_iterator_next(it))
 
 #define of_property_for_each_u32(np, propname, u)			\
-	for (struct {struct property *prop; const __be32 *item; } _it =	\
+	for (struct {const struct property *prop; const __be32 *item; } _it =	\
 		{of_find_property(np, propname, NULL),			\
 		 of_prop_next_u32(_it.prop, NULL, &u)};			\
 	     _it.item;							\
-- 
cgit v1.2.3


From d79616b04f0e08178ceb716a5d2ef60ab723d532 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Thu, 10 Oct 2024 11:27:20 -0500
Subject: of/address: Constify of_busses[] array and pointers

The of_busses array is fixed, so it and all struct of_bus pointers can
be const.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20241010-dt-const-v1-7-87a51f558425@kernel.org
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/of/address.c       | 12 ++++++------
 include/linux/of_address.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index aa1a4e381461..824bb449e007 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -340,7 +340,7 @@ static int of_bus_default_flags_match(struct device_node *np)
  * Array of bus specific translators
  */
 
-static struct of_bus of_busses[] = {
+static const struct of_bus of_busses[] = {
 #ifdef CONFIG_PCI
 	/* PCI */
 	{
@@ -388,7 +388,7 @@ static struct of_bus of_busses[] = {
 	},
 };
 
-static struct of_bus *of_match_bus(struct device_node *np)
+static const struct of_bus *of_match_bus(struct device_node *np)
 {
 	int i;
 
@@ -419,8 +419,8 @@ static int of_empty_ranges_quirk(const struct device_node *np)
 	return false;
 }
 
-static int of_translate_one(struct device_node *parent, struct of_bus *bus,
-			    struct of_bus *pbus, __be32 *addr,
+static int of_translate_one(const struct device_node *parent, const struct of_bus *bus,
+			    const struct of_bus *pbus, __be32 *addr,
 			    int na, int ns, int pna, const char *rprop)
 {
 	const __be32 *ranges;
@@ -505,7 +505,7 @@ static u64 __of_translate_address(struct device_node *node,
 {
 	struct device_node *dev __free(device_node) = of_node_get(node);
 	struct device_node *parent __free(device_node) = get_parent(dev);
-	struct of_bus *bus, *pbus;
+	const struct of_bus *bus, *pbus;
 	__be32 addr[OF_MAX_ADDR_CELLS];
 	int na, ns, pna, pns;
 
@@ -690,7 +690,7 @@ const __be32 *__of_get_address(struct device_node *dev, int index, int bar_no,
 	const __be32 *prop;
 	unsigned int psize;
 	struct device_node *parent __free(device_node) = of_get_parent(dev);
-	struct of_bus *bus;
+	const struct of_bus *bus;
 	int onesize, i, na, ns;
 
 	if (parent == NULL)
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index bd46dbcc6e88..9e034363788a 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -10,7 +10,7 @@ struct of_bus;
 
 struct of_pci_range_parser {
 	struct device_node *node;
-	struct of_bus *bus;
+	const struct of_bus *bus;
 	const __be32 *range;
 	const __be32 *end;
 	int na;
-- 
cgit v1.2.3


From 74fe1ad4132234f04fcc75e16600449496a67b5b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Oct 2024 18:50:14 +0200
Subject: debugobjects: Prepare for batching

Move the debug_obj::object pointer into a union and add a pointer to the
last node in a batch. That allows to implement batch processing efficiently
by utilizing the stack property of hlist:

When the first object of a batch is added to the list, then the batch
pointer is set to the hlist node of the object itself. Any subsequent add
retrieves the pointer to the last node from the first object in the list
and uses that for storing the last node pointer in the newly added object.

Add the pointer to the data structure and ensure that all relevant pool
sizes are strictly batch sized. The actual batching implementation follows
in subsequent changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Link: https://lore.kernel.org/all/20241007164914.139204961@linutronix.de
---
 include/linux/debugobjects.h | 12 ++++++++----
 lib/debugobjects.c           | 10 +++++++---
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 32444686b6ff..8b95545e7924 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -23,13 +23,17 @@ struct debug_obj_descr;
  * @state:	tracked object state
  * @astate:	current active state
  * @object:	pointer to the real object
+ * @batch_last:	pointer to the last hlist node in a batch
  * @descr:	pointer to an object type specific debug description structure
  */
 struct debug_obj {
-	struct hlist_node	node;
-	enum debug_obj_state	state;
-	unsigned int		astate;
-	void			*object;
+	struct hlist_node		node;
+	enum debug_obj_state		state;
+	unsigned int			astate;
+	union {
+		void			*object;
+		struct hlist_node	*batch_last;
+	};
 	const struct debug_obj_descr *descr;
 };
 
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 65cce4cd9643..2fed7b863827 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -21,11 +21,15 @@
 #define ODEBUG_HASH_BITS	14
 #define ODEBUG_HASH_SIZE	(1 << ODEBUG_HASH_BITS)
 
-#define ODEBUG_POOL_SIZE	1024
-#define ODEBUG_POOL_MIN_LEVEL	256
-#define ODEBUG_POOL_PERCPU_SIZE	64
+/* Must be power of two */
 #define ODEBUG_BATCH_SIZE	16
 
+/* Initial values. Must all be a multiple of batch size */
+#define ODEBUG_POOL_SIZE	(64 * ODEBUG_BATCH_SIZE)
+#define ODEBUG_POOL_MIN_LEVEL	(ODEBUG_POOL_SIZE / 4)
+
+#define ODEBUG_POOL_PERCPU_SIZE	(4 * ODEBUG_BATCH_SIZE)
+
 #define ODEBUG_CHUNK_SHIFT	PAGE_SHIFT
 #define ODEBUG_CHUNK_SIZE	(1 << ODEBUG_CHUNK_SHIFT)
 #define ODEBUG_CHUNK_MASK	(~(ODEBUG_CHUNK_SIZE - 1))
-- 
cgit v1.2.3


From e4cf33ca48128d580e25ebe779b7ba7b4b4cf733 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Oct 2024 20:21:14 -0400
Subject: ftrace: Consolidate ftrace_regs accessor functions for archs using
 pt_regs

Most architectures use pt_regs within ftrace_regs making a lot of the
accessor functions just calls to the pt_regs internally. Instead of
duplication this effort, use a HAVE_ARCH_FTRACE_REGS for architectures
that have their own ftrace_regs that is not based on pt_regs and will
define all the accessor functions, and for the architectures that just use
pt_regs, it will leave it undefined, and the default accessor functions
will be used.

Note, this will also make it easier to add new accessor functions to
ftrace_regs as it will mean having to touch less architectures.

Cc: <linux-arch@vger.kernel.org>
Cc: "x86@kernel.org" <x86@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Naveen N Rao <naveen@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/20241010202114.2289f6fd@gandalf.local.home
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Heiko Carstens <hca@linux.ibm.com> # s390
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> # powerpc
Suggested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 arch/arm64/include/asm/ftrace.h     |  1 +
 arch/loongarch/include/asm/ftrace.h | 25 +------------------------
 arch/powerpc/include/asm/ftrace.h   | 26 +-------------------------
 arch/riscv/include/asm/ftrace.h     |  1 +
 arch/s390/include/asm/ftrace.h      | 26 +-------------------------
 arch/x86/include/asm/ftrace.h       | 21 +--------------------
 include/linux/ftrace.h              | 32 +++++++-------------------------
 include/linux/ftrace_regs.h         | 36 ++++++++++++++++++++++++++++++++++++
 8 files changed, 49 insertions(+), 119 deletions(-)
 create mode 100644 include/linux/ftrace_regs.h

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index bbb69c7751b9..5ccff4de7f09 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -54,6 +54,7 @@ extern void return_to_handler(void);
 unsigned long ftrace_call_adjust(unsigned long addr);
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
+#define HAVE_ARCH_FTRACE_REGS
 struct dyn_ftrace;
 struct ftrace_ops;
 struct ftrace_regs;
diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h
index 0e15d36ce251..8f13eaeaa325 100644
--- a/arch/loongarch/include/asm/ftrace.h
+++ b/arch/loongarch/include/asm/ftrace.h
@@ -43,43 +43,20 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent);
 
 #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
 struct ftrace_ops;
-struct ftrace_regs;
-#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
 
-struct __arch_ftrace_regs {
-	struct pt_regs regs;
-};
+#include <linux/ftrace_regs.h>
 
 static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
 {
 	return &arch_ftrace_regs(fregs)->regs;
 }
 
-static __always_inline unsigned long
-ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs)
-{
-	return instruction_pointer(&arch_ftrace_regs(fregs)->regs);
-}
-
 static __always_inline void
 ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip)
 {
 	instruction_pointer_set(&arch_ftrace_regs(fregs)->regs, ip);
 }
 
-#define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
-#define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_return_value(fregs) \
-	regs_return_value(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
-#define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_query_register_offset(name) \
-	regs_query_register_offset(name)
-
 #define ftrace_graph_func ftrace_graph_func
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs);
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index e299fd47d201..0edfb874eb02 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -32,12 +32,7 @@ struct dyn_arch_ftrace {
 int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
 #define ftrace_init_nop ftrace_init_nop
 
-struct ftrace_regs;
-#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
-
-struct __arch_ftrace_regs {
-	struct pt_regs regs;
-};
+#include <linux/ftrace_regs.h>
 
 static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
 {
@@ -52,25 +47,6 @@ ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
 	regs_set_return_ip(&arch_ftrace_regs(fregs)->regs, ip);
 }
 
-static __always_inline unsigned long
-ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs)
-{
-	return instruction_pointer(&arch_ftrace_regs(fregs)->regs);
-}
-
-#define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
-#define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_return_value(fregs) \
-	regs_return_value(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
-#define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_query_register_offset(name) \
-	regs_query_register_offset(name)
-
 struct ftrace_ops;
 
 #define ftrace_graph_func ftrace_graph_func
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index c6bcdff105b5..3d66437a1029 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -125,6 +125,7 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
 #define arch_ftrace_get_regs(regs) NULL
+#define HAVE_ARCH_FTRACE_REGS
 struct ftrace_ops;
 struct ftrace_regs;
 #define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 1498d0a9c762..fc97d75dc752 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -51,12 +51,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	return addr;
 }
 
-struct ftrace_regs;
-#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
-
-struct __arch_ftrace_regs {
-	struct pt_regs regs;
-};
+#include <linux/ftrace_regs.h>
 
 static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
 {
@@ -84,12 +79,6 @@ static __always_inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-static __always_inline unsigned long
-ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs)
-{
-	return arch_ftrace_regs(fregs)->regs.psw.addr;
-}
-
 static __always_inline void
 ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
 				    unsigned long ip)
@@ -97,19 +86,6 @@ ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
 	arch_ftrace_regs(fregs)->regs.psw.addr = ip;
 }
 
-#define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
-#define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_return_value(fregs) \
-	regs_return_value(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
-#define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_query_register_offset(name) \
-	regs_query_register_offset(name)
-
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 /*
  * When an ftrace registered caller is tracing a function that is
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 87943f7a299b..8f02d28c571a 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -33,12 +33,8 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 }
 
 #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
-struct ftrace_regs;
-#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
 
-struct __arch_ftrace_regs {
-	struct pt_regs		regs;
-};
+#include <linux/ftrace_regs.h>
 
 static __always_inline struct pt_regs *
 arch_ftrace_get_regs(struct ftrace_regs *fregs)
@@ -52,21 +48,6 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
 #define ftrace_regs_set_instruction_pointer(fregs, _ip)	\
 	do { arch_ftrace_regs(fregs)->regs.ip = (_ip); } while (0)
 
-#define ftrace_regs_get_instruction_pointer(fregs) \
-	arch_ftrace_regs(fregs)->regs.ip)
-
-#define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
-#define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_return_value(fregs) \
-	regs_return_value(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
-#define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_query_register_offset(name) \
-	regs_query_register_offset(name)
 
 struct ftrace_ops;
 #define ftrace_graph_func ftrace_graph_func
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 66f10291a0b2..aa9ddd1e4bb6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -113,6 +113,8 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
 
 #ifdef CONFIG_FUNCTION_TRACER
 
+#include <linux/ftrace_regs.h>
+
 extern int ftrace_enabled;
 
 /**
@@ -150,14 +152,11 @@ struct ftrace_regs {
 #define ftrace_regs_size()	sizeof(struct __arch_ftrace_regs)
 
 #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
-
-struct __arch_ftrace_regs {
-	struct pt_regs		regs;
-};
-
-struct ftrace_regs;
-#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
-
+/*
+ * Architectures that define HAVE_DYNAMIC_FTRACE_WITH_ARGS must define their own
+ * arch_ftrace_get_regs() where it only returns pt_regs *if* it is fully
+ * populated. It should return NULL otherwise.
+ */
 static inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
 {
 	return &arch_ftrace_regs(fregs)->regs;
@@ -191,23 +190,6 @@ static __always_inline bool ftrace_regs_has_args(struct ftrace_regs *fregs)
 	return ftrace_get_regs(fregs) != NULL;
 }
 
-#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
-#define ftrace_regs_get_instruction_pointer(fregs) \
-	instruction_pointer(ftrace_get_regs(fregs))
-#define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(ftrace_get_regs(fregs), n)
-#define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(ftrace_get_regs(fregs))
-#define ftrace_regs_return_value(fregs) \
-	regs_return_value(ftrace_get_regs(fregs))
-#define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(ftrace_get_regs(fregs), ret)
-#define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(ftrace_get_regs(fregs))
-#define ftrace_regs_query_register_offset(name) \
-	regs_query_register_offset(name)
-#endif
-
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
 			      struct ftrace_ops *op, struct ftrace_regs *fregs);
 
diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h
new file mode 100644
index 000000000000..dea6a0851b74
--- /dev/null
+++ b/include/linux/ftrace_regs.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FTRACE_REGS_H
+#define _LINUX_FTRACE_REGS_H
+
+/*
+ * For archs that just copy pt_regs in ftrace regs, it can use this default.
+ * If an architecture does not use pt_regs, it must define all the below
+ * accessor functions.
+ */
+#ifndef HAVE_ARCH_FTRACE_REGS
+struct __arch_ftrace_regs {
+	struct pt_regs		regs;
+};
+
+#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs))
+
+struct ftrace_regs;
+
+#define ftrace_regs_get_instruction_pointer(fregs) \
+	instruction_pointer(arch_ftrace_get_regs(fregs))
+#define ftrace_regs_get_argument(fregs, n) \
+	regs_get_kernel_argument(arch_ftrace_get_regs(fregs), n)
+#define ftrace_regs_get_stack_pointer(fregs) \
+	kernel_stack_pointer(arch_ftrace_get_regs(fregs))
+#define ftrace_regs_return_value(fregs) \
+	regs_return_value(arch_ftrace_get_regs(fregs))
+#define ftrace_regs_set_return_value(fregs, ret) \
+	regs_set_return_value(arch_ftrace_get_regs(fregs), ret)
+#define ftrace_override_function_with_return(fregs) \
+	override_function_with_return(arch_ftrace_get_regs(fregs))
+#define ftrace_regs_query_register_offset(name) \
+	regs_query_register_offset(name)
+
+#endif /* HAVE_ARCH_FTRACE_REGS */
+
+#endif /* _LINUX_FTRACE_REGS_H */
-- 
cgit v1.2.3


From a066397e8ed1036e8b959050ab6e830ee90d9f58 Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry@gourry.net>
Date: Fri, 13 Sep 2024 19:19:53 -0400
Subject: tpm: fix unsigned/signed mismatch errors related to
 __calc_tpm2_event_size

__calc_tpm2_event_size returns 0 or a positive length, but return values
are often interpreted as ints.  Convert everything over to u32 to avoid
signed/unsigned logic errors.

Signed-off-by: Gregory Price <gourry@gourry.net>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/libstub/tpm.c | 6 +++---
 drivers/firmware/efi/tpm.c         | 2 +-
 include/linux/tpm_eventlog.h       | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/libstub/tpm.c b/drivers/firmware/efi/libstub/tpm.c
index 1fd6823248ab..d31ea3f351d8 100644
--- a/drivers/firmware/efi/libstub/tpm.c
+++ b/drivers/firmware/efi/libstub/tpm.c
@@ -57,7 +57,7 @@ static void efi_retrieve_tcg2_eventlog(int version, efi_physical_addr_t log_loca
 	struct linux_efi_tpm_eventlog *log_tbl = NULL;
 	unsigned long first_entry_addr, last_entry_addr;
 	size_t log_size, last_entry_size;
-	int final_events_size = 0;
+	u32 final_events_size = 0;
 
 	first_entry_addr = (unsigned long) log_location;
 
@@ -110,9 +110,9 @@ static void efi_retrieve_tcg2_eventlog(int version, efi_physical_addr_t log_loca
 	 */
 	if (final_events_table && final_events_table->nr_events) {
 		struct tcg_pcr_event2_head *header;
-		int offset;
+		u32 offset;
 		void *data;
-		int event_size;
+		u32 event_size;
 		int i = final_events_table->nr_events;
 
 		data = (void *)final_events_table;
diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c
index b0cc2cc11d7e..cdd431027065 100644
--- a/drivers/firmware/efi/tpm.c
+++ b/drivers/firmware/efi/tpm.c
@@ -19,7 +19,7 @@ EXPORT_SYMBOL(efi_tpm_final_log_size);
 static int __init tpm2_calc_event_log_size(void *data, int count, void *size_info)
 {
 	struct tcg_pcr_event2_head *header;
-	int event_size, size = 0;
+	u32 event_size, size = 0;
 
 	while (count > 0) {
 		header = data + size;
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 7d68a5cc5881..891368e82558 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -157,7 +157,7 @@ struct tcg_algorithm_info {
  * Return: size of the event on success, 0 on failure
  */
 
-static __always_inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
+static __always_inline u32 __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 					 struct tcg_pcr_event *event_header,
 					 bool do_mapping)
 {
-- 
cgit v1.2.3


From 58797abed49d6b78c7af99b03b037f20c7ffb203 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 5 Oct 2024 12:04:17 +0200
Subject: power: supply: core: constify power_supply_battery_info::resist_table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The power supply core never modifies the resist table.
Reflect this in the API, so drivers can mark their static tables as
const.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20241005-power-supply-battery-const-v1-1-c1f721927048@weissschuh.net
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply_core.c | 4 ++--
 include/linux/power_supply.h             | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index 49534458a9f7..a01703fa83c0 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -798,7 +798,7 @@ int power_supply_get_battery_info(struct power_supply *psy,
 		goto out_ret_pointer;
 
 	info->resist_table_size = len / (2 * sizeof(__be32));
-	resist_table = info->resist_table = devm_kcalloc(&psy->dev,
+	info->resist_table = resist_table = devm_kcalloc(&psy->dev,
 							 info->resist_table_size,
 							 sizeof(*resist_table),
 							 GFP_KERNEL);
@@ -982,7 +982,7 @@ EXPORT_SYMBOL_GPL(power_supply_battery_info_get_prop);
  *
  * Return: the battery internal resistance percent
  */
-int power_supply_temp2resist_simple(struct power_supply_resistance_temp_table *table,
+int power_supply_temp2resist_simple(const struct power_supply_resistance_temp_table *table,
 				    int table_len, int temp)
 {
 	int i, high, low;
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 910d407ebe63..9253411c105f 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -752,7 +752,7 @@ struct power_supply_battery_info {
 	int temp_max;
 	struct power_supply_battery_ocv_table *ocv_table[POWER_SUPPLY_OCV_TEMP_MAX];
 	int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX];
-	struct power_supply_resistance_temp_table *resist_table;
+	const struct power_supply_resistance_temp_table *resist_table;
 	int resist_table_size;
 	const struct power_supply_vbat_ri_table *vbat2ri_discharging;
 	int vbat2ri_discharging_size;
@@ -805,7 +805,7 @@ power_supply_find_ocv2cap_table(struct power_supply_battery_info *info,
 extern int power_supply_batinfo_ocv2cap(struct power_supply_battery_info *info,
 					int ocv, int temp);
 extern int
-power_supply_temp2resist_simple(struct power_supply_resistance_temp_table *table,
+power_supply_temp2resist_simple(const struct power_supply_resistance_temp_table *table,
 				int table_len, int temp);
 extern int power_supply_vbat2ri(struct power_supply_battery_info *info,
 				int vbat_uv, bool charging);
-- 
cgit v1.2.3


From ce20d5b9e37099a035ab34d4d3f59e1744756385 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 5 Oct 2024 12:04:21 +0200
Subject: power: supply: core: constify power_supply_battery_info::ocv_table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The power supply core never modifies the ocv table.
Reflect this in the API, so drivers can mark their static tables as
const.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20241005-power-supply-battery-const-v1-5-c1f721927048@weissschuh.net
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply_core.c | 8 ++++----
 include/linux/power_supply.h             | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index a01703fa83c0..5aefba2ddcda 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -777,7 +777,7 @@ int power_supply_get_battery_info(struct power_supply *psy,
 		tab_len = size / (2 * sizeof(__be32));
 		info->ocv_table_size[index] = tab_len;
 
-		table = info->ocv_table[index] =
+		info->ocv_table[index] = table =
 			devm_kcalloc(&psy->dev, tab_len, sizeof(*table), GFP_KERNEL);
 		if (!info->ocv_table[index]) {
 			power_supply_put_battery_info(psy, info);
@@ -1093,7 +1093,7 @@ EXPORT_SYMBOL_GPL(power_supply_get_maintenance_charging_setting);
  *
  * Return: the battery capacity.
  */
-int power_supply_ocv2cap_simple(struct power_supply_battery_ocv_table *table,
+int power_supply_ocv2cap_simple(const struct power_supply_battery_ocv_table *table,
 				int table_len, int ocv)
 {
 	int i, high, low;
@@ -1118,7 +1118,7 @@ int power_supply_ocv2cap_simple(struct power_supply_battery_ocv_table *table,
 }
 EXPORT_SYMBOL_GPL(power_supply_ocv2cap_simple);
 
-struct power_supply_battery_ocv_table *
+const struct power_supply_battery_ocv_table *
 power_supply_find_ocv2cap_table(struct power_supply_battery_info *info,
 				int temp, int *table_len)
 {
@@ -1149,7 +1149,7 @@ EXPORT_SYMBOL_GPL(power_supply_find_ocv2cap_table);
 int power_supply_batinfo_ocv2cap(struct power_supply_battery_info *info,
 				 int ocv, int temp)
 {
-	struct power_supply_battery_ocv_table *table;
+	const struct power_supply_battery_ocv_table *table;
 	int table_len;
 
 	table = power_supply_find_ocv2cap_table(info, temp, &table_len);
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 9253411c105f..4e29ec39c18f 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -750,7 +750,7 @@ struct power_supply_battery_info {
 	int temp_alert_max;
 	int temp_min;
 	int temp_max;
-	struct power_supply_battery_ocv_table *ocv_table[POWER_SUPPLY_OCV_TEMP_MAX];
+	const struct power_supply_battery_ocv_table *ocv_table[POWER_SUPPLY_OCV_TEMP_MAX];
 	int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX];
 	const struct power_supply_resistance_temp_table *resist_table;
 	int resist_table_size;
@@ -797,9 +797,9 @@ extern bool power_supply_battery_info_has_prop(struct power_supply_battery_info
 extern int power_supply_battery_info_get_prop(struct power_supply_battery_info *info,
 					      enum power_supply_property psp,
 					      union power_supply_propval *val);
-extern int power_supply_ocv2cap_simple(struct power_supply_battery_ocv_table *table,
+extern int power_supply_ocv2cap_simple(const struct power_supply_battery_ocv_table *table,
 				       int table_len, int ocv);
-extern struct power_supply_battery_ocv_table *
+extern const struct power_supply_battery_ocv_table *
 power_supply_find_ocv2cap_table(struct power_supply_battery_info *info,
 				int temp, int *table_len);
 extern int power_supply_batinfo_ocv2cap(struct power_supply_battery_info *info,
-- 
cgit v1.2.3


From 49000fee9e639f62ba1f965ed2ae4c5ad18d19e2 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 5 Oct 2024 12:05:03 +0200
Subject: power: supply: core: add wakeup source inhibit by power_supply_config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To inhibit wakeup users currently have to use dedicated functions to
register the power supply: {,devm_}power_supply_register_no_ws().
This is inconsistent to other runtime settings which can be configured
through struct power_supply_config.
It's also not obvious what _no_ws() is meant to mean.

Extend power_supply_config to also be able to inhibit the wakeup source.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20241005-power-supply-no-wakeup-source-v1-1-1d62bf9bcb1d@weissschuh.net
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply_core.c | 3 +++
 include/linux/power_supply.h             | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index 5aefba2ddcda..a2005e3c6f38 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -1410,6 +1410,9 @@ __power_supply_register(struct device *parent,
 	if (rc)
 		goto device_add_failed;
 
+	if (cfg && cfg->no_wakeup_source)
+		ws = false;
+
 	rc = device_init_wakeup(dev, ws);
 	if (rc)
 		goto wakeup_init_failed;
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 4e29ec39c18f..9a64043798bd 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -236,6 +236,8 @@ struct power_supply_config {
 
 	char **supplied_to;
 	size_t num_supplicants;
+
+	bool no_wakeup_source;
 };
 
 /* Description of power supply */
-- 
cgit v1.2.3


From 85d319e14f301e1c68131b74c1dceabae73d1e81 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 5 Oct 2024 12:05:10 +0200
Subject: power: supply: core: remove {,devm_}power_supply_register_no_ws()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The same functionality is available through
power_supply_config::no_wakeup_source, which is more idiomatic.
All users of the old API have been converted.

Also remove the argument "ws" from __power_supply_register(),
as it is now always "true".

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20241005-power-supply-no-wakeup-source-v1-8-1d62bf9bcb1d@weissschuh.net
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply_core.c | 71 ++------------------------------
 include/linux/power_supply.h             |  8 ----
 2 files changed, 4 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index a2005e3c6f38..f20b0fde9847 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -1342,8 +1342,7 @@ static void psy_unregister_thermal(struct power_supply *psy)
 static struct power_supply *__must_check
 __power_supply_register(struct device *parent,
 				   const struct power_supply_desc *desc,
-				   const struct power_supply_config *cfg,
-				   bool ws)
+				   const struct power_supply_config *cfg)
 {
 	struct device *dev;
 	struct power_supply *psy;
@@ -1410,10 +1409,7 @@ __power_supply_register(struct device *parent,
 	if (rc)
 		goto device_add_failed;
 
-	if (cfg && cfg->no_wakeup_source)
-		ws = false;
-
-	rc = device_init_wakeup(dev, ws);
+	rc = device_init_wakeup(dev, cfg ? !cfg->no_wakeup_source : true);
 	if (rc)
 		goto wakeup_init_failed;
 
@@ -1479,33 +1475,10 @@ struct power_supply *__must_check power_supply_register(struct device *parent,
 		const struct power_supply_desc *desc,
 		const struct power_supply_config *cfg)
 {
-	return __power_supply_register(parent, desc, cfg, true);
+	return __power_supply_register(parent, desc, cfg);
 }
 EXPORT_SYMBOL_GPL(power_supply_register);
 
-/**
- * power_supply_register_no_ws() - Register new non-waking-source power supply
- * @parent:	Device to be a parent of power supply's device, usually
- *		the device which probe function calls this
- * @desc:	Description of power supply, must be valid through whole
- *		lifetime of this power supply
- * @cfg:	Run-time specific configuration accessed during registering,
- *		may be NULL
- *
- * Return: A pointer to newly allocated power_supply on success
- * or ERR_PTR otherwise.
- * Use power_supply_unregister() on returned power_supply pointer to release
- * resources.
- */
-struct power_supply *__must_check
-power_supply_register_no_ws(struct device *parent,
-		const struct power_supply_desc *desc,
-		const struct power_supply_config *cfg)
-{
-	return __power_supply_register(parent, desc, cfg, false);
-}
-EXPORT_SYMBOL_GPL(power_supply_register_no_ws);
-
 static void devm_power_supply_release(struct device *dev, void *res)
 {
 	struct power_supply **psy = res;
@@ -1538,7 +1511,7 @@ devm_power_supply_register(struct device *parent,
 
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
-	psy = __power_supply_register(parent, desc, cfg, true);
+	psy = __power_supply_register(parent, desc, cfg);
 	if (IS_ERR(psy)) {
 		devres_free(ptr);
 	} else {
@@ -1549,42 +1522,6 @@ devm_power_supply_register(struct device *parent,
 }
 EXPORT_SYMBOL_GPL(devm_power_supply_register);
 
-/**
- * devm_power_supply_register_no_ws() - Register managed non-waking-source power supply
- * @parent:	Device to be a parent of power supply's device, usually
- *		the device which probe function calls this
- * @desc:	Description of power supply, must be valid through whole
- *		lifetime of this power supply
- * @cfg:	Run-time specific configuration accessed during registering,
- *		may be NULL
- *
- * Return: A pointer to newly allocated power_supply on success
- * or ERR_PTR otherwise.
- * The returned power_supply pointer will be automatically unregistered
- * on driver detach.
- */
-struct power_supply *__must_check
-devm_power_supply_register_no_ws(struct device *parent,
-		const struct power_supply_desc *desc,
-		const struct power_supply_config *cfg)
-{
-	struct power_supply **ptr, *psy;
-
-	ptr = devres_alloc(devm_power_supply_release, sizeof(*ptr), GFP_KERNEL);
-
-	if (!ptr)
-		return ERR_PTR(-ENOMEM);
-	psy = __power_supply_register(parent, desc, cfg, false);
-	if (IS_ERR(psy)) {
-		devres_free(ptr);
-	} else {
-		*ptr = psy;
-		devres_add(parent, ptr);
-	}
-	return psy;
-}
-EXPORT_SYMBOL_GPL(devm_power_supply_register_no_ws);
-
 /**
  * power_supply_unregister() - Remove this power supply from system
  * @psy:	Pointer to power supply to unregister
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 9a64043798bd..14eedefb3ac8 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -874,17 +874,9 @@ power_supply_register(struct device *parent,
 				 const struct power_supply_desc *desc,
 				 const struct power_supply_config *cfg);
 extern struct power_supply *__must_check
-power_supply_register_no_ws(struct device *parent,
-				 const struct power_supply_desc *desc,
-				 const struct power_supply_config *cfg);
-extern struct power_supply *__must_check
 devm_power_supply_register(struct device *parent,
 				 const struct power_supply_desc *desc,
 				 const struct power_supply_config *cfg);
-extern struct power_supply *__must_check
-devm_power_supply_register_no_ws(struct device *parent,
-				 const struct power_supply_desc *desc,
-				 const struct power_supply_config *cfg);
 extern void power_supply_unregister(struct power_supply *psy);
 extern int power_supply_powers(struct power_supply *psy, struct device *dev);
 
-- 
cgit v1.2.3


From 0b582611a8f4270fa357a22a546909b2dd5fc5fe Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 15 Oct 2024 10:28:43 +0900
Subject: ftrace: Use arch_ftrace_regs() for ftrace_regs_*() macros

Since the arch_ftrace_get_regs(fregs) is only valid when the
FL_SAVE_REGS is set, we need to use `&arch_ftrace_regs()->regs` for
ftrace_regs_*() APIs because those APIs are for ftrace_regs, not
complete pt_regs.

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Florent Revest <revest@chromium.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: bpf <bpf@vger.kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/172895572290.107311.16057631001860177198.stgit@devnote2
Fixes: e4cf33ca4812 ("ftrace: Consolidate ftrace_regs accessor functions for archs using pt_regs")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace_regs.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h
index dea6a0851b74..b78a0a60515b 100644
--- a/include/linux/ftrace_regs.h
+++ b/include/linux/ftrace_regs.h
@@ -17,17 +17,17 @@ struct __arch_ftrace_regs {
 struct ftrace_regs;
 
 #define ftrace_regs_get_instruction_pointer(fregs) \
-	instruction_pointer(arch_ftrace_get_regs(fregs))
+	instruction_pointer(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_get_argument(fregs, n) \
-	regs_get_kernel_argument(arch_ftrace_get_regs(fregs), n)
+	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
 #define ftrace_regs_get_stack_pointer(fregs) \
-	kernel_stack_pointer(arch_ftrace_get_regs(fregs))
+	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_return_value(fregs) \
-	regs_return_value(arch_ftrace_get_regs(fregs))
+	regs_return_value(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_set_return_value(fregs, ret) \
-	regs_set_return_value(arch_ftrace_get_regs(fregs), ret)
+	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
 #define ftrace_override_function_with_return(fregs) \
-	override_function_with_return(arch_ftrace_get_regs(fregs))
+	override_function_with_return(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_query_register_offset(name) \
 	regs_query_register_offset(name)
 
-- 
cgit v1.2.3


From 2d17932da44fdc1ba835ad05110ab996d2912dbf Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 15 Oct 2024 10:28:53 +0900
Subject: ftrace: Rename ftrace_regs_return_value to
 ftrace_regs_get_return_value

Rename ftrace_regs_return_value to ftrace_regs_get_return_value as same as
other ftrace_regs_get/set_* APIs. arm64 and riscv are already using this
new name.

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Florent Revest <revest@chromium.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: bpf <bpf@vger.kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/172895573350.107311.7564634260652361511.stgit@devnote2
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h
index b78a0a60515b..be1ed0c891d0 100644
--- a/include/linux/ftrace_regs.h
+++ b/include/linux/ftrace_regs.h
@@ -22,7 +22,7 @@ struct ftrace_regs;
 	regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n)
 #define ftrace_regs_get_stack_pointer(fregs) \
 	kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs)
-#define ftrace_regs_return_value(fregs) \
+#define ftrace_regs_get_return_value(fregs) \
 	regs_return_value(&arch_ftrace_regs(fregs)->regs)
 #define ftrace_regs_set_return_value(fregs, ret) \
 	regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret)
-- 
cgit v1.2.3


From 102f085d84607462234ac60f6027973b45a9bde2 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Mon, 14 Oct 2024 10:22:21 +0200
Subject: timers: Rename usleep_idle_range() to usleep_range_idle()

usleep_idle_range() is a variant of usleep_range(). Both are using
usleep_range_state() as a base. To be able to find all the related
functions in one go, rename it usleep_idle_range() to usleep_range_idle().

No functional change.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-4-dc8b907cb62f@linutronix.de
---
 include/linux/delay.h | 2 +-
 mm/damon/core.c       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index ff9cda975e30..2bc586aa2068 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -68,7 +68,7 @@ static inline void usleep_range(unsigned long min, unsigned long max)
 	usleep_range_state(min, max, TASK_UNINTERRUPTIBLE);
 }
 
-static inline void usleep_idle_range(unsigned long min, unsigned long max)
+static inline void usleep_range_idle(unsigned long min, unsigned long max)
 {
 	usleep_range_state(min, max, TASK_IDLE);
 }
diff --git a/mm/damon/core.c b/mm/damon/core.c
index a83f3b736d51..c725c78b43f0 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1896,7 +1896,7 @@ static void kdamond_usleep(unsigned long usecs)
 	if (usecs > 20 * USEC_PER_MSEC)
 		schedule_timeout_idle(usecs_to_jiffies(usecs));
 	else
-		usleep_idle_range(usecs, usecs + 1);
+		usleep_range_idle(usecs, usecs + 1);
 }
 
 /* Returns negative error code if it's not activated but should return */
-- 
cgit v1.2.3


From f36eb171410839325fff9cd9b7b7400f7e606962 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Mon, 14 Oct 2024 10:22:22 +0200
Subject: timers: Update function descriptions of sleep/delay related functions

A lot of commonly used functions for inserting a sleep or delay lack a
proper function description. Add function descriptions to all of them to
have important information in a central place close to the code.

No functional change.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-5-dc8b907cb62f@linutronix.de
---
 include/asm-generic/delay.h | 41 +++++++++++++++++++++++++++++++----
 include/linux/delay.h       | 48 ++++++++++++++++++++++++++++++----------
 kernel/time/sleep_timeout.c | 53 ++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 120 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/delay.h b/include/asm-generic/delay.h
index e448ac61430c..a8cee41cc51b 100644
--- a/include/asm-generic/delay.h
+++ b/include/asm-generic/delay.h
@@ -12,11 +12,39 @@ extern void __const_udelay(unsigned long xloops);
 extern void __delay(unsigned long loops);
 
 /*
- * The weird n/20000 thing suppresses a "comparison is always false due to
- * limited range of data type" warning with non-const 8-bit arguments.
+ * Implementation details:
+ *
+ * * The weird n/20000 thing suppresses a "comparison is always false due to
+ *   limited range of data type" warning with non-const 8-bit arguments.
+ * * 0x10c7 is 2**32 / 1000000 (rounded up) -> udelay
+ * * 0x5 is 2**32 / 1000000000 (rounded up) -> ndelay
  */
 
-/* 0x10c7 is 2**32 / 1000000 (rounded up) */
+/**
+ * udelay - Inserting a delay based on microseconds with busy waiting
+ * @usec:	requested delay in microseconds
+ *
+ * When delaying in an atomic context ndelay(), udelay() and mdelay() are the
+ * only valid variants of delaying/sleeping to go with.
+ *
+ * When inserting delays in non atomic context which are shorter than the time
+ * which is required to queue e.g. an hrtimer and to enter then the scheduler,
+ * it is also valuable to use udelay(). But it is not simple to specify a
+ * generic threshold for this which will fit for all systems. An approximation
+ * is a threshold for all delays up to 10 microseconds.
+ *
+ * When having a delay which is larger than the architecture specific
+ * %MAX_UDELAY_MS value, please make sure mdelay() is used. Otherwise a overflow
+ * risk is given.
+ *
+ * Please note that ndelay(), udelay() and mdelay() may return early for several
+ * reasons (https://lists.openwall.net/linux-kernel/2011/01/09/56):
+ *
+ * #. computed loops_per_jiffy too low (due to the time taken to execute the
+ *    timer interrupt.)
+ * #. cache behaviour affecting the time it takes to execute the loop function.
+ * #. CPU clock rate changes.
+ */
 #define udelay(n)							\
 	({								\
 		if (__builtin_constant_p(n)) {				\
@@ -29,7 +57,12 @@ extern void __delay(unsigned long loops);
 		}							\
 	})
 
-/* 0x5 is 2**32 / 1000000000 (rounded up) */
+/**
+ * ndelay - Inserting a delay based on nanoseconds with busy waiting
+ * @nsec:	requested delay in nanoseconds
+ *
+ * See udelay() for basic information about ndelay() and it's variants.
+ */
 #define ndelay(n)							\
 	({								\
 		if (__builtin_constant_p(n)) {				\
diff --git a/include/linux/delay.h b/include/linux/delay.h
index 2bc586aa2068..2de509e4adce 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -6,17 +6,7 @@
  * Copyright (C) 1993 Linus Torvalds
  *
  * Delay routines, using a pre-computed "loops_per_jiffy" value.
- *
- * Please note that ndelay(), udelay() and mdelay() may return early for
- * several reasons:
- *  1. computed loops_per_jiffy too low (due to the time taken to
- *     execute the timer interrupt.)
- *  2. cache behaviour affecting the time it takes to execute the
- *     loop function.
- *  3. CPU clock rate changes.
- *
- * Please see this thread:
- *   https://lists.openwall.net/linux-kernel/2011/01/09/56
+ * Sleep routines using timer list timers or hrtimers.
  */
 
 #include <linux/math.h>
@@ -35,12 +25,21 @@ extern unsigned long loops_per_jiffy;
  * The 2nd mdelay() definition ensures GCC will optimize away the 
  * while loop for the common cases where n <= MAX_UDELAY_MS  --  Paul G.
  */
-
 #ifndef MAX_UDELAY_MS
 #define MAX_UDELAY_MS	5
 #endif
 
 #ifndef mdelay
+/**
+ * mdelay - Inserting a delay based on milliseconds with busy waiting
+ * @n:	requested delay in milliseconds
+ *
+ * See udelay() for basic information about mdelay() and it's variants.
+ *
+ * Please double check, whether mdelay() is the right way to go or whether a
+ * refactoring of the code is the better variant to be able to use msleep()
+ * instead.
+ */
 #define mdelay(n) (\
 	(__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
 	({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
@@ -63,16 +62,41 @@ unsigned long msleep_interruptible(unsigned int msecs);
 void usleep_range_state(unsigned long min, unsigned long max,
 			unsigned int state);
 
+/**
+ * usleep_range - Sleep for an approximate time
+ * @min:	Minimum time in microseconds to sleep
+ * @max:	Maximum time in microseconds to sleep
+ *
+ * For basic information please refere to usleep_range_state().
+ *
+ * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep.
+ */
 static inline void usleep_range(unsigned long min, unsigned long max)
 {
 	usleep_range_state(min, max, TASK_UNINTERRUPTIBLE);
 }
 
+/**
+ * usleep_range_idle - Sleep for an approximate time with idle time accounting
+ * @min:	Minimum time in microseconds to sleep
+ * @max:	Maximum time in microseconds to sleep
+ *
+ * For basic information please refere to usleep_range_state().
+ *
+ * The sleeping task has the state TASK_IDLE during the sleep to prevent
+ * contribution to the load avarage.
+ */
 static inline void usleep_range_idle(unsigned long min, unsigned long max)
 {
 	usleep_range_state(min, max, TASK_IDLE);
 }
 
+/**
+ * ssleep - wrapper for seconds around msleep
+ * @seconds:	Requested sleep duration in seconds
+ *
+ * Please refere to msleep() for detailed information.
+ */
 static inline void ssleep(unsigned int seconds)
 {
 	msleep(seconds * 1000);
diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c
index 560d17c30aa5..f3f246e4c8d1 100644
--- a/kernel/time/sleep_timeout.c
+++ b/kernel/time/sleep_timeout.c
@@ -281,7 +281,34 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout);
 
 /**
  * msleep - sleep safely even with waitqueue interruptions
- * @msecs: Time in milliseconds to sleep for
+ * @msecs:	Requested sleep duration in milliseconds
+ *
+ * msleep() uses jiffy based timeouts for the sleep duration. Because of the
+ * design of the timer wheel, the maximum additional percentage delay (slack) is
+ * 12.5%. This is only valid for timers which will end up in level 1 or a higher
+ * level of the timer wheel. For explanation of those 12.5% please check the
+ * detailed description about the basics of the timer wheel.
+ *
+ * The slack of timers which will end up in level 0 depends on sleep duration
+ * (msecs) and HZ configuration and can be calculated in the following way (with
+ * the timer wheel design restriction that the slack is not less than 12.5%):
+ *
+ *   ``slack = MSECS_PER_TICK / msecs``
+ *
+ * When the allowed slack of the callsite is known, the calculation could be
+ * turned around to find the minimal allowed sleep duration to meet the
+ * constraints. For example:
+ *
+ * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``:
+ *   all sleep durations greater or equal 4ms will meet the constraints.
+ * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``:
+ *   all sleep durations greater or equal 8ms will meet the constraints.
+ * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``:
+ *   all sleep durations greater or equal 16ms will meet the constraints.
+ * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``:
+ *   all sleep durations greater or equal 32ms will meet the constraints.
+ *
+ * See also the signal aware variant msleep_interruptible().
  */
 void msleep(unsigned int msecs)
 {
@@ -294,7 +321,15 @@ EXPORT_SYMBOL(msleep);
 
 /**
  * msleep_interruptible - sleep waiting for signals
- * @msecs: Time in milliseconds to sleep for
+ * @msecs:	Requested sleep duration in milliseconds
+ *
+ * See msleep() for some basic information.
+ *
+ * The difference between msleep() and msleep_interruptible() is that the sleep
+ * could be interrupted by a signal delivery and then returns early.
+ *
+ * Returns: The remaining time of the sleep duration transformed to msecs (see
+ * schedule_timeout() for details).
  */
 unsigned long msleep_interruptible(unsigned int msecs)
 {
@@ -312,11 +347,17 @@ EXPORT_SYMBOL(msleep_interruptible);
  * @max:	Maximum time in usecs to sleep
  * @state:	State of the current task that will be while sleeping
  *
+ * usleep_range_state() sleeps at least for the minimum specified time but not
+ * longer than the maximum specified amount of time. The range might reduce
+ * power usage by allowing hrtimers to coalesce an already scheduled interrupt
+ * with this hrtimer. In the worst case, an interrupt is scheduled for the upper
+ * bound.
+ *
+ * The sleeping task is set to the specified state before starting the sleep.
+ *
  * In non-atomic context where the exact wakeup time is flexible, use
- * usleep_range_state() instead of udelay().  The sleep improves responsiveness
- * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
- * power usage by allowing hrtimers to take advantage of an already-
- * scheduled interrupt instead of scheduling a new one just for this sleep.
+ * usleep_range() or its variants instead of udelay(). The sleep improves
+ * responsiveness by avoiding the CPU-hogging busy-wait of udelay().
  */
 void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state)
 {
-- 
cgit v1.2.3


From 82e11e47c1880362e05c065bef7dbe28a749555c Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Mon, 14 Oct 2024 10:22:24 +0200
Subject: timers: Adjust flseep() to reflect reality

fsleep() simply implements the recommendations of the outdated
documentation in "Documentation/timers/timers-howto.rst". This should be a
user friendly interface to choose always the best timeout function
approach:

- udelay() for very short sleep durations shorter than 10 microseconds
- usleep_range() for sleep durations until 20 milliseconds
- msleep() for the others

The actual implementation has several problems:

- It does not take into account that HZ resolution also has an impact on
  granularity of jiffies and has also an impact on the granularity of the
  buckets of timer wheel levels. This means that accuracy for the timeout
  does not have an upper limit. When executing fsleep(20000) on a HZ=100
  system, the possible additional slack will be 50% as the granularity of
  the buckets in the lowest level is 10 milliseconds.

- The upper limit of usleep_range() is twice the requested timeout. When no
  other interrupts occur in this range, the maximum value is used. This
  means that the requested sleep length has then an additional delay of
  100%.

Change the thresholds for the decisions in fsleep() to make sure the
maximum slack which is added to the sleep duration is 25%.

Note: Outdated documentation will be updated in a followup patch.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-7-dc8b907cb62f@linutronix.de
---
 include/linux/delay.h | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 2de509e4adce..89866bab100d 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -11,6 +11,7 @@
 
 #include <linux/math.h>
 #include <linux/sched.h>
+#include <linux/jiffies.h>
 
 extern unsigned long loops_per_jiffy;
 
@@ -102,15 +103,35 @@ static inline void ssleep(unsigned int seconds)
 	msleep(seconds * 1000);
 }
 
-/* see Documentation/timers/timers-howto.rst for the thresholds */
+static const unsigned int max_slack_shift = 2;
+#define USLEEP_RANGE_UPPER_BOUND	((TICK_NSEC << max_slack_shift) / NSEC_PER_USEC)
+
+/**
+ * fsleep - flexible sleep which autoselects the best mechanism
+ * @usecs:	requested sleep duration in microseconds
+ *
+ * flseep() selects the best mechanism that will provide maximum 25% slack
+ * to the requested sleep duration. Therefore it uses:
+ *
+ * * udelay() loop for sleep durations <= 10 microseconds to avoid hrtimer
+ *   overhead for really short sleep durations.
+ * * usleep_range() for sleep durations which would lead with the usage of
+ *   msleep() to a slack larger than 25%. This depends on the granularity of
+ *   jiffies.
+ * * msleep() for all other sleep durations.
+ *
+ * Note: When %CONFIG_HIGH_RES_TIMERS is not set, all sleeps are processed with
+ * the granularity of jiffies and the slack might exceed 25% especially for
+ * short sleep durations.
+ */
 static inline void fsleep(unsigned long usecs)
 {
 	if (usecs <= 10)
 		udelay(usecs);
-	else if (usecs <= 20000)
-		usleep_range(usecs, 2 * usecs);
+	else if (usecs < USLEEP_RANGE_UPPER_BOUND)
+		usleep_range(usecs, usecs + (usecs >> max_slack_shift));
 	else
-		msleep(DIV_ROUND_UP(usecs, 1000));
+		msleep(DIV_ROUND_UP(usecs, USEC_PER_MSEC));
 }
 
 #endif /* defined(_LINUX_DELAY_H) */
-- 
cgit v1.2.3


From 89124747f096fc0fe44be0162c7b4fb3271739e8 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Mon, 14 Oct 2024 10:22:29 +0200
Subject: iopoll/regmap/phy/snd: Fix comment referencing outdated timer
 documentation

Function descriptions in iopoll.h, regmap.h, phy.h and sound/soc/sof/ops.h
copied all the same outdated documentation about sleep/delay function
limitations. In those comments, the generic (and still outdated) timer
documentation file is referenced.

As proper function descriptions for used delay and sleep functions are in
place, simply update the descriptions to reference to them. While at it fix
missing colon after "Returns" in function description and move return value
description to the end of the function description.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch> # for phy.h
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-12-dc8b907cb62f@linutronix.de
---
 include/linux/iopoll.h | 52 +++++++++++++++++++++++++-------------------------
 include/linux/phy.h    |  9 +++++----
 include/linux/regmap.h | 38 ++++++++++++++++++------------------
 sound/soc/sof/ops.h    |  8 ++++----
 4 files changed, 54 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
index 19a7b00baff4..91324c331a4b 100644
--- a/include/linux/iopoll.h
+++ b/include/linux/iopoll.h
@@ -19,19 +19,19 @@
  * @op: accessor function (takes @args as its arguments)
  * @val: Variable to read the value into
  * @cond: Break condition (usually involving @val)
- * @sleep_us: Maximum time to sleep between reads in us (0
- *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.rst).
+ * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please
+ *            read usleep_range() function description for details and
+ *            limitations.
  * @timeout_us: Timeout in us, 0 means never timeout
  * @sleep_before_read: if it is true, sleep @sleep_us before read.
  * @args: arguments for @op poll
  *
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
- * case, the last read value at @args is stored in @val. Must not
- * be called from atomic context if sleep_us or timeout_us are used.
- *
  * When available, you'll probably want to use one of the specialized
  * macros defined below rather than this macro directly.
+ *
+ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @args is stored in @val. Must not
+ * be called from atomic context if sleep_us or timeout_us are used.
  */
 #define read_poll_timeout(op, val, cond, sleep_us, timeout_us, \
 				sleep_before_read, args...) \
@@ -64,22 +64,22 @@
  * @op: accessor function (takes @args as its arguments)
  * @val: Variable to read the value into
  * @cond: Break condition (usually involving @val)
- * @delay_us: Time to udelay between reads in us (0 tight-loops).  Should
- *            be less than ~10us since udelay is used (see
- *            Documentation/timers/timers-howto.rst).
+ * @delay_us: Time to udelay between reads in us (0 tight-loops). Please
+ *            read udelay() function description for details and
+ *            limitations.
  * @timeout_us: Timeout in us, 0 means never timeout
  * @delay_before_read: if it is true, delay @delay_us before read.
  * @args: arguments for @op poll
  *
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
- * case, the last read value at @args is stored in @val.
- *
  * This macro does not rely on timekeeping.  Hence it is safe to call even when
  * timekeeping is suspended, at the expense of an underestimation of wall clock
  * time, which is rather minimal with a non-zero delay_us.
  *
  * When available, you'll probably want to use one of the specialized
  * macros defined below rather than this macro directly.
+ *
+ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @args is stored in @val.
  */
 #define read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, \
 					delay_before_read, args...) \
@@ -119,17 +119,17 @@
  * @addr: Address to poll
  * @val: Variable to read the value into
  * @cond: Break condition (usually involving @val)
- * @sleep_us: Maximum time to sleep between reads in us (0
- *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.rst).
+ * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please
+ *            read usleep_range() function description for details and
+ *            limitations.
  * @timeout_us: Timeout in us, 0 means never timeout
  *
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
- * case, the last read value at @addr is stored in @val. Must not
- * be called from atomic context if sleep_us or timeout_us are used.
- *
  * When available, you'll probably want to use one of the specialized
  * macros defined below rather than this macro directly.
+ *
+ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @addr is stored in @val. Must not
+ * be called from atomic context if sleep_us or timeout_us are used.
  */
 #define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us)	\
 	read_poll_timeout(op, val, cond, sleep_us, timeout_us, false, addr)
@@ -140,16 +140,16 @@
  * @addr: Address to poll
  * @val: Variable to read the value into
  * @cond: Break condition (usually involving @val)
- * @delay_us: Time to udelay between reads in us (0 tight-loops).  Should
- *            be less than ~10us since udelay is used (see
- *            Documentation/timers/timers-howto.rst).
+ * @delay_us: Time to udelay between reads in us (0 tight-loops). Please
+ *            read udelay() function description for details and
+ *            limitations.
  * @timeout_us: Timeout in us, 0 means never timeout
  *
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
- * case, the last read value at @addr is stored in @val.
- *
  * When available, you'll probably want to use one of the specialized
  * macros defined below rather than this macro directly.
+ *
+ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @addr is stored in @val.
  */
 #define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \
 	read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, false, addr)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index a98bc91a0cde..504766d4b2d5 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1378,12 +1378,13 @@ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum);
  * @regnum: The register on the MMD to read
  * @val: Variable to read the register into
  * @cond: Break condition (usually involving @val)
- * @sleep_us: Maximum time to sleep between reads in us (0
- *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.rst).
+ * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please
+ *            read usleep_range() function description for details and
+ *            limitations.
  * @timeout_us: Timeout in us, 0 means never timeout
  * @sleep_before_read: if it is true, sleep @sleep_us before read.
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
+ *
+ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either
  * case, the last read value at @args is stored in @val. Must not
  * be called from atomic context if sleep_us or timeout_us are used.
  */
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index f9ccad32fc5c..75f162b60ba1 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -106,17 +106,17 @@ struct reg_sequence {
  * @addr: Address to poll
  * @val: Unsigned integer variable to read the value into
  * @cond: Break condition (usually involving @val)
- * @sleep_us: Maximum time to sleep between reads in us (0
- *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.rst).
+ * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please
+ *            read usleep_range() function description for details and
+ *            limitations.
  * @timeout_us: Timeout in us, 0 means never timeout
  *
- * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_read
+ * This is modelled after the readx_poll_timeout macros in linux/iopoll.h.
+ *
+ * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_read
  * error return value in case of a error read. In the two former cases,
  * the last read value at @addr is stored in @val. Must not be called
  * from atomic context if sleep_us or timeout_us are used.
- *
- * This is modelled after the readx_poll_timeout macros in linux/iopoll.h.
  */
 #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \
 ({ \
@@ -133,20 +133,20 @@ struct reg_sequence {
  * @addr: Address to poll
  * @val: Unsigned integer variable to read the value into
  * @cond: Break condition (usually involving @val)
- * @delay_us: Time to udelay between reads in us (0 tight-loops).
- *            Should be less than ~10us since udelay is used
- *            (see Documentation/timers/timers-howto.rst).
+ * @delay_us: Time to udelay between reads in us (0 tight-loops). Please
+ *            read udelay() function description for details and
+ *            limitations.
  * @timeout_us: Timeout in us, 0 means never timeout
  *
- * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_read
- * error return value in case of a error read. In the two former cases,
- * the last read value at @addr is stored in @val.
- *
  * This is modelled after the readx_poll_timeout_atomic macros in linux/iopoll.h.
  *
  * Note: In general regmap cannot be used in atomic context. If you want to use
  * this macro then first setup your regmap for atomic use (flat or no cache
  * and MMIO regmap).
+ *
+ * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_read
+ * error return value in case of a error read. In the two former cases,
+ * the last read value at @addr is stored in @val.
  */
 #define regmap_read_poll_timeout_atomic(map, addr, val, cond, delay_us, timeout_us) \
 ({ \
@@ -177,17 +177,17 @@ struct reg_sequence {
  * @field: Regmap field to read from
  * @val: Unsigned integer variable to read the value into
  * @cond: Break condition (usually involving @val)
- * @sleep_us: Maximum time to sleep between reads in us (0
- *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.rst).
+ * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please
+ *            read usleep_range() function description for details and
+ *            limitations.
  * @timeout_us: Timeout in us, 0 means never timeout
  *
- * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read
+ * This is modelled after the readx_poll_timeout macros in linux/iopoll.h.
+ *
+ * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read
  * error return value in case of a error read. In the two former cases,
  * the last read value at @addr is stored in @val. Must not be called
  * from atomic context if sleep_us or timeout_us are used.
- *
- * This is modelled after the readx_poll_timeout macros in linux/iopoll.h.
  */
 #define regmap_field_read_poll_timeout(field, val, cond, sleep_us, timeout_us) \
 ({ \
diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h
index 2584621c3b2d..d73644e85b6e 100644
--- a/sound/soc/sof/ops.h
+++ b/sound/soc/sof/ops.h
@@ -597,12 +597,12 @@ snd_sof_is_chain_dma_supported(struct snd_sof_dev *sdev, u32 dai_type)
  * @addr: Address to poll
  * @val: Variable to read the value into
  * @cond: Break condition (usually involving @val)
- * @sleep_us: Maximum time to sleep between reads in us (0
- *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.rst).
+ * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please
+ *            read usleep_range() function description for details and
+ *            limitations.
  * @timeout_us: Timeout in us, 0 means never timeout
  *
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
+ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either
  * case, the last read value at @addr is stored in @val. Must not
  * be called from atomic context if sleep_us or timeout_us are used.
  *
-- 
cgit v1.2.3


From 719258c55f7e9c123ef2a0941c6730b6d43c1bc2 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Mon, 9 Sep 2024 15:49:41 +0200
Subject: mfd: palmas: Constify strings with regulator names

The names of regulators are static const strings, so pointers can be made
as pointers to const for code safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240909134941.121847-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/palmas.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index eda1ffd99c1a..dabcc0dea802 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -98,8 +98,8 @@ struct palmas_sleep_requestor_info {
 };
 
 struct palmas_regs_info {
-	char	*name;
-	char	*sname;
+	const char	*name;
+	const char	*sname;
 	u8	vsel_addr;
 	u8	ctrl_addr;
 	u8	tstep_addr;
-- 
cgit v1.2.3


From d5340a18cd321075442677ce1002d8e163b17b67 Mon Sep 17 00:00:00 2001
From: Dzmitry Sankouski <dsankouski@gmail.com>
Date: Thu, 26 Sep 2024 13:28:10 +0300
Subject: mfd: max77693: Remove unused max77693_irq_source declarations

Remove `enum max77693_irq_source` declaration because unused.

Signed-off-by: Dzmitry Sankouski <dsankouski@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20240617-starqltechn_integration_upstream-v5-1-125d9228d751@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/max77693-private.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index 20c5e02ed9da..c324d548619e 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -419,17 +419,6 @@ enum max77693_haptic_reg {
 #define MAX77693_CONFIG2_MEN            6
 #define MAX77693_CONFIG2_HTYP           5
 
-enum max77693_irq_source {
-	LED_INT = 0,
-	TOPSYS_INT,
-	CHG_INT,
-	MUIC_INT1,
-	MUIC_INT2,
-	MUIC_INT3,
-
-	MAX77693_IRQ_GROUP_NR,
-};
-
 #define SRC_IRQ_CHARGER			BIT(0)
 #define SRC_IRQ_TOP			BIT(1)
 #define SRC_IRQ_FLASH			BIT(2)
-- 
cgit v1.2.3


From bf231e5febcf9358d7e70a2c6974548f7f3e4f61 Mon Sep 17 00:00:00 2001
From: Dzmitry Sankouski <dsankouski@gmail.com>
Date: Thu, 26 Sep 2024 12:47:31 +0300
Subject: mfd: sec-core: Add support for the Samsung s2dos05

S2DOS05 is a panel/touchscreen PMIC, often found in
Samsung phones. We define regulator sub-device for which driver will
be added in subsequent patch. The device also has ADC for power and
current measurements.

Signed-off-by: Dzmitry Sankouski <dsankouski@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20240617-starqltechn_integration_upstream-v5-2-ea1109029ba5@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/sec-core.c           | 11 +++++++++++
 include/linux/mfd/samsung/core.h |  1 +
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/sec-core.c b/drivers/mfd/sec-core.c
index a6b0d7300b2d..cdfe738e1d76 100644
--- a/drivers/mfd/sec-core.c
+++ b/drivers/mfd/sec-core.c
@@ -34,6 +34,10 @@ static const struct mfd_cell s5m8767_devs[] = {
 	},
 };
 
+static const struct mfd_cell s2dos05_devs[] = {
+	{ .name = "s2dos05-regulator", },
+};
+
 static const struct mfd_cell s2mps11_devs[] = {
 	{ .name = "s2mps11-regulator", },
 	{ .name = "s2mps14-rtc", },
@@ -83,6 +87,9 @@ static const struct of_device_id sec_dt_match[] = {
 	{
 		.compatible = "samsung,s5m8767-pmic",
 		.data = (void *)S5M8767X,
+	}, {
+		.compatible = "samsung,s2dos05",
+		.data = (void *)S2DOS05,
 	}, {
 		.compatible = "samsung,s2mps11-pmic",
 		.data = (void *)S2MPS11X,
@@ -339,6 +346,10 @@ static int sec_pmic_probe(struct i2c_client *i2c)
 		sec_devs = s5m8767_devs;
 		num_sec_devs = ARRAY_SIZE(s5m8767_devs);
 		break;
+	case S2DOS05:
+		sec_devs = s2dos05_devs;
+		num_sec_devs = ARRAY_SIZE(s2dos05_devs);
+		break;
 	case S2MPA01:
 		sec_devs = s2mpa01_devs;
 		num_sec_devs = ARRAY_SIZE(s2mpa01_devs);
diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index a212b9f72bc9..750274d41fc0 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -37,6 +37,7 @@ struct gpio_desc;
 
 enum sec_device_type {
 	S5M8767X,
+	S2DOS05,
 	S2MPA01,
 	S2MPS11X,
 	S2MPS13X,
-- 
cgit v1.2.3


From 2b627246c0cf2b54f1b17531dde964cf5b99b2a5 Mon Sep 17 00:00:00 2001
From: Pavan Holla <pholla@chromium.org>
Date: Tue, 10 Sep 2024 10:15:21 +0000
Subject: platform/chrome: Update EC feature flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Define EC_FEATURE_UCSI_PPM to enable usage of the cros_ec_ucsi
driver. Also, add any feature flags that are implemented by the EC
but are missing in the kernel header.

Signed-off-by: Pavan Holla <pholla@chromium.org>
Signed-off-by: Łukasz Bartosik <ukaszb@chromium.org>
Acked-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20240910101527.603452-3-ukaszb@chromium.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/platform_data/cros_ec_commands.h | 32 ++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index e574b790be6f..b3c4993e656e 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -1312,6 +1312,38 @@ enum ec_feature_code {
 	 * The EC supports the AP composing VDMs for us to send.
 	 */
 	EC_FEATURE_TYPEC_AP_VDM_SEND = 46,
+	/*
+	 * The EC supports system safe mode panic recovery.
+	 */
+	EC_FEATURE_SYSTEM_SAFE_MODE = 47,
+	/*
+	 * The EC will reboot on runtime assertion failures.
+	 */
+	EC_FEATURE_ASSERT_REBOOTS = 48,
+	/*
+	 * The EC image is built with tokenized logging enabled.
+	 */
+	EC_FEATURE_TOKENIZED_LOGGING = 49,
+	/*
+	 * The EC supports triggering an STB dump.
+	 */
+	EC_FEATURE_AMD_STB_DUMP = 50,
+	/*
+	 * The EC supports memory dump commands.
+	 */
+	EC_FEATURE_MEMORY_DUMP = 51,
+	/*
+	 * The EC supports DP2.1 capability
+	 */
+	EC_FEATURE_TYPEC_DP2_1 = 52,
+	/*
+	 * The MCU is System Companion Processor Core 1
+	 */
+	EC_FEATURE_SCP_C1 = 53,
+	/*
+	 * The EC supports UCSI PPM.
+	 */
+	EC_FEATURE_UCSI_PPM = 54,
 };
 
 #define EC_FEATURE_MASK_0(event_code) BIT(event_code % 32)
-- 
cgit v1.2.3


From 522249f05c5551aec9ec0ba9b6438f1ec19c138d Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 3 Oct 2024 16:29:22 +0200
Subject: fanotify: allow reporting errors on failure to open fd

When working in "fd mode", fanotify_read() needs to open an fd
from a dentry to report event->fd to userspace.

Opening an fd from dentry can fail for several reasons.
For example, when tasks are gone and we try to open their
/proc files or we try to open a WRONLY file like in sysfs
or when trying to open a file that was deleted on the
remote network server.

Add a new flag FAN_REPORT_FD_ERROR for fanotify_init().
For a group with FAN_REPORT_FD_ERROR, we will send the
event with the error instead of the open fd, otherwise
userspace may not get the error at all.

For an overflow event, we report -EBADF to avoid confusing FAN_NOFD
with -EPERM.  Similarly for pidfd open errors we report either -ESRCH
or the open error instead of FAN_NOPIDFD and FAN_EPIDFD.

In any case, userspace will not know which file failed to
open, so add a debug print for further investigation.

Reported-by: Krishna Vivek Vitta <kvitta@microsoft.com>
Link: https://lore.kernel.org/linux-fsdevel/SI2P153MB07182F3424619EDDD1F393EED46D2@SI2P153MB0718.APCP153.PROD.OUTLOOK.COM/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20241003142922.111539-1-amir73il@gmail.com
---
 fs/notify/fanotify/fanotify_user.c | 85 +++++++++++++++++++++-----------------
 include/linux/fanotify.h           |  1 +
 include/uapi/linux/fanotify.h      |  1 +
 3 files changed, 50 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9644bc72e457..8e2d43fc6f7c 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -266,13 +266,6 @@ static int create_fd(struct fsnotify_group *group, const struct path *path,
 			       group->fanotify_data.f_flags | __FMODE_NONOTIFY,
 			       current_cred());
 	if (IS_ERR(new_file)) {
-		/*
-		 * we still send an event even if we can't open the file.  this
-		 * can happen when say tasks are gone and we try to open their
-		 * /proc files or we try to open a WRONLY file like in sysfs
-		 * we just send the errno to userspace since there isn't much
-		 * else we can do.
-		 */
 		put_unused_fd(client_fd);
 		client_fd = PTR_ERR(new_file);
 	} else {
@@ -663,7 +656,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 	struct file *f = NULL, *pidfd_file = NULL;
-	int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
+	int ret, pidfd = -ESRCH, fd = -EBADF;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
@@ -691,10 +684,39 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
 	    path && path->mnt && path->dentry) {
 		fd = create_fd(group, path, &f);
-		if (fd < 0)
-			return fd;
+		/*
+		 * Opening an fd from dentry can fail for several reasons.
+		 * For example, when tasks are gone and we try to open their
+		 * /proc files or we try to open a WRONLY file like in sysfs
+		 * or when trying to open a file that was deleted on the
+		 * remote network server.
+		 *
+		 * For a group with FAN_REPORT_FD_ERROR, we will send the
+		 * event with the error instead of the open fd, otherwise
+		 * Userspace may not get the error at all.
+		 * In any case, userspace will not know which file failed to
+		 * open, so add a debug print for further investigation.
+		 */
+		if (fd < 0) {
+			pr_debug("fanotify: create_fd(%pd2) failed err=%d\n",
+				 path->dentry, fd);
+			if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) {
+				/*
+				 * Historically, we've handled EOPENSTALE in a
+				 * special way and silently dropped such
+				 * events. Now we have to keep it to maintain
+				 * backward compatibility...
+				 */
+				if (fd == -EOPENSTALE)
+					fd = 0;
+				return fd;
+			}
+		}
 	}
-	metadata.fd = fd;
+	if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR))
+		metadata.fd = fd;
+	else
+		metadata.fd = fd >= 0 ? fd : FAN_NOFD;
 
 	if (pidfd_mode) {
 		/*
@@ -709,18 +731,16 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		 * The PIDTYPE_TGID check for an event->pid is performed
 		 * preemptively in an attempt to catch out cases where the event
 		 * listener reads events after the event generating process has
-		 * already terminated. Report FAN_NOPIDFD to the event listener
-		 * in those cases, with all other pidfd creation errors being
-		 * reported as FAN_EPIDFD.
+		 * already terminated.  Depending on flag FAN_REPORT_FD_ERROR,
+		 * report either -ESRCH or FAN_NOPIDFD to the event listener in
+		 * those cases with all other pidfd creation errors reported as
+		 * the error code itself or as FAN_EPIDFD.
 		 */
-		if (metadata.pid == 0 ||
-		    !pid_has_task(event->pid, PIDTYPE_TGID)) {
-			pidfd = FAN_NOPIDFD;
-		} else {
+		if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID))
 			pidfd = pidfd_prepare(event->pid, 0, &pidfd_file);
-			if (pidfd < 0)
-				pidfd = FAN_EPIDFD;
-		}
+
+		if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0)
+			pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD;
 	}
 
 	ret = -EFAULT;
@@ -737,9 +757,6 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	buf += FAN_EVENT_METADATA_LEN;
 	count -= FAN_EVENT_METADATA_LEN;
 
-	if (fanotify_is_perm_event(event->mask))
-		FANOTIFY_PERM(event)->fd = fd;
-
 	if (info_mode) {
 		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
 						buf, count);
@@ -753,15 +770,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	if (pidfd_file)
 		fd_install(pidfd, pidfd_file);
 
+	if (fanotify_is_perm_event(event->mask))
+		FANOTIFY_PERM(event)->fd = fd;
+
 	return metadata.event_len;
 
 out_close_fd:
-	if (fd != FAN_NOFD) {
+	if (f) {
 		put_unused_fd(fd);
 		fput(f);
 	}
 
-	if (pidfd >= 0) {
+	if (pidfd_file) {
 		put_unused_fd(pidfd);
 		fput(pidfd_file);
 	}
@@ -828,15 +848,6 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 		}
 
 		ret = copy_event_to_user(group, event, buf, count);
-		if (unlikely(ret == -EOPENSTALE)) {
-			/*
-			 * We cannot report events with stale fd so drop it.
-			 * Setting ret to 0 will continue the event loop and
-			 * do the right thing if there are no more events to
-			 * read (i.e. return bytes read, -EAGAIN or wait).
-			 */
-			ret = 0;
-		}
 
 		/*
 		 * Permission events get queued to wait for response.  Other
@@ -845,7 +856,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 		if (!fanotify_is_perm_event(event->mask)) {
 			fsnotify_destroy_event(group, &event->fse);
 		} else {
-			if (ret <= 0) {
+			if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) {
 				spin_lock(&group->notification_lock);
 				finish_permission_event(group,
 					FANOTIFY_PERM(event), FAN_DENY, NULL);
@@ -1954,7 +1965,7 @@ static int __init fanotify_user_setup(void)
 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
 
 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
-	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
 
 	fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 4f1c4f603118..89ff45bd6f01 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -36,6 +36,7 @@
 #define FANOTIFY_ADMIN_INIT_FLAGS	(FANOTIFY_PERM_CLASSES | \
 					 FAN_REPORT_TID | \
 					 FAN_REPORT_PIDFD | \
+					 FAN_REPORT_FD_ERROR | \
 					 FAN_UNLIMITED_QUEUE | \
 					 FAN_UNLIMITED_MARKS)
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index a37de58ca571..34f221d3a1b9 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -60,6 +60,7 @@
 #define FAN_REPORT_DIR_FID	0x00000400	/* Report unique directory id */
 #define FAN_REPORT_NAME		0x00000800	/* Report events with name */
 #define FAN_REPORT_TARGET_FID	0x00001000	/* Report dirent target id  */
+#define FAN_REPORT_FD_ERROR	0x00002000	/* event->fd can report error */
 
 /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */
 #define FAN_REPORT_DFID_NAME	(FAN_REPORT_DIR_FID | FAN_REPORT_NAME)
-- 
cgit v1.2.3


From d6083f040d5d8f8d748462c77e90547097df936e Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Tue, 15 Oct 2024 23:02:06 +0800
Subject: bpf: Prevent tailcall infinite loop caused by freplace

There is a potential infinite loop issue that can occur when using a
combination of tail calls and freplace.

In an upcoming selftest, the attach target for entry_freplace of
tailcall_freplace.c is subprog_tc of tc_bpf2bpf.c, while the tail call in
entry_freplace leads to entry_tc. This results in an infinite loop:

entry_tc -> subprog_tc -> entry_freplace --tailcall-> entry_tc.

The problem arises because the tail_call_cnt in entry_freplace resets to
zero each time entry_freplace is executed, causing the tail call mechanism
to never terminate, eventually leading to a kernel panic.

To fix this issue, the solution is twofold:

1. Prevent updating a program extended by an freplace program to a
   prog_array map.
2. Prevent extending a program that is already part of a prog_array map
   with an freplace program.

This ensures that:

* If a program or its subprogram has been extended by an freplace program,
  it can no longer be updated to a prog_array map.
* If a program has been added to a prog_array map, neither it nor its
  subprograms can be extended by an freplace program.

Moreover, an extension program should not be tailcalled. As such, return
-EINVAL if the program has a type of BPF_PROG_TYPE_EXT when adding it to a
prog_array map.

Additionally, fix a minor code style issue by replacing eight spaces with a
tab for proper formatting.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20241015150207.70264-2-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     | 17 +++++++++++++----
 kernel/bpf/arraymap.c   | 26 ++++++++++++++++++++++++--
 kernel/bpf/core.c       |  1 +
 kernel/bpf/syscall.c    |  7 ++++---
 kernel/bpf/trampoline.c | 47 +++++++++++++++++++++++++++++++++++++++--------
 5 files changed, 81 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 19d8ca8ac960..0c216e71cec7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1292,8 +1292,12 @@ void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len);
 bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr);
 
 #ifdef CONFIG_BPF_JIT
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+			     struct bpf_trampoline *tr,
+			     struct bpf_prog *tgt_prog);
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+			       struct bpf_trampoline *tr,
+			       struct bpf_prog *tgt_prog);
 struct bpf_trampoline *bpf_trampoline_get(u64 key,
 					  struct bpf_attach_target_info *tgt_info);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
@@ -1374,12 +1378,14 @@ void bpf_jit_uncharge_modmem(u32 size);
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
 #else
 static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
-					   struct bpf_trampoline *tr)
+					   struct bpf_trampoline *tr,
+					   struct bpf_prog *tgt_prog)
 {
 	return -ENOTSUPP;
 }
 static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
-					     struct bpf_trampoline *tr)
+					     struct bpf_trampoline *tr,
+					     struct bpf_prog *tgt_prog)
 {
 	return -ENOTSUPP;
 }
@@ -1483,6 +1489,9 @@ struct bpf_prog_aux {
 	bool xdp_has_frags;
 	bool exception_cb;
 	bool exception_boundary;
+	bool is_extended; /* true if extended by freplace program */
+	u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
+	struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
 	struct bpf_arena *arena;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 79660e3fca4c..6cdbb4c33d31 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -947,22 +947,44 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
 				   struct file *map_file, int fd)
 {
 	struct bpf_prog *prog = bpf_prog_get(fd);
+	bool is_extended;
 
 	if (IS_ERR(prog))
 		return prog;
 
-	if (!bpf_prog_map_compatible(map, prog)) {
+	if (prog->type == BPF_PROG_TYPE_EXT ||
+	    !bpf_prog_map_compatible(map, prog)) {
 		bpf_prog_put(prog);
 		return ERR_PTR(-EINVAL);
 	}
 
+	mutex_lock(&prog->aux->ext_mutex);
+	is_extended = prog->aux->is_extended;
+	if (!is_extended)
+		prog->aux->prog_array_member_cnt++;
+	mutex_unlock(&prog->aux->ext_mutex);
+	if (is_extended) {
+		/* Extended prog can not be tail callee. It's to prevent a
+		 * potential infinite loop like:
+		 * tail callee prog entry -> tail callee prog subprog ->
+		 * freplace prog entry --tailcall-> tail callee prog entry.
+		 */
+		bpf_prog_put(prog);
+		return ERR_PTR(-EBUSY);
+	}
+
 	return prog;
 }
 
 static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
+	struct bpf_prog *prog = ptr;
+
+	mutex_lock(&prog->aux->ext_mutex);
+	prog->aux->prog_array_member_cnt--;
+	mutex_unlock(&prog->aux->ext_mutex);
 	/* bpf_prog is freed after one RCU or tasks trace grace period */
-	bpf_prog_put(ptr);
+	bpf_prog_put(prog);
 }
 
 static u32 prog_fd_array_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5e77c58e0601..233ea78f8f1b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -131,6 +131,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
 #endif
 	mutex_init(&fp->aux->used_maps_mutex);
+	mutex_init(&fp->aux->ext_mutex);
 	mutex_init(&fp->aux->dst_mutex);
 
 	return fp;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a8f1808a1ca5..4d04d4d9c1f3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3214,7 +3214,8 @@ static void bpf_tracing_link_release(struct bpf_link *link)
 		container_of(link, struct bpf_tracing_link, link.link);
 
 	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
-						tr_link->trampoline));
+						tr_link->trampoline,
+						tr_link->tgt_prog));
 
 	bpf_trampoline_put(tr_link->trampoline);
 
@@ -3354,7 +3355,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	 *   in prog->aux
 	 *
 	 * - if prog->aux->dst_trampoline is NULL, the program has already been
-         *   attached to a target and its initial target was cleared (below)
+	 *   attached to a target and its initial target was cleared (below)
 	 *
 	 * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
 	 *   target_btf_id using the link_create API.
@@ -3429,7 +3430,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	if (err)
 		goto out_unlock;
 
-	err = bpf_trampoline_link_prog(&link->link, tr);
+	err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		link = NULL;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index f8302a5ca400..9f36c049f4c2 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -523,7 +523,27 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 	}
 }
 
-static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog)
+{
+	struct bpf_prog_aux *aux = tgt_prog->aux;
+
+	guard(mutex)(&aux->ext_mutex);
+	if (aux->prog_array_member_cnt)
+		/* Program extensions can not extend target prog when the target
+		 * prog has been updated to any prog_array map as tail callee.
+		 * It's to prevent a potential infinite loop like:
+		 * tgt prog entry -> tgt prog subprog -> freplace prog entry
+		 * --tailcall-> tgt prog entry.
+		 */
+		return -EBUSY;
+
+	aux->is_extended = true;
+	return 0;
+}
+
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+				      struct bpf_trampoline *tr,
+				      struct bpf_prog *tgt_prog)
 {
 	enum bpf_tramp_prog_type kind;
 	struct bpf_tramp_link *link_exiting;
@@ -544,6 +564,9 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
 		/* Cannot attach extension if fentry/fexit are in use. */
 		if (cnt)
 			return -EBUSY;
+		err = bpf_freplace_check_tgt_prog(tgt_prog);
+		if (err)
+			return err;
 		tr->extension_prog = link->link.prog;
 		return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
 					  link->link.prog->bpf_func);
@@ -570,17 +593,21 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
 	return err;
 }
 
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+			     struct bpf_trampoline *tr,
+			     struct bpf_prog *tgt_prog)
 {
 	int err;
 
 	mutex_lock(&tr->mutex);
-	err = __bpf_trampoline_link_prog(link, tr);
+	err = __bpf_trampoline_link_prog(link, tr, tgt_prog);
 	mutex_unlock(&tr->mutex);
 	return err;
 }
 
-static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+					struct bpf_trampoline *tr,
+					struct bpf_prog *tgt_prog)
 {
 	enum bpf_tramp_prog_type kind;
 	int err;
@@ -591,6 +618,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
 		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
 					 tr->extension_prog->bpf_func, NULL);
 		tr->extension_prog = NULL;
+		guard(mutex)(&tgt_prog->aux->ext_mutex);
+		tgt_prog->aux->is_extended = false;
 		return err;
 	}
 	hlist_del_init(&link->tramp_hlist);
@@ -599,12 +628,14 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
 }
 
 /* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+			       struct bpf_trampoline *tr,
+			       struct bpf_prog *tgt_prog)
 {
 	int err;
 
 	mutex_lock(&tr->mutex);
-	err = __bpf_trampoline_unlink_prog(link, tr);
+	err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog);
 	mutex_unlock(&tr->mutex);
 	return err;
 }
@@ -619,7 +650,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link)
 	if (!shim_link->trampoline)
 		return;
 
-	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL));
 	bpf_trampoline_put(shim_link->trampoline);
 }
 
@@ -733,7 +764,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
 		goto err;
 	}
 
-	err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+	err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL);
 	if (err)
 		goto err;
 
-- 
cgit v1.2.3


From ce1dfe6d328966b75821c1f043a940eb2569768a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 12 Oct 2024 20:32:43 +0900
Subject: PCI: endpoint: Introduce pci_epc_mem_map()/unmap()

Some endpoint controllers have requirements on the alignment of the
controller physical memory address that must be used to map a RC PCI
address region. For instance, the endpoint controller of the RK3399 SoC
uses at most the lower 20 bits of a physical memory address region as
the lower bits of a RC PCI address region. For mapping a PCI address
region of size bytes starting from pci_addr, the exact number of
address bits used is the number of address bits changing in the address
range [pci_addr..pci_addr + size - 1]. For this example, this creates
the following constraints:
1) The offset into the controller physical memory allocated for a
   mapping depends on the mapping size *and* the starting PCI address
   for the mapping.
2) A mapping size cannot exceed the controller windows size (1MB) minus
   the offset needed into the allocated physical memory, which can end
   up being a smaller size than the desired mapping size.

Handling these constraints independently of the controller being used
in an endpoint function driver is not possible with the current EPC
API as only the ->align field in struct pci_epc_features is provided
but used for BAR (inbound ATU mappings) mapping only. A new API is
needed for function drivers to discover mapping constraints and handle
non-static requirements based on the RC PCI address range to access.

Introduce the endpoint controller operation ->align_addr() to allow
the EPC core functions to obtain the size and the offset into a
controller address region that must be allocated and mapped to access
a RC PCI address region. The size of the mapping provided by the
align_addr() operation can then be used as the size argument for the
function pci_epc_mem_alloc_addr() and the offset into the allocated
controller memory provided can be used to correctly handle data
transfers. For endpoint controllers that have PCI address alignment
constraints, the align_addr() operation may indicate upon return an
effective PCI address mapping size that is smaller (but not 0) than the
requested PCI address region size.

The controller ->align_addr() operation is optional: controllers that
do not have any alignment constraints for mapping RC PCI address regions
do not need to implement this operation. For such controllers, it is
always assumed that the mapping size is equal to the requested size of
the PCI region and that the mapping offset is 0.

The function pci_epc_mem_map() is introduced to use this new controller
operation (if it is defined) to handle controller memory allocation and
mapping to a RC PCI address region in endpoint function drivers.

This function first uses the ->align_addr() controller operation to
determine the controller memory address size (and offset into) needed
for mapping an RC PCI address region. The result of this operation is
used to allocate a controller physical memory region using
pci_epc_mem_alloc_addr() and then to map that memory to the RC PCI
address space with pci_epc_map_addr().

Since ->align_addr() () may indicate that not all of a RC PCI address
region can be mapped, pci_epc_mem_map() may only partially map the RC
PCI address region specified. It is the responsibility of the caller
(an endpoint function driver) to handle such smaller mapping by
repeatedly using pci_epc_mem_map() over the desried PCI address range.

The counterpart of pci_epc_mem_map() to unmap and free a mapped
controller memory address region is pci_epc_mem_unmap().

Both functions operate using the new struct pci_epc_map data structure.
This new structure represents a mapping PCI address, mapping effective
size, the size of the controller memory needed for the mapping as well
as the physical and virtual CPU addresses of the mapping (phys_base and
virt_base fields). For convenience, the physical and virtual CPU
addresses within that mapping to use to access the target RC PCI address
region are also provided (phys_addr and virt_addr fields).

Endpoint function drivers can use struct pci_epc_map to access the
mapped RC PCI address region using the ->virt_addr and ->pci_size
fields.

Co-developed-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Signed-off-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/20241012113246.95634-4-dlemoal@kernel.org
[mani: squashed the patch that changed phy_addr_t to u64]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/pci/endpoint/pci-epc-core.c | 103 ++++++++++++++++++++++++++++++++++++
 include/linux/pci-epc.h             |  38 +++++++++++++
 2 files changed, 141 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index b854f1bab26f..04a85d2f7e2a 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -466,6 +466,109 @@ int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 }
 EXPORT_SYMBOL_GPL(pci_epc_map_addr);
 
+/**
+ * pci_epc_mem_map() - allocate and map a PCI address to a CPU address
+ * @epc: the EPC device on which the CPU address is to be allocated and mapped
+ * @func_no: the physical endpoint function number in the EPC device
+ * @vfunc_no: the virtual endpoint function number in the physical function
+ * @pci_addr: PCI address to which the CPU address should be mapped
+ * @pci_size: the number of bytes to map starting from @pci_addr
+ * @map: where to return the mapping information
+ *
+ * Allocate a controller memory address region and map it to a RC PCI address
+ * region, taking into account the controller physical address mapping
+ * constraints using the controller operation align_addr(). If this operation is
+ * not defined, we assume that there are no alignment constraints for the
+ * mapping.
+ *
+ * The effective size of the PCI address range mapped from @pci_addr is
+ * indicated by @map->pci_size. This size may be less than the requested
+ * @pci_size. The local virtual CPU address for the mapping is indicated by
+ * @map->virt_addr (@map->phys_addr indicates the physical address).
+ * The size and CPU address of the controller memory allocated and mapped are
+ * respectively indicated by @map->map_size and @map->virt_base (and
+ * @map->phys_base for the physical address of @map->virt_base).
+ *
+ * Returns 0 on success and a negative error code in case of error.
+ */
+int pci_epc_mem_map(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
+		    u64 pci_addr, size_t pci_size, struct pci_epc_map *map)
+{
+	size_t map_size = pci_size;
+	size_t map_offset = 0;
+	int ret;
+
+	if (!pci_epc_function_is_valid(epc, func_no, vfunc_no))
+		return -EINVAL;
+
+	if (!pci_size || !map)
+		return -EINVAL;
+
+	/*
+	 * Align the PCI address to map. If the controller defines the
+	 * .align_addr() operation, use it to determine the PCI address to map
+	 * and the size of the mapping. Otherwise, assume that the controller
+	 * has no alignment constraint.
+	 */
+	memset(map, 0, sizeof(*map));
+	map->pci_addr = pci_addr;
+	if (epc->ops->align_addr)
+		map->map_pci_addr =
+			epc->ops->align_addr(epc, pci_addr,
+					     &map_size, &map_offset);
+	else
+		map->map_pci_addr = pci_addr;
+	map->map_size = map_size;
+	if (map->map_pci_addr + map->map_size < pci_addr + pci_size)
+		map->pci_size = map->map_pci_addr + map->map_size - pci_addr;
+	else
+		map->pci_size = pci_size;
+
+	map->virt_base = pci_epc_mem_alloc_addr(epc, &map->phys_base,
+						map->map_size);
+	if (!map->virt_base)
+		return -ENOMEM;
+
+	map->phys_addr = map->phys_base + map_offset;
+	map->virt_addr = map->virt_base + map_offset;
+
+	ret = pci_epc_map_addr(epc, func_no, vfunc_no, map->phys_base,
+			       map->map_pci_addr, map->map_size);
+	if (ret) {
+		pci_epc_mem_free_addr(epc, map->phys_base, map->virt_base,
+				      map->map_size);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_epc_mem_map);
+
+/**
+ * pci_epc_mem_unmap() - unmap and free a CPU address region
+ * @epc: the EPC device on which the CPU address is allocated and mapped
+ * @func_no: the physical endpoint function number in the EPC device
+ * @vfunc_no: the virtual endpoint function number in the physical function
+ * @map: the mapping information
+ *
+ * Unmap and free a CPU address region that was allocated and mapped with
+ * pci_epc_mem_map().
+ */
+void pci_epc_mem_unmap(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
+		       struct pci_epc_map *map)
+{
+	if (!pci_epc_function_is_valid(epc, func_no, vfunc_no))
+		return;
+
+	if (!map || !map->virt_base)
+		return;
+
+	pci_epc_unmap_addr(epc, func_no, vfunc_no, map->phys_base);
+	pci_epc_mem_free_addr(epc, map->phys_base, map->virt_base,
+			      map->map_size);
+}
+EXPORT_SYMBOL_GPL(pci_epc_mem_unmap);
+
 /**
  * pci_epc_clear_bar() - reset the BAR
  * @epc: the EPC device for which the BAR has to be cleared
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 42ef06136bd1..de8cc3658220 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -32,11 +32,43 @@ pci_epc_interface_string(enum pci_epc_interface_type type)
 	}
 }
 
+/**
+ * struct pci_epc_map - information about EPC memory for mapping a RC PCI
+ *                      address range
+ * @pci_addr: start address of the RC PCI address range to map
+ * @pci_size: size of the RC PCI address range mapped from @pci_addr
+ * @map_pci_addr: RC PCI address used as the first address mapped (may be lower
+ *                than @pci_addr)
+ * @map_size: size of the controller memory needed for mapping the RC PCI address
+ *            range @pci_addr..@pci_addr+@pci_size
+ * @phys_base: base physical address of the allocated EPC memory for mapping the
+ *             RC PCI address range
+ * @phys_addr: physical address at which @pci_addr is mapped
+ * @virt_base: base virtual address of the allocated EPC memory for mapping the
+ *             RC PCI address range
+ * @virt_addr: virtual address at which @pci_addr is mapped
+ */
+struct pci_epc_map {
+	u64		pci_addr;
+	size_t		pci_size;
+
+	u64		map_pci_addr;
+	size_t		map_size;
+
+	phys_addr_t	phys_base;
+	phys_addr_t	phys_addr;
+	void __iomem	*virt_base;
+	void __iomem	*virt_addr;
+};
+
 /**
  * struct pci_epc_ops - set of function pointers for performing EPC operations
  * @write_header: ops to populate configuration space header
  * @set_bar: ops to configure the BAR
  * @clear_bar: ops to reset the BAR
+ * @align_addr: operation to get the mapping address, mapping size and offset
+ *		into a controller memory window needed to map an RC PCI address
+ *		region
  * @map_addr: ops to map CPU address to PCI address
  * @unmap_addr: ops to unmap CPU address and PCI address
  * @set_msi: ops to set the requested number of MSI interrupts in the MSI
@@ -61,6 +93,8 @@ struct pci_epc_ops {
 			   struct pci_epf_bar *epf_bar);
 	void	(*clear_bar)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			     struct pci_epf_bar *epf_bar);
+	u64	(*align_addr)(struct pci_epc *epc, u64 pci_addr, size_t *size,
+			      size_t *offset);
 	int	(*map_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			    phys_addr_t addr, u64 pci_addr, size_t size);
 	void	(*unmap_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
@@ -278,6 +312,10 @@ void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc,
 				     phys_addr_t *phys_addr, size_t size);
 void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr,
 			   void __iomem *virt_addr, size_t size);
+int pci_epc_mem_map(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
+		    u64 pci_addr, size_t pci_size, struct pci_epc_map *map);
+void pci_epc_mem_unmap(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
+		       struct pci_epc_map *map);
 
 #else
 static inline void pci_epc_init_notify(struct pci_epc *epc)
-- 
cgit v1.2.3


From 5280a14a6079040205a1d968cd80f20448d047c7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 15 Oct 2024 12:09:32 -0700
Subject: genirq: Introduce irq_get_nr_irqs() and irq_set_nr_irqs()

Prepare for changing 'nr_irqs' from an exported global variable into a
variable with file scope.

This will prevent accidental changes of assignments to a local variable
'nr_irqs' into assignments to the global 'nr_irqs' variable.

Suppose that a patch would be submitted for review that removes a
declaration of a local variable with the name 'nr_irqs' and that that patch
does not remove all assignments to that local variable. Such a patch
converts an assignment to a local variable into an assignment into a global
variable. If the 'nr_irqs' assignment is more than three lines away from
other changes, the assignment won't be included in the diff context lines
and hence won't be visible without inspecting the modified file.

With these abstraction series applied, such accidental conversions from
assignments to a local variable into an assignment to a global variable are
converted into a compilation error.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20241015190953.1266194-2-bvanassche@acm.org
---
 include/linux/irqnr.h |  2 ++
 kernel/irq/irqdesc.c  | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 3496baa0b07f..7419b807b71b 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -6,6 +6,8 @@
 
 
 extern int nr_irqs;
+unsigned int irq_get_nr_irqs(void) __pure;
+unsigned int irq_set_nr_irqs(unsigned int nr);
 extern struct irq_desc *irq_to_desc(unsigned int irq);
 unsigned int irq_get_next_irq(unsigned int offset);
 
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1dee88ba0ae4..b0733959f8ae 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -141,6 +141,29 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+/**
+ * irq_get_nr_irqs() - Number of interrupts supported by the system.
+ */
+unsigned int irq_get_nr_irqs(void)
+{
+	return nr_irqs;
+}
+EXPORT_SYMBOL_GPL(irq_get_nr_irqs);
+
+/**
+ * irq_set_nr_irqs() - Set the number of interrupts supported by the system.
+ * @nr: New number of interrupts.
+ *
+ * Return: @nr.
+ */
+unsigned int irq_set_nr_irqs(unsigned int nr)
+{
+	nr_irqs = nr;
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(irq_set_nr_irqs);
+
 static DEFINE_MUTEX(sparse_irq_lock);
 static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs,
 					MT_FLAGS_ALLOC_RANGE |
-- 
cgit v1.2.3


From 1ad2048bf7146efb83bc033147ca1611a7fe8494 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 15 Oct 2024 12:09:52 -0700
Subject: genirq: Switch to irq_get_nr_irqs()

Use the irq_get_nr_irqs() function instead of the global variable
'nr_irqs'. Cache the result of this function in a local variable in
order not to rely on CSE (common subexpression elimination). Prepare
for changing 'nr_irqs' from an exported global variable into a variable
with file scope.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20241015190953.1266194-22-bvanassche@acm.org
---
 include/linux/irqnr.h  | 33 +++++++++++++++++++--------------
 kernel/irq/irqdomain.c |  2 +-
 kernel/irq/proc.c      |  3 ++-
 3 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 7419b807b71b..a33088d27c54 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -11,26 +11,31 @@ unsigned int irq_set_nr_irqs(unsigned int nr);
 extern struct irq_desc *irq_to_desc(unsigned int irq);
 unsigned int irq_get_next_irq(unsigned int offset);
 
-# define for_each_irq_desc(irq, desc)					\
-	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
-	     irq++, desc = irq_to_desc(irq))				\
-		if (!desc)						\
-			;						\
-		else
-
+#define for_each_irq_desc(irq, desc)                                      \
+	for (unsigned int __nr_irqs__ = irq_get_nr_irqs(); __nr_irqs__;   \
+	     __nr_irqs__ = 0)                                             \
+		for (irq = 0, desc = irq_to_desc(irq); irq < __nr_irqs__; \
+		     irq++, desc = irq_to_desc(irq))                      \
+			if (!desc)                                        \
+				;                                         \
+			else
 
 # define for_each_irq_desc_reverse(irq, desc)				\
-	for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0;	\
-	     irq--, desc = irq_to_desc(irq))				\
+	for (irq = irq_get_nr_irqs() - 1, desc = irq_to_desc(irq);	\
+	     irq >= 0; irq--, desc = irq_to_desc(irq))			\
 		if (!desc)						\
 			;						\
 		else
 
-# define for_each_active_irq(irq)			\
-	for (irq = irq_get_next_irq(0); irq < nr_irqs;	\
-	     irq = irq_get_next_irq(irq + 1))
+#define for_each_active_irq(irq)                                        \
+	for (unsigned int __nr_irqs__ = irq_get_nr_irqs(); __nr_irqs__; \
+	     __nr_irqs__ = 0)                                           \
+		for (irq = irq_get_next_irq(0); irq < __nr_irqs__;      \
+		     irq = irq_get_next_irq(irq + 1))
 
-#define for_each_irq_nr(irq)                   \
-       for (irq = 0; irq < nr_irqs; irq++)
+#define for_each_irq_nr(irq)                                            \
+	for (unsigned int __nr_irqs__ = irq_get_nr_irqs(); __nr_irqs__; \
+	     __nr_irqs__ = 0)                                           \
+		for (irq = 0; irq < __nr_irqs__; irq++)
 
 #endif
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e0bff21f30e0..ec6d8e72d980 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1225,7 +1225,7 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
 		virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE,
 					 affinity);
 	} else {
-		hint = hwirq % nr_irqs;
+		hint = hwirq % irq_get_nr_irqs();
 		if (hint == 0)
 			hint++;
 		virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE,
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9081ada81c3d..d226282c5b66 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -457,11 +457,12 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec)
 }
 
 #ifndef ACTUAL_NR_IRQS
-# define ACTUAL_NR_IRQS nr_irqs
+# define ACTUAL_NR_IRQS irq_get_nr_irqs()
 #endif
 
 int show_interrupts(struct seq_file *p, void *v)
 {
+	const unsigned int nr_irqs = irq_get_nr_irqs();
 	static int prec;
 
 	int i = *(loff_t *) v, j;
-- 
cgit v1.2.3


From ef4c675dc2961ee533bdc1ea20390761df0af5be Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 15 Oct 2024 12:09:53 -0700
Subject: genirq: Unexport nr_irqs

Unexport nr_irqs and declare it static now that all code that reads or
modifies nr_irqs has been converted to number_of_interrupts() /
set_number_of_interrupts(). Change the type of 'nr_irqs' from 'int' into
'unsigned int' to match the return type and argument type of the
irq_get_nr_iqs() / irq_set_nr_irqs() functions.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20241015190953.1266194-23-bvanassche@acm.org
---
 include/linux/irqnr.h | 1 -
 kernel/irq/irqdesc.c  | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index a33088d27c54..e97206c721a0 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -5,7 +5,6 @@
 #include <uapi/linux/irqnr.h>
 
 
-extern int nr_irqs;
 unsigned int irq_get_nr_irqs(void) __pure;
 unsigned int irq_set_nr_irqs(unsigned int nr);
 extern struct irq_desc *irq_to_desc(unsigned int irq);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index b0733959f8ae..479cf1caa610 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -138,8 +138,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 	desc_smp_init(desc, node, affinity);
 }
 
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL_GPL(nr_irqs);
+static unsigned int nr_irqs = NR_IRQS;
 
 /**
  * irq_get_nr_irqs() - Number of interrupts supported by the system.
-- 
cgit v1.2.3


From cf70da29c4993bf23df68b67a82dfa3da8234e75 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 5 Oct 2024 12:06:16 +0200
Subject: power: supply: core: unexport power_supply_property_is_writeable()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit ("power: supply: Drop use_cnt check from power_supply_property_is_writeable()"),
this function does not check use_cnt anymore, making it unsuitable for
general usage. As it is only used by the psy core anyways, remove it
from the public header and unexport it to avoid misusage.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20241005-power-supply-cleanups-v1-2-45303b2d0a4d@weissschuh.net
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply.h       | 3 +++
 drivers/power/supply/power_supply_core.c  | 1 -
 drivers/power/supply/power_supply_hwmon.c | 1 +
 include/linux/power_supply.h              | 2 --
 4 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/power_supply.h b/drivers/power/supply/power_supply.h
index 3cbafc58bdad..1bc57615ba68 100644
--- a/drivers/power/supply/power_supply.h
+++ b/drivers/power/supply/power_supply.h
@@ -13,6 +13,9 @@ struct device;
 struct device_type;
 struct power_supply;
 
+extern int power_supply_property_is_writeable(struct power_supply *psy,
+					      enum power_supply_property psp);
+
 #ifdef CONFIG_SYSFS
 
 extern void power_supply_init_attrs(void);
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index e0c87a08bc26..16085eff0084 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -1231,7 +1231,6 @@ int power_supply_property_is_writeable(struct power_supply *psy,
 {
 	return psy->desc->property_is_writeable && psy->desc->property_is_writeable(psy, psp);
 }
-EXPORT_SYMBOL_GPL(power_supply_property_is_writeable);
 
 void power_supply_external_power_changed(struct power_supply *psy)
 {
diff --git a/drivers/power/supply/power_supply_hwmon.c b/drivers/power/supply/power_supply_hwmon.c
index 6fbbfb1c685e..01be04903d7d 100644
--- a/drivers/power/supply/power_supply_hwmon.c
+++ b/drivers/power/supply/power_supply_hwmon.c
@@ -7,6 +7,7 @@
 #include <linux/hwmon.h>
 #include <linux/power_supply.h>
 #include <linux/slab.h>
+#include "power_supply.h"
 
 struct power_supply_hwmon {
 	struct power_supply *psy;
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 14eedefb3ac8..c1d6cb7f4c40 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -865,8 +865,6 @@ static inline int power_supply_set_property(struct power_supply *psy,
 			    const union power_supply_propval *val)
 { return 0; }
 #endif
-extern int power_supply_property_is_writeable(struct power_supply *psy,
-					enum power_supply_property psp);
 extern void power_supply_external_power_changed(struct power_supply *psy);
 
 extern struct power_supply *__must_check
-- 
cgit v1.2.3


From 8060bcb109f2d9b85451e84a7a08042da40368df Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 16 Oct 2024 16:18:31 +0300
Subject: usb: typec: Add attribute file showing the supported USB modes of the
 port

This attribute file, named "usb_capability", will show the
supported USB modes, which are USB 2.0, USB 3.2 and USB4.
These modes are defined in the USB Type-C (R2.0) and USB
Power Delivery (R3.0 V2.0) Specifications.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20241016131834.898599-2-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-typec | 13 +++++
 drivers/usb/typec/class.c                   | 81 +++++++++++++++++++++++++++++
 drivers/usb/typec/class.h                   |  1 +
 include/linux/usb/typec.h                   | 17 ++++++
 4 files changed, 112 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec
index 281b995beb05..3ee757208122 100644
--- a/Documentation/ABI/testing/sysfs-class-typec
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -149,6 +149,19 @@ Description:
 		advertise to the partner. The currently used capabilities are in
 		brackets. Selection happens by writing to the file.
 
+What:		/sys/class/typec/<port>/usb_capability
+Date:		November 2024
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:	Lists the supported USB Modes. The default USB mode that is used
+		next time with the Enter_USB Message is in brackets. The default
+		mode can be changed by writing to the file when supported by the
+		driver.
+
+		Valid values:
+		- usb2 (USB 2.0)
+		- usb3 (USB 3.2)
+		- usb4 (USB4)
+
 USB Type-C partner devices (eg. /sys/class/typec/port0-partner/)
 
 What:		/sys/class/typec/<port>-partner/accessory_mode
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index 9262fcd4144f..2f12269d1465 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -219,6 +219,13 @@ static ssize_t usb_power_delivery_revision_show(struct device *dev,
 						char *buf);
 static DEVICE_ATTR_RO(usb_power_delivery_revision);
 
+static const char * const usb_modes[] = {
+	[USB_MODE_NONE] = "none",
+	[USB_MODE_USB2] = "usb2",
+	[USB_MODE_USB3] = "usb3",
+	[USB_MODE_USB4] = "usb4"
+};
+
 /* ------------------------------------------------------------------------- */
 /* Alternate Modes */
 
@@ -1289,6 +1296,67 @@ EXPORT_SYMBOL_GPL(typec_unregister_cable);
 /* ------------------------------------------------------------------------- */
 /* USB Type-C ports */
 
+/**
+ * typec_port_set_usb_mode - Set the operational USB mode for the port
+ * @port: USB Type-C port
+ * @mode: USB Mode (USB2, USB3 or USB4)
+ *
+ * @mode will be used with the next Enter_USB message. Existing connections are
+ * not affected.
+ */
+void typec_port_set_usb_mode(struct typec_port *port, enum usb_mode mode)
+{
+	port->usb_mode = mode;
+}
+EXPORT_SYMBOL_GPL(typec_port_set_usb_mode);
+
+static ssize_t
+usb_capability_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct typec_port *port = to_typec_port(dev);
+	int len = 0;
+	int i;
+
+	for (i = USB_MODE_USB2; i < USB_MODE_USB4 + 1; i++) {
+		if (!(BIT(i - 1) & port->cap->usb_capability))
+			continue;
+
+		if (i == port->usb_mode)
+			len += sysfs_emit_at(buf, len, "[%s] ", usb_modes[i]);
+		else
+			len += sysfs_emit_at(buf, len, "%s ", usb_modes[i]);
+	}
+
+	sysfs_emit_at(buf, len - 1, "\n");
+
+	return len;
+}
+
+static ssize_t
+usb_capability_store(struct device *dev, struct device_attribute *attr,
+		     const char *buf, size_t size)
+{
+	struct typec_port *port = to_typec_port(dev);
+	int ret = 0;
+	int mode;
+
+	if (!port->ops || !port->ops->default_usb_mode_set)
+		return -EOPNOTSUPP;
+
+	mode = sysfs_match_string(usb_modes, buf);
+	if (mode < 0)
+		return mode;
+
+	ret = port->ops->default_usb_mode_set(port, mode);
+	if (ret)
+		return ret;
+
+	port->usb_mode = mode;
+
+	return size;
+}
+static DEVICE_ATTR_RW(usb_capability);
+
 /**
  * typec_port_set_usb_power_delivery - Assign USB PD for port.
  * @port: USB Type-C port.
@@ -1757,6 +1825,7 @@ static struct attribute *typec_attrs[] = {
 	&dev_attr_vconn_source.attr,
 	&dev_attr_port_type.attr,
 	&dev_attr_orientation.attr,
+	&dev_attr_usb_capability.attr,
 	NULL,
 };
 
@@ -1790,6 +1859,11 @@ static umode_t typec_attr_is_visible(struct kobject *kobj,
 		if (port->cap->orientation_aware)
 			return 0444;
 		return 0;
+	} else if (attr == &dev_attr_usb_capability.attr) {
+		if (!port->cap->usb_capability)
+			return 0;
+		if (!port->ops || !port->ops->default_usb_mode_set)
+			return 0444;
 	}
 
 	return attr->mode;
@@ -2428,6 +2502,13 @@ struct typec_port *typec_register_port(struct device *parent,
 	port->con.attach = typec_partner_attach;
 	port->con.deattach = typec_partner_deattach;
 
+	if (cap->usb_capability & USB_CAPABILITY_USB4)
+		port->usb_mode = USB_MODE_USB4;
+	else if (cap->usb_capability & USB_CAPABILITY_USB3)
+		port->usb_mode = USB_MODE_USB3;
+	else if (cap->usb_capability & USB_CAPABILITY_USB2)
+		port->usb_mode = USB_MODE_USB2;
+
 	device_initialize(&port->dev);
 	port->dev.class = &typec_class;
 	port->dev.parent = parent;
diff --git a/drivers/usb/typec/class.h b/drivers/usb/typec/class.h
index 7485cdb9dd20..85bc50aa54f7 100644
--- a/drivers/usb/typec/class.h
+++ b/drivers/usb/typec/class.h
@@ -55,6 +55,7 @@ struct typec_port {
 	enum typec_role			vconn_role;
 	enum typec_pwr_opmode		pwr_opmode;
 	enum typec_port_type		port_type;
+	enum usb_mode			usb_mode;
 	struct mutex			port_type_lock;
 
 	enum typec_orientation		orientation;
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 549275f8ac1b..f7edced5b10b 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -87,6 +87,17 @@ enum typec_orientation {
 	TYPEC_ORIENTATION_REVERSE,
 };
 
+enum usb_mode {
+	USB_MODE_NONE,
+	USB_MODE_USB2,
+	USB_MODE_USB3,
+	USB_MODE_USB4
+};
+
+#define USB_CAPABILITY_USB2	BIT(0)
+#define USB_CAPABILITY_USB3	BIT(1)
+#define USB_CAPABILITY_USB4	BIT(2)
+
 /*
  * struct enter_usb_data - Enter_USB Message details
  * @eudo: Enter_USB Data Object
@@ -240,6 +251,7 @@ struct typec_partner_desc {
  * @port_type_set: Set port type
  * @pd_get: Get available USB Power Delivery Capabilities.
  * @pd_set: Set USB Power Delivery Capabilities.
+ * @default_usb_mode_set: USB Mode to be used by default with Enter_USB Message
  */
 struct typec_operations {
 	int (*try_role)(struct typec_port *port, int role);
@@ -250,6 +262,7 @@ struct typec_operations {
 			     enum typec_port_type type);
 	struct usb_power_delivery **(*pd_get)(struct typec_port *port);
 	int (*pd_set)(struct typec_port *port, struct usb_power_delivery *pd);
+	int (*default_usb_mode_set)(struct typec_port *port, enum usb_mode mode);
 };
 
 enum usb_pd_svdm_ver {
@@ -267,6 +280,7 @@ enum usb_pd_svdm_ver {
  * @svdm_version: USB PD Structured VDM version if supported
  * @prefer_role: Initial role preference (DRP ports).
  * @accessory: Supported Accessory Modes
+ * @usb_capability: Supported USB Modes
  * @fwnode: Optional fwnode of the port
  * @driver_data: Private pointer for driver specific info
  * @pd: Optional USB Power Delivery Support
@@ -283,6 +297,7 @@ struct typec_capability {
 	int			prefer_role;
 	enum typec_accessory	accessory[TYPEC_MAX_ACCESSORY];
 	unsigned int		orientation_aware:1;
+	u8			usb_capability;
 
 	struct fwnode_handle	*fwnode;
 	void			*driver_data;
@@ -350,6 +365,8 @@ int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_
 int typec_partner_set_usb_power_delivery(struct typec_partner *partner,
 					 struct usb_power_delivery *pd);
 
+void typec_port_set_usb_mode(struct typec_port *port, enum usb_mode mode);
+
 /**
  * struct typec_connector - Representation of Type-C port for external drivers
  * @attach: notification about device removal
-- 
cgit v1.2.3


From 2140a952c4e9c73993ae6d9c2cc674d263d4beab Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 16 Oct 2024 16:18:32 +0300
Subject: usb: typec: Add attribute file showing the USB Modes of the partner

This attribute file shows the supported USB modes (USB 2.0,
USB 3.0 and USB4) of the partner, and the currently active
mode.

The active mode is determined primarily by checking the
speed of the enumerated USB device. When USB Power Delivery
is supported, the active USB mode should be always the mode
that was used with the Enter_USB Message, regardless of the
result of the USB enumeration. The port drivers can
separately assign the mode with a dedicated API.

If USB Power Delivery Identity is supplied for the partner
device, the supported modes are extracted from it.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20241016131834.898599-3-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-typec |  14 ++++
 drivers/usb/typec/class.c                   | 124 +++++++++++++++++++++++++++-
 drivers/usb/typec/class.h                   |   2 +
 include/linux/usb/typec.h                   |   5 ++
 4 files changed, 141 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec
index 3ee757208122..38e101c17a00 100644
--- a/Documentation/ABI/testing/sysfs-class-typec
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -233,6 +233,20 @@ Description:
 		directory exists, it will have an attribute file for every VDO
 		in Discover Identity command result.
 
+What:		/sys/class/typec/<port>-partner/usb_mode
+Date:		November 2024
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:	The USB Modes that the partner device supports. The active mode
+		is displayed in brackets. The active USB mode can be changed by
+		writing to this file when the port driver is able to send Data
+		Reset Message to the partner. That requires USB Power Delivery
+		contract between the partner and the port.
+
+		Valid values:
+		- usb2 (USB 2.0)
+		- usb3 (USB 3.2)
+		- usb4 (USB4)
+
 USB Type-C cable devices (eg. /sys/class/typec/port0-cable/)
 
 Note: Electronically Marked Cables will have a device also for one cable plug
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index 2f12269d1465..953cfb1b093e 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -618,6 +618,75 @@ EXPORT_SYMBOL_GPL(typec_unregister_altmode);
 /* ------------------------------------------------------------------------- */
 /* Type-C Partners */
 
+/**
+ * typec_partner_set_usb_mode - Assign active USB Mode for the partner
+ * @partner: USB Type-C partner
+ * @mode: USB Mode (USB2, USB3 or USB4)
+ *
+ * The port drivers can use this function to assign the active USB Mode to
+ * @partner. The USB Mode can change for example due to Data Reset.
+ */
+void typec_partner_set_usb_mode(struct typec_partner *partner, enum usb_mode mode)
+{
+	if (!partner || partner->usb_mode == mode)
+		return;
+
+	partner->usb_capability |= BIT(mode - 1);
+	partner->usb_mode = mode;
+	sysfs_notify(&partner->dev.kobj, NULL, "usb_mode");
+}
+EXPORT_SYMBOL_GPL(typec_partner_set_usb_mode);
+
+static ssize_t
+usb_mode_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct typec_partner *partner = to_typec_partner(dev);
+	int len = 0;
+	int i;
+
+	for (i = USB_MODE_USB2; i < USB_MODE_USB4 + 1; i++) {
+		if (!(BIT(i - 1) & partner->usb_capability))
+			continue;
+
+		if (i == partner->usb_mode)
+			len += sysfs_emit_at(buf, len, "[%s] ", usb_modes[i]);
+		else
+			len += sysfs_emit_at(buf, len, "%s ", usb_modes[i]);
+	}
+
+	sysfs_emit_at(buf, len - 1, "\n");
+
+	return len;
+}
+
+static ssize_t usb_mode_store(struct device *dev, struct device_attribute *attr,
+			      const char *buf, size_t size)
+{
+	struct typec_partner *partner = to_typec_partner(dev);
+	struct typec_port *port = to_typec_port(dev->parent);
+	int mode;
+	int ret;
+
+	if (!port->ops || !port->ops->enter_usb_mode)
+		return -EOPNOTSUPP;
+
+	mode = sysfs_match_string(usb_modes, buf);
+	if (mode < 0)
+		return mode;
+
+	if (mode == partner->usb_mode)
+		return size;
+
+	ret = port->ops->enter_usb_mode(port, mode);
+	if (ret)
+		return ret;
+
+	typec_partner_set_usb_mode(partner, mode);
+
+	return size;
+}
+static DEVICE_ATTR_RW(usb_mode);
+
 static ssize_t accessory_mode_show(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
@@ -664,6 +733,7 @@ static struct attribute *typec_partner_attrs[] = {
 	&dev_attr_supports_usb_power_delivery.attr,
 	&dev_attr_number_of_alternate_modes.attr,
 	&dev_attr_type.attr,
+	&dev_attr_usb_mode.attr,
 	&dev_attr_usb_power_delivery_revision.attr,
 	NULL
 };
@@ -671,6 +741,14 @@ static struct attribute *typec_partner_attrs[] = {
 static umode_t typec_partner_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n)
 {
 	struct typec_partner *partner = to_typec_partner(kobj_to_dev(kobj));
+	struct typec_port *port = to_typec_port(partner->dev.parent);
+
+	if (attr == &dev_attr_usb_mode.attr) {
+		if (!partner->usb_capability)
+			return 0;
+		if (!port->ops || !port->ops->enter_usb_mode)
+			return 0444;
+	}
 
 	if (attr == &dev_attr_number_of_alternate_modes.attr) {
 		if (partner->num_altmodes < 0)
@@ -744,10 +822,33 @@ static void typec_partner_unlink_device(struct typec_partner *partner, struct de
  */
 int typec_partner_set_identity(struct typec_partner *partner)
 {
-	if (!partner->identity)
+	u8 usb_capability = partner->usb_capability;
+	struct device *dev = &partner->dev;
+	struct usb_pd_identity *id;
+
+	id = get_pd_identity(dev);
+	if (!id)
 		return -EINVAL;
 
-	typec_report_identity(&partner->dev);
+	if (to_typec_port(dev->parent)->data_role == TYPEC_HOST)  {
+		u32 devcap = PD_VDO_UFP_DEVCAP(id->vdo[0]);
+
+		if (devcap & (DEV_USB2_CAPABLE | DEV_USB2_BILLBOARD))
+			usb_capability |= USB_CAPABILITY_USB2;
+		if (devcap & DEV_USB3_CAPABLE)
+			usb_capability |= USB_CAPABILITY_USB3;
+		if (devcap & DEV_USB4_CAPABLE)
+			usb_capability |= USB_CAPABILITY_USB4;
+	} else {
+		usb_capability = PD_VDO_DFP_HOSTCAP(id->vdo[0]);
+	}
+
+	if (partner->usb_capability != usb_capability) {
+		partner->usb_capability = usb_capability;
+		sysfs_notify(&dev->kobj, NULL, "usb_mode");
+	}
+
+	typec_report_identity(dev);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(typec_partner_set_identity);
@@ -917,6 +1018,7 @@ struct typec_partner *typec_register_partner(struct typec_port *port,
 	partner->usb_pd = desc->usb_pd;
 	partner->accessory = desc->accessory;
 	partner->num_altmodes = -1;
+	partner->usb_capability = desc->usb_capability;
 	partner->pd_revision = desc->pd_revision;
 	partner->svdm_version = port->cap->svdm_version;
 	partner->attach = desc->attach;
@@ -936,6 +1038,15 @@ struct typec_partner *typec_register_partner(struct typec_port *port,
 	partner->dev.type = &typec_partner_dev_type;
 	dev_set_name(&partner->dev, "%s-partner", dev_name(&port->dev));
 
+	if (port->usb2_dev) {
+		partner->usb_capability |= USB_CAPABILITY_USB2;
+		partner->usb_mode = USB_MODE_USB2;
+	}
+	if (port->usb3_dev) {
+		partner->usb_capability |= USB_CAPABILITY_USB2 | USB_CAPABILITY_USB3;
+		partner->usb_mode = USB_MODE_USB3;
+	}
+
 	ret = device_register(&partner->dev);
 	if (ret) {
 		dev_err(&port->dev, "failed to register partner (%d)\n", ret);
@@ -1935,13 +2046,18 @@ static void typec_partner_attach(struct typec_connector *con, struct device *dev
 	struct typec_port *port = container_of(con, struct typec_port, con);
 	struct typec_partner *partner = typec_get_partner(port);
 	struct usb_device *udev = to_usb_device(dev);
+	enum usb_mode usb_mode;
 
-	if (udev->speed < USB_SPEED_SUPER)
+	if (udev->speed < USB_SPEED_SUPER) {
+		usb_mode = USB_MODE_USB2;
 		port->usb2_dev = dev;
-	else
+	} else {
+		usb_mode = USB_MODE_USB3;
 		port->usb3_dev = dev;
+	}
 
 	if (partner) {
+		typec_partner_set_usb_mode(partner, usb_mode);
 		typec_partner_link_device(partner, dev);
 		put_device(&partner->dev);
 	}
diff --git a/drivers/usb/typec/class.h b/drivers/usb/typec/class.h
index 85bc50aa54f7..b3076a24ad2e 100644
--- a/drivers/usb/typec/class.h
+++ b/drivers/usb/typec/class.h
@@ -35,6 +35,8 @@ struct typec_partner {
 	int				num_altmodes;
 	u16				pd_revision; /* 0300H = "3.0" */
 	enum usb_pd_svdm_ver		svdm_version;
+	enum usb_mode			usb_mode;
+	u8				usb_capability;
 
 	struct usb_power_delivery	*pd;
 
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index f7edced5b10b..d616b8807000 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -220,6 +220,7 @@ struct typec_cable_desc {
  * @accessory: Audio, Debug or none.
  * @identity: Discover Identity command data
  * @pd_revision: USB Power Delivery Specification Revision if supported
+ * @usb_capability: Supported USB Modes
  * @attach: Notification about attached USB device
  * @deattach: Notification about removed USB device
  *
@@ -237,6 +238,7 @@ struct typec_partner_desc {
 	enum typec_accessory	accessory;
 	struct usb_pd_identity	*identity;
 	u16			pd_revision; /* 0300H = "3.0" */
+	u8			usb_capability;
 
 	void (*attach)(struct typec_partner *partner, struct device *dev);
 	void (*deattach)(struct typec_partner *partner, struct device *dev);
@@ -252,6 +254,7 @@ struct typec_partner_desc {
  * @pd_get: Get available USB Power Delivery Capabilities.
  * @pd_set: Set USB Power Delivery Capabilities.
  * @default_usb_mode_set: USB Mode to be used by default with Enter_USB Message
+ * @enter_usb_mode: Change the active USB Mode
  */
 struct typec_operations {
 	int (*try_role)(struct typec_port *port, int role);
@@ -263,6 +266,7 @@ struct typec_operations {
 	struct usb_power_delivery **(*pd_get)(struct typec_port *port);
 	int (*pd_set)(struct typec_port *port, struct usb_power_delivery *pd);
 	int (*default_usb_mode_set)(struct typec_port *port, enum usb_mode mode);
+	int (*enter_usb_mode)(struct typec_port *port, enum usb_mode mode);
 };
 
 enum usb_pd_svdm_ver {
@@ -365,6 +369,7 @@ int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_
 int typec_partner_set_usb_power_delivery(struct typec_partner *partner,
 					 struct usb_power_delivery *pd);
 
+void typec_partner_set_usb_mode(struct typec_partner *partner, enum usb_mode usb_mode);
 void typec_port_set_usb_mode(struct typec_port *port, enum usb_mode mode);
 
 /**
-- 
cgit v1.2.3


From f35533a0e60946ee3fb8adccf8a36024c6f1fe40 Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Wed, 16 Oct 2024 18:23:23 +0800
Subject: soundwire: sdw_intel: include linux/acpi.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For the acpi_handle stuff.

Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Link: https://patch.msgid.link/20241016102333.294448-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 37ae69365fe2..fae345987b8c 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -4,6 +4,7 @@
 #ifndef __SDW_INTEL_H
 #define __SDW_INTEL_H
 
+#include <linux/acpi.h>
 #include <linux/irqreturn.h>
 #include <linux/soundwire/sdw.h>
 
-- 
cgit v1.2.3


From 4b224ff80d6609811ec6ab5406a16c92825cfb1a Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 16 Oct 2024 18:23:24 +0800
Subject: ASoC/soundwire: remove sdw_slave_extended_id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This structure is used to copy information from the 'sdw_slave'
structures, it's better to create a flexible array of 'sdw_slave'
pointers and directly access the information. This will also help
access additional information stored in the 'sdw_slave' structure,
such as an SDCA context.

This patch does not add new functionality, it only modified how the
information is retrieved.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://patch.msgid.link/20241016102333.294448-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/soundwire/amd_init.c        | 12 ++++++------
 drivers/soundwire/intel_init.c      | 13 ++++++-------
 include/linux/soundwire/sdw.h       |  6 +++---
 include/linux/soundwire/sdw_amd.h   |  7 ++-----
 include/linux/soundwire/sdw_intel.h |  7 ++-----
 include/sound/soc-acpi.h            |  3 +--
 sound/soc/amd/ps/pci-ps.c           |  3 +--
 sound/soc/soc-acpi.c                | 30 ++++++++++++++++--------------
 sound/soc/sof/amd/acp-common.c      |  3 +--
 sound/soc/sof/intel/hda.c           | 16 +++++++++-------
 10 files changed, 47 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/amd_init.c b/drivers/soundwire/amd_init.c
index db040f435059..53d1d707ca1a 100644
--- a/drivers/soundwire/amd_init.c
+++ b/drivers/soundwire/amd_init.c
@@ -177,7 +177,7 @@ EXPORT_SYMBOL_NS(sdw_amd_probe, SOUNDWIRE_AMD_INIT);
 void sdw_amd_exit(struct sdw_amd_ctx *ctx)
 {
 	sdw_amd_cleanup(ctx);
-	kfree(ctx->ids);
+	kfree(ctx->peripherals);
 	kfree(ctx);
 }
 EXPORT_SYMBOL_NS(sdw_amd_exit, SOUNDWIRE_AMD_INIT);
@@ -204,10 +204,11 @@ int sdw_amd_get_slave_info(struct sdw_amd_ctx *ctx)
 			num_slaves++;
 	}
 
-	ctx->ids = kcalloc(num_slaves, sizeof(*ctx->ids), GFP_KERNEL);
-	if (!ctx->ids)
+	ctx->peripherals = kmalloc(struct_size(ctx->peripherals, array, num_slaves),
+				   GFP_KERNEL);
+	if (!ctx->peripherals)
 		return -ENOMEM;
-	ctx->num_slaves = num_slaves;
+	ctx->peripherals->num_peripherals = num_slaves;
 	for (index = 0; index < ctx->count; index++) {
 		if (!(ctx->link_mask & BIT(index)))
 			continue;
@@ -215,8 +216,7 @@ int sdw_amd_get_slave_info(struct sdw_amd_ctx *ctx)
 		if (amd_manager) {
 			bus = &amd_manager->bus;
 			list_for_each_entry(slave, &bus->slaves, node) {
-				ctx->ids[i].id = slave->id;
-				ctx->ids[i].link_id = bus->link_id;
+				ctx->peripherals->array[i] = slave;
 				i++;
 			}
 		}
diff --git a/drivers/soundwire/intel_init.c b/drivers/soundwire/intel_init.c
index a09134b97cd6..12e7a98f319f 100644
--- a/drivers/soundwire/intel_init.c
+++ b/drivers/soundwire/intel_init.c
@@ -252,17 +252,16 @@ static struct sdw_intel_ctx
 			num_slaves++;
 	}
 
-	ctx->ids = kcalloc(num_slaves, sizeof(*ctx->ids), GFP_KERNEL);
-	if (!ctx->ids)
+	ctx->peripherals = kmalloc(struct_size(ctx->peripherals, array, num_slaves),
+				   GFP_KERNEL);
+	if (!ctx->peripherals)
 		goto err;
-
-	ctx->num_slaves = num_slaves;
+	ctx->peripherals->num_peripherals = num_slaves;
 	i = 0;
 	list_for_each_entry(link, &ctx->link_list, list) {
 		bus = &link->cdns->bus;
 		list_for_each_entry(slave, &bus->slaves, node) {
-			ctx->ids[i].id = slave->id;
-			ctx->ids[i].link_id = bus->link_id;
+			ctx->peripherals->array[i] = slave;
 			i++;
 		}
 	}
@@ -371,7 +370,7 @@ void sdw_intel_exit(struct sdw_intel_ctx *ctx)
 	}
 
 	sdw_intel_cleanup(ctx);
-	kfree(ctx->ids);
+	kfree(ctx->peripherals);
 	kfree(ctx->ldev);
 	kfree(ctx);
 }
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 5e0dd47a0412..283c8bfdde49 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -488,9 +488,9 @@ struct sdw_slave_id {
 	__u8 sdw_version:4;
 };
 
-struct sdw_extended_slave_id {
-	int link_id;
-	struct sdw_slave_id id;
+struct sdw_peripherals {
+	int num_peripherals;
+	struct sdw_slave *array[];
 };
 
 /*
diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h
index 28a4eb77717f..585b4c58a8a6 100644
--- a/include/linux/soundwire/sdw_amd.h
+++ b/include/linux/soundwire/sdw_amd.h
@@ -115,19 +115,16 @@ struct sdw_amd_acpi_info {
  * struct sdw_amd_ctx - context allocated by the controller driver probe
  *
  * @count: link count
- * @num_slaves: total number of devices exposed across all enabled links
  * @link_mask: bit-wise mask listing SoundWire links reported by the
  * Controller
- * @ids: array of slave_id, representing Slaves exposed across all enabled
- * links
  * @pdev: platform device structure
+ * @peripherals: array representing Peripherals exposed across all enabled links
  */
 struct sdw_amd_ctx {
 	int count;
-	int num_slaves;
 	u32 link_mask;
-	struct sdw_extended_slave_id *ids;
 	struct platform_device *pdev[AMD_SDW_MAX_MANAGER_COUNT];
+	struct sdw_peripherals *peripherals;
 };
 
 /**
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index fae345987b8c..491ddd27270f 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -287,31 +287,28 @@ struct hdac_bus;
  * hardware capabilities after all power dependencies are settled.
  * @link_mask: bit-wise mask listing SoundWire links reported by the
  * Controller
- * @num_slaves: total number of devices exposed across all enabled links
  * @handle: ACPI parent handle
  * @ldev: information for each link (controller-specific and kept
  * opaque here)
- * @ids: array of slave_id, representing Slaves exposed across all enabled
- * links
  * @link_list: list to handle interrupts across all links
  * @shim_lock: mutex to handle concurrent rmw access to shared SHIM registers.
  * @shim_mask: flags to track initialization of SHIM shared registers
  * @shim_base: sdw shim base.
  * @alh_base: sdw alh base.
+ * @peripherals: array representing Peripherals exposed across all enabled links
  */
 struct sdw_intel_ctx {
 	int count;
 	void __iomem *mmio_base;
 	u32 link_mask;
-	int num_slaves;
 	acpi_handle handle;
 	struct sdw_intel_link_dev **ldev;
-	struct sdw_extended_slave_id *ids;
 	struct list_head link_list;
 	struct mutex shim_lock; /* lock for access to shared SHIM registers */
 	u32 shim_mask;
 	u32 shim_base;
 	u32 alh_base;
+	struct sdw_peripherals *peripherals;
 };
 
 /**
diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h
index 60d3b86a4660..ac7f9e791ee1 100644
--- a/include/sound/soc-acpi.h
+++ b/include/sound/soc-acpi.h
@@ -233,7 +233,6 @@ static inline bool snd_soc_acpi_sof_parent(struct device *dev)
 
 bool snd_soc_acpi_sdw_link_slaves_found(struct device *dev,
 					const struct snd_soc_acpi_link_adr *link,
-					struct sdw_extended_slave_id *ids,
-					int num_slaves);
+					struct sdw_peripherals *peripherals);
 
 #endif
diff --git a/sound/soc/amd/ps/pci-ps.c b/sound/soc/amd/ps/pci-ps.c
index c72d666d51bd..4365499c8f82 100644
--- a/sound/soc/amd/ps/pci-ps.c
+++ b/sound/soc/amd/ps/pci-ps.c
@@ -302,8 +302,7 @@ static struct snd_soc_acpi_mach *acp63_sdw_machine_select(struct device *dev)
 			link = mach->links;
 			for (i = 0; i < acp_data->info.count && link->num_adr; link++, i++) {
 				if (!snd_soc_acpi_sdw_link_slaves_found(dev, link,
-									acp_data->sdw->ids,
-									acp_data->sdw->num_slaves))
+									acp_data->sdw->peripherals))
 					break;
 			}
 			if (i == acp_data->info.count || !link->num_adr)
diff --git a/sound/soc/soc-acpi.c b/sound/soc/soc-acpi.c
index 6d693b2ad5a3..270f9777942f 100644
--- a/sound/soc/soc-acpi.c
+++ b/sound/soc/soc-acpi.c
@@ -131,8 +131,7 @@ EXPORT_SYMBOL_GPL(snd_soc_acpi_codec_list);
 /* Check if all Slaves defined on the link can be found */
 bool snd_soc_acpi_sdw_link_slaves_found(struct device *dev,
 					const struct snd_soc_acpi_link_adr *link,
-					struct sdw_extended_slave_id *ids,
-					int num_slaves)
+					struct sdw_peripherals *peripherals)
 {
 	unsigned int part_id, link_id, unique_id, mfg_id, version;
 	int i, j, k;
@@ -146,22 +145,25 @@ bool snd_soc_acpi_sdw_link_slaves_found(struct device *dev,
 		link_id = SDW_DISCO_LINK_ID(adr);
 		version = SDW_VERSION(adr);
 
-		for (j = 0; j < num_slaves; j++) {
+		for (j = 0; j < peripherals->num_peripherals; j++) {
+			struct sdw_slave *peripheral = peripherals->array[j];
+
 			/* find out how many identical parts were reported on that link */
-			if (ids[j].link_id == link_id &&
-			    ids[j].id.part_id == part_id &&
-			    ids[j].id.mfg_id == mfg_id &&
-			    ids[j].id.sdw_version == version)
+			if (peripheral->bus->link_id == link_id &&
+			    peripheral->id.part_id == part_id &&
+			    peripheral->id.mfg_id == mfg_id &&
+			    peripheral->id.sdw_version == version)
 				reported_part_count++;
 		}
 
-		for (j = 0; j < num_slaves; j++) {
+		for (j = 0; j < peripherals->num_peripherals; j++) {
+			struct sdw_slave *peripheral = peripherals->array[j];
 			int expected_part_count = 0;
 
-			if (ids[j].link_id != link_id ||
-			    ids[j].id.part_id != part_id ||
-			    ids[j].id.mfg_id != mfg_id ||
-			    ids[j].id.sdw_version != version)
+			if (peripheral->bus->link_id != link_id ||
+			    peripheral->id.part_id != part_id ||
+			    peripheral->id.mfg_id != mfg_id ||
+			    peripheral->id.sdw_version != version)
 				continue;
 
 			/* find out how many identical parts are expected */
@@ -180,7 +182,7 @@ bool snd_soc_acpi_sdw_link_slaves_found(struct device *dev,
 				 */
 				unique_id = SDW_UNIQUE_ID(adr);
 				if (reported_part_count == 1 ||
-				    ids[j].id.unique_id == unique_id) {
+				    peripheral->id.unique_id == unique_id) {
 					dev_dbg(dev, "found part_id %#x at link %d\n", part_id, link_id);
 					break;
 				}
@@ -189,7 +191,7 @@ bool snd_soc_acpi_sdw_link_slaves_found(struct device *dev,
 					part_id, reported_part_count, expected_part_count, link_id);
 			}
 		}
-		if (j == num_slaves) {
+		if (j == peripherals->num_peripherals) {
 			dev_dbg(dev, "Slave part_id %#x not found\n", part_id);
 			return false;
 		}
diff --git a/sound/soc/sof/amd/acp-common.c b/sound/soc/sof/amd/acp-common.c
index dbcaac84cb73..fc792956bb97 100644
--- a/sound/soc/sof/amd/acp-common.c
+++ b/sound/soc/sof/amd/acp-common.c
@@ -145,8 +145,7 @@ static struct snd_soc_acpi_mach *amd_sof_sdw_machine_select(struct snd_sof_dev *
 			link = mach->links;
 			for (i = 0; i < acp_data->info.count && link->num_adr; link++, i++) {
 				if (!snd_soc_acpi_sdw_link_slaves_found(sdev->dev, link,
-									acp_data->sdw->ids,
-									acp_data->sdw->num_slaves))
+									acp_data->sdw->peripherals))
 					break;
 			}
 			if (i == acp_data->info.count || !link->num_adr)
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index e4cb4ffc7270..9abc4c071ae5 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -1064,7 +1064,7 @@ static struct snd_soc_acpi_mach *hda_sdw_machine_select(struct snd_sof_dev *sdev
 {
 	struct snd_sof_pdata *pdata = sdev->pdata;
 	const struct snd_soc_acpi_link_adr *link;
-	struct sdw_extended_slave_id *ids;
+	struct sdw_peripherals *peripherals;
 	struct snd_soc_acpi_mach *mach;
 	struct sof_intel_hda_dev *hdev;
 	u32 link_mask;
@@ -1083,7 +1083,7 @@ static struct snd_soc_acpi_mach *hda_sdw_machine_select(struct snd_sof_dev *sdev
 		return NULL;
 	}
 
-	if (!hdev->sdw->num_slaves) {
+	if (!hdev->sdw->peripherals || !hdev->sdw->peripherals->num_peripherals) {
 		dev_warn(sdev->dev, "No SoundWire peripheral detected in ACPI tables\n");
 		return NULL;
 	}
@@ -1119,8 +1119,7 @@ static struct snd_soc_acpi_mach *hda_sdw_machine_select(struct snd_sof_dev *sdev
 			 * are not found on this link.
 			 */
 			if (!snd_soc_acpi_sdw_link_slaves_found(sdev->dev, link,
-								hdev->sdw->ids,
-								hdev->sdw->num_slaves))
+								hdev->sdw->peripherals))
 				break;
 		}
 		/* Found if all Slaves are checked */
@@ -1136,10 +1135,13 @@ static struct snd_soc_acpi_mach *hda_sdw_machine_select(struct snd_sof_dev *sdev
 	}
 
 	dev_info(sdev->dev, "No SoundWire machine driver found for the ACPI-reported configuration:\n");
-	ids = hdev->sdw->ids;
-	for (i = 0; i < hdev->sdw->num_slaves; i++)
+	peripherals = hdev->sdw->peripherals;
+	for (i = 0; i < peripherals->num_peripherals; i++)
 		dev_info(sdev->dev, "link %d mfg_id 0x%04x part_id 0x%04x version %#x\n",
-			 ids[i].link_id, ids[i].id.mfg_id, ids[i].id.part_id, ids[i].id.sdw_version);
+			 peripherals->array[i]->bus->link_id,
+			 peripherals->array[i]->id.mfg_id,
+			 peripherals->array[i]->id.part_id,
+			 peripherals->array[i]->id.sdw_version);
 
 	return NULL;
 }
-- 
cgit v1.2.3


From 3a513da1ae33972e59efeef7908061f1f24af480 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 16 Oct 2024 18:23:25 +0800
Subject: ASoC: SDCA: add initial module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add new module for SDCA (SoundWire Device Class for Audio) support.
For now just add a parser to identify the SDCA revision and the
function mask.

Note that the SDCA definitions and related MIPI DisCo properties are
defined only for ACPI platforms and extracted with _DSD helpers. There
is currently no support for Device Tree in the specification, the
'depends on ACPI' reflects this design limitation. This might change
in a future revision of the specification but for SDCA 1.0 ACPI is the
only supported type of platform firmware.

The SDCA library is defined with static inline fallbacks, which will
allow for unconditional addition of SDCA support in common parts of
the code.

The design follows a four-step process:

1) Basic information related to Functions is extracted from MIPI DisCo
tables and stored in the 'struct sdw_slave'. Devm_ based memory
allocation is not allowed at this point prior to a driver probe, so we only
store the function node, address and type.

2) When a codec driver probes, it will register subdevices for each
Function identified in phase 1)

3) a driver will probe for each subdevice and addition parsing/memory
allocation takes place at this level. devm_ based allocation is highly
encouraged to make error handling manageable.

4) Before the peripheral device becomes physically attached, register
access is not permitted and the regmaps are cache-only. When
peripheral device is enumerated, the bus level uses the
'update_status' notification; after optional device-level
initialization, the codec driver will notify each of the subdevices so
that they can start interacting with the hardware.

Note that the context extracted in 1) should be arguably be handled
completely in the codec driver probe. That would however make it
difficult to use the ACPI information for machine quirks, and
e.g. select different machine driver and topologies as done for the
RT712_VB handling later in the series. To make the implementation of
quirks simpler, this patchset extracts a minimal amount of context
(interface revision and number/type of Functions) before the codec
driver probe, and stores this context in the scope of the 'struct
sdw_slave'.

The SDCA library can also be used in a vendor-specific driver without
creating subdevices, e.g. to retrieve the 'initialization-table'
values to write platform-specific values as needed.

For more technical details, the SDCA specification is available for
public downloads at https://www.mipi.org/mipi-sdca-v1-0-download

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://patch.msgid.link/20241016102333.294448-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw.h   |   3 +
 include/sound/sdca.h            |  54 +++++++++++++
 include/sound/sdca_function.h   |  55 +++++++++++++
 sound/soc/Kconfig               |   1 +
 sound/soc/Makefile              |   1 +
 sound/soc/sdca/Kconfig          |  11 +++
 sound/soc/sdca/Makefile         |   5 ++
 sound/soc/sdca/sdca_device.c    |  24 ++++++
 sound/soc/sdca/sdca_functions.c | 173 ++++++++++++++++++++++++++++++++++++++++
 9 files changed, 327 insertions(+)
 create mode 100644 include/sound/sdca.h
 create mode 100644 include/sound/sdca_function.h
 create mode 100644 sound/soc/sdca/Kconfig
 create mode 100644 sound/soc/sdca/Makefile
 create mode 100644 sound/soc/sdca/sdca_device.c
 create mode 100644 sound/soc/sdca/sdca_functions.c

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 283c8bfdde49..49d690f3d29a 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -10,6 +10,7 @@
 #include <linux/irqdomain.h>
 #include <linux/mod_devicetable.h>
 #include <linux/bitfield.h>
+#include <sound/sdca.h>
 
 struct sdw_bus;
 struct sdw_slave;
@@ -663,6 +664,7 @@ struct sdw_slave_ops {
  * @is_mockup_device: status flag used to squelch errors in the command/control
  * protocol for SoundWire mockup devices
  * @sdw_dev_lock: mutex used to protect callbacks/remove races
+ * @sdca_data: structure containing all device data for SDCA helpers
  */
 struct sdw_slave {
 	struct sdw_slave_id id;
@@ -686,6 +688,7 @@ struct sdw_slave {
 	bool first_interrupt_done;
 	bool is_mockup_device;
 	struct mutex sdw_dev_lock; /* protect callbacks/remove races */
+	struct sdca_device_data sdca_data;
 };
 
 #define dev_to_sdw_dev(_dev) container_of(_dev, struct sdw_slave, dev)
diff --git a/include/sound/sdca.h b/include/sound/sdca.h
new file mode 100644
index 000000000000..34473ca4c789
--- /dev/null
+++ b/include/sound/sdca.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ *
+ * Copyright(c) 2024 Intel Corporation
+ */
+
+#ifndef __SDCA_H__
+#define __SDCA_H__
+
+struct sdw_slave;
+
+#define SDCA_MAX_FUNCTION_COUNT 8
+
+/**
+ * sdca_device_desc - short descriptor for an SDCA Function
+ * @adr: ACPI address (used for SDCA register access)
+ * @type: Function topology type
+ * @name: human-readable string
+ */
+struct sdca_function_desc {
+	u64 adr;
+	u32 type;
+	const char *name;
+};
+
+/**
+ * sdca_device_data - structure containing all SDCA related information
+ * @sdca_interface_revision: value read from _DSD property, mainly to check
+ * for changes between silicon versions
+ * @num_functions: total number of supported SDCA functions. Invalid/unsupported
+ * functions will be skipped.
+ * @sdca_func: array of function descriptors
+ */
+struct sdca_device_data {
+	u32 interface_revision;
+	int num_functions;
+	struct sdca_function_desc sdca_func[SDCA_MAX_FUNCTION_COUNT];
+};
+
+#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SND_SOC_SDCA)
+
+void sdca_lookup_functions(struct sdw_slave *slave);
+void sdca_lookup_interface_revision(struct sdw_slave *slave);
+
+#else
+
+static inline void sdca_lookup_functions(struct sdw_slave *slave) {}
+static inline void sdca_lookup_interface_revision(struct sdw_slave *slave) {}
+
+#endif
+
+#endif
diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
new file mode 100644
index 000000000000..a01eec86b9a6
--- /dev/null
+++ b/include/sound/sdca_function.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ *
+ * Copyright(c) 2024 Intel Corporation
+ */
+
+#ifndef __SDCA_FUNCTION_H__
+#define __SDCA_FUNCTION_H__
+
+/*
+ * SDCA Function Types from SDCA specification v1.0a Section 5.1.2
+ * all Function types not described are reserved
+ * Note that SIMPLE_AMP, SIMPLE_MIC and SIMPLE_JACK Function Types
+ * are NOT defined in SDCA 1.0a, but they were defined in earlier
+ * drafts and are planned for 1.1.
+ */
+
+enum sdca_function_type {
+	SDCA_FUNCTION_TYPE_SMART_AMP	= 0x01,	/* Amplifier with protection features */
+	SDCA_FUNCTION_TYPE_SIMPLE_AMP	= 0x02,	/* subset of SmartAmp */
+	SDCA_FUNCTION_TYPE_SMART_MIC	= 0x03,	/* Smart microphone with acoustic triggers */
+	SDCA_FUNCTION_TYPE_SIMPLE_MIC	= 0x04,	/* subset of SmartMic */
+	SDCA_FUNCTION_TYPE_SPEAKER_MIC	= 0x05,	/* Combination of SmartMic and SmartAmp */
+	SDCA_FUNCTION_TYPE_UAJ		= 0x06,	/* 3.5mm Universal Audio jack */
+	SDCA_FUNCTION_TYPE_RJ		= 0x07,	/* Retaskable jack */
+	SDCA_FUNCTION_TYPE_SIMPLE_JACK	= 0x08,	/* Subset of UAJ */
+	SDCA_FUNCTION_TYPE_HID		= 0x0A,	/* Human Interface Device, for e.g. buttons */
+	SDCA_FUNCTION_TYPE_IMP_DEF	= 0x1F,	/* Implementation-defined function */
+};
+
+/* Human-readable names used for kernel logs and Function device registration/bind */
+#define	SDCA_FUNCTION_TYPE_SMART_AMP_NAME	"SmartAmp"
+#define	SDCA_FUNCTION_TYPE_SIMPLE_AMP_NAME	"SimpleAmp"
+#define	SDCA_FUNCTION_TYPE_SMART_MIC_NAME	"SmartMic"
+#define	SDCA_FUNCTION_TYPE_SIMPLE_MIC_NAME	"SimpleMic"
+#define	SDCA_FUNCTION_TYPE_SPEAKER_MIC_NAME	"SpeakerMic"
+#define	SDCA_FUNCTION_TYPE_UAJ_NAME		"UAJ"
+#define	SDCA_FUNCTION_TYPE_RJ_NAME		"RJ"
+#define	SDCA_FUNCTION_TYPE_SIMPLE_NAME		"SimpleJack"
+#define	SDCA_FUNCTION_TYPE_HID_NAME		"HID"
+
+enum sdca_entity0_controls {
+	SDCA_CONTROL_ENTITY_0_COMMIT_GROUP_MASK		= 0x01,
+	SDCA_CONTROL_ENTITY_0_INTSTAT_CLEAR		= 0x02,
+	SDCA_CONTROL_ENTITY_0_INT_ENABLE		= 0x03,
+	SDCA_CONTROL_ENTITY_0_FUNCTION_SDCA_VERSION	= 0x04,
+	SDCA_CONTROL_ENTITY_0_FUNCTION_TOPOLOGY		= 0x05,
+	SDCA_CONTROL_ENTITY_0_FUNCTION_MANUFACTURER_ID	= 0x06,
+	SDCA_CONTROL_ENTITY_0_FUNCTION_ID		= 0x07,
+	SDCA_CONTROL_ENTITY_0_FUNCTION_VERSION		= 0x08
+};
+
+#endif
diff --git a/sound/soc/Kconfig b/sound/soc/Kconfig
index e87bd15a8b43..8e01b421fe8d 100644
--- a/sound/soc/Kconfig
+++ b/sound/soc/Kconfig
@@ -108,6 +108,7 @@ source "sound/soc/pxa/Kconfig"
 source "sound/soc/qcom/Kconfig"
 source "sound/soc/rockchip/Kconfig"
 source "sound/soc/samsung/Kconfig"
+source "sound/soc/sdca/Kconfig"
 source "sound/soc/sh/Kconfig"
 source "sound/soc/sof/Kconfig"
 source "sound/soc/spear/Kconfig"
diff --git a/sound/soc/Makefile b/sound/soc/Makefile
index 775bb38c2ed4..5307b0b62a93 100644
--- a/sound/soc/Makefile
+++ b/sound/soc/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_SND_SOC)	+= pxa/
 obj-$(CONFIG_SND_SOC)	+= qcom/
 obj-$(CONFIG_SND_SOC)	+= rockchip/
 obj-$(CONFIG_SND_SOC)	+= samsung/
+obj-$(CONFIG_SND_SOC)	+= sdca/
 obj-$(CONFIG_SND_SOC)	+= sh/
 obj-$(CONFIG_SND_SOC)	+= sof/
 obj-$(CONFIG_SND_SOC)	+= spear/
diff --git a/sound/soc/sdca/Kconfig b/sound/soc/sdca/Kconfig
new file mode 100644
index 000000000000..07f6822fa614
--- /dev/null
+++ b/sound/soc/sdca/Kconfig
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config SND_SOC_SDCA
+	tristate "ASoC SDCA library"
+	depends on ACPI
+	help
+	  This option enables support for the MIPI SoundWire Device
+	  Class for Audio (SDCA).
+
+config SND_SOC_SDCA_OPTIONAL
+	def_tristate SND_SOC_SDCA || !SND_SOC_SDCA
diff --git a/sound/soc/sdca/Makefile b/sound/soc/sdca/Makefile
new file mode 100644
index 000000000000..c296bd5a0a7c
--- /dev/null
+++ b/sound/soc/sdca/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+snd-soc-sdca-objs	:= sdca_functions.o sdca_device.o
+
+obj-$(CONFIG_SND_SOC_SDCA)	+= snd-soc-sdca.o
diff --git a/sound/soc/sdca/sdca_device.c b/sound/soc/sdca/sdca_device.c
new file mode 100644
index 000000000000..58f5f6f0f723
--- /dev/null
+++ b/sound/soc/sdca/sdca_device.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2024 Intel Corporation
+
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ */
+
+#include <linux/acpi.h>
+#include <linux/soundwire/sdw.h>
+#include <sound/sdca.h>
+
+void sdca_lookup_interface_revision(struct sdw_slave *slave)
+{
+	struct fwnode_handle *fwnode = slave->dev.fwnode;
+
+	/*
+	 * if this property is not present, then the sdca_interface_revision will
+	 * remain zero, which will be considered as 'not defined' or 'invalid'.
+	 */
+	fwnode_property_read_u32(fwnode, "mipi-sdw-sdca-interface-revision",
+				 &slave->sdca_data.interface_revision);
+}
+EXPORT_SYMBOL_NS(sdca_lookup_interface_revision, SND_SOC_SDCA);
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
new file mode 100644
index 000000000000..a6ad57430dd4
--- /dev/null
+++ b/sound/soc/sdca/sdca_functions.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2024 Intel Corporation
+
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ */
+
+#include <linux/acpi.h>
+#include <linux/soundwire/sdw.h>
+#include <sound/sdca.h>
+#include <sound/sdca_function.h>
+
+static int patch_sdca_function_type(struct device *dev,
+				    u32 interface_revision,
+				    u32 *function_type,
+				    const char **function_name)
+{
+	unsigned long function_type_patch = 0;
+
+	/*
+	 * Unfortunately early SDCA specifications used different indices for Functions,
+	 * for backwards compatibility we have to reorder the values found
+	 */
+	if (interface_revision >= 0x0801)
+		goto skip_early_draft_order;
+
+	switch (*function_type) {
+	case 1:
+		function_type_patch = SDCA_FUNCTION_TYPE_SMART_AMP;
+		break;
+	case 2:
+		function_type_patch = SDCA_FUNCTION_TYPE_SMART_MIC;
+		break;
+	case 3:
+		function_type_patch = SDCA_FUNCTION_TYPE_SPEAKER_MIC;
+		break;
+	case 4:
+		function_type_patch = SDCA_FUNCTION_TYPE_UAJ;
+		break;
+	case 5:
+		function_type_patch = SDCA_FUNCTION_TYPE_RJ;
+		break;
+	case 6:
+		function_type_patch = SDCA_FUNCTION_TYPE_HID;
+		break;
+	default:
+		dev_warn(dev, "%s: SDCA version %#x unsupported function type %d, skipped\n",
+			 __func__, interface_revision, *function_type);
+		return -EINVAL;
+	}
+
+skip_early_draft_order:
+	if (function_type_patch)
+		*function_type = function_type_patch;
+
+	/* now double-check the values */
+	switch (*function_type) {
+	case SDCA_FUNCTION_TYPE_SMART_AMP:
+		*function_name = SDCA_FUNCTION_TYPE_SMART_AMP_NAME;
+		break;
+	case SDCA_FUNCTION_TYPE_SMART_MIC:
+		*function_name = SDCA_FUNCTION_TYPE_SMART_MIC_NAME;
+		break;
+	case SDCA_FUNCTION_TYPE_UAJ:
+		*function_name = SDCA_FUNCTION_TYPE_UAJ_NAME;
+		break;
+	case SDCA_FUNCTION_TYPE_HID:
+		*function_name = SDCA_FUNCTION_TYPE_HID_NAME;
+		break;
+	case SDCA_FUNCTION_TYPE_SIMPLE_AMP:
+	case SDCA_FUNCTION_TYPE_SIMPLE_MIC:
+	case SDCA_FUNCTION_TYPE_SPEAKER_MIC:
+	case SDCA_FUNCTION_TYPE_RJ:
+	case SDCA_FUNCTION_TYPE_IMP_DEF:
+		dev_warn(dev, "%s: found unsupported SDCA function type %d, skipped\n",
+			 __func__, *function_type);
+		return -EINVAL;
+	default:
+		dev_err(dev, "%s: found invalid SDCA function type %d, skipped\n",
+			__func__, *function_type);
+		return -EINVAL;
+	}
+
+	dev_info(dev, "%s: found SDCA function %s (type %d)\n",
+		 __func__, *function_name, *function_type);
+
+	return 0;
+}
+
+static int find_sdca_function(struct acpi_device *adev, void *data)
+{
+	struct fwnode_handle *function_node = acpi_fwnode_handle(adev);
+	struct sdca_device_data *sdca_data = data;
+	struct device *dev = &adev->dev;
+	struct fwnode_handle *control5; /* used to identify function type */
+	const char *function_name;
+	u32 function_type;
+	int func_index;
+	u64 addr;
+	int ret;
+
+	if (sdca_data->num_functions >= SDCA_MAX_FUNCTION_COUNT) {
+		dev_err(dev, "%s: maximum number of functions exceeded\n", __func__);
+		return -EINVAL;
+	}
+
+	/*
+	 * The number of functions cannot exceed 8, we could use
+	 * acpi_get_local_address() but the value is stored as u64 so
+	 * we might as well avoid casts and intermediate levels
+	 */
+	ret = acpi_get_local_u64_address(adev->handle, &addr);
+	if (ret < 0)
+		return ret;
+
+	if (!addr) {
+		dev_err(dev, "%s: no addr\n", __func__);
+		return -ENODEV;
+	}
+
+	/*
+	 * Extracting the topology type for an SDCA function is a
+	 * convoluted process.
+	 * The Function type is only visible as a result of a read
+	 * from a control. In theory this would mean reading from the hardware,
+	 * but the SDCA/DisCo specs defined the notion of "DC value" - a constant
+	 * represented with a DSD subproperty.
+	 * Drivers have to query the properties for the control
+	 * SDCA_CONTROL_ENTITY_0_FUNCTION_TOPOLOGY (0x05)
+	 */
+	control5 = fwnode_get_named_child_node(function_node,
+					       "mipi-sdca-control-0x5-subproperties");
+	if (!control5)
+		return -ENODEV;
+
+	ret = fwnode_property_read_u32(control5, "mipi-sdca-control-dc-value",
+				       &function_type);
+
+	fwnode_handle_put(control5);
+
+	if (ret < 0) {
+		dev_err(dev, "%s: the function type can only be determined from ACPI information\n",
+			__func__);
+		return ret;
+	}
+
+	ret = patch_sdca_function_type(dev, sdca_data->interface_revision,
+				       &function_type, &function_name);
+	if (ret < 0)
+		return ret;
+
+	/* store results */
+	func_index = sdca_data->num_functions;
+	sdca_data->sdca_func[func_index].adr = addr;
+	sdca_data->sdca_func[func_index].type = function_type;
+	sdca_data->sdca_func[func_index].name = function_name;
+	sdca_data->num_functions++;
+
+	return 0;
+}
+
+void sdca_lookup_functions(struct sdw_slave *slave)
+{
+	struct device *dev = &slave->dev;
+	struct acpi_device *adev = to_acpi_device_node(dev->fwnode);
+
+	acpi_dev_for_each_child(adev, find_sdca_function, &slave->sdca_data);
+}
+EXPORT_SYMBOL_NS(sdca_lookup_functions, SND_SOC_SDCA);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("SDCA library");
-- 
cgit v1.2.3


From 6a136805e3c1532d2414b298c024429943cfd482 Mon Sep 17 00:00:00 2001
From: Théo Lebrun <theo.lebrun@bootlin.com>
Date: Mon, 7 Oct 2024 15:49:18 +0200
Subject: clk: divider: Introduce CLK_DIVIDER_EVEN_INTEGERS flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add CLK_DIVIDER_EVEN_INTEGERS flag to support divisor of 2, 4, 6, etc.
The same divisor can be done using a table, which would be big and
wasteful for a clock dividor of width 8 (256 entries).

Require increasing flags size from u8 to u16 because
CLK_DIVIDER_EVEN_INTEGERS is the eighth flag. u16 is used inside struct
clk_divider; `unsigned long` is used for function arguments.

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://lore.kernel.org/r/20241007-mbly-clk-v5-3-e9d8994269cb@bootlin.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-divider.c    | 16 ++++++++++++----
 include/linux/clk-provider.h | 15 ++++++++++-----
 2 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index a2c2b5203b0a..c1f426b8a504 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -72,6 +72,8 @@ static unsigned int _get_maxdiv(const struct clk_div_table *table, u8 width,
 		return clk_div_mask(width);
 	if (flags & CLK_DIVIDER_POWER_OF_TWO)
 		return 1 << clk_div_mask(width);
+	if (flags & CLK_DIVIDER_EVEN_INTEGERS)
+		return 2 * (clk_div_mask(width) + 1);
 	if (table)
 		return _get_table_maxdiv(table, width);
 	return clk_div_mask(width) + 1;
@@ -97,6 +99,8 @@ static unsigned int _get_div(const struct clk_div_table *table,
 		return 1 << val;
 	if (flags & CLK_DIVIDER_MAX_AT_ZERO)
 		return val ? val : clk_div_mask(width) + 1;
+	if (flags & CLK_DIVIDER_EVEN_INTEGERS)
+		return 2 * (val + 1);
 	if (table)
 		return _get_table_div(table, val);
 	return val + 1;
@@ -122,6 +126,8 @@ static unsigned int _get_val(const struct clk_div_table *table,
 		return __ffs(div);
 	if (flags & CLK_DIVIDER_MAX_AT_ZERO)
 		return (div == clk_div_mask(width) + 1) ? 0 : div;
+	if (flags & CLK_DIVIDER_EVEN_INTEGERS)
+		return (div >> 1) - 1;
 	if (table)
 		return  _get_table_val(table, div);
 	return div - 1;
@@ -538,7 +544,8 @@ struct clk_hw *__clk_hw_register_divider(struct device *dev,
 		struct device_node *np, const char *name,
 		const char *parent_name, const struct clk_hw *parent_hw,
 		const struct clk_parent_data *parent_data, unsigned long flags,
-		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
+		void __iomem *reg, u8 shift, u8 width,
+		unsigned long clk_divider_flags,
 		const struct clk_div_table *table, spinlock_t *lock)
 {
 	struct clk_divider *div;
@@ -610,8 +617,8 @@ EXPORT_SYMBOL_GPL(__clk_hw_register_divider);
 struct clk *clk_register_divider_table(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
-		u8 clk_divider_flags, const struct clk_div_table *table,
-		spinlock_t *lock)
+		unsigned long clk_divider_flags,
+		const struct clk_div_table *table, spinlock_t *lock)
 {
 	struct clk_hw *hw;
 
@@ -664,7 +671,8 @@ struct clk_hw *__devm_clk_hw_register_divider(struct device *dev,
 		struct device_node *np, const char *name,
 		const char *parent_name, const struct clk_hw *parent_hw,
 		const struct clk_parent_data *parent_data, unsigned long flags,
-		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
+		void __iomem *reg, u8 shift, u8 width,
+		unsigned long clk_divider_flags,
 		const struct clk_div_table *table, spinlock_t *lock)
 {
 	struct clk_hw **ptr, *hw;
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 7e43caabb54b..dbe793964c24 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -689,13 +689,15 @@ struct clk_div_table {
  * CLK_DIVIDER_BIG_ENDIAN - By default little endian register accesses are used
  *	for the divider register.  Setting this flag makes the register accesses
  *	big endian.
+ * CLK_DIVIDER_EVEN_INTEGERS - clock divisor is 2, 4, 6, 8, 10, etc.
+ *	Formula is 2 * (value read from hardware + 1).
  */
 struct clk_divider {
 	struct clk_hw	hw;
 	void __iomem	*reg;
 	u8		shift;
 	u8		width;
-	u8		flags;
+	u16		flags;
 	const struct clk_div_table	*table;
 	spinlock_t	*lock;
 };
@@ -711,6 +713,7 @@ struct clk_divider {
 #define CLK_DIVIDER_READ_ONLY		BIT(5)
 #define CLK_DIVIDER_MAX_AT_ZERO		BIT(6)
 #define CLK_DIVIDER_BIG_ENDIAN		BIT(7)
+#define CLK_DIVIDER_EVEN_INTEGERS	BIT(8)
 
 extern const struct clk_ops clk_divider_ops;
 extern const struct clk_ops clk_divider_ro_ops;
@@ -740,19 +743,21 @@ struct clk_hw *__clk_hw_register_divider(struct device *dev,
 		struct device_node *np, const char *name,
 		const char *parent_name, const struct clk_hw *parent_hw,
 		const struct clk_parent_data *parent_data, unsigned long flags,
-		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
+		void __iomem *reg, u8 shift, u8 width,
+		unsigned long clk_divider_flags,
 		const struct clk_div_table *table, spinlock_t *lock);
 struct clk_hw *__devm_clk_hw_register_divider(struct device *dev,
 		struct device_node *np, const char *name,
 		const char *parent_name, const struct clk_hw *parent_hw,
 		const struct clk_parent_data *parent_data, unsigned long flags,
-		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
+		void __iomem *reg, u8 shift, u8 width,
+		unsigned long clk_divider_flags,
 		const struct clk_div_table *table, spinlock_t *lock);
 struct clk *clk_register_divider_table(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
-		u8 clk_divider_flags, const struct clk_div_table *table,
-		spinlock_t *lock);
+		unsigned long clk_divider_flags,
+		const struct clk_div_table *table, spinlock_t *lock);
 /**
  * clk_register_divider - register a divider clock with the clock framework
  * @dev: device registering this clock
-- 
cgit v1.2.3


From 44fcc479a574a4055fb6aed1f786d39999466383 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Thu, 17 Oct 2024 06:37:33 +0200
Subject: power: supply: hwmon: move interface to private header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The interface of power_supply_hwmon.c is only meant to be used by the
psy core. Remove it from the public header file and use the private one
instead.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20241017-power-supply-cleanups-v2-1-cb0f5deab088@weissschuh.net
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply.h | 17 +++++++++++++++++
 include/linux/power_supply.h        | 13 -------------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/power_supply.h b/drivers/power/supply/power_supply.h
index 4c558fb3db72..7434a6f24775 100644
--- a/drivers/power/supply/power_supply.h
+++ b/drivers/power/supply/power_supply.h
@@ -44,3 +44,20 @@ static inline int power_supply_create_triggers(struct power_supply *psy)
 static inline void power_supply_remove_triggers(struct power_supply *psy) {}
 
 #endif /* CONFIG_LEDS_TRIGGERS */
+
+#ifdef CONFIG_POWER_SUPPLY_HWMON
+
+int power_supply_add_hwmon_sysfs(struct power_supply *psy);
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy);
+
+#else
+
+static inline int power_supply_add_hwmon_sysfs(struct power_supply *psy)
+{
+	return 0;
+}
+
+static inline
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy) {}
+
+#endif /* CONFIG_POWER_SUPPLY_HWMON */
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index c1d6cb7f4c40..b98106e1a90f 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -937,19 +937,6 @@ static inline bool power_supply_is_watt_property(enum power_supply_property psp)
 	return false;
 }
 
-#ifdef CONFIG_POWER_SUPPLY_HWMON
-int power_supply_add_hwmon_sysfs(struct power_supply *psy);
-void power_supply_remove_hwmon_sysfs(struct power_supply *psy);
-#else
-static inline int power_supply_add_hwmon_sysfs(struct power_supply *psy)
-{
-	return 0;
-}
-
-static inline
-void power_supply_remove_hwmon_sysfs(struct power_supply *psy) {}
-#endif
-
 #ifdef CONFIG_SYSFS
 ssize_t power_supply_charge_behaviour_show(struct device *dev,
 					   unsigned int available_behaviours,
-- 
cgit v1.2.3


From 0784181b44af831a3fa52e1e5ff77c388d699dba Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 26 Sep 2024 16:17:37 +0100
Subject: lockdep: Add lockdep_cleanup_dead_cpu()

Add a function to check that an offline CPU has left the tracing
infrastructure in a sane state.

Commit 9bb69ba4c177 ("ACPI: processor_idle: use raw_safe_halt() in
acpi_idle_play_dead()") fixed an issue where the acpi_idle_play_dead()
function called safe_halt() instead of raw_safe_halt(), which had the
side-effect of setting the hardirqs_enabled flag for the offline CPU.

On x86 this triggered warnings from lockdep_assert_irqs_disabled() when
the CPU was brought back online again later. These warnings were too
early for the exception to be handled correctly, leading to a
triple-fault.

Add lockdep_cleanup_dead_cpu() to check for this kind of failure mode,
print the events leading up to it, and correct it so that the CPU can
come online again correctly. Re-introducing the original bug now merely
results in this warning instead:

[   61.556652] smpboot: CPU 1 is now offline
[   61.556769] CPU 1 left hardirqs enabled!
[   61.556915] irq event stamp: 128149
[   61.556965] hardirqs last  enabled at (128149): [<ffffffff81720a36>] acpi_idle_play_dead+0x46/0x70
[   61.557055] hardirqs last disabled at (128148): [<ffffffff81124d50>] do_idle+0x90/0xe0
[   61.557117] softirqs last  enabled at (128078): [<ffffffff81cec74c>] __do_softirq+0x31c/0x423
[   61.557199] softirqs last disabled at (128065): [<ffffffff810baae1>] __irq_exit_rcu+0x91/0x100

[boqun: Capitalize the title and reword the message a bit]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/r/f7bd2b3b999051bb3ef4be34526a9262008285f5.camel@infradead.org
---
 include/linux/irqflags.h |  6 ++++++
 kernel/cpu.c             |  1 +
 kernel/locking/lockdep.c | 24 ++++++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 3f003d5fde53..57b074e0cfbb 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -18,6 +18,8 @@
 #include <asm/irqflags.h>
 #include <asm/percpu.h>
 
+struct task_struct;
+
 /* Currently lockdep_softirqs_on/off is used only by lockdep */
 #ifdef CONFIG_PROVE_LOCKING
   extern void lockdep_softirqs_on(unsigned long ip);
@@ -25,12 +27,16 @@
   extern void lockdep_hardirqs_on_prepare(void);
   extern void lockdep_hardirqs_on(unsigned long ip);
   extern void lockdep_hardirqs_off(unsigned long ip);
+  extern void lockdep_cleanup_dead_cpu(unsigned int cpu,
+				       struct task_struct *idle);
 #else
   static inline void lockdep_softirqs_on(unsigned long ip) { }
   static inline void lockdep_softirqs_off(unsigned long ip) { }
   static inline void lockdep_hardirqs_on_prepare(void) { }
   static inline void lockdep_hardirqs_on(unsigned long ip) { }
   static inline void lockdep_hardirqs_off(unsigned long ip) { }
+  static inline void lockdep_cleanup_dead_cpu(unsigned int cpu,
+					      struct task_struct *idle) {}
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d293d52a3e00..c4aaf73dec9e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1338,6 +1338,7 @@ static int takedown_cpu(unsigned int cpu)
 
 	cpuhp_bp_sync_dead(cpu);
 
+	lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu));
 	tick_cleanup_dead_cpu(cpu);
 
 	/*
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 536bd471557f..6fd4af217e71 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4586,6 +4586,30 @@ void lockdep_softirqs_off(unsigned long ip)
 		debug_atomic_inc(redundant_softirqs_off);
 }
 
+/**
+ * lockdep_cleanup_dead_cpu - Ensure CPU lockdep state is cleanly stopped
+ *
+ * @cpu: index of offlined CPU
+ * @idle: task pointer for offlined CPU's idle thread
+ *
+ * Invoked after the CPU is dead. Ensures that the tracing infrastructure
+ * is left in a suitable state for the CPU to be subsequently brought
+ * online again.
+ */
+void lockdep_cleanup_dead_cpu(unsigned int cpu, struct task_struct *idle)
+{
+	if (unlikely(!debug_locks))
+		return;
+
+	if (unlikely(per_cpu(hardirqs_enabled, cpu))) {
+		pr_warn("CPU %u left hardirqs enabled!", cpu);
+		if (idle)
+			print_irqtrace_events(idle);
+		/* Clean it up for when the CPU comes online again. */
+		per_cpu(hardirqs_enabled, cpu) = 0;
+	}
+}
+
 static int
 mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 {
-- 
cgit v1.2.3


From d7fe143cb115076fed0126ad8cf5ba6c3e575e43 Mon Sep 17 00:00:00 2001
From: Ahmed Ehab <bottaawesome633@gmail.com>
Date: Sun, 25 Aug 2024 01:10:30 +0300
Subject: locking/lockdep: Avoid creating new name string literals in
 lockdep_set_subclass()

Syzbot reports a problem that a warning will be triggered while
searching a lock class in look_up_lock_class().

The cause of the issue is that a new name is created and used by
lockdep_set_subclass() instead of using the existing one. This results
in a lock instance has a different name pointer than previous registered
one stored in lock class, and WARN_ONCE() is triggered because of that
in look_up_lock_class().

To fix this, change lockdep_set_subclass() to use the existing name
instead of a new one. Hence, no new name will be created by
lockdep_set_subclass(). Hence, the warning is avoided.

[boqun: Reword the commit log to state the correct issue]

Reported-by: <syzbot+7f4a6f7f7051474e40ad@syzkaller.appspotmail.com>
Fixes: de8f5e4f2dc1f ("lockdep: Introduce wait-type checks")
Cc: stable@vger.kernel.org
Signed-off-by: Ahmed Ehab <bottaawesome633@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/lkml/20240824221031.7751-1-bottaawesome633@gmail.com/
---
 include/linux/lockdep.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 217f7abf2cbf..67964dc4db95 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -173,7 +173,7 @@ static inline void lockdep_init_map(struct lockdep_map *lock, const char *name,
 			      (lock)->dep_map.lock_type)
 
 #define lockdep_set_subclass(lock, sub)					\
-	lockdep_init_map_type(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\
+	lockdep_init_map_type(&(lock)->dep_map, (lock)->dep_map.name, (lock)->dep_map.key, sub,\
 			      (lock)->dep_map.wait_type_inner,		\
 			      (lock)->dep_map.wait_type_outer,		\
 			      (lock)->dep_map.lock_type)
-- 
cgit v1.2.3


From 6eaa83ec229b1e99130456c4f6658430d509c76c Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 17 Oct 2024 17:11:08 +0300
Subject: PCI: Remove unused PCI_SUBTRACTIVE_DECODE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

2fe2abf896c1 ("PCI: augment bus resource table with a list") added
PCI_SUBTRACTIVE_DECODE which is put into the struct pci_bus_resource flags
field but is never read. There seems to never have been users for it.

Remove both PCI_SUBTRACTIVE_DECODE and the flags field from the struct
pci_bus_resource.

Link: https://lore.kernel.org/r/20241017141111.44612-1-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 arch/s390/pci/pci_bus.c |  2 +-
 arch/x86/pci/fixup.c    |  2 +-
 drivers/pci/bus.c       |  4 +---
 drivers/pci/probe.c     |  5 ++---
 include/linux/pci.h     | 12 +-----------
 5 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index daa5d7450c7d..5630af5deb8b 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -53,7 +53,7 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev)
 		zpci_setup_bus_resources(zdev);
 		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			if (zdev->bars[i].res)
-				pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0);
+				pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res);
 		}
 	}
 
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 98a9bb92d75c..0681ecfe3430 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -757,7 +757,7 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
 		dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n",
 			 res);
 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-		pci_bus_add_resource(dev->bus, res, 0);
+		pci_bus_add_resource(dev->bus, res);
 	}
 
 	base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) |
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 55c853686051..9cf6d0f3ab2b 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -46,8 +46,7 @@ void pci_free_resource_list(struct list_head *resources)
 }
 EXPORT_SYMBOL(pci_free_resource_list);
 
-void pci_bus_add_resource(struct pci_bus *bus, struct resource *res,
-			  unsigned int flags)
+void pci_bus_add_resource(struct pci_bus *bus, struct resource *res)
 {
 	struct pci_bus_resource *bus_res;
 
@@ -58,7 +57,6 @@ void pci_bus_add_resource(struct pci_bus *bus, struct resource *res,
 	}
 
 	bus_res->res = res;
-	bus_res->flags = flags;
 	list_add_tail(&bus_res->list, &bus->resources);
 }
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..4243b1e6ece2 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -546,8 +546,7 @@ void pci_read_bridge_bases(struct pci_bus *child)
 	if (dev->transparent) {
 		pci_bus_for_each_resource(child->parent, res) {
 			if (res && res->flags) {
-				pci_bus_add_resource(child, res,
-						     PCI_SUBTRACTIVE_DECODE);
+				pci_bus_add_resource(child, res);
 				pci_info(dev, "  bridge window %pR (subtractive decode)\n",
 					   res);
 			}
@@ -1032,7 +1031,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
 		if (res->flags & IORESOURCE_BUS)
 			pci_bus_insert_busn_res(bus, bus->number, res->end);
 		else
-			pci_bus_add_resource(bus, res, 0);
+			pci_bus_add_resource(bus, res);
 
 		if (offset) {
 			if (resource_type(res) == IORESOURCE_IO)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..6a9cf80d0d4b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -633,18 +633,9 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge);
  * Use pci_bus_for_each_resource() to iterate through all the resources.
  */
 
-/*
- * PCI_SUBTRACTIVE_DECODE means the bridge forwards the window implicitly
- * and there's no way to program the bridge with the details of the window.
- * This does not apply to ACPI _CRS windows, even with the _DEC subtractive-
- * decode bit set, because they are explicit and can be programmed with _SRS.
- */
-#define PCI_SUBTRACTIVE_DECODE	0x1
-
 struct pci_bus_resource {
 	struct list_head	list;
 	struct resource		*res;
-	unsigned int		flags;
 };
 
 #define PCI_REGION_FLAG_MASK	0x0fU	/* These bits of resource flags tell us the PCI region flags */
@@ -1498,8 +1489,7 @@ void pci_add_resource(struct list_head *resources, struct resource *res);
 void pci_add_resource_offset(struct list_head *resources, struct resource *res,
 			     resource_size_t offset);
 void pci_free_resource_list(struct list_head *resources);
-void pci_bus_add_resource(struct pci_bus *bus, struct resource *res,
-			  unsigned int flags);
+void pci_bus_add_resource(struct pci_bus *bus, struct resource *res);
 struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n);
 void pci_bus_remove_resources(struct pci_bus *bus);
 void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res);
-- 
cgit v1.2.3


From 469c9cb941480718db52876bbdb95a7a79b34889 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 17 Oct 2024 17:11:09 +0300
Subject: PCI: Move struct pci_bus_resource into bus.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The struct pci_bus_resource is only used in bus.c, so move it there.

Link: https://lore.kernel.org/r/20241017141111.44612-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/pci/bus.c   | 12 ++++++++++++
 include/linux/pci.h | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 9cf6d0f3ab2b..e0a2441be6d3 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -18,6 +18,18 @@
 
 #include "pci.h"
 
+/*
+ * The first PCI_BRIDGE_RESOURCE_NUM PCI bus resources (those that correspond
+ * to P2P or CardBus bridge windows) go in a table.  Additional ones (for
+ * buses below host bridges or subtractive decode bridges) go in the list.
+ * Use pci_bus_for_each_resource() to iterate through all the resources.
+ */
+
+struct pci_bus_resource {
+	struct list_head	list;
+	struct resource		*res;
+};
+
 void pci_add_resource_offset(struct list_head *resources, struct resource *res,
 			     resource_size_t offset)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6a9cf80d0d4b..5a9d849b28ef 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -626,18 +626,6 @@ void pci_set_host_bridge_release(struct pci_host_bridge *bridge,
 
 int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge);
 
-/*
- * The first PCI_BRIDGE_RESOURCE_NUM PCI bus resources (those that correspond
- * to P2P or CardBus bridge windows) go in a table.  Additional ones (for
- * buses below host bridges or subtractive decode bridges) go in the list.
- * Use pci_bus_for_each_resource() to iterate through all the resources.
- */
-
-struct pci_bus_resource {
-	struct list_head	list;
-	struct resource		*res;
-};
-
 #define PCI_REGION_FLAG_MASK	0x0fU	/* These bits of resource flags tell us the PCI region flags */
 
 struct pci_bus {
-- 
cgit v1.2.3


From 08ef26ea9ab315b895d57f8fbad41e02ff345bb9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Oct 2024 16:23:58 +0200
Subject: fs: add file_ref

As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop it has
O(N^2) behaviour under contention with N concurrent operations and it is
in a hot path in __fget_files_rcu().

The rcuref infrastructures remedies this problem by using an
unconditional increment relying on safe- and dead zones to make this
work and requiring rcu protection for the data structure in question.
This not just scales better it also introduces overflow protection.

However, in contrast to generic rcuref, files require a memory barrier
and thus cannot rely on *_relaxed() atomic operations and also require
to be built on atomic_long_t as having massive amounts of reference
isn't unheard of even if it is just an attack.

As suggested by Linus, add a file specific variant instead of making
this a generic library.

Files are SLAB_TYPESAFE_BY_RCU and thus don't have "regular" rcu
protection. In short, freeing of files isn't delayed until a grace
period has elapsed. Instead, they are freed immediately and thus can be
reused (multiple times) within the same grace period.

So when picking a file from the file descriptor table via its file
descriptor number it is thus possible to see an elevated reference count
on file->f_count even though the file has already been recycled possibly
multiple times by another task.

To guard against this the vfs will pick the file from the file
descriptor table twice. Once before the refcount increment and once
after to compare the pointers (grossly simplified). If they match then
the file is still valid. If not the caller needs to fput() it.

The unconditional increment makes the following race possible as
illustrated by rcuref:

> Deconstruction race
> ===================
>
> The release operation must be protected by prohibiting a grace period in
> order to prevent a possible use after free:
>
>      T1                              T2
>      put()                           get()
>      // ref->refcnt = ONEREF
>      if (!atomic_add_negative(-1, &ref->refcnt))
>              return false;                           <- Not taken
>
>      // ref->refcnt == NOREF
>      --> preemption
>                                      // Elevates ref->refcnt to ONEREF
>                                      if (!atomic_add_negative(1, &ref->refcnt))
>                                              return true;                    <- taken
>
>                                      if (put(&p->ref)) { <-- Succeeds
>                                              remove_pointer(p);
>                                              kfree_rcu(p, rcu);
>                                      }
>
>              RCU grace period ends, object is freed
>
>      atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);      <- UAF
>
> [...] it prevents the grace period which keeps the object alive until
> all put() operations complete.

Having files by SLAB_TYPESAFE_BY_RCU shouldn't cause any problems for
this deconstruction race. Afaict, the only interesting case would be
someone freeing the file and someone immediately recycling it within the
same grace period and reinitializing file->f_count to ONEREF while a
concurrent fput() is doing atomic_cmpxchg(&ref->refcnt, NOREF, DEAD) as
in the race above.

But this is safe from SLAB_TYPESAFE_BY_RCU's perspective and it should
be safe from rcuref's perspective.

      T1                              T2                                                    T3
      fput()                          fget()
      // f_count->refcnt = ONEREF
      if (!atomic_add_negative(-1, &f_count->refcnt))
              return false;                           <- Not taken

      // f_count->refcnt == NOREF
      --> preemption
                                      // Elevates f_count->refcnt to ONEREF
                                      if (!atomic_add_negative(1, &f_count->refcnt))
                                              return true;                    <- taken

                                      if (put(&f_count)) { <-- Succeeds
                                              remove_pointer(p);
                                              /*
                                               * Cache is SLAB_TYPESAFE_BY_RCU
                                               * so this is freed without a grace period.
                                               */
                                              kmem_cache_free(p);
                                      }

                                                                                             kmem_cache_alloc()
                                                                                             init_file() {
                                                                                                     // Sets f_count->refcnt to ONEREF
                                                                                                     rcuref_long_init(&f->f_count, 1);
                                                                                             }

                        Object has been reused within the same grace period
                        via kmem_cache_alloc()'s SLAB_TYPESAFE_BY_RCU.

      /*
       * With SLAB_TYPESAFE_BY_RCU this would be a safe UAF access and
       * it would work correctly because the atomic_cmpxchg()
       * will fail because the refcount has been reset to ONEREF by T3.
       */
      atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);      <- UAF

However, there are other cases to consider:

(1) Benign race due to multiple atomic_long_read()

    CPU1                                                    CPU2

    file_ref_put()
    // last reference
    // => count goes negative/FILE_REF_NOREF
    atomic_long_add_negative_release(-1, &ref->refcnt)
    -> __file_ref_put()
                                                        file_ref_get()
                                                        // goes back from negative/FILE_REF_NOREF to 0
                                                        // and file_ref_get() succeeds
                                                        atomic_long_add_negative(1, &ref->refcnt)

                                                        // This is immediately followed by file_ref_put()
                                                        // managing to set FILE_REF_DEAD
                                                        file_ref_put()

       // __file_ref_put() continues and sees
       // cnt > FILE_REF_RELEASED // and splats with
       // "imbalanced put on file reference count"
       cnt = atomic_long_read(&ref->refcnt);

    The race however is benign and the problem is the
    atomic_long_read(). Instead of performing a separate read this uses
    atomic_long_dec_return() and pass the value to __file_ref_put().
    Thanks to Linus for pointing out that braino.

(2) SLAB_TYPESAFE_BY_RCU may cause recycled files to be marked dead

    When a file is recycled the following race exists:

    CPU1                                                       CPU2
    // @file is already dead and thus
    // cnt >= FILE_REF_RELEASED.
    file_ref_get(file)
    atomic_long_add_negative(1, &ref->refcnt)
       // We thus call into __file_ref_get()
    -> __file_ref_get()

       // which sees cnt >= FILE_REF_RELEASED
       cnt = atomic_long_read(&ref->refcnt);
                                                               // In the meantime @file gets freed
                                                               kmem_cache_free()

                                                               // and is immediately recycled
                                                               file = kmem_cache_zalloc()
                                                               // and the reference count is reinitialized
                                                               // and the file alive again in someone
                                                               // else's file descriptor table
                                                               file_ref_init(&ref->refcnt, 1);

       // the __file_ref_get() slowpath now continues
       // and as it saw earlier that cnt >= FILE_REF_RELEASED
       // it wants to ensure that we're staying in the middle
       // of the deadzone and unconditionally sets
       // FILE_REF_DEAD.
       // This marks @file dead for CPU2...
       atomic_long_set(&ref->refcnt, FILE_REF_DEAD);

                                                               // Caller issues a close() system call to close @file
                                                               close(fd)
                                                               file = file_close_fd_locked()
                                                               filp_flush()
                                                               // The caller sees that cnt >= FILE_REF_RELEASED
                                                               // and warns the first time...
                                                               CHECK_DATA_CORRUPTION(file_count(file) == 0)

                                                               // and then splats a second time because
                                                               // __file_ref_put() sees cnt >= FILE_REF_RELEASED
                                                               file_ref_put(&ref->refcnt);
                                                               -> __file_ref_put()

    My initial inclination was to replace the unconditional
    atomic_long_set() with an atomic_long_try_cmpxchg() but Linus
    pointed out that:

    > I think we should just make file_ref_get() do a simple
    >
    >        return !atomic_long_add_negative(1, &ref->refcnt));
    >
    > and nothing else. Yes, multiple CPU's can race, and you can increment
    > more than once, but the gap - even on 32-bit - between DEAD and
    > becoming close to REF_RELEASED is so big that we simply don't care.
    > That's the point of having a gap.

I've been testing this with will-it-scale using fstat() on a machine
that Jens gave me access (thank you very much!):

processor       : 511
vendor_id       : AuthenticAMD
cpu family      : 25
model           : 160
model name      : AMD EPYC 9754 128-Core Processor

and I consistently get a 3-5% improvement on 256+ threads.

Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202410151043.5d224a27-oliver.sang@intel.com
Closes: https://lore.kernel.org/all/202410151611.f4cd71f2-oliver.sang@intel.com
Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-2-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file.c                |  63 +++++++++++++++++
 include/linux/file_ref.h | 177 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 240 insertions(+)
 create mode 100644 include/linux/file_ref.h

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index eb093e736972..ab15e2a3d280 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -20,10 +20,73 @@
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/close_range.h>
+#include <linux/file_ref.h>
 #include <net/sock.h>
 
 #include "internal.h"
 
+/**
+ * __file_ref_put - Slowpath of file_ref_put()
+ * @ref:	Pointer to the reference count
+ * @cnt:	Current reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ *	True if this was the last reference with no future references
+ *	possible. This signals the caller that it can safely schedule the
+ *	object, which is protected by the reference counter, for
+ *	deconstruction.
+ *
+ *	False if there are still active references or the put() raced
+ *	with a concurrent get()/put() pair. Caller is not allowed to
+ *	deconstruct the protected object.
+ */
+bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
+{
+	/* Did this drop the last reference? */
+	if (likely(cnt == FILE_REF_NOREF)) {
+		/*
+		 * Carefully try to set the reference count to FILE_REF_DEAD.
+		 *
+		 * This can fail if a concurrent get() operation has
+		 * elevated it again or the corresponding put() even marked
+		 * it dead already. Both are valid situations and do not
+		 * require a retry. If this fails the caller is not
+		 * allowed to deconstruct the object.
+		 */
+		if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
+			return false;
+
+		/*
+		 * The caller can safely schedule the object for
+		 * deconstruction. Provide acquire ordering.
+		 */
+		smp_acquire__after_ctrl_dep();
+		return true;
+	}
+
+	/*
+	 * If the reference count was already in the dead zone, then this
+	 * put() operation is imbalanced. Warn, put the reference count back to
+	 * DEAD and tell the caller to not deconstruct the object.
+	 */
+	if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
+		atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
+		return false;
+	}
+
+	/*
+	 * This is a put() operation on a saturated refcount. Restore the
+	 * mean saturation value and tell the caller to not deconstruct the
+	 * object.
+	 */
+	if (cnt > FILE_REF_MAXREF)
+		atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
+	return false;
+}
+EXPORT_SYMBOL_GPL(__file_ref_put);
+
 unsigned int sysctl_nr_open __read_mostly = 1024*1024;
 unsigned int sysctl_nr_open_min = BITS_PER_LONG;
 /* our min() is unusable in constant expressions ;-/ */
diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h
new file mode 100644
index 000000000000..9b3a8d9b17ab
--- /dev/null
+++ b/include/linux/file_ref.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_FILE_REF_H
+#define _LINUX_FILE_REF_H
+
+#include <linux/atomic.h>
+#include <linux/preempt.h>
+#include <linux/types.h>
+
+/*
+ * file_ref is a reference count implementation specifically for use by
+ * files. It takes inspiration from rcuref but differs in key aspects
+ * such as support for SLAB_TYPESAFE_BY_RCU type caches.
+ *
+ * FILE_REF_ONEREF                FILE_REF_MAXREF
+ * 0x0000000000000000UL      0x7FFFFFFFFFFFFFFFUL
+ * <-------------------valid ------------------->
+ *
+ *                       FILE_REF_SATURATED
+ * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL
+ * <-----------------------saturation zone---------------------->
+ *
+ * FILE_REF_RELEASED                   FILE_REF_DEAD
+ * 0xC000000000000000UL         0xE000000000000000UL
+ * <-------------------dead zone------------------->
+ *
+ * FILE_REF_NOREF
+ * 0xFFFFFFFFFFFFFFFFUL
+ */
+
+#ifdef CONFIG_64BIT
+#define FILE_REF_ONEREF		0x0000000000000000UL
+#define FILE_REF_MAXREF		0x7FFFFFFFFFFFFFFFUL
+#define FILE_REF_SATURATED	0xA000000000000000UL
+#define FILE_REF_RELEASED	0xC000000000000000UL
+#define FILE_REF_DEAD		0xE000000000000000UL
+#define FILE_REF_NOREF		0xFFFFFFFFFFFFFFFFUL
+#else
+#define FILE_REF_ONEREF		0x00000000U
+#define FILE_REF_MAXREF		0x7FFFFFFFU
+#define FILE_REF_SATURATED	0xA0000000U
+#define FILE_REF_RELEASED	0xC0000000U
+#define FILE_REF_DEAD		0xE0000000U
+#define FILE_REF_NOREF		0xFFFFFFFFU
+#endif
+
+typedef struct {
+#ifdef CONFIG_64BIT
+	atomic64_t refcnt;
+#else
+	atomic_t refcnt;
+#endif
+} file_ref_t;
+
+/**
+ * file_ref_init - Initialize a file reference count
+ * @ref: Pointer to the reference count
+ * @cnt: The initial reference count typically '1'
+ */
+static inline void file_ref_init(file_ref_t *ref, unsigned long cnt)
+{
+	atomic_long_set(&ref->refcnt, cnt - 1);
+}
+
+bool __file_ref_put(file_ref_t *ref, unsigned long cnt);
+
+/**
+ * file_ref_get - Acquire one reference on a file
+ * @ref: Pointer to the reference count
+ *
+ * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF.
+ *
+ * Provides full memory ordering.
+ *
+ * Return: False if the attempt to acquire a reference failed. This happens
+ *         when the last reference has been put already. True if a reference
+ *         was successfully acquired
+ */
+static __always_inline __must_check bool file_ref_get(file_ref_t *ref)
+{
+	/*
+	 * Unconditionally increase the reference count with full
+	 * ordering. The saturation and dead zones provide enough
+	 * tolerance for this.
+	 *
+	 * If this indicates negative the file in question the fail can
+	 * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU.
+	 * Hence, unconditionally altering the file reference count to
+	 * e.g., reset the file reference count back to the middle of
+	 * the deadzone risk end up marking someone else's file as dead
+	 * behind their back.
+	 *
+	 * It would be possible to do a careful:
+	 *
+	 * cnt = atomic_long_inc_return();
+	 * if (likely(cnt >= 0))
+	 *	return true;
+	 *
+	 * and then something like:
+	 *
+	 * if (cnt >= FILE_REF_RELEASE)
+	 *	atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD),
+	 *
+	 * to set the value back to the middle of the deadzone. But it's
+	 * practically impossible to go from FILE_REF_DEAD to
+	 * FILE_REF_ONEREF. It would need 2305843009213693952/2^61
+	 * file_ref_get()s to resurrect such a dead file.
+	 */
+	return !atomic_long_add_negative(1, &ref->refcnt);
+}
+
+/**
+ * file_ref_inc - Acquire one reference on a file
+ * @ref: Pointer to the reference count
+ *
+ * Acquire an additional reference on a file. Warns if the caller didn't
+ * already hold a reference.
+ */
+static __always_inline void file_ref_inc(file_ref_t *ref)
+{
+	long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt);
+	WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference");
+}
+
+/**
+ * file_ref_put -- Release a file reference
+ * @ref:	Pointer to the reference count
+ *
+ * Provides release memory ordering, such that prior loads and stores
+ * are done before, and provides an acquire ordering on success such
+ * that free() must come after.
+ *
+ * Return: True if this was the last reference with no future references
+ *         possible. This signals the caller that it can safely release
+ *         the object which is protected by the reference counter.
+ *         False if there are still active references or the put() raced
+ *         with a concurrent get()/put() pair. Caller is not allowed to
+ *         release the protected object.
+ */
+static __always_inline __must_check bool file_ref_put(file_ref_t *ref)
+{
+	long cnt;
+
+	/*
+	 * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put()
+	 * calls don't risk UAFs when a file is recyclyed, it is still
+	 * vulnerable to UAFs caused by freeing the whole slab page once
+	 * it becomes unused. Prevent file_ref_put() from being
+	 * preempted protects against this.
+	 */
+	guard(preempt)();
+	/*
+	 * Unconditionally decrease the reference count. The saturation
+	 * and dead zones provide enough tolerance for this. If this
+	 * fails then we need to handle the last reference drop and
+	 * cases inside the saturation and dead zones.
+	 */
+	cnt = atomic_long_dec_return(&ref->refcnt);
+	if (cnt >= 0)
+		return false;
+	return __file_ref_put(ref, cnt);
+}
+
+/**
+ * file_ref_read - Read the number of file references
+ * @ref: Pointer to the reference count
+ *
+ * Return: The number of held references (0 ... N)
+ */
+static inline unsigned long file_ref_read(file_ref_t *ref)
+{
+	unsigned long c = atomic_long_read(&ref->refcnt);
+
+	/* Return 0 if within the DEAD zone. */
+	return c >= FILE_REF_RELEASED ? 0 : c + 1;
+}
+
+#endif
-- 
cgit v1.2.3


From 9a8dbdadae509e5717ff6e5aa572ca0974d2101d Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Sat, 19 Oct 2024 12:51:06 +0000
Subject: block/fs: Pass an iocb to generic_atomic_write_valid()

Darrick and Hannes both thought it better that generic_atomic_write_valid()
should be passed a struct iocb, and not just the member of that struct
which is referenced; see [0] and [1].

I think that makes a more generic and clean API, so make that change.

[0] https://lore.kernel.org/linux-block/680ce641-729b-4150-b875-531a98657682@suse.de/
[1] https://lore.kernel.org/linux-xfs/20240620212401.GA3058325@frogsfrogsfrogs/

Fixes: c34fc6f26ab8 ("fs: Initial atomic write support")
Suggested-by: Darrick J. Wong <djwong@kernel.org>
Suggested-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20241019125113.369994-2-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/fops.c       | 8 ++++----
 fs/read_write.c    | 4 ++--
 include/linux/fs.h | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/fops.c b/block/fops.c
index e696ae53bf1e..968b47b615c4 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -35,13 +35,13 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
 	return opf;
 }
 
-static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos,
+static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
 				struct iov_iter *iter, bool is_atomic)
 {
-	if (is_atomic && !generic_atomic_write_valid(iter, pos))
+	if (is_atomic && !generic_atomic_write_valid(iocb, iter))
 		return true;
 
-	return pos & (bdev_logical_block_size(bdev) - 1) ||
+	return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
 		!bdev_iter_is_aligned(bdev, iter);
 }
 
@@ -374,7 +374,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (!iov_iter_count(iter))
 		return 0;
 
-	if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic))
+	if (blkdev_dio_invalid(bdev, iocb, iter, is_atomic))
 		return -EINVAL;
 
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
diff --git a/fs/read_write.c b/fs/read_write.c
index 64dc24afdb3a..2c3263530828 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1830,7 +1830,7 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
 	return 0;
 }
 
-bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
+bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
 {
 	size_t len = iov_iter_count(iter);
 
@@ -1840,7 +1840,7 @@ bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
 	if (!is_power_of_2(len))
 		return false;
 
-	if (!IS_ALIGNED(pos, len))
+	if (!IS_ALIGNED(iocb->ki_pos, len))
 		return false;
 
 	return true;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e3c603d01337..fbfa032d1d90 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3721,6 +3721,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path)
 	return !c;
 }
 
-bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos);
+bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
 
 #endif /* _LINUX_FS_H */
-- 
cgit v1.2.3


From c3be7ebbbce5201e151f17e28a6c807602f369c9 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Sat, 19 Oct 2024 12:51:07 +0000
Subject: fs/block: Check for IOCB_DIRECT in generic_atomic_write_valid()

Currently FMODE_CAN_ATOMIC_WRITE is set if the bdev can atomic write and
the file is open for direct IO. This does not work if the file is not
opened for direct IO, yet fcntl(O_DIRECT) is used on the fd later.

Change to check for direct IO on a per-IO basis in
generic_atomic_write_valid(). Since we want to report -EOPNOTSUPP for
non-direct IO for an atomic write, change to return an error code.

Relocate the block fops atomic write checks to the common write path, as to
catch non-direct IO.

Fixes: c34fc6f26ab8 ("fs: Initial atomic write support")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20241019125113.369994-3-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/fops.c       | 18 ++++++++++--------
 fs/read_write.c    | 13 ++++++++-----
 include/linux/fs.h |  2 +-
 3 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/fops.c b/block/fops.c
index 968b47b615c4..2d01c9007681 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -36,11 +36,8 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
 }
 
 static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
-				struct iov_iter *iter, bool is_atomic)
+				struct iov_iter *iter)
 {
-	if (is_atomic && !generic_atomic_write_valid(iocb, iter))
-		return true;
-
 	return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
 		!bdev_iter_is_aligned(bdev, iter);
 }
@@ -368,13 +365,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
-	bool is_atomic = iocb->ki_flags & IOCB_ATOMIC;
 	unsigned int nr_pages;
 
 	if (!iov_iter_count(iter))
 		return 0;
 
-	if (blkdev_dio_invalid(bdev, iocb, iter, is_atomic))
+	if (blkdev_dio_invalid(bdev, iocb, iter))
 		return -EINVAL;
 
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
@@ -383,7 +379,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 			return __blkdev_direct_IO_simple(iocb, iter, bdev,
 							nr_pages);
 		return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
-	} else if (is_atomic) {
+	} else if (iocb->ki_flags & IOCB_ATOMIC) {
 		return -EINVAL;
 	}
 	return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
@@ -625,7 +621,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 	if (!bdev)
 		return -ENXIO;
 
-	if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT)
+	if (bdev_can_atomic_write(bdev))
 		filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
 
 	ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
@@ -700,6 +696,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
 		return -EOPNOTSUPP;
 
+	if (iocb->ki_flags & IOCB_ATOMIC) {
+		ret = generic_atomic_write_valid(iocb, from);
+		if (ret)
+			return ret;
+	}
+
 	size -= iocb->ki_pos;
 	if (iov_iter_count(from) > size) {
 		shorted = iov_iter_count(from) - size;
diff --git a/fs/read_write.c b/fs/read_write.c
index 2c3263530828..befec0b5c537 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1830,18 +1830,21 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
 	return 0;
 }
 
-bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
+int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
 {
 	size_t len = iov_iter_count(iter);
 
 	if (!iter_is_ubuf(iter))
-		return false;
+		return -EINVAL;
 
 	if (!is_power_of_2(len))
-		return false;
+		return -EINVAL;
 
 	if (!IS_ALIGNED(iocb->ki_pos, len))
-		return false;
+		return -EINVAL;
 
-	return true;
+	if (!(iocb->ki_flags & IOCB_DIRECT))
+		return -EOPNOTSUPP;
+
+	return 0;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fbfa032d1d90..ba47fb283730 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3721,6 +3721,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path)
 	return !c;
 }
 
-bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
+int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
 
 #endif /* _LINUX_FS_H */
-- 
cgit v1.2.3


From 1eadb157947163ca72ba8963b915fdc099ce6cca Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Sat, 19 Oct 2024 12:51:08 +0000
Subject: block: Add bdev atomic write limits helpers

Add helpers to get atomic write limits for a bdev, so that we don't access
request_queue helpers outside the block layer.

We check if the bdev can actually atomic write in these helpers, so we
can avoid users missing using this check.

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20241019125113.369994-4-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50c3b959da28..c2cc3c146d74 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1674,6 +1674,22 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev)
 	return true;
 }
 
+static inline unsigned int
+bdev_atomic_write_unit_min_bytes(struct block_device *bdev)
+{
+	if (!bdev_can_atomic_write(bdev))
+		return 0;
+	return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev));
+}
+
+static inline unsigned int
+bdev_atomic_write_unit_max_bytes(struct block_device *bdev)
+{
+	if (!bdev_can_atomic_write(bdev))
+		return 0;
+	return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev));
+}
+
 #define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
 
 #endif /* _LINUX_BLKDEV_H */
-- 
cgit v1.2.3


From e896474fe4851ffc4dd860c92daa906783090346 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Oct 2024 23:08:34 -0400
Subject: getname_maybe_null() - the third variant of pathname copy-in

Semantics used by statx(2) (and later *xattrat(2)): without AT_EMPTY_PATH
it's standard getname() (i.e. ERR_PTR(-ENOENT) on empty string,
ERR_PTR(-EFAULT) on NULL), with AT_EMPTY_PATH both empty string and
NULL are accepted.

Calling conventions: getname_maybe_null(user_pointer, flags) returns
	* pointer to struct filename when non-empty string had been
successfully read
	* ERR_PTR(...) on error
	* NULL if an empty string or NULL pointer had been given
with AT_EMPTY_PATH in the flags argument.

It tries to avoid allocation in the last case; it's not always
able to do so, in which case the temporary struct filename instance
is freed and NULL returned anyway.

Fast path is inlined.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c         | 30 +++++++++++++++++++++++-------
 fs/stat.c          | 28 ++++------------------------
 include/linux/fs.h | 10 ++++++++++
 3 files changed, 37 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index aaf3cd6c802f..2bfe476c3bd0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -211,22 +211,38 @@ getname_flags(const char __user *filename, int flags)
 	return result;
 }
 
-struct filename *
-getname_uflags(const char __user *filename, int uflags)
+struct filename *getname_uflags(const char __user *filename, int uflags)
 {
 	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
 
 	return getname_flags(filename, flags);
 }
 
-struct filename *
-getname(const char __user * filename)
+struct filename *getname(const char __user * filename)
 {
 	return getname_flags(filename, 0);
 }
 
-struct filename *
-getname_kernel(const char * filename)
+struct filename *__getname_maybe_null(const char __user *pathname)
+{
+	struct filename *name;
+	char c;
+
+	/* try to save on allocations; loss on um, though */
+	if (get_user(c, pathname))
+		return ERR_PTR(-EFAULT);
+	if (!c)
+		return NULL;
+
+	name = getname_flags(pathname, LOOKUP_EMPTY);
+	if (!IS_ERR(name) && !(name->name[0])) {
+		putname(name);
+		name = NULL;
+	}
+	return name;
+}
+
+struct filename *getname_kernel(const char * filename)
 {
 	struct filename *result;
 	int len = strlen(filename) + 1;
@@ -264,7 +280,7 @@ EXPORT_SYMBOL(getname_kernel);
 
 void putname(struct filename *name)
 {
-	if (IS_ERR(name))
+	if (IS_ERR_OR_NULL(name))
 		return;
 
 	if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
diff --git a/fs/stat.c b/fs/stat.c
index 41e598376d7e..b74831dc7ae6 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -326,18 +326,11 @@ int vfs_fstatat(int dfd, const char __user *filename,
 {
 	int ret;
 	int statx_flags = flags | AT_NO_AUTOMOUNT;
-	struct filename *name;
+	struct filename *name = getname_maybe_null(filename, flags);
 
-	/*
-	 * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH)
-	 *
-	 * If AT_EMPTY_PATH is set, we expect the common case to be that
-	 * empty path, and avoid doing all the extra pathname work.
-	 */
-	if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+	if (!name && dfd >= 0)
 		return vfs_fstat(dfd, stat);
 
-	name = getname_flags(filename, getname_statx_lookup_flags(statx_flags));
 	ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
 	putname(name);
 
@@ -774,24 +767,11 @@ SYSCALL_DEFINE5(statx,
 		struct statx __user *, buffer)
 {
 	int ret;
-	unsigned lflags;
-	struct filename *name;
+	struct filename *name = getname_maybe_null(filename, flags);
 
-	/*
-	 * Short-circuit handling of NULL and "" paths.
-	 *
-	 * For a NULL path we require and accept only the AT_EMPTY_PATH flag
-	 * (possibly |'d with AT_STATX flags).
-	 *
-	 * However, glibc on 32-bit architectures implements fstatat as statx
-	 * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags.
-	 * Supporting this results in the uglification below.
-	 */
-	lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE);
-	if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+	if (!name && dfd >= 0)
 		return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer);
 
-	name = getname_flags(filename, getname_statx_lookup_flags(flags));
 	ret = do_statx(dfd, name, flags, mask, buffer);
 	putname(name);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e3c603d01337..403258ac2ea2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2766,6 +2766,16 @@ extern struct filename *getname_flags(const char __user *, int);
 extern struct filename *getname_uflags(const char __user *, int);
 extern struct filename *getname(const char __user *);
 extern struct filename *getname_kernel(const char *);
+extern struct filename *__getname_maybe_null(const char __user *);
+static inline struct filename *getname_maybe_null(const char __user *name, int flags)
+{
+	if (!(flags & AT_EMPTY_PATH))
+		return getname(name);
+
+	if (!name)
+		return NULL;
+	return __getname_maybe_null(name);
+}
 extern void putname(struct filename *name);
 
 extern int finish_open(struct file *file, struct dentry *dentry,
-- 
cgit v1.2.3


From af264e5989055ac33f413c4c80874345cda0cc97 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 18 Oct 2024 19:05:57 +0200
Subject: mtd: spinand: Constify struct nand_ecc_engine_ops

'struct nand_ecc_engine_ops' are not modified in these drivers.

Constifying this structure moves some data to a read-only section, so
increases overall security, especially when the structure holds some
function pointers.

Update the prototype of mxic_ecc_get_pipelined_ops() accordingly.

On a x86_64, with allmodconfig, as an example:
Before:
======
   text	   data	    bss	    dec	    hex	filename
  16709	   1374	     16	  18099	   46b3	drivers/mtd/nand/ecc-mxic.o

After:
=====
   text	   data	    bss	    dec	    hex	filename
  16789	   1294	     16	  18099	   46b3	drivers/mtd/nand/ecc-mxic.o

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/72597e9de2320a4109be2112e696399592edacd4.1729271136.git.christophe.jaillet@wanadoo.fr
---
 drivers/mtd/nand/ecc-mxic.c       |  6 +++---
 drivers/mtd/nand/ecc-sw-bch.c     |  2 +-
 drivers/mtd/nand/ecc-sw-hamming.c |  2 +-
 drivers/mtd/nand/spi/core.c       |  2 +-
 drivers/spi/spi-mtk-snfi.c        |  2 +-
 drivers/spi/spi-mxic.c            | 10 +++++-----
 include/linux/mtd/nand-ecc-mxic.h |  4 ++--
 include/linux/mtd/nand.h          |  2 +-
 8 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-mxic.c b/drivers/mtd/nand/ecc-mxic.c
index 47e10945b8d2..86895e09328f 100644
--- a/drivers/mtd/nand/ecc-mxic.c
+++ b/drivers/mtd/nand/ecc-mxic.c
@@ -723,21 +723,21 @@ static int mxic_ecc_finish_io_req_pipelined(struct nand_device *nand,
 	return ret;
 }
 
-static struct nand_ecc_engine_ops mxic_ecc_engine_external_ops = {
+static const struct nand_ecc_engine_ops mxic_ecc_engine_external_ops = {
 	.init_ctx = mxic_ecc_init_ctx_external,
 	.cleanup_ctx = mxic_ecc_cleanup_ctx,
 	.prepare_io_req = mxic_ecc_prepare_io_req_external,
 	.finish_io_req = mxic_ecc_finish_io_req_external,
 };
 
-static struct nand_ecc_engine_ops mxic_ecc_engine_pipelined_ops = {
+static const struct nand_ecc_engine_ops mxic_ecc_engine_pipelined_ops = {
 	.init_ctx = mxic_ecc_init_ctx_pipelined,
 	.cleanup_ctx = mxic_ecc_cleanup_ctx,
 	.prepare_io_req = mxic_ecc_prepare_io_req_pipelined,
 	.finish_io_req = mxic_ecc_finish_io_req_pipelined,
 };
 
-struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void)
+const struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void)
 {
 	return &mxic_ecc_engine_pipelined_ops;
 }
diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
index 405552d014a8..0d9310dd6f52 100644
--- a/drivers/mtd/nand/ecc-sw-bch.c
+++ b/drivers/mtd/nand/ecc-sw-bch.c
@@ -384,7 +384,7 @@ static int nand_ecc_sw_bch_finish_io_req(struct nand_device *nand,
 	return max_bitflips;
 }
 
-static struct nand_ecc_engine_ops nand_ecc_sw_bch_engine_ops = {
+static const struct nand_ecc_engine_ops nand_ecc_sw_bch_engine_ops = {
 	.init_ctx = nand_ecc_sw_bch_init_ctx,
 	.cleanup_ctx = nand_ecc_sw_bch_cleanup_ctx,
 	.prepare_io_req = nand_ecc_sw_bch_prepare_io_req,
diff --git a/drivers/mtd/nand/ecc-sw-hamming.c b/drivers/mtd/nand/ecc-sw-hamming.c
index 254db2e7f8bb..f2d0effad9d2 100644
--- a/drivers/mtd/nand/ecc-sw-hamming.c
+++ b/drivers/mtd/nand/ecc-sw-hamming.c
@@ -638,7 +638,7 @@ static int nand_ecc_sw_hamming_finish_io_req(struct nand_device *nand,
 	return max_bitflips;
 }
 
-static struct nand_ecc_engine_ops nand_ecc_sw_hamming_engine_ops = {
+static const struct nand_ecc_engine_ops nand_ecc_sw_hamming_engine_ops = {
 	.init_ctx = nand_ecc_sw_hamming_init_ctx,
 	.cleanup_ctx = nand_ecc_sw_hamming_cleanup_ctx,
 	.prepare_io_req = nand_ecc_sw_hamming_prepare_io_req,
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 4d76f9f71a0e..b1df7f627161 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -337,7 +337,7 @@ static int spinand_ondie_ecc_finish_io_req(struct nand_device *nand,
 	return ret;
 }
 
-static struct nand_ecc_engine_ops spinand_ondie_ecc_engine_ops = {
+static const struct nand_ecc_engine_ops spinand_ondie_ecc_engine_ops = {
 	.init_ctx = spinand_ondie_ecc_init_ctx,
 	.cleanup_ctx = spinand_ondie_ecc_cleanup_ctx,
 	.prepare_io_req = spinand_ondie_ecc_prepare_io_req,
diff --git a/drivers/spi/spi-mtk-snfi.c b/drivers/spi/spi-mtk-snfi.c
index ddd98ddb7913..b2a135e99e4d 100644
--- a/drivers/spi/spi-mtk-snfi.c
+++ b/drivers/spi/spi-mtk-snfi.c
@@ -776,7 +776,7 @@ static int mtk_snand_ecc_finish_io_req(struct nand_device *nand,
 	return snf->ecc_stats.failed ? -EBADMSG : snf->ecc_stats.bitflips;
 }
 
-static struct nand_ecc_engine_ops mtk_snfi_ecc_engine_ops = {
+static const struct nand_ecc_engine_ops mtk_snfi_ecc_engine_ops = {
 	.init_ctx = mtk_snand_ecc_init_ctx,
 	.cleanup_ctx = mtk_snand_ecc_cleanup_ctx,
 	.prepare_io_req = mtk_snand_ecc_prepare_io_req,
diff --git a/drivers/spi/spi-mxic.c b/drivers/spi/spi-mxic.c
index 6156d691630a..7ef3ee55cba3 100644
--- a/drivers/spi/spi-mxic.c
+++ b/drivers/spi/spi-mxic.c
@@ -640,7 +640,7 @@ static int mxic_spi_transfer_one(struct spi_controller *host,
 /* ECC wrapper */
 static int mxic_spi_mem_ecc_init_ctx(struct nand_device *nand)
 {
-	struct nand_ecc_engine_ops *ops = mxic_ecc_get_pipelined_ops();
+	const struct nand_ecc_engine_ops *ops = mxic_ecc_get_pipelined_ops();
 	struct mxic_spi *mxic = nand->ecc.engine->priv;
 
 	mxic->ecc.use_pipelined_conf = true;
@@ -650,7 +650,7 @@ static int mxic_spi_mem_ecc_init_ctx(struct nand_device *nand)
 
 static void mxic_spi_mem_ecc_cleanup_ctx(struct nand_device *nand)
 {
-	struct nand_ecc_engine_ops *ops = mxic_ecc_get_pipelined_ops();
+	const struct nand_ecc_engine_ops *ops = mxic_ecc_get_pipelined_ops();
 	struct mxic_spi *mxic = nand->ecc.engine->priv;
 
 	mxic->ecc.use_pipelined_conf = false;
@@ -661,7 +661,7 @@ static void mxic_spi_mem_ecc_cleanup_ctx(struct nand_device *nand)
 static int mxic_spi_mem_ecc_prepare_io_req(struct nand_device *nand,
 					   struct nand_page_io_req *req)
 {
-	struct nand_ecc_engine_ops *ops = mxic_ecc_get_pipelined_ops();
+	const struct nand_ecc_engine_ops *ops = mxic_ecc_get_pipelined_ops();
 
 	return ops->prepare_io_req(nand, req);
 }
@@ -669,12 +669,12 @@ static int mxic_spi_mem_ecc_prepare_io_req(struct nand_device *nand,
 static int mxic_spi_mem_ecc_finish_io_req(struct nand_device *nand,
 					  struct nand_page_io_req *req)
 {
-	struct nand_ecc_engine_ops *ops = mxic_ecc_get_pipelined_ops();
+	const struct nand_ecc_engine_ops *ops = mxic_ecc_get_pipelined_ops();
 
 	return ops->finish_io_req(nand, req);
 }
 
-static struct nand_ecc_engine_ops mxic_spi_mem_ecc_engine_pipelined_ops = {
+static const struct nand_ecc_engine_ops mxic_spi_mem_ecc_engine_pipelined_ops = {
 	.init_ctx = mxic_spi_mem_ecc_init_ctx,
 	.cleanup_ctx = mxic_spi_mem_ecc_cleanup_ctx,
 	.prepare_io_req = mxic_spi_mem_ecc_prepare_io_req,
diff --git a/include/linux/mtd/nand-ecc-mxic.h b/include/linux/mtd/nand-ecc-mxic.h
index b125926e458c..0da4b2999576 100644
--- a/include/linux/mtd/nand-ecc-mxic.h
+++ b/include/linux/mtd/nand-ecc-mxic.h
@@ -16,7 +16,7 @@ struct mxic_ecc_engine;
 
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_MXIC) && IS_REACHABLE(CONFIG_MTD_NAND_CORE)
 
-struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void);
+const struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void);
 struct nand_ecc_engine *mxic_ecc_get_pipelined_engine(struct platform_device *spi_pdev);
 void mxic_ecc_put_pipelined_engine(struct nand_ecc_engine *eng);
 int mxic_ecc_process_data_pipelined(struct nand_ecc_engine *eng,
@@ -24,7 +24,7 @@ int mxic_ecc_process_data_pipelined(struct nand_ecc_engine *eng,
 
 #else /* !CONFIG_MTD_NAND_ECC_MXIC */
 
-static inline struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void)
+static inline const struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void)
 {
 	return NULL;
 }
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 1e4208040956..0e2f228e8b4a 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -293,7 +293,7 @@ enum nand_ecc_engine_integration {
 struct nand_ecc_engine {
 	struct device *dev;
 	struct list_head node;
-	struct nand_ecc_engine_ops *ops;
+	const struct nand_ecc_engine_ops *ops;
 	enum nand_ecc_engine_integration integration;
 	void *priv;
 };
-- 
cgit v1.2.3


From 192514ae05687a60ac230569b2a215fb2d3b8d90 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Mon, 10 Jun 2024 10:57:32 +0200
Subject: soc: mediatek: Add MediaTek DVFS Resource Collector (DVFSRC) driver

The Dynamic Voltage and Frequency Scaling Resource Collector (DVFSRC) is a
Hardware module used to collect all the requests from both software and the
various remote processors embedded into the SoC and decide about a minimum
operating voltage and a minimum DRAM frequency to fulfill those requests in
an effort to provide the best achievable performance per watt.

This hardware IP is capable of transparently performing direct register R/W
on all of the DVFSRC-controlled regulators and SoC bandwidth knobs.

This driver includes support for MT8183, MT8192 and MT8195.

Co-Developed-by: Dawei Chien <dawei.chien@mediatek.com>
[Angelo: Partial refactoring and cleanups]
Reviewed-by: Georgi Djakov <djakov@kerenl.org>
Link: https://lore.kernel.org/r/20240610085735.147134-5-angelogioacchino.delregno@collabora.com
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/Kconfig             |  11 +
 drivers/soc/mediatek/Makefile            |   1 +
 drivers/soc/mediatek/mtk-dvfsrc.c        | 545 +++++++++++++++++++++++++++++++
 include/linux/soc/mediatek/dvfsrc.h      |  36 ++
 include/linux/soc/mediatek/mtk_sip_svc.h |   3 +
 5 files changed, 596 insertions(+)
 create mode 100644 drivers/soc/mediatek/mtk-dvfsrc.c
 create mode 100644 include/linux/soc/mediatek/dvfsrc.h

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig
index 1b7afb19ccd6..d7293977f06e 100644
--- a/drivers/soc/mediatek/Kconfig
+++ b/drivers/soc/mediatek/Kconfig
@@ -26,6 +26,17 @@ config MTK_DEVAPC
 	  The violation information is logged for further analysis or
 	  countermeasures.
 
+config MTK_DVFSRC
+	tristate "MediaTek DVFSRC Support"
+	depends on ARCH_MEDIATEK
+	help
+	  Say yes here to add support for the MediaTek Dynamic Voltage
+	  and Frequency Scaling Resource Collector (DVFSRC): a HW
+	  IP found on many MediaTek SoCs, which is responsible for
+	  collecting DVFS requests from various SoC IPs, other than
+	  software, and performing bandwidth scaling to provide the
+	  best achievable performance-per-watt.
+
 config MTK_INFRACFG
 	bool "MediaTek INFRACFG Support"
 	select REGMAP
diff --git a/drivers/soc/mediatek/Makefile b/drivers/soc/mediatek/Makefile
index 6830512848fd..0665573e3c4b 100644
--- a/drivers/soc/mediatek/Makefile
+++ b/drivers/soc/mediatek/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_MTK_CMDQ) += mtk-cmdq-helper.o
 obj-$(CONFIG_MTK_DEVAPC) += mtk-devapc.o
+obj-$(CONFIG_MTK_DVFSRC) += mtk-dvfsrc.o
 obj-$(CONFIG_MTK_INFRACFG) += mtk-infracfg.o
 obj-$(CONFIG_MTK_PMIC_WRAP) += mtk-pmic-wrap.o
 obj-$(CONFIG_MTK_REGULATOR_COUPLER) += mtk-regulator-coupler.o
diff --git a/drivers/soc/mediatek/mtk-dvfsrc.c b/drivers/soc/mediatek/mtk-dvfsrc.c
new file mode 100644
index 000000000000..83bf46fdcf2d
--- /dev/null
+++ b/drivers/soc/mediatek/mtk-dvfsrc.c
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 MediaTek Inc.
+ * Copyright (c) 2024 Collabora Ltd.
+ *                    AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/bitfield.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/soc/mediatek/dvfsrc.h>
+#include <linux/soc/mediatek/mtk_sip_svc.h>
+
+/* DVFSRC_LEVEL */
+#define DVFSRC_V1_LEVEL_TARGET_LEVEL	GENMASK(15, 0)
+#define DVFSRC_TGT_LEVEL_IDLE		0x00
+#define DVFSRC_V1_LEVEL_CURRENT_LEVEL	GENMASK(31, 16)
+
+/* DVFSRC_SW_REQ, DVFSRC_SW_REQ2 */
+#define DVFSRC_V1_SW_REQ2_DRAM_LEVEL	GENMASK(1, 0)
+#define DVFSRC_V1_SW_REQ2_VCORE_LEVEL	GENMASK(3, 2)
+
+#define DVFSRC_V2_SW_REQ_DRAM_LEVEL	GENMASK(3, 0)
+#define DVFSRC_V2_SW_REQ_VCORE_LEVEL	GENMASK(6, 4)
+
+/* DVFSRC_VCORE */
+#define DVFSRC_V2_VCORE_REQ_VSCP_LEVEL	GENMASK(14, 12)
+
+#define DVFSRC_POLL_TIMEOUT_US		1000
+#define STARTUP_TIME_US			1
+
+#define MTK_SIP_DVFSRC_INIT		0x0
+#define MTK_SIP_DVFSRC_START		0x1
+
+struct dvfsrc_bw_constraints {
+	u16 max_dram_nom_bw;
+	u16 max_dram_peak_bw;
+	u16 max_dram_hrt_bw;
+};
+
+struct dvfsrc_opp {
+	u32 vcore_opp;
+	u32 dram_opp;
+};
+
+struct dvfsrc_opp_desc {
+	const struct dvfsrc_opp *opps;
+	u32 num_opp;
+};
+
+struct dvfsrc_soc_data;
+struct mtk_dvfsrc {
+	struct device *dev;
+	struct platform_device *icc;
+	struct platform_device *regulator;
+	const struct dvfsrc_soc_data *dvd;
+	const struct dvfsrc_opp_desc *curr_opps;
+	void __iomem *regs;
+	int dram_type;
+};
+
+struct dvfsrc_soc_data {
+	const int *regs;
+	const struct dvfsrc_opp_desc *opps_desc;
+	u32 (*get_target_level)(struct mtk_dvfsrc *dvfsrc);
+	u32 (*get_current_level)(struct mtk_dvfsrc *dvfsrc);
+	u32 (*get_vcore_level)(struct mtk_dvfsrc *dvfsrc);
+	u32 (*get_vscp_level)(struct mtk_dvfsrc *dvfsrc);
+	void (*set_dram_bw)(struct mtk_dvfsrc *dvfsrc, u64 bw);
+	void (*set_dram_peak_bw)(struct mtk_dvfsrc *dvfsrc, u64 bw);
+	void (*set_dram_hrt_bw)(struct mtk_dvfsrc *dvfsrc, u64 bw);
+	void (*set_opp_level)(struct mtk_dvfsrc *dvfsrc, u32 level);
+	void (*set_vcore_level)(struct mtk_dvfsrc *dvfsrc, u32 level);
+	void (*set_vscp_level)(struct mtk_dvfsrc *dvfsrc, u32 level);
+	int (*wait_for_opp_level)(struct mtk_dvfsrc *dvfsrc, u32 level);
+	int (*wait_for_vcore_level)(struct mtk_dvfsrc *dvfsrc, u32 level);
+	const struct dvfsrc_bw_constraints *bw_constraints;
+};
+
+static u32 dvfsrc_readl(struct mtk_dvfsrc *dvfs, u32 offset)
+{
+	return readl(dvfs->regs + dvfs->dvd->regs[offset]);
+}
+
+static void dvfsrc_writel(struct mtk_dvfsrc *dvfs, u32 offset, u32 val)
+{
+	writel(val, dvfs->regs + dvfs->dvd->regs[offset]);
+}
+
+enum dvfsrc_regs {
+	DVFSRC_SW_REQ,
+	DVFSRC_SW_REQ2,
+	DVFSRC_LEVEL,
+	DVFSRC_TARGET_LEVEL,
+	DVFSRC_SW_BW,
+	DVFSRC_SW_PEAK_BW,
+	DVFSRC_SW_HRT_BW,
+	DVFSRC_VCORE,
+	DVFSRC_REGS_MAX,
+};
+
+static const int dvfsrc_mt8183_regs[] = {
+	[DVFSRC_SW_REQ] = 0x4,
+	[DVFSRC_SW_REQ2] = 0x8,
+	[DVFSRC_LEVEL] = 0xDC,
+	[DVFSRC_SW_BW] = 0x160,
+};
+
+static const int dvfsrc_mt8195_regs[] = {
+	[DVFSRC_SW_REQ] = 0xc,
+	[DVFSRC_VCORE] = 0x6c,
+	[DVFSRC_SW_PEAK_BW] = 0x278,
+	[DVFSRC_SW_BW] = 0x26c,
+	[DVFSRC_SW_HRT_BW] = 0x290,
+	[DVFSRC_LEVEL] = 0xd44,
+	[DVFSRC_TARGET_LEVEL] = 0xd48,
+};
+
+static const struct dvfsrc_opp *dvfsrc_get_current_opp(struct mtk_dvfsrc *dvfsrc)
+{
+	u32 level = dvfsrc->dvd->get_current_level(dvfsrc);
+
+	return &dvfsrc->curr_opps->opps[level];
+}
+
+static bool dvfsrc_is_idle(struct mtk_dvfsrc *dvfsrc)
+{
+	if (!dvfsrc->dvd->get_target_level)
+		return true;
+
+	return dvfsrc->dvd->get_target_level(dvfsrc) == DVFSRC_TGT_LEVEL_IDLE;
+}
+
+static int dvfsrc_wait_for_vcore_level_v1(struct mtk_dvfsrc *dvfsrc, u32 level)
+{
+	const struct dvfsrc_opp *curr;
+
+	return readx_poll_timeout_atomic(dvfsrc_get_current_opp, dvfsrc, curr,
+					 curr->vcore_opp >= level, STARTUP_TIME_US,
+					 DVFSRC_POLL_TIMEOUT_US);
+}
+
+static int dvfsrc_wait_for_opp_level_v1(struct mtk_dvfsrc *dvfsrc, u32 level)
+{
+	const struct dvfsrc_opp *target, *curr;
+	int ret;
+
+	target = &dvfsrc->curr_opps->opps[level];
+	ret = readx_poll_timeout_atomic(dvfsrc_get_current_opp, dvfsrc, curr,
+					curr->dram_opp >= target->dram_opp &&
+					curr->vcore_opp >= target->vcore_opp,
+					STARTUP_TIME_US, DVFSRC_POLL_TIMEOUT_US);
+	if (ret < 0) {
+		dev_warn(dvfsrc->dev,
+			 "timeout! target OPP: %u, dram: %d, vcore: %d\n", level,
+			 curr->dram_opp, curr->vcore_opp);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int dvfsrc_wait_for_opp_level_v2(struct mtk_dvfsrc *dvfsrc, u32 level)
+{
+	const struct dvfsrc_opp *target, *curr;
+	int ret;
+
+	target = &dvfsrc->curr_opps->opps[level];
+	ret = readx_poll_timeout_atomic(dvfsrc_get_current_opp, dvfsrc, curr,
+					curr->dram_opp >= target->dram_opp &&
+					curr->vcore_opp >= target->vcore_opp,
+					STARTUP_TIME_US, DVFSRC_POLL_TIMEOUT_US);
+	if (ret < 0) {
+		dev_warn(dvfsrc->dev,
+			 "timeout! target OPP: %u, dram: %d\n", level, curr->dram_opp);
+		return ret;
+	}
+
+	return 0;
+}
+
+static u32 dvfsrc_get_target_level_v1(struct mtk_dvfsrc *dvfsrc)
+{
+	u32 val = dvfsrc_readl(dvfsrc, DVFSRC_LEVEL);
+
+	return FIELD_GET(DVFSRC_V1_LEVEL_TARGET_LEVEL, val);
+}
+
+static u32 dvfsrc_get_current_level_v1(struct mtk_dvfsrc *dvfsrc)
+{
+	u32 val = dvfsrc_readl(dvfsrc, DVFSRC_LEVEL);
+	u32 current_level = FIELD_GET(DVFSRC_V1_LEVEL_CURRENT_LEVEL, val);
+
+	return ffs(current_level) - 1;
+}
+
+static u32 dvfsrc_get_target_level_v2(struct mtk_dvfsrc *dvfsrc)
+{
+	return dvfsrc_readl(dvfsrc, DVFSRC_TARGET_LEVEL);
+}
+
+static u32 dvfsrc_get_current_level_v2(struct mtk_dvfsrc *dvfsrc)
+{
+	u32 val = dvfsrc_readl(dvfsrc, DVFSRC_LEVEL);
+	u32 level = ffs(val);
+
+	/* Valid levels */
+	if (level < dvfsrc->curr_opps->num_opp)
+		return dvfsrc->curr_opps->num_opp - level;
+
+	/* Zero for level 0 or invalid level */
+	return 0;
+}
+
+static u32 dvfsrc_get_vcore_level_v1(struct mtk_dvfsrc *dvfsrc)
+{
+	u32 val = dvfsrc_readl(dvfsrc, DVFSRC_SW_REQ2);
+
+	return FIELD_GET(DVFSRC_V1_SW_REQ2_VCORE_LEVEL, val);
+}
+
+static void dvfsrc_set_vcore_level_v1(struct mtk_dvfsrc *dvfsrc, u32 level)
+{
+	u32 val = dvfsrc_readl(dvfsrc, DVFSRC_SW_REQ2);
+
+	val &= ~DVFSRC_V1_SW_REQ2_VCORE_LEVEL;
+	val |= FIELD_PREP(DVFSRC_V1_SW_REQ2_VCORE_LEVEL, level);
+
+	dvfsrc_writel(dvfsrc, DVFSRC_SW_REQ2, val);
+}
+
+static u32 dvfsrc_get_vcore_level_v2(struct mtk_dvfsrc *dvfsrc)
+{
+	u32 val = dvfsrc_readl(dvfsrc, DVFSRC_SW_REQ);
+
+	return FIELD_GET(DVFSRC_V2_SW_REQ_VCORE_LEVEL, val);
+}
+
+static void dvfsrc_set_vcore_level_v2(struct mtk_dvfsrc *dvfsrc, u32 level)
+{
+	u32 val = dvfsrc_readl(dvfsrc, DVFSRC_SW_REQ);
+
+	val &= ~DVFSRC_V2_SW_REQ_VCORE_LEVEL;
+	val |= FIELD_PREP(DVFSRC_V2_SW_REQ_VCORE_LEVEL, level);
+
+	dvfsrc_writel(dvfsrc, DVFSRC_SW_REQ, val);
+}
+
+static u32 dvfsrc_get_vscp_level_v2(struct mtk_dvfsrc *dvfsrc)
+{
+	u32 val = dvfsrc_readl(dvfsrc, DVFSRC_VCORE);
+
+	return FIELD_GET(DVFSRC_V2_VCORE_REQ_VSCP_LEVEL, val);
+}
+
+static void dvfsrc_set_vscp_level_v2(struct mtk_dvfsrc *dvfsrc, u32 level)
+{
+	u32 val = dvfsrc_readl(dvfsrc, DVFSRC_VCORE);
+
+	val &= ~DVFSRC_V2_VCORE_REQ_VSCP_LEVEL;
+	val |= FIELD_PREP(DVFSRC_V2_VCORE_REQ_VSCP_LEVEL, level);
+
+	dvfsrc_writel(dvfsrc, DVFSRC_VCORE, val);
+}
+
+static void __dvfsrc_set_dram_bw_v1(struct mtk_dvfsrc *dvfsrc, u32 reg,
+				    u16 max_bw, u16 min_bw, u64 bw)
+{
+	u32 new_bw = (u32)div_u64(bw, 100 * 1000);
+
+	/* If bw constraints (in mbps) are defined make sure to respect them */
+	if (max_bw)
+		new_bw = min(new_bw, max_bw);
+	if (min_bw && new_bw > 0)
+		new_bw = max(new_bw, min_bw);
+
+	dvfsrc_writel(dvfsrc, reg, new_bw);
+}
+
+static void dvfsrc_set_dram_bw_v1(struct mtk_dvfsrc *dvfsrc, u64 bw)
+{
+	u64 max_bw = dvfsrc->dvd->bw_constraints->max_dram_nom_bw;
+
+	__dvfsrc_set_dram_bw_v1(dvfsrc, DVFSRC_SW_BW, max_bw, 0, bw);
+};
+
+static void dvfsrc_set_dram_peak_bw_v1(struct mtk_dvfsrc *dvfsrc, u64 bw)
+{
+	u64 max_bw = dvfsrc->dvd->bw_constraints->max_dram_peak_bw;
+
+	__dvfsrc_set_dram_bw_v1(dvfsrc, DVFSRC_SW_PEAK_BW, max_bw, 0, bw);
+}
+
+static void dvfsrc_set_dram_hrt_bw_v1(struct mtk_dvfsrc *dvfsrc, u64 bw)
+{
+	u64 max_bw = dvfsrc->dvd->bw_constraints->max_dram_hrt_bw;
+
+	__dvfsrc_set_dram_bw_v1(dvfsrc, DVFSRC_SW_HRT_BW, max_bw, 0, bw);
+}
+
+static void dvfsrc_set_opp_level_v1(struct mtk_dvfsrc *dvfsrc, u32 level)
+{
+	const struct dvfsrc_opp *opp = &dvfsrc->curr_opps->opps[level];
+	u32 val;
+
+	/* Translate Pstate to DVFSRC level and set it to DVFSRC HW */
+	val = FIELD_PREP(DVFSRC_V1_SW_REQ2_DRAM_LEVEL, opp->dram_opp);
+	val |= FIELD_PREP(DVFSRC_V1_SW_REQ2_VCORE_LEVEL, opp->vcore_opp);
+
+	dev_dbg(dvfsrc->dev, "vcore_opp: %d, dram_opp: %d\n", opp->vcore_opp, opp->dram_opp);
+	dvfsrc_writel(dvfsrc, DVFSRC_SW_REQ, val);
+}
+
+int mtk_dvfsrc_send_request(const struct device *dev, u32 cmd, u64 data)
+{
+	struct mtk_dvfsrc *dvfsrc = dev_get_drvdata(dev);
+	bool state;
+	int ret;
+
+	dev_dbg(dvfsrc->dev, "cmd: %d, data: %llu\n", cmd, data);
+
+	switch (cmd) {
+	case MTK_DVFSRC_CMD_BW:
+		dvfsrc->dvd->set_dram_bw(dvfsrc, data);
+		return 0;
+	case MTK_DVFSRC_CMD_HRT_BW:
+		if (dvfsrc->dvd->set_dram_hrt_bw)
+			dvfsrc->dvd->set_dram_hrt_bw(dvfsrc, data);
+		return 0;
+	case MTK_DVFSRC_CMD_PEAK_BW:
+		if (dvfsrc->dvd->set_dram_peak_bw)
+			dvfsrc->dvd->set_dram_peak_bw(dvfsrc, data);
+		return 0;
+	case MTK_DVFSRC_CMD_OPP:
+		if (!dvfsrc->dvd->set_opp_level)
+			return 0;
+
+		dvfsrc->dvd->set_opp_level(dvfsrc, data);
+		break;
+	case MTK_DVFSRC_CMD_VCORE_LEVEL:
+		dvfsrc->dvd->set_vcore_level(dvfsrc, data);
+		break;
+	case MTK_DVFSRC_CMD_VSCP_LEVEL:
+		if (!dvfsrc->dvd->set_vscp_level)
+			return 0;
+
+		dvfsrc->dvd->set_vscp_level(dvfsrc, data);
+		break;
+	default:
+		dev_err(dvfsrc->dev, "unknown command: %d\n", cmd);
+		return -EOPNOTSUPP;
+	}
+
+	/* DVFSRC needs at least 2T(~196ns) to handle a request */
+	udelay(STARTUP_TIME_US);
+
+	ret = readx_poll_timeout_atomic(dvfsrc_is_idle, dvfsrc, state, state,
+					STARTUP_TIME_US, DVFSRC_POLL_TIMEOUT_US);
+	if (ret < 0) {
+		dev_warn(dvfsrc->dev,
+			 "%d: idle timeout, data: %llu, last: %d -> %d\n", cmd, data,
+			 dvfsrc->dvd->get_current_level(dvfsrc),
+			 dvfsrc->dvd->get_target_level(dvfsrc));
+		return ret;
+	}
+
+	if (cmd == MTK_DVFSRC_CMD_OPP)
+		ret = dvfsrc->dvd->wait_for_opp_level(dvfsrc, data);
+	else
+		ret = dvfsrc->dvd->wait_for_vcore_level(dvfsrc, data);
+
+	if (ret < 0) {
+		dev_warn(dvfsrc->dev,
+			 "%d: wait timeout, data: %llu, last: %d -> %d\n",
+			 cmd, data,
+			 dvfsrc->dvd->get_current_level(dvfsrc),
+			 dvfsrc->dvd->get_target_level(dvfsrc));
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mtk_dvfsrc_send_request);
+
+int mtk_dvfsrc_query_info(const struct device *dev, u32 cmd, int *data)
+{
+	struct mtk_dvfsrc *dvfsrc = dev_get_drvdata(dev);
+
+	switch (cmd) {
+	case MTK_DVFSRC_CMD_VCORE_LEVEL:
+		*data = dvfsrc->dvd->get_vcore_level(dvfsrc);
+		break;
+	case MTK_DVFSRC_CMD_VSCP_LEVEL:
+		*data = dvfsrc->dvd->get_vscp_level(dvfsrc);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mtk_dvfsrc_query_info);
+
+static int mtk_dvfsrc_probe(struct platform_device *pdev)
+{
+	struct arm_smccc_res ares;
+	struct mtk_dvfsrc *dvfsrc;
+	int ret;
+
+	dvfsrc = devm_kzalloc(&pdev->dev, sizeof(*dvfsrc), GFP_KERNEL);
+	if (!dvfsrc)
+		return -ENOMEM;
+
+	dvfsrc->dvd = of_device_get_match_data(&pdev->dev);
+	dvfsrc->dev = &pdev->dev;
+
+	dvfsrc->regs = devm_platform_get_and_ioremap_resource(pdev, 0, NULL);
+	if (IS_ERR(dvfsrc->regs))
+		return PTR_ERR(dvfsrc->regs);
+
+	arm_smccc_smc(MTK_SIP_DVFSRC_VCOREFS_CONTROL, MTK_SIP_DVFSRC_INIT,
+		      0, 0, 0, 0, 0, 0, &ares);
+	if (ares.a0)
+		return dev_err_probe(&pdev->dev, -EINVAL, "DVFSRC init failed: %lu\n", ares.a0);
+
+	dvfsrc->dram_type = ares.a1;
+	dev_dbg(&pdev->dev, "DRAM Type: %d\n", dvfsrc->dram_type);
+
+	dvfsrc->curr_opps = &dvfsrc->dvd->opps_desc[dvfsrc->dram_type];
+	platform_set_drvdata(pdev, dvfsrc);
+
+	ret = devm_of_platform_populate(&pdev->dev);
+	if (ret)
+		return dev_err_probe(&pdev->dev, ret, "Failed to populate child devices\n");
+
+	/* Everything is set up - make it run! */
+	arm_smccc_smc(MTK_SIP_DVFSRC_VCOREFS_CONTROL, MTK_SIP_DVFSRC_START,
+		      0, 0, 0, 0, 0, 0, &ares);
+	if (ares.a0)
+		return dev_err_probe(&pdev->dev, -EINVAL, "Cannot start DVFSRC: %lu\n", ares.a0);
+
+	return 0;
+}
+
+static const struct dvfsrc_opp dvfsrc_opp_mt8183_lp4[] = {
+	{ 0, 0 }, { 0, 1 }, { 0, 2 }, { 1, 2 },
+};
+
+static const struct dvfsrc_opp dvfsrc_opp_mt8183_lp3[] = {
+	{ 0, 0 }, { 0, 1 }, { 1, 1 }, { 1, 2 },
+};
+
+static const struct dvfsrc_opp_desc dvfsrc_opp_mt8183_desc[] = {
+	[0] = {
+		.opps = dvfsrc_opp_mt8183_lp4,
+		.num_opp = ARRAY_SIZE(dvfsrc_opp_mt8183_lp4),
+	},
+	[1] = {
+		.opps = dvfsrc_opp_mt8183_lp3,
+		.num_opp = ARRAY_SIZE(dvfsrc_opp_mt8183_lp3),
+	},
+	[2] = {
+		.opps = dvfsrc_opp_mt8183_lp3,
+		.num_opp = ARRAY_SIZE(dvfsrc_opp_mt8183_lp3),
+	}
+};
+
+static const struct dvfsrc_bw_constraints dvfsrc_bw_constr_mt8183 = { 0, 0, 0 };
+
+static const struct dvfsrc_soc_data mt8183_data = {
+	.opps_desc = dvfsrc_opp_mt8183_desc,
+	.regs = dvfsrc_mt8183_regs,
+	.get_target_level = dvfsrc_get_target_level_v1,
+	.get_current_level = dvfsrc_get_current_level_v1,
+	.get_vcore_level = dvfsrc_get_vcore_level_v1,
+	.set_dram_bw = dvfsrc_set_dram_bw_v1,
+	.set_opp_level = dvfsrc_set_opp_level_v1,
+	.set_vcore_level = dvfsrc_set_vcore_level_v1,
+	.wait_for_opp_level = dvfsrc_wait_for_opp_level_v1,
+	.wait_for_vcore_level = dvfsrc_wait_for_vcore_level_v1,
+	.bw_constraints = &dvfsrc_bw_constr_mt8183,
+};
+
+static const struct dvfsrc_opp dvfsrc_opp_mt8195_lp4[] = {
+	{ 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },
+	{ 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 },
+	{ 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 },
+	{ 1, 3 }, { 2, 3 }, { 3, 3 }, { 1, 4 },
+	{ 2, 4 }, { 3, 4 }, { 2, 5 }, { 3, 5 },
+	{ 3, 6 },
+};
+
+static const struct dvfsrc_opp_desc dvfsrc_opp_mt8195_desc[] = {
+	[0] = {
+		.opps = dvfsrc_opp_mt8195_lp4,
+		.num_opp = ARRAY_SIZE(dvfsrc_opp_mt8195_lp4),
+	}
+};
+
+static const struct dvfsrc_bw_constraints dvfsrc_bw_constr_mt8195 = {
+	.max_dram_nom_bw = 255,
+	.max_dram_peak_bw = 255,
+	.max_dram_hrt_bw = 1023,
+};
+
+static const struct dvfsrc_soc_data mt8195_data = {
+	.opps_desc = dvfsrc_opp_mt8195_desc,
+	.regs = dvfsrc_mt8195_regs,
+	.get_target_level = dvfsrc_get_target_level_v2,
+	.get_current_level = dvfsrc_get_current_level_v2,
+	.get_vcore_level = dvfsrc_get_vcore_level_v2,
+	.get_vscp_level = dvfsrc_get_vscp_level_v2,
+	.set_dram_bw = dvfsrc_set_dram_bw_v1,
+	.set_dram_peak_bw = dvfsrc_set_dram_peak_bw_v1,
+	.set_dram_hrt_bw = dvfsrc_set_dram_hrt_bw_v1,
+	.set_vcore_level = dvfsrc_set_vcore_level_v2,
+	.set_vscp_level = dvfsrc_set_vscp_level_v2,
+	.wait_for_opp_level = dvfsrc_wait_for_opp_level_v2,
+	.wait_for_vcore_level = dvfsrc_wait_for_vcore_level_v1,
+	.bw_constraints = &dvfsrc_bw_constr_mt8195,
+};
+
+static const struct of_device_id mtk_dvfsrc_of_match[] = {
+	{ .compatible = "mediatek,mt8183-dvfsrc", .data = &mt8183_data },
+	{ .compatible = "mediatek,mt8195-dvfsrc", .data = &mt8195_data },
+	{ /* sentinel */ }
+};
+
+static struct platform_driver mtk_dvfsrc_driver = {
+	.probe	= mtk_dvfsrc_probe,
+	.driver = {
+		.name = "mtk-dvfsrc",
+		.of_match_table = mtk_dvfsrc_of_match,
+	},
+};
+module_platform_driver(mtk_dvfsrc_driver);
+
+MODULE_AUTHOR("AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>");
+MODULE_AUTHOR("Dawei Chien <dawei.chien@mediatek.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MediaTek DVFSRC driver");
diff --git a/include/linux/soc/mediatek/dvfsrc.h b/include/linux/soc/mediatek/dvfsrc.h
new file mode 100644
index 000000000000..1498b3ed396b
--- /dev/null
+++ b/include/linux/soc/mediatek/dvfsrc.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2021 MediaTek Inc.
+ * Copyright (c) 2024 Collabora Ltd.
+ *                    AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __MEDIATEK_DVFSRC_H
+#define __MEDIATEK_DVFSRC_H
+
+enum mtk_dvfsrc_cmd {
+	MTK_DVFSRC_CMD_BW,
+	MTK_DVFSRC_CMD_HRT_BW,
+	MTK_DVFSRC_CMD_PEAK_BW,
+	MTK_DVFSRC_CMD_OPP,
+	MTK_DVFSRC_CMD_VCORE_LEVEL,
+	MTK_DVFSRC_CMD_VSCP_LEVEL,
+	MTK_DVFSRC_CMD_MAX,
+};
+
+#if IS_ENABLED(CONFIG_MTK_DVFSRC)
+
+int mtk_dvfsrc_send_request(const struct device *dev, u32 cmd, u64 data);
+int mtk_dvfsrc_query_info(const struct device *dev, u32 cmd, int *data);
+
+#else
+
+static inline int mtk_dvfsrc_send_request(const struct device *dev, u32 cmd, u64 data)
+{ return -ENODEV; }
+
+static inline int mtk_dvfsrc_query_info(const struct device *dev, u32 cmd, int *data)
+{ return -ENODEV; }
+
+#endif /* CONFIG_MTK_DVFSRC */
+
+#endif
diff --git a/include/linux/soc/mediatek/mtk_sip_svc.h b/include/linux/soc/mediatek/mtk_sip_svc.h
index 0761128b4354..abe24a73ee19 100644
--- a/include/linux/soc/mediatek/mtk_sip_svc.h
+++ b/include/linux/soc/mediatek/mtk_sip_svc.h
@@ -22,6 +22,9 @@
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, MTK_SIP_SMC_CONVENTION, \
 			   ARM_SMCCC_OWNER_SIP, fn_id)
 
+/* DVFSRC SMC calls */
+#define MTK_SIP_DVFSRC_VCOREFS_CONTROL	MTK_SIP_SMC_CMD(0x506)
+
 /* IOMMU related SMC call */
 #define MTK_SIP_KERNEL_IOMMU_CONTROL	MTK_SIP_SMC_CMD(0x514)
 
-- 
cgit v1.2.3


From 424a55a4a9087887fcfcee561648df497701a4a2 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 10 Oct 2024 07:40:34 +1100
Subject: uaccess: add copy_struct_to_user helper

This is based on copy_struct_from_user(), but there is one additional
case to consider when creating a syscall that returns an
extensible-struct to userspace -- how should data in the struct that
cannot fit into the userspace struct be handled (ksize > usize)?

There are three possibilies:

 1. The interface is like sched_getattr(2), where new information will
    be silently not provided to userspace. This is probably what most
    interfaces will want to do, as it provides the most possible
    backwards-compatibility.

 2. The interface is like lsm_list_modules(2), where you want to return
    an error like -EMSGSIZE if not providing information could result in
    the userspace program making a serious mistake (such as one that
    could lead to a security problem) or if you want to provide some
    flag to userspace so they know that they are missing some
    information.

 3. The interface is like statx(2), where there some kind of a request
    mask that indicates what data userspace would like. One could
    imagine that statx2(2) (using extensible structs) would want to
    return -EMSGSIZE if the user explicitly requested a field that their
    structure is too small to fit, but not return an error if the field
    was not explicitly requested. This is kind of a mix between (1) and
    (2) based on the requested mask.

The copy_struct_to_user() helper includes a an extra argument that is
used to return a boolean flag indicating whether there was a non-zero
byte in the trailing bytes that were not copied to userspace. This can
be used in the following ways to handle all three cases, respectively:

 1. Just pass NULL, as you don't care about this case.

 2. Return an error (say -EMSGSIZE) if the argument was set to true by
    copy_struct_to_user().

 3. If the argument was set to true by copy_struct_to_user(), check if
    there is a flag that implies a field larger than usize.

    This is the only case where callers of copy_struct_to_user() should
    check usize themselves. This will probably require scanning an array
    that specifies what flags were added for each version of the flags
    struct and returning an error if the request mask matches any of the
    flags that were added in versions of the struct that are larger than
    usize.

At the moment we don't have any users of (3), so this patch doesn't
include any helpers to make the necessary scanning easier, but it should
be fairly easy to add some if necessary.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/r/20241010-extensible-structs-check_fields-v3-1-d2833dfe6edd@cyphar.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uaccess.h | 97 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 39c7cf82b0c2..1e7c68b4c262 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -394,6 +394,103 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
 	return 0;
 }
 
+/**
+ * copy_struct_to_user: copy a struct to userspace
+ * @dst:   Destination address, in userspace. This buffer must be @ksize
+ *         bytes long.
+ * @usize: (Alleged) size of @dst struct.
+ * @src:   Source address, in kernel space.
+ * @ksize: Size of @src struct.
+ * @ignored_trailing: Set to %true if there was a non-zero byte in @src that
+ * userspace cannot see because they are using an smaller struct.
+ *
+ * Copies a struct from kernel space to userspace, in a way that guarantees
+ * backwards-compatibility for struct syscall arguments (as long as future
+ * struct extensions are made such that all new fields are *appended* to the
+ * old struct, and zeroed-out new fields have the same meaning as the old
+ * struct).
+ *
+ * Some syscalls may wish to make sure that userspace knows about everything in
+ * the struct, and if there is a non-zero value that userspce doesn't know
+ * about, they want to return an error (such as -EMSGSIZE) or have some other
+ * fallback (such as adding a "you're missing some information" flag). If
+ * @ignored_trailing is non-%NULL, it will be set to %true if there was a
+ * non-zero byte that could not be copied to userspace (ie. was past @usize).
+ *
+ * While unconditionally returning an error in this case is the simplest
+ * solution, for maximum backward compatibility you should try to only return
+ * -EMSGSIZE if the user explicitly requested the data that couldn't be copied.
+ * Note that structure sizes can change due to header changes and simple
+ * recompilations without code changes(!), so if you care about
+ * @ignored_trailing you probably want to make sure that any new field data is
+ * associated with a flag. Otherwise you might assume that a program knows
+ * about data it does not.
+ *
+ * @ksize is just sizeof(*src), and @usize should've been passed by userspace.
+ * The recommended usage is something like the following:
+ *
+ *   SYSCALL_DEFINE2(foobar, struct foo __user *, uarg, size_t, usize)
+ *   {
+ *      int err;
+ *      bool ignored_trailing;
+ *      struct foo karg = {};
+ *
+ *      if (usize > PAGE_SIZE)
+ *		return -E2BIG;
+ *      if (usize < FOO_SIZE_VER0)
+ *		return -EINVAL;
+ *
+ *      // ... modify karg somehow ...
+ *
+ *      err = copy_struct_to_user(uarg, usize, &karg, sizeof(karg),
+ *				  &ignored_trailing);
+ *      if (err)
+ *		return err;
+ *      if (ignored_trailing)
+ *		return -EMSGSIZE:
+ *
+ *      // ...
+ *   }
+ *
+ * There are three cases to consider:
+ *  * If @usize == @ksize, then it's copied verbatim.
+ *  * If @usize < @ksize, then the kernel is trying to pass userspace a newer
+ *    struct than it supports. Thus we only copy the interoperable portions
+ *    (@usize) and ignore the rest (but @ignored_trailing is set to %true if
+ *    any of the trailing (@ksize - @usize) bytes are non-zero).
+ *  * If @usize > @ksize, then the kernel is trying to pass userspace an older
+ *    struct than userspace supports. In order to make sure the
+ *    unknown-to-the-kernel fields don't contain garbage values, we zero the
+ *    trailing (@usize - @ksize) bytes.
+ *
+ * Returns (in all cases, some data may have been copied):
+ *  * -EFAULT: access to userspace failed.
+ */
+static __always_inline __must_check int
+copy_struct_to_user(void __user *dst, size_t usize, const void *src,
+		    size_t ksize, bool *ignored_trailing)
+{
+	size_t size = min(ksize, usize);
+	size_t rest = max(ksize, usize) - size;
+
+	/* Double check if ksize is larger than a known object size. */
+	if (WARN_ON_ONCE(ksize > __builtin_object_size(src, 1)))
+		return -E2BIG;
+
+	/* Deal with trailing bytes. */
+	if (usize > ksize) {
+		if (clear_user(dst + size, rest))
+			return -EFAULT;
+	}
+	if (ignored_trailing)
+		*ignored_trailing = ksize < usize &&
+			memchr_inv(src + size, 0, rest) != NULL;
+	/* Copy the interoperable parts of the struct. */
+	if (copy_to_user(dst, src, size))
+		return -EFAULT;
+	return 0;
+}
+
 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);
 
 long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
-- 
cgit v1.2.3


From 3e482e2840546a3ff725d5adf8d0779ac1c3cf4f Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 18 Oct 2024 14:37:25 -0700
Subject: dmaengine: idxd: Move DSA/IAA device IDs to IDXD driver

Since the DSA/IAA device IDs are only used by the IDXD driver, there is
no need to define them as public IDs. Move their definitions to the IDXD
driver to limit their scope. This change helps reduce unnecessary
exposure of the device IDs in the global space, making the codebase
cleaner and better encapsulated.

There is no functional change.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20241018213725.4167413-1-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/registers.h | 4 ++++
 include/linux/pci_ids.h      | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index e16dbf9ab324..c426511f2104 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -6,6 +6,10 @@
 #include <uapi/linux/idxd.h>
 
 /* PCI Config */
+#define PCI_DEVICE_ID_INTEL_DSA_GNRD	0x11fb
+#define PCI_DEVICE_ID_INTEL_DSA_DMR	0x1212
+#define PCI_DEVICE_ID_INTEL_IAA_DMR	0x1216
+
 #define DEVICE_VERSION_1		0x100
 #define DEVICE_VERSION_2		0x200
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4cf6aaed5f35..e4bddb927795 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2709,9 +2709,6 @@
 #define PCI_DEVICE_ID_INTEL_82815_MC	0x1130
 #define PCI_DEVICE_ID_INTEL_82815_CGC	0x1132
 #define PCI_DEVICE_ID_INTEL_SST_TNG	0x119a
-#define PCI_DEVICE_ID_INTEL_DSA_GNRD	0x11fb
-#define PCI_DEVICE_ID_INTEL_DSA_DMR	0x1212
-#define PCI_DEVICE_ID_INTEL_IAA_DMR	0x1216
 #define PCI_DEVICE_ID_INTEL_82092AA_0	0x1221
 #define PCI_DEVICE_ID_INTEL_82437	0x122d
 #define PCI_DEVICE_ID_INTEL_82371FB_0	0x122e
-- 
cgit v1.2.3


From 6474353a5e3d0b2cf610153cea0c61f576a36d0a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 25 Sep 2024 11:05:16 +0200
Subject: epoll: annotate racy check

Epoll relies on a racy fastpath check during __fput() in
eventpoll_release() to avoid the hit of pointlessly acquiring a
semaphore. Annotate that race by using WRITE_ONCE() and READ_ONCE().

Link: https://lore.kernel.org/r/66edfb3c.050a0220.3195df.001a.GAE@google.com
Link: https://lore.kernel.org/r/20240925-fungieren-anbauen-79b334b00542@brauner
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: syzbot+3b6b32dc50537a49bb4a@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventpoll.c            | 6 ++++--
 include/linux/eventpoll.h | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1ae4542f0bd8..90fbab6b6f03 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 	to_free = NULL;
 	head = file->f_ep;
 	if (head->first == &epi->fllink && !epi->fllink.next) {
-		file->f_ep = NULL;
+		/* See eventpoll_release() for details. */
+		WRITE_ONCE(file->f_ep, NULL);
 		if (!is_file_epoll(file)) {
 			struct epitems_head *v;
 			v = container_of(head, struct epitems_head, epitems);
@@ -1603,7 +1604,8 @@ allocate:
 			spin_unlock(&file->f_lock);
 			goto allocate;
 		}
-		file->f_ep = head;
+		/* See eventpoll_release() for details. */
+		WRITE_ONCE(file->f_ep, head);
 		to_free = NULL;
 	}
 	hlist_add_head_rcu(&epi->fllink, file->f_ep);
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 3337745d81bd..0c0d00fcd131 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -42,7 +42,7 @@ static inline void eventpoll_release(struct file *file)
 	 * because the file in on the way to be removed and nobody ( but
 	 * eventpoll ) has still a reference to this file.
 	 */
-	if (likely(!file->f_ep))
+	if (likely(!READ_ONCE(file->f_ep)))
 		return;
 
 	/*
-- 
cgit v1.2.3


From 900bbaae67e980945dec74d36f8afe0de7556d5a Mon Sep 17 00:00:00 2001
From: Xuewen Yan <xuewen.yan@unisoc.com>
Date: Fri, 26 Apr 2024 16:05:48 +0800
Subject: epoll: Add synchronous wakeup support for ep_poll_callback

Now, the epoll only use wake_up() interface to wake up task.
However, sometimes, there are epoll users which want to use
the synchronous wakeup flag to hint the scheduler, such as
Android binder driver.
So add a wake_up_sync() define, and use the wake_up_sync()
when the sync is true in ep_poll_callback().

Co-developed-by: Jing Xia <jing.xia@unisoc.com>
Signed-off-by: Jing Xia <jing.xia@unisoc.com>
Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
Link: https://lore.kernel.org/r/20240426080548.8203-1-xuewen.yan@unisoc.com
Tested-by: Brian Geffon <bgeffon@google.com>
Reviewed-by: Brian Geffon <bgeffon@google.com>
Reported-by: Benoit Lize <lizeb@google.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventpoll.c       | 5 ++++-
 include/linux/wait.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 90fbab6b6f03..1a06e462b6ef 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1373,7 +1373,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 				break;
 			}
 		}
-		wake_up(&ep->wq);
+		if (sync)
+			wake_up_sync(&ep->wq);
+		else
+			wake_up(&ep->wq);
 	}
 	if (waitqueue_active(&ep->poll_wait))
 		pwake++;
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 8aa3372f21a0..2b322a9b88a2 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -221,6 +221,7 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head);
 #define wake_up_all(x)			__wake_up(x, TASK_NORMAL, 0, NULL)
 #define wake_up_locked(x)		__wake_up_locked((x), TASK_NORMAL, 1)
 #define wake_up_all_locked(x)		__wake_up_locked((x), TASK_NORMAL, 0)
+#define wake_up_sync(x)			__wake_up_sync(x, TASK_NORMAL)
 
 #define wake_up_interruptible(x)	__wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
 #define wake_up_interruptible_nr(x, nr)	__wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
-- 
cgit v1.2.3


From 99bdadbde9c418f29b78b7241732268dbc0a05cc Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Tue, 15 Oct 2024 22:21:54 +0200
Subject: acl: Realign struct posix_acl to save 8 bytes

Reduce posix_acl's struct size by 8 bytes by realigning its members.

Cc: Christian Brauner <brauner@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Link: https://lore.kernel.org/r/20241015202158.2376-1-thorsten.blum@linux.dev
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/posix_acl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 0e65b3d634d9..2d6a4badd306 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -28,8 +28,8 @@ struct posix_acl_entry {
 
 struct posix_acl {
 	refcount_t		a_refcount;
-	struct rcu_head		a_rcu;
 	unsigned int		a_count;
+	struct rcu_head		a_rcu;
 	struct posix_acl_entry	a_entries[];
 };
 
-- 
cgit v1.2.3


From 8c6e03ffedc5463d1aa1ba89f6ceb082518a3520 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Fri, 18 Oct 2024 14:14:21 +0200
Subject: acl: Annotate struct posix_acl with __counted_by()

Add the __counted_by compiler attribute to the flexible array member
a_entries to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Use struct_size() to calculate the number of bytes to allocate for new
and cloned acls and remove the local size variables.

Change the posix_acl_alloc() function parameter count from int to
unsigned int to match posix_acl's a_count data type. Add identifier
names to the function definition to silence two checkpatch warnings.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Link: https://lore.kernel.org/r/20241018121426.155247-2-thorsten.blum@linux.dev
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/posix_acl.c            | 13 ++++++-------
 include/linux/posix_acl.h |  4 ++--
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 6c66a37522d0..4050942ab52f 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init);
  * Allocate a new ACL with the specified number of entries.
  */
 struct posix_acl *
-posix_acl_alloc(int count, gfp_t flags)
+posix_acl_alloc(unsigned int count, gfp_t flags)
 {
-	const size_t size = sizeof(struct posix_acl) +
-	                    count * sizeof(struct posix_acl_entry);
-	struct posix_acl *acl = kmalloc(size, flags);
+	struct posix_acl *acl;
+
+	acl = kmalloc(struct_size(acl, a_entries, count), flags);
 	if (acl)
 		posix_acl_init(acl, count);
 	return acl;
@@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
 	struct posix_acl *clone = NULL;
 
 	if (acl) {
-		int size = sizeof(struct posix_acl) + acl->a_count *
-		           sizeof(struct posix_acl_entry);
-		clone = kmemdup(acl, size, flags);
+		clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count),
+				flags);
 		if (clone)
 			refcount_set(&clone->a_refcount, 1);
 	}
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 2d6a4badd306..e2d47eb1a7f3 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -30,7 +30,7 @@ struct posix_acl {
 	refcount_t		a_refcount;
 	unsigned int		a_count;
 	struct rcu_head		a_rcu;
-	struct posix_acl_entry	a_entries[];
+	struct posix_acl_entry	a_entries[] __counted_by(a_count);
 };
 
 #define FOREACH_ACL_ENTRY(pa, acl, pe) \
@@ -62,7 +62,7 @@ posix_acl_release(struct posix_acl *acl)
 /* posix_acl.c */
 
 extern void posix_acl_init(struct posix_acl *, int);
-extern struct posix_acl *posix_acl_alloc(int, gfp_t);
+extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags);
 extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
 extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
 extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
-- 
cgit v1.2.3


From c2114a0d17631497df99388905353c11f4d5f0dd Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Thu, 17 Oct 2024 11:51:35 +0300
Subject: pmdomain: mediatek: Add support for MT6735

Add support for SCPSYS power domains of MT6735. All non-CPU power domains
are added except for MD2 (C2K modem), which is left out due to issues
with powering it on.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20241017085136.68053-3-y.oudjana@protonmail.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/mediatek/mt6735-pm-domains.h | 96 +++++++++++++++++++++++++++
 drivers/pmdomain/mediatek/mtk-pm-domains.c    |  5 ++
 drivers/pmdomain/mediatek/mtk-pm-domains.h    |  2 +
 include/linux/soc/mediatek/infracfg.h         |  5 ++
 4 files changed, 108 insertions(+)
 create mode 100644 drivers/pmdomain/mediatek/mt6735-pm-domains.h

(limited to 'include/linux')

diff --git a/drivers/pmdomain/mediatek/mt6735-pm-domains.h b/drivers/pmdomain/mediatek/mt6735-pm-domains.h
new file mode 100644
index 000000000000..71896be68e22
--- /dev/null
+++ b/drivers/pmdomain/mediatek/mt6735-pm-domains.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __SOC_MEDIATEK_MT6735_PM_DOMAINS_H
+#define __SOC_MEDIATEK_MT6735_PM_DOMAINS_H
+
+#include "mtk-pm-domains.h"
+#include <dt-bindings/power/mediatek,mt6735-power-controller.h>
+
+/*
+ * MT6735 power domain support
+ */
+
+static const struct scpsys_domain_data scpsys_domain_data_mt6735[] = {
+	[MT6735_POWER_DOMAIN_MD1] = {
+		.name = "md1",
+		.sta_mask = PWR_STATUS_MD1,
+		.ctl_offs = SPM_MD1_PWR_CON,
+		.pwr_sta_offs = SPM_PWR_STATUS,
+		.pwr_sta2nd_offs = SPM_PWR_STATUS_2ND,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = 0,
+		.bp_cfg = {
+			BUS_PROT_INFRA_UPDATE_TOPAXI(MT6735_TOP_AXI_PROT_EN_MD1),
+		},
+	},
+	[MT6735_POWER_DOMAIN_CONN] = {
+		.name = "conn",
+		.sta_mask = PWR_STATUS_CONN,
+		.ctl_offs = SPM_CONN_PWR_CON,
+		.pwr_sta_offs = SPM_PWR_STATUS,
+		.pwr_sta2nd_offs = SPM_PWR_STATUS_2ND,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = 0,
+		.bp_cfg = {
+			BUS_PROT_INFRA_UPDATE_TOPAXI(MT6735_TOP_AXI_PROT_EN_CONN),
+		},
+	},
+	[MT6735_POWER_DOMAIN_DIS] = {
+		.name = "dis",
+		.sta_mask = PWR_STATUS_DISP,
+		.ctl_offs = SPM_DIS_PWR_CON,
+		.pwr_sta_offs = SPM_PWR_STATUS,
+		.pwr_sta2nd_offs = SPM_PWR_STATUS_2ND,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_cfg = {
+			BUS_PROT_INFRA_UPDATE_TOPAXI(MT8173_TOP_AXI_PROT_EN_MM_M0),
+		},
+	},
+	[MT6735_POWER_DOMAIN_MFG] = {
+		.name = "mfg",
+		.sta_mask = PWR_STATUS_MFG,
+		.ctl_offs = SPM_MFG_PWR_CON,
+		.pwr_sta_offs = SPM_PWR_STATUS,
+		.pwr_sta2nd_offs = SPM_PWR_STATUS_2ND,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_cfg = {
+			BUS_PROT_INFRA_UPDATE_TOPAXI(MT8173_TOP_AXI_PROT_EN_MFG_S),
+		},
+	},
+	[MT6735_POWER_DOMAIN_ISP] = {
+		.name = "isp",
+		.sta_mask = PWR_STATUS_ISP,
+		.ctl_offs = SPM_ISP_PWR_CON,
+		.pwr_sta_offs = SPM_PWR_STATUS,
+		.pwr_sta2nd_offs = SPM_PWR_STATUS_2ND,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+	},
+	[MT6735_POWER_DOMAIN_VDE] = {
+		.name = "vde",
+		.sta_mask = PWR_STATUS_VDEC,
+		.ctl_offs = SPM_VDE_PWR_CON,
+		.pwr_sta_offs = SPM_PWR_STATUS,
+		.pwr_sta2nd_offs = SPM_PWR_STATUS_2ND,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT6735_POWER_DOMAIN_VEN] = {
+		.name = "ven",
+		.sta_mask = BIT(8),
+		.ctl_offs = SPM_VEN_PWR_CON,
+		.pwr_sta_offs = SPM_PWR_STATUS,
+		.pwr_sta2nd_offs = SPM_PWR_STATUS_2ND,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+	},
+};
+
+static const struct scpsys_soc_data mt6735_scpsys_data = {
+	.domains_data = scpsys_domain_data_mt6735,
+	.num_domains = ARRAY_SIZE(scpsys_domain_data_mt6735),
+};
+
+#endif /* __SOC_MEDIATEK_MT6735_PM_DOMAINS_H */
diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c
index 3580913f25d3..b866b006af69 100644
--- a/drivers/pmdomain/mediatek/mtk-pm-domains.c
+++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c
@@ -16,6 +16,7 @@
 #include <linux/regulator/consumer.h>
 #include <linux/soc/mediatek/infracfg.h>
 
+#include "mt6735-pm-domains.h"
 #include "mt6795-pm-domains.h"
 #include "mt8167-pm-domains.h"
 #include "mt8173-pm-domains.h"
@@ -608,6 +609,10 @@ static void scpsys_domain_cleanup(struct scpsys *scpsys)
 }
 
 static const struct of_device_id scpsys_of_match[] = {
+	{
+		.compatible = "mediatek,mt6735-power-controller",
+		.data = &mt6735_scpsys_data,
+	},
 	{
 		.compatible = "mediatek,mt6795-power-controller",
 		.data = &mt6795_scpsys_data,
diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.h b/drivers/pmdomain/mediatek/mtk-pm-domains.h
index aaba5e6b0536..2ac96804b985 100644
--- a/drivers/pmdomain/mediatek/mtk-pm-domains.h
+++ b/drivers/pmdomain/mediatek/mtk-pm-domains.h
@@ -21,6 +21,7 @@
 #define SPM_ISP_PWR_CON			0x0238
 #define SPM_DIS_PWR_CON			0x023c
 #define SPM_CONN_PWR_CON		0x0280
+#define SPM_MD1_PWR_CON			0x0284
 #define SPM_VEN2_PWR_CON		0x0298
 #define SPM_AUDIO_PWR_CON		0x029c
 #define SPM_MFG_2D_PWR_CON		0x02c0
@@ -30,6 +31,7 @@
 #define SPM_PWR_STATUS			0x060c
 #define SPM_PWR_STATUS_2ND		0x0610
 
+#define PWR_STATUS_MD1			BIT(0)
 #define PWR_STATUS_CONN			BIT(1)
 #define PWR_STATUS_DISP			BIT(3)
 #define PWR_STATUS_MFG			BIT(4)
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index 6c6cccc848f4..9956e18c5ffa 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -434,6 +434,11 @@
 #define MT7622_TOP_AXI_PROT_EN_WB		(BIT(2) | BIT(6) | \
 						 BIT(7) | BIT(8))
 
+#define MT6735_TOP_AXI_PROT_EN_CONN		(BIT(2) | BIT(8))
+#define MT6735_TOP_AXI_PROT_EN_MD1		(BIT(24) | BIT(25) | \
+						 BIT(26) | BIT(27) | \
+						 BIT(28))
+
 #define INFRA_TOPAXI_PROTECTEN			0x0220
 #define INFRA_TOPAXI_PROTECTSTA1		0x0228
 #define INFRA_TOPAXI_PROTECTEN_SET		0x0260
-- 
cgit v1.2.3


From d811ac148f0afd2f3f7e1cd7f54de8da973ec5e3 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 Oct 2024 15:56:35 +0200
Subject: virtchnl: fix m68k build.

The kernel test robot reported a build failure on m68k in the intel
driver due to the recent shapers-related changes.

The mentioned arch has funny alignment properties, let's be explicit
about the binary layout expectation introducing a padding field.

Fixes: 608a5c05c39b ("virtchnl: support queue rate limit and quanta size configuration")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202410131710.71Wt6LKO-lkp@intel.com/
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/e45d1c9f17356d431b03b419f60b8b763d2ff768.1729000481.git.pabeni@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/avf/virtchnl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 223e433c39fe..13a11f3c09b8 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -1499,6 +1499,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_queue_chunk);
 
 struct virtchnl_quanta_cfg {
 	u16 quanta_size;
+	u16 pad;
 	struct virtchnl_queue_chunk queue_select;
 };
 
-- 
cgit v1.2.3


From 2c50ec98fc6cab28df35e0a22a2bcc7957d9d0ab Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 2 Oct 2024 14:06:47 -0600
Subject: block: remove redundant passthrough check in blk_mq_need_time_stamp()

Simply checking the rq_flags is enough to determine if accounting is
being done for this request.

Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4fecf46ef681..59e9adf815a4 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -857,12 +857,6 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib);
  */
 static inline bool blk_mq_need_time_stamp(struct request *rq)
 {
-	/*
-	 * passthrough io doesn't use iostat accounting, cgroup stats
-	 * and io scheduler functionalities.
-	 */
-	if (blk_rq_is_passthrough(rq))
-		return false;
 	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
 }
 
-- 
cgit v1.2.3


From 9dfd9ea93aeab57d897bb7fc7c0707f26b0b9af8 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Thu, 3 Oct 2024 00:11:43 +0200
Subject: block: introduce add_disk_fwnode()

Introduce add_disk_fwnode() as a replacement of device_add_disk() that
permits to pass and attach a fwnode to disk dev.

This variant can be useful for eMMC that might have the partition table
for the disk defined in DT. A parser can later make use of the attached
fwnode to parse the related table and init the hardcoded partition for
the disk.

device_add_disk() is converted to a simple wrapper of add_disk_fwnode()
with the fwnode entry set as NULL.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241002221306.4403-4-ansuelsmth@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c          | 28 ++++++++++++++++++++++++----
 include/linux/blkdev.h |  3 +++
 2 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 1c05dd4c6980..bc30eee7ab16 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -383,16 +383,18 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
 }
 
 /**
- * device_add_disk - add disk information to kernel list
+ * add_disk_fwnode - add disk information to kernel list with fwnode
  * @parent: parent device for the disk
  * @disk: per-device partitioning information
  * @groups: Additional per-device sysfs groups
+ * @fwnode: attached disk fwnode
  *
  * This function registers the partitioning information in @disk
- * with the kernel.
+ * with the kernel. Also attach a fwnode to the disk device.
  */
-int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
-				 const struct attribute_group **groups)
+int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
+				 const struct attribute_group **groups,
+				 struct fwnode_handle *fwnode)
 
 {
 	struct device *ddev = disk_to_dev(disk);
@@ -452,6 +454,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 	ddev->parent = parent;
 	ddev->groups = groups;
 	dev_set_name(ddev, "%s", disk->disk_name);
+	if (fwnode)
+		device_set_node(ddev, fwnode);
 	if (!(disk->flags & GENHD_FL_HIDDEN))
 		ddev->devt = MKDEV(disk->major, disk->first_minor);
 	ret = device_add(ddev);
@@ -553,6 +557,22 @@ out_exit_elevator:
 		elevator_exit(disk->queue);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(add_disk_fwnode);
+
+/**
+ * device_add_disk - add disk information to kernel list
+ * @parent: parent device for the disk
+ * @disk: per-device partitioning information
+ * @groups: Additional per-device sysfs groups
+ *
+ * This function registers the partitioning information in @disk
+ * with the kernel.
+ */
+int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
+				 const struct attribute_group **groups)
+{
+	return add_disk_fwnode(parent, disk, groups, NULL);
+}
 EXPORT_SYMBOL(device_add_disk);
 
 static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50c3b959da28..a6aae750b4ac 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -725,6 +725,9 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
 #define for_each_bio(_bio)		\
 	for (; _bio; _bio = _bio->bi_next)
 
+int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
+				 const struct attribute_group **groups,
+				 struct fwnode_handle *fwnode);
 int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 				 const struct attribute_group **groups);
 static inline int __must_check add_disk(struct gendisk *disk)
-- 
cgit v1.2.3


From 110234da18ab482f6f583d28eff26b9569bf5622 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 7 Oct 2024 08:32:35 -0700
Subject: block: enable passthrough command statistics

Applications using the passthrough interfaces for IO want to continue
seeing the disk stats. These requests had been fenced off from this
block layer feature. While the block layer doesn't necessarily know what
a passthrough command does, we do know the data size and direction,
which is enough to account for the command's stats.

Since tracking these has the potential to produce unexpected results,
the passthrough stats are locked behind a new queue flag that needs to
be enabled with the /sys/block/<dev>/queue/iostats_passthrough
attribute.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241007153236.2818562-1-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/stable/sysfs-block |  7 +++++++
 block/blk-mq.c                       | 32 +++++++++++++++++++++++++++++++-
 block/blk-sysfs.c                    | 30 ++++++++++++++++++++++++++++++
 include/linux/blkdev.h               |  5 +++++
 4 files changed, 73 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index cea8856f798d..835361110715 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -424,6 +424,13 @@ Description:
 		[RW] This file is used to control (on/off) the iostats
 		accounting of the disk.
 
+What:		/sys/block/<disk>/queue/iostats_passthrough
+Date:		October 2024
+Contact:	linux-block@vger.kernel.org
+Description:
+		[RW] This file is used to control (on/off) the iostats
+		accounting of the disk for passthrough commands.
+
 
 What:		/sys/block/<disk>/queue/logical_block_size
 Date:		May 2009
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f4f78c03f735..7d05a56e3639 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -988,13 +988,43 @@ static inline void blk_account_io_done(struct request *req, u64 now)
 	}
 }
 
+static inline bool blk_rq_passthrough_stats(struct request *req)
+{
+	struct bio *bio = req->bio;
+
+	if (!blk_queue_passthrough_stat(req->q))
+		return false;
+
+	/* Requests without a bio do not transfer data. */
+	if (!bio)
+		return false;
+
+	/*
+	 * Stats are accumulated in the bdev, so must have one attached to a
+	 * bio to track stats. Most drivers do not set the bdev for passthrough
+	 * requests, but nvme is one that will set it.
+	 */
+	if (!bio->bi_bdev)
+		return false;
+
+	/*
+	 * We don't know what a passthrough command does, but we know the
+	 * payload size and data direction. Ensuring the size is aligned to the
+	 * block size filters out most commands with payloads that don't
+	 * represent sector access.
+	 */
+	if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
+		return false;
+	return true;
+}
+
 static inline void blk_account_io_start(struct request *req)
 {
 	trace_block_io_start(req);
 
 	if (!blk_queue_io_stat(req->q))
 		return;
-	if (blk_rq_is_passthrough(req))
+	if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
 		return;
 
 	req->rq_flags |= RQF_IO_STAT;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8717d43e0792..741b95dfdbf6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -272,6 +272,34 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page)
 	return queue_var_show(disk_nr_zones(disk), page);
 }
 
+static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page)
+{
+	return queue_var_show(blk_queue_passthrough_stat(disk->queue), page);
+}
+
+static ssize_t queue_iostats_passthrough_store(struct gendisk *disk,
+					       const char *page, size_t count)
+{
+	struct queue_limits lim;
+	unsigned long ios;
+	ssize_t ret;
+
+	ret = queue_var_store(&ios, page, count);
+	if (ret < 0)
+		return ret;
+
+	lim = queue_limits_start_update(disk->queue);
+	if (ios)
+		lim.flags |= BLK_FLAG_IOSTATS_PASSTHROUGH;
+	else
+		lim.flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH;
+
+	ret = queue_limits_commit_update(disk->queue, &lim);
+	if (ret)
+		return ret;
+
+	return count;
+}
 static ssize_t queue_nomerges_show(struct gendisk *disk, char *page)
 {
 	return queue_var_show((blk_queue_nomerges(disk->queue) << 1) |
@@ -460,6 +488,7 @@ QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
 QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
 
 QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
+QUEUE_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
 QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
 QUEUE_RW_ENTRY(queue_poll, "io_poll");
 QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
@@ -586,6 +615,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_max_open_zones_entry.attr,
 	&queue_max_active_zones_entry.attr,
 	&queue_nomerges_entry.attr,
+	&queue_iostats_passthrough_entry.attr,
 	&queue_iostats_entry.attr,
 	&queue_stable_writes_entry.attr,
 	&queue_add_random_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a6aae750b4ac..6b78a68e0bd9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -349,6 +349,9 @@ typedef unsigned int __bitwise blk_flags_t;
 /* I/O topology is misaligned */
 #define BLK_FLAG_MISALIGNED		((__force blk_flags_t)(1u << 1))
 
+/* passthrough command IO accounting */
+#define BLK_FLAG_IOSTATS_PASSTHROUGH	((__force blk_flags_t)(1u << 2))
+
 struct queue_limits {
 	blk_features_t		features;
 	blk_flags_t		flags;
@@ -617,6 +620,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
 	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
 #define blk_queue_nonrot(q)	(!((q)->limits.features & BLK_FEAT_ROTATIONAL))
 #define blk_queue_io_stat(q)	((q)->limits.features & BLK_FEAT_IO_STAT)
+#define blk_queue_passthrough_stat(q)	\
+	((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH)
 #define blk_queue_dax(q)	((q)->limits.features & BLK_FEAT_DAX)
 #define blk_queue_pci_p2pdma(q)	((q)->limits.features & BLK_FEAT_PCI_P2PDMA)
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
-- 
cgit v1.2.3


From b21d948f4cc73e3296f2365c7afca721dd6893fa Mon Sep 17 00:00:00 2001
From: Greg Joyce <gjoyce@linux.ibm.com>
Date: Thu, 29 Aug 2024 12:56:11 -0500
Subject: block: sed-opal: add ioctl IOC_OPAL_SET_SID_PW

After a SED drive is provisioned, there is no way to change the SID
password via the ioctl() interface. A new ioctl IOC_OPAL_SET_SID_PW
will allow the password to be changed. The valid current password is
required.

Signed-off-by: Greg Joyce <gjoyce@linux.ibm.com>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Link: https://lore.kernel.org/r/20240829175639.6478-2-gjoyce@linux.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/sed-opal.c              | 26 ++++++++++++++++++++++++++
 include/linux/sed-opal.h      |  1 +
 include/uapi/linux/sed-opal.h |  1 +
 3 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 598fd3e7fcc8..5a28f23f7f22 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -3037,6 +3037,29 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 	return ret;
 }
 
+static int opal_set_new_sid_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
+{
+	int ret;
+	struct opal_key *newkey = &opal_pw->new_user_pw.opal_key;
+	struct opal_key *oldkey = &opal_pw->session.opal_key;
+
+	const struct opal_step pw_steps[] = {
+		{ start_SIDASP_opal_session, oldkey },
+		{ set_sid_cpin_pin, newkey },
+		{ end_opal_session, }
+	};
+
+	if (!dev)
+		return -ENODEV;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
+	mutex_unlock(&dev->dev_lock);
+
+	return ret;
+}
+
 static int opal_activate_user(struct opal_dev *dev,
 			      struct opal_session_info *opal_session)
 {
@@ -3286,6 +3309,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_DISCOVERY:
 		ret = opal_get_discv(dev, p);
 		break;
+	case IOC_OPAL_SET_SID_PW:
+		ret = opal_set_new_sid_pw(dev, p);
+		break;
 
 	default:
 		break;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 2ac50822554e..80f33a93f944 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -52,6 +52,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_GET_GEOMETRY:
 	case IOC_OPAL_DISCOVERY:
 	case IOC_OPAL_REVERT_LSP:
+	case IOC_OPAL_SET_SID_PW:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index d3994b7716bc..9025dd5a4f0f 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -215,5 +215,6 @@ struct opal_revert_lsp {
 #define IOC_OPAL_GET_GEOMETRY       _IOR('p', 238, struct opal_geometry)
 #define IOC_OPAL_DISCOVERY          _IOW('p', 239, struct opal_discovery)
 #define IOC_OPAL_REVERT_LSP         _IOW('p', 240, struct opal_revert_lsp)
+#define IOC_OPAL_SET_SID_PW         _IOW('p', 241, struct opal_new_pw)
 
 #endif /* _UAPI_SED_OPAL_H */
-- 
cgit v1.2.3


From 8cf8dfceebdaf282da8a836b2bb578808a12698c Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 22 Oct 2024 15:41:34 +0200
Subject: seccomp: Stub for !HAVE_ARCH_SECCOMP_FILTER

If we have CONFIG_SECCOMP but not CONFIG_HAVE_ARCH_SECCOMP_FILTER
we get a compilation error:

../kernel/entry/common.c: In function 'syscall_trace_enter':
../kernel/entry/common.c:55:23: error: implicit declaration of function '__secure_computing' [-Werror=implicit-function-declaration]
   55 |                 ret = __secure_computing(NULL);
      |                       ^~~~~~~~~~~~~~~~~~

This is because generic entry calls __secure_computing()
unconditionally.

Provide the needed stub similar to how current ARM does
this by calling secure_computing_strict() in the absence
of secure_computing(). This is similar to what is done for
ARM in arch/arm/kernel/ptrace.c.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20241022-seccomp-compile-error-v2-1-c9f08a4f8ebb@linaro.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/seccomp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 709ad84809e1..341980599c71 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -32,6 +32,11 @@ static inline int secure_computing(void)
 }
 #else
 extern void secure_computing_strict(int this_syscall);
+static inline int __secure_computing(const struct seccomp_data *sd)
+{
+	secure_computing_strict(sd->nr);
+	return 0;
+}
 #endif
 
 extern long prctl_get_seccomp(void);
-- 
cgit v1.2.3


From cfcdf395c21eeac4543d2b8fef9d29ae9e4559e9 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Tue, 8 Oct 2024 18:07:02 +0200
Subject: regulator: core: add callback to perform runtime init

Provide an initialisation callback to handle runtime parameters.
The idea is similar to the regulator_init() callback, but it provides
regulator specific structures, instead of just the driver specific data.

As an example, this allows the driver to amend the regulator constraints
based on runtime parameters if necessary.

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://patch.msgid.link/20241008-regulator-ignored-data-v2-2-d1251e0ee507@baylibre.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 6 ++++++
 include/linux/regulator/driver.h | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 24bb7f5b12e3..eecb05a0d08c 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -5758,6 +5758,12 @@ regulator_register(struct device *dev,
 		goto wash;
 	}
 
+	if (regulator_desc->init_cb) {
+		ret = regulator_desc->init_cb(rdev, config);
+		if (ret < 0)
+			goto wash;
+	}
+
 	if ((rdev->supply_name && !rdev->supply) &&
 		(rdev->constraints->always_on ||
 		 rdev->constraints->boot_on)) {
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index f230a472ccd3..d2f4427504f0 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -365,6 +365,8 @@ struct regulator_desc {
 	int (*of_parse_cb)(struct device_node *,
 			    const struct regulator_desc *,
 			    struct regulator_config *);
+	int (*init_cb)(struct regulator_dev *,
+		       struct regulator_config *);
 	int id;
 	unsigned int continuous_voltage_range:1;
 	unsigned n_voltages;
-- 
cgit v1.2.3


From 602ff58ae4fe4289b0ca71cba9fb82f7de92cd64 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Tue, 8 Oct 2024 18:07:03 +0200
Subject: regulator: core: remove machine init callback from config

The machine specific regulator_init() appears to be unused.
It does not allow a lot of interaction with the regulator framework,
since nothing from the framework is passed along (desc, config,
etc ...)

Machine specific init may also be done with the added init_cb() in
the regulator description, so remove regulator_init().

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://patch.msgid.link/20241008-regulator-ignored-data-v2-3-d1251e0ee507@baylibre.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c          | 7 -------
 include/linux/regulator/machine.h | 3 +--
 2 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index eecb05a0d08c..f8e36c9f5943 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -5775,13 +5775,6 @@ regulator_register(struct device *dev,
 		resolved_early = true;
 	}
 
-	/* perform any regulator specific init */
-	if (init_data && init_data->regulator_init) {
-		ret = init_data->regulator_init(rdev->reg_data);
-		if (ret < 0)
-			goto wash;
-	}
-
 	if (config->ena_gpiod) {
 		ret = regulator_ena_gpio_request(rdev, config);
 		if (ret != 0) {
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 0cd76d264727..d0d700ff337a 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -285,8 +285,7 @@ struct regulator_init_data {
 	int num_consumer_supplies;
 	struct regulator_consumer_supply *consumer_supplies;
 
-	/* optional regulator machine specific init */
-	int (*regulator_init)(void *driver_data);
+	/* optional regulator machine specific data */
 	void *driver_data;	/* core does not touch this */
 };
 
-- 
cgit v1.2.3


From 51e32e897539663957f7a0950f66b48f8896efee Mon Sep 17 00:00:00 2001
From: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
Date: Sat, 19 Oct 2024 14:16:00 +0300
Subject: clk: Provide devm_clk_bulk_get_all_enabled() helper

Commit 265b07df758a ("clk: Provide managed helper to get and enable bulk
clocks") added devm_clk_bulk_get_all_enable() function, but missed to
return the number of clocks stored in the clk_bulk_data table referenced
by the clks argument.  Without knowing the number, it's not possible to
iterate these clocks when needed, hence the argument is useless and
could have been simply removed.

Introduce devm_clk_bulk_get_all_enabled() variant, which is consistent
with devm_clk_bulk_get_all() in terms of the returned value:

 > 0 if one or more clocks have been stored
 = 0 if there are no clocks
 < 0 if an error occurred

Moreover, the naming is consistent with devm_clk_get_enabled(), i.e. use
the past form of 'enable'.

To reduce code duplication and improve patch readability, make
devm_clk_bulk_get_all_enable() use the new helper, as suggested by
Stephen Boyd.

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
Link: https://lore.kernel.org/r/20241019-clk_bulk_ena_fix-v4-1-57f108f64e70@collabora.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-devres.c |  9 +++++----
 include/linux/clk.h      | 21 ++++++++++++++++-----
 2 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index 82ae1f26e634..5368d92d9b39 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -218,8 +218,8 @@ static void devm_clk_bulk_release_all_enable(struct device *dev, void *res)
 	clk_bulk_put_all(devres->num_clks, devres->clks);
 }
 
-int __must_check devm_clk_bulk_get_all_enable(struct device *dev,
-					      struct clk_bulk_data **clks)
+int __must_check devm_clk_bulk_get_all_enabled(struct device *dev,
+					       struct clk_bulk_data **clks)
 {
 	struct clk_bulk_devres *devres;
 	int ret;
@@ -244,11 +244,12 @@ int __must_check devm_clk_bulk_get_all_enable(struct device *dev,
 	} else {
 		clk_bulk_put_all(devres->num_clks, devres->clks);
 		devres_free(devres);
+		return ret;
 	}
 
-	return ret;
+	return devres->num_clks;
 }
-EXPORT_SYMBOL_GPL(devm_clk_bulk_get_all_enable);
+EXPORT_SYMBOL_GPL(devm_clk_bulk_get_all_enabled);
 
 static int devm_clk_match(struct device *dev, void *res, void *data)
 {
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 851a0f2cf42c..1dcee6d701e4 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -496,11 +496,13 @@ int __must_check devm_clk_bulk_get_all(struct device *dev,
 				       struct clk_bulk_data **clks);
 
 /**
- * devm_clk_bulk_get_all_enable - Get and enable all clocks of the consumer (managed)
+ * devm_clk_bulk_get_all_enabled - Get and enable all clocks of the consumer (managed)
  * @dev: device for clock "consumer"
  * @clks: pointer to the clk_bulk_data table of consumer
  *
- * Returns success (0) or negative errno.
+ * Returns a positive value for the number of clocks obtained while the
+ * clock references are stored in the clk_bulk_data table in @clks field.
+ * Returns 0 if there're none and a negative value if something failed.
  *
  * This helper function allows drivers to get all clocks of the
  * consumer and enables them in one operation with management.
@@ -508,8 +510,8 @@ int __must_check devm_clk_bulk_get_all(struct device *dev,
  * is unbound.
  */
 
-int __must_check devm_clk_bulk_get_all_enable(struct device *dev,
-					      struct clk_bulk_data **clks);
+int __must_check devm_clk_bulk_get_all_enabled(struct device *dev,
+					       struct clk_bulk_data **clks);
 
 /**
  * devm_clk_get - lookup and obtain a managed reference to a clock producer.
@@ -1034,7 +1036,7 @@ static inline int __must_check devm_clk_bulk_get_all(struct device *dev,
 	return 0;
 }
 
-static inline int __must_check devm_clk_bulk_get_all_enable(struct device *dev,
+static inline int __must_check devm_clk_bulk_get_all_enabled(struct device *dev,
 						struct clk_bulk_data **clks)
 {
 	return 0;
@@ -1136,6 +1138,15 @@ static inline void clk_restore_context(void) {}
 
 #endif
 
+/* Deprecated. Use devm_clk_bulk_get_all_enabled() */
+static inline int __must_check
+devm_clk_bulk_get_all_enable(struct device *dev, struct clk_bulk_data **clks)
+{
+	int ret = devm_clk_bulk_get_all_enabled(dev, clks);
+
+	return ret > 0 ? 0 : ret;
+}
+
 /* clk_prepare_enable helps cases using clk_enable in non-atomic context. */
 static inline int clk_prepare_enable(struct clk *clk)
 {
-- 
cgit v1.2.3


From 44059790a5cb9258ae6137387e4c39b717fd2ced Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 23 Oct 2024 07:53:04 +0200
Subject: kfifo: don't include dma-mapping.h in kfifo.h

Nothing in kfifo.h directly needs dma-mapping.h, only two macros
use DMA_MAPPING_ERROR when actually instantiated.  Drop the
dma-mapping.h include to reduce include bloat.

Add an explicity <linux/io.h> include to drivers/mailbox/omap-mailbox.c
as that file uses __raw_readl and __raw_writel through a complicated
include chain involving <linux/dma-mapping.h>

Fixes: d52b761e4b1a ("kfifo: add kfifo_dma_out_prepare_mapped()")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241023055317.313234-1-hch@lst.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mailbox/omap-mailbox.c | 1 +
 include/linux/kfifo.h          | 1 -
 samples/kfifo/dma-example.c    | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index 6797770474a5..680243751d62 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/kfifo.h>
 #include <linux/err.h>
+#include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 564868bdce89..fd743d4c4b4b 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -37,7 +37,6 @@
  */
 
 #include <linux/array_size.h>
-#include <linux/dma-mapping.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 #include <linux/types.h>
diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c
index 48df719dac8c..8076ac410161 100644
--- a/samples/kfifo/dma-example.c
+++ b/samples/kfifo/dma-example.c
@@ -9,6 +9,7 @@
 #include <linux/kfifo.h>
 #include <linux/module.h>
 #include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
 
 /*
  * This module shows how to handle fifo dma operations.
-- 
cgit v1.2.3


From e44ef3f66c5472c2cbc6957c684d7279c26b0db1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Oct 2024 05:21:08 +0000
Subject: netpoll: remove ndo_netpoll_setup() second argument

npinfo is not used in any of the ndo_netpoll_setup() methods.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20241018052108.2610827-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/bonding/bond_main.c | 2 +-
 drivers/net/macvlan.c           | 2 +-
 drivers/net/team/team_core.c    | 3 +--
 include/linux/netdevice.h       | 3 +--
 net/8021q/vlan_dev.c            | 2 +-
 net/bridge/br_device.c          | 2 +-
 net/core/netpoll.c              | 2 +-
 net/dsa/user.c                  | 3 +--
 8 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b1bffd8e9a95..3928287f5865 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1476,7 +1476,7 @@ static void bond_netpoll_cleanup(struct net_device *bond_dev)
 			slave_disable_netpoll(slave);
 }
 
-static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
+static int bond_netpoll_setup(struct net_device *dev)
 {
 	struct bonding *bond = netdev_priv(dev);
 	struct list_head *iter;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index cf18e66de142..edbd5afcec41 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1117,7 +1117,7 @@ static void macvlan_dev_poll_controller(struct net_device *dev)
 	return;
 }
 
-static int macvlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo)
+static int macvlan_dev_netpoll_setup(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct net_device *real_dev = vlan->lowerdev;
diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c
index 18191d5a8bd4..a1b27b69f010 100644
--- a/drivers/net/team/team_core.c
+++ b/drivers/net/team/team_core.c
@@ -1946,8 +1946,7 @@ static void team_netpoll_cleanup(struct net_device *dev)
 	mutex_unlock(&team->lock);
 }
 
-static int team_netpoll_setup(struct net_device *dev,
-			      struct netpoll_info *npifo)
+static int team_netpoll_setup(struct net_device *dev)
 {
 	struct team *team = netdev_priv(dev);
 	struct team_port *port;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8feaca12655e..86a0b7eb9461 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1425,8 +1425,7 @@ struct net_device_ops {
 						        __be16 proto, u16 vid);
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	void                    (*ndo_poll_controller)(struct net_device *dev);
-	int			(*ndo_netpoll_setup)(struct net_device *dev,
-						     struct netpoll_info *info);
+	int			(*ndo_netpoll_setup)(struct net_device *dev);
 	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
 #endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 458040e8a0e0..91d134961357 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -725,7 +725,7 @@ static void vlan_dev_poll_controller(struct net_device *dev)
 	return;
 }
 
-static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo)
+static int vlan_dev_netpoll_setup(struct net_device *dev)
 {
 	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	struct net_device *real_dev = vlan->real_dev;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 26b79feb385d..0ab4613aa07a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -328,7 +328,7 @@ int br_netpoll_enable(struct net_bridge_port *p)
 	return __br_netpoll_enable(p);
 }
 
-static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
+static int br_netpoll_setup(struct net_device *dev)
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_port *p;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index aa49b92e9194..94b7f07a952f 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -641,7 +641,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
 
 		ops = ndev->netdev_ops;
 		if (ops->ndo_netpoll_setup) {
-			err = ops->ndo_netpoll_setup(ndev, npinfo);
+			err = ops->ndo_netpoll_setup(ndev);
 			if (err)
 				goto free_npinfo;
 		}
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 64f660d2334b..91a1fa5f8ab0 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -1308,8 +1308,7 @@ static int dsa_user_set_pauseparam(struct net_device *dev,
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
-static int dsa_user_netpoll_setup(struct net_device *dev,
-				  struct netpoll_info *ni)
+static int dsa_user_netpoll_setup(struct net_device *dev)
 {
 	struct net_device *conduit = dsa_user_to_conduit(dev);
 	struct dsa_user_priv *p = netdev_priv(dev);
-- 
cgit v1.2.3


From 7cfc1b1fa8673fe386304194d4cc2c8fe555bbf9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Oct 2024 05:23:10 +0000
Subject: net: netdev_tx_sent_queue() small optimization

Change smp_mb() imediately following a set_bit()
with smp_mb__after_atomic().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20241018052310.2612084-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 86a0b7eb9461..bbd30f3c5d29 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3517,7 +3517,7 @@ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
 	 * because in netdev_tx_completed_queue we update the dql_completed
 	 * before checking the XOFF flag.
 	 */
-	smp_mb();
+	smp_mb__after_atomic();
 
 	/* check again in case another CPU has just made room avail */
 	if (unlikely(dql_avail(&dev_queue->dql) >= 0))
-- 
cgit v1.2.3


From e55f45b0cda71ac2e9b4c6dee8852dc8d6f22ef0 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 23 Oct 2024 10:35:43 +0200
Subject: regulator: doc: add missing documentation for init_cb

Add comment documenting introduced init_cb.
This solves the following warning when building the kernel documentation:

./include/linux/regulator/driver.h:435: warning: Function parameter or struct member 'init_cb' not described in 'regulator_desc'

Fixes: cfcdf395c21e ("regulator: core: add callback to perform runtime init")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/lkml/20241023155120.6c4fea20@canb.auug.org.au/
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://patch.msgid.link/20241023-regulator-doc-fixup-v1-1-ec018742ad73@baylibre.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index d2f4427504f0..5b66caf1695d 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -269,6 +269,11 @@ enum regulator_type {
  *               config but it cannot store it for later usage.
  *               Callback should return 0 on success or negative ERRNO
  *               indicating failure.
+ * @init_cb: Optional callback called after the parsing of init_data.
+ *           Allows the regulator to perform runtime init if necessary,
+ *           such as synching the regulator and the parsed constraints.
+ *           Callback should return 0 on success or negative ERRNO
+ *           indicating failure.
  * @id: Numerical identifier for the regulator.
  * @ops: Regulator operations table.
  * @irq: Interrupt number for the regulator.
-- 
cgit v1.2.3


From d1bc2d5cca434e7c854fd16ef021c16d74293b60 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 23 Oct 2024 10:35:44 +0200
Subject: regulator: doc: remove documentation comment for regulator_init

Remove documentation comment related to regulator_init callback.
This solves the following warning when building the kernel documentation:

./include/linux/regulator/machine.h:290: warning: Excess struct member 'regulator_init' description in 'regulator_init_data'

Fixes: 602ff58ae4fe ("regulator: core: remove machine init callback from config")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/lkml/20241023155257.0fa7211d@canb.auug.org.au/
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://patch.msgid.link/20241023-regulator-doc-fixup-v1-2-ec018742ad73@baylibre.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/machine.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index d0d700ff337a..b3db09a7429b 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -273,8 +273,6 @@ struct regulator_consumer_supply {
  *               be usable.
  * @num_consumer_supplies: Number of consumer device supplies.
  * @consumer_supplies: Consumer device supply configuration.
- *
- * @regulator_init: Callback invoked when the regulator has been registered.
  * @driver_data: Data passed to regulator_init.
  */
 struct regulator_init_data {
-- 
cgit v1.2.3


From 50a191a8a12b33dfad3b27c6ba4e76c5ba39db73 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 10 Aug 2024 19:00:35 +0200
Subject: sysctl: update comments to new registration APIs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sysctl registration APIs do not need a terminating table entry
anymore and with commit acc154691fc7 ("sysctl: Warn on an empty procname
element") even emit warnings if such a sentinel entry is supplied.

While at it also remove the mention of "table->de" which was removed in
commit 3fbfa98112fc ("[PATCH] sysctl: remove the proc_dir_entry member
for the sysctl tables") back in 2007.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/sysctl.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 202855befa8b..40a6ac6c9713 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -90,9 +90,7 @@ int proc_do_static_key(const struct ctl_table *table, int write, void *buffer,
 
 /*
  * Register a set of sysctl names by calling register_sysctl
- * with an initialised array of struct ctl_table's.  An entry with 
- * NULL procname terminates the table.  table->de will be
- * set up by the registration and need not be initialised in advance.
+ * with an initialised array of struct ctl_table's.
  *
  * sysctl names can be mirrored automatically under /proc/sys.  The
  * procname supplied controls /proc naming.
@@ -133,7 +131,7 @@ static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
 
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table {
-	const char *procname;		/* Text ID for /proc/sys, or zero */
+	const char *procname;		/* Text ID for /proc/sys */
 	void *data;
 	int maxlen;
 	umode_t mode;
-- 
cgit v1.2.3


From 8dc6d81c6b2acc434b00c4585f0594739031c4e4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 22 Oct 2024 15:18:34 +0200
Subject: debugfs: add small file operations for most files

As struct file_operations is really big, but (most) debugfs
files only use simple_open, read, write and perhaps seek, and
don't need anything else, this wastes a lot of space for NULL
pointers.

Add a struct debugfs_short_fops and some bookkeeping code in
debugfs so that users can use that with debugfs_create_file()
using _Generic to figure out which function to use.

Converting mac80211 to use it where possible saves quite a
bit of space:

1010127  205064    1220 1216411  128f9b net/mac80211/mac80211.ko (before)
 981199  205064    1220 1187483  121e9b net/mac80211/mac80211.ko (after)
-------
 -28928 = ~28KiB

With a marginal space cost in debugfs:

   8701	    550	     16	   9267	   2433	fs/debugfs/inode.o (before)
  25233	    325	     32	  25590	   63f6	fs/debugfs/file.o  (before)
   8914	    558	     16	   9488	   2510	fs/debugfs/inode.o (after)
  25380	    325	     32	  25737	   6489	fs/debugfs/file.o  (after)
---------------
   +360      +8

(All on x86-64)

A simple spatch suggests there are more than 300 instances,
not even counting the ones hidden in macros like in mac80211,
that could be trivially converted, for additional savings of
about 240 bytes for each.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/20241022151838.26f9925fb959.Ia80b55e934bbfc45ce0df42a3233d34b35508046@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 fs/debugfs/file.c       | 100 +++++++++++++++++++++++++++++++++---------------
 fs/debugfs/inode.c      |  63 +++++++++++++-----------------
 fs/debugfs/internal.h   |   6 +++
 include/linux/debugfs.h |  62 ++++++++++++++++++++++++++++--
 4 files changed, 160 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 67299e8b734e..47dc96dfe386 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -100,8 +100,16 @@ int debugfs_file_get(struct dentry *dentry)
 		if (!fsd)
 			return -ENOMEM;
 
-		fsd->real_fops = (void *)((unsigned long)d_fsd &
-					~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+		if ((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT) {
+			fsd->real_fops = NULL;
+			fsd->short_fops = (void *)((unsigned long)d_fsd &
+						~(DEBUGFS_FSDATA_IS_REAL_FOPS_BIT |
+						  DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT));
+		} else {
+			fsd->real_fops = (void *)((unsigned long)d_fsd &
+						~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+			fsd->short_fops = NULL;
+		}
 		refcount_set(&fsd->active_users, 1);
 		init_completion(&fsd->active_users_drained);
 		INIT_LIST_HEAD(&fsd->cancellations);
@@ -241,9 +249,10 @@ static int debugfs_locked_down(struct inode *inode,
 {
 	if ((inode->i_mode & 07777 & ~0444) == 0 &&
 	    !(filp->f_mode & FMODE_WRITE) &&
-	    !real_fops->unlocked_ioctl &&
-	    !real_fops->compat_ioctl &&
-	    !real_fops->mmap)
+	    (!real_fops ||
+	     (!real_fops->unlocked_ioctl &&
+	      !real_fops->compat_ioctl &&
+	      !real_fops->mmap)))
 		return 0;
 
 	if (security_locked_down(LOCKDOWN_DEBUGFS))
@@ -316,19 +325,38 @@ static ret_type full_proxy_ ## name(proto)				\
 	return r;							\
 }
 
-FULL_PROXY_FUNC(llseek, loff_t, filp,
-		PROTO(struct file *filp, loff_t offset, int whence),
-		ARGS(filp, offset, whence));
+#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args)		\
+static ret_type full_proxy_ ## name(proto)				\
+{									\
+	struct dentry *dentry = F_DENTRY(filp);				\
+	struct debugfs_fsdata *fsd;					\
+	ret_type r;							\
+									\
+	r = debugfs_file_get(dentry);					\
+	if (unlikely(r))						\
+		return r;						\
+	fsd = dentry->d_fsdata;						\
+	if (fsd->real_fops)						\
+		r = fsd->real_fops->name(args);				\
+	else								\
+		r = fsd->short_fops->name(args);			\
+	debugfs_file_put(dentry);					\
+	return r;							\
+}
+
+FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp,
+		     PROTO(struct file *filp, loff_t offset, int whence),
+		     ARGS(filp, offset, whence));
 
-FULL_PROXY_FUNC(read, ssize_t, filp,
-		PROTO(struct file *filp, char __user *buf, size_t size,
-			loff_t *ppos),
-		ARGS(filp, buf, size, ppos));
+FULL_PROXY_FUNC_BOTH(read, ssize_t, filp,
+		     PROTO(struct file *filp, char __user *buf, size_t size,
+			   loff_t *ppos),
+		     ARGS(filp, buf, size, ppos));
 
-FULL_PROXY_FUNC(write, ssize_t, filp,
-		PROTO(struct file *filp, const char __user *buf, size_t size,
-			loff_t *ppos),
-		ARGS(filp, buf, size, ppos));
+FULL_PROXY_FUNC_BOTH(write, ssize_t, filp,
+		     PROTO(struct file *filp, const char __user *buf,
+			   size_t size, loff_t *ppos),
+		     ARGS(filp, buf, size, ppos));
 
 FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
 		PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
@@ -363,7 +391,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
 	 * not to leak any resources. Releasers must not assume that
 	 * ->i_private is still being meaningful here.
 	 */
-	if (real_fops->release)
+	if (real_fops && real_fops->release)
 		r = real_fops->release(inode, filp);
 
 	replace_fops(filp, d_inode(dentry)->i_fop);
@@ -373,39 +401,48 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
 }
 
 static void __full_proxy_fops_init(struct file_operations *proxy_fops,
-				const struct file_operations *real_fops)
+				   struct debugfs_fsdata *fsd)
 {
 	proxy_fops->release = full_proxy_release;
-	if (real_fops->llseek)
+
+	if ((fsd->real_fops && fsd->real_fops->llseek) ||
+	    (fsd->short_fops && fsd->short_fops->llseek))
 		proxy_fops->llseek = full_proxy_llseek;
-	if (real_fops->read)
+
+	if ((fsd->real_fops && fsd->real_fops->read) ||
+	    (fsd->short_fops && fsd->short_fops->read))
 		proxy_fops->read = full_proxy_read;
-	if (real_fops->write)
+
+	if ((fsd->real_fops && fsd->real_fops->write) ||
+	    (fsd->short_fops && fsd->short_fops->write))
 		proxy_fops->write = full_proxy_write;
-	if (real_fops->poll)
+
+	if (fsd->real_fops && fsd->real_fops->poll)
 		proxy_fops->poll = full_proxy_poll;
-	if (real_fops->unlocked_ioctl)
+
+	if (fsd->real_fops && fsd->real_fops->unlocked_ioctl)
 		proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
 }
 
 static int full_proxy_open(struct inode *inode, struct file *filp)
 {
 	struct dentry *dentry = F_DENTRY(filp);
-	const struct file_operations *real_fops = NULL;
+	const struct file_operations *real_fops;
 	struct file_operations *proxy_fops = NULL;
+	struct debugfs_fsdata *fsd;
 	int r;
 
 	r = debugfs_file_get(dentry);
 	if (r)
 		return r == -EIO ? -ENOENT : r;
 
-	real_fops = debugfs_real_fops(filp);
-
+	fsd = dentry->d_fsdata;
+	real_fops = fsd->real_fops;
 	r = debugfs_locked_down(inode, filp, real_fops);
 	if (r)
 		goto out;
 
-	if (!fops_get(real_fops)) {
+	if (real_fops && !fops_get(real_fops)) {
 #ifdef CONFIG_MODULES
 		if (real_fops->owner &&
 		    real_fops->owner->state == MODULE_STATE_GOING) {
@@ -426,11 +463,14 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
 		r = -ENOMEM;
 		goto free_proxy;
 	}
-	__full_proxy_fops_init(proxy_fops, real_fops);
+	__full_proxy_fops_init(proxy_fops, fsd);
 	replace_fops(filp, proxy_fops);
 
-	if (real_fops->open) {
-		r = real_fops->open(inode, filp);
+	if (!real_fops || real_fops->open) {
+		if (real_fops)
+			r = real_fops->open(inode, filp);
+		else
+			r = simple_open(inode, filp);
 		if (r) {
 			replace_fops(filp, d_inode(dentry)->i_fop);
 			goto free_proxy;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 66d9b3b4c588..38a9c7eb97e6 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -412,7 +412,7 @@ static struct dentry *end_creating(struct dentry *dentry)
 static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 				struct dentry *parent, void *data,
 				const struct file_operations *proxy_fops,
-				const struct file_operations *real_fops)
+				const void *real_fops)
 {
 	struct dentry *dentry;
 	struct inode *inode;
@@ -450,49 +450,38 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 	return end_creating(dentry);
 }
 
-/**
- * debugfs_create_file - create a file in the debugfs filesystem
- * @name: a pointer to a string containing the name of the file to create.
- * @mode: the permission that the file should have.
- * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this parameter is NULL, then the
- *          file will be created in the root of the debugfs filesystem.
- * @data: a pointer to something that the caller will want to get to later
- *        on.  The inode.i_private pointer will point to this value on
- *        the open() call.
- * @fops: a pointer to a struct file_operations that should be used for
- *        this file.
- *
- * This is the basic "create a file" function for debugfs.  It allows for a
- * wide range of flexibility in creating a file, or a directory (if you want
- * to create a directory, the debugfs_create_dir() function is
- * recommended to be used instead.)
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
- *
- * NOTE: it's expected that most callers should _ignore_ the errors returned
- * by this function. Other debugfs functions handle the fact that the "dentry"
- * passed to them could be an error and they don't crash in that case.
- * Drivers should generally work fine even if debugfs fails to init anyway.
- */
-struct dentry *debugfs_create_file(const char *name, umode_t mode,
-				   struct dentry *parent, void *data,
-				   const struct file_operations *fops)
+struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops)
 {
+	if (WARN_ON((unsigned long)fops &
+		    (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT |
+		     DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+		return ERR_PTR(-EINVAL);
 
 	return __debugfs_create_file(name, mode, parent, data,
 				fops ? &debugfs_full_proxy_file_operations :
 					&debugfs_noop_file_operations,
 				fops);
 }
-EXPORT_SYMBOL_GPL(debugfs_create_file);
+EXPORT_SYMBOL_GPL(debugfs_create_file_full);
+
+struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
+					 struct dentry *parent, void *data,
+					 const struct debugfs_short_fops *fops)
+{
+	if (WARN_ON((unsigned long)fops &
+		    (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT |
+		     DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+		return ERR_PTR(-EINVAL);
+
+	return __debugfs_create_file(name, mode, parent, data,
+				fops ? &debugfs_full_proxy_file_operations :
+					&debugfs_noop_file_operations,
+				(const void *)((unsigned long)fops |
+					       DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT));
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file_short);
 
 /**
  * debugfs_create_file_unsafe - create a file in the debugfs filesystem
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index dae80c2a469e..a3edfa4f0d8e 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -18,6 +18,7 @@ extern const struct file_operations debugfs_full_proxy_file_operations;
 
 struct debugfs_fsdata {
 	const struct file_operations *real_fops;
+	const struct debugfs_short_fops *short_fops;
 	union {
 		/* automount_fn is used when real_fops is NULL */
 		debugfs_automount_t automount;
@@ -39,6 +40,11 @@ struct debugfs_fsdata {
  * pointer gets its lowest bit set.
  */
 #define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0)
+/*
+ * A dentry's ->d_fsdata, when pointing to real fops, is with
+ * short fops instead of full fops.
+ */
+#define DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT BIT(1)
 
 /* Access BITS */
 #define DEBUGFS_ALLOW_API	BIT(0)
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 0928a6c8ae1e..59444b495d49 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -71,9 +71,63 @@ typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
 
 struct dentry *debugfs_lookup(const char *name, struct dentry *parent);
 
-struct dentry *debugfs_create_file(const char *name, umode_t mode,
-				   struct dentry *parent, void *data,
-				   const struct file_operations *fops);
+struct debugfs_short_fops {
+	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+	loff_t (*llseek) (struct file *, loff_t, int);
+};
+
+struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops);
+struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
+					 struct dentry *parent, void *data,
+					 const struct debugfs_short_fops *fops);
+
+/**
+ * debugfs_create_file - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations or struct debugfs_short_fops that
+ *        should be used for this file.
+ *
+ * This is the basic "create a file" function for debugfs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ *
+ * If fops points to a struct debugfs_short_fops, then simple_open() will be
+ * used for the open, and only read/write/llseek are supported and are proxied,
+ * so no module reference or release are needed.
+ *
+ * NOTE: it's expected that most callers should _ignore_ the errors returned
+ * by this function. Other debugfs functions handle the fact that the "dentry"
+ * passed to them could be an error and they don't crash in that case.
+ * Drivers should generally work fine even if debugfs fails to init anyway.
+ */
+#define debugfs_create_file(name, mode, parent, data, fops)			\
+	_Generic(fops,								\
+		 const struct file_operations *: debugfs_create_file_full,	\
+		 const struct debugfs_short_fops *: debugfs_create_file_short,	\
+		 struct file_operations *: debugfs_create_file_full,		\
+		 struct debugfs_short_fops *: debugfs_create_file_short)	\
+		(name, mode, parent, data, fops)
+
 struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
@@ -207,7 +261,7 @@ static inline struct dentry *debugfs_lookup(const char *name,
 
 static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
-					const struct file_operations *fops)
+					const void *fops)
 {
 	return ERR_PTR(-ENODEV);
 }
-- 
cgit v1.2.3


From 26ff1fb02991e1260481185bb5ccab1ee498d5e4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 16 Oct 2024 09:19:29 -0700
Subject: rcu: Delete unused rcu_gp_might_be_stalled() function

The rcu_gp_might_be_stalled() function is no longer used, so this commit
removes it.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcutiny.h |  1 -
 include/linux/rcutree.h |  1 -
 kernel/rcu/tree_stall.h | 30 ------------------------------
 3 files changed, 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 0ee270b3f5ed..fe42315f667f 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -165,7 +165,6 @@ static inline bool rcu_inkernel_boot_has_ended(void) { return true; }
 static inline bool rcu_is_watching(void) { return true; }
 static inline void rcu_momentary_eqs(void) { }
 static inline void kfree_rcu_scheduler_running(void) { }
-static inline bool rcu_gp_might_be_stalled(void) { return false; }
 
 /* Avoid RCU read-side critical sections leaking across. */
 static inline void rcu_all_qs(void) { barrier(); }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 90a684f94776..27d86d912781 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -40,7 +40,6 @@ void kvfree_rcu_barrier(void);
 void rcu_barrier(void);
 void rcu_momentary_eqs(void);
 void kfree_rcu_scheduler_running(void);
-bool rcu_gp_might_be_stalled(void);
 
 struct rcu_gp_oldstate {
 	unsigned long rgos_norm;
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 4432db6d0b99..d7cdd535e50b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -76,36 +76,6 @@ int rcu_jiffies_till_stall_check(void)
 }
 EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
 
-/**
- * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled?
- *
- * Returns @true if the current grace period is sufficiently old that
- * it is reasonable to assume that it might be stalled.  This can be
- * useful when deciding whether to allocate memory to enable RCU-mediated
- * freeing on the one hand or just invoking synchronize_rcu() on the other.
- * The latter is preferable when the grace period is stalled.
- *
- * Note that sampling of the .gp_start and .gp_seq fields must be done
- * carefully to avoid false positives at the beginnings and ends of
- * grace periods.
- */
-bool rcu_gp_might_be_stalled(void)
-{
-	unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV;
-	unsigned long j = jiffies;
-
-	if (d < RCU_STALL_MIGHT_MIN)
-		d = RCU_STALL_MIGHT_MIN;
-	smp_mb(); // jiffies before .gp_seq to avoid false positives.
-	if (!rcu_gp_in_progress())
-		return false;
-	// Long delays at this point avoids false positive, but a delay
-	// of ULONG_MAX/4 jiffies voids your no-false-positive warranty.
-	smp_mb(); // .gp_seq before second .gp_start
-	// And ditto here.
-	return !time_before(j, READ_ONCE(rcu_state.gp_start) + d);
-}
-
 /* Don't do RCU CPU stall warnings during long sysrq printouts. */
 void rcu_sysrq_start(void)
 {
-- 
cgit v1.2.3


From da09a9e0c3eab164af950be44ee6bdea8527c3e5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 18 Oct 2024 22:22:51 +0200
Subject: uprobe: Add data pointer to consumer handlers

Adding data pointer to both entry and exit consumer handlers and all
its users. The functionality itself is coming in following change.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241018202252.693462-2-jolsa@kernel.org
---
 include/linux/uprobes.h                               |  4 ++--
 kernel/events/uprobes.c                               |  4 ++--
 kernel/trace/bpf_trace.c                              |  6 ++++--
 kernel/trace/trace_uprobe.c                           | 12 ++++++++----
 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c |  2 +-
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 2b294bf1881f..bb265a632b91 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -37,10 +37,10 @@ struct uprobe_consumer {
 	 * for the current process. If filter() is omitted or returns true,
 	 * UPROBE_HANDLER_REMOVE is effectively ignored.
 	 */
-	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
+	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data);
 	int (*ret_handler)(struct uprobe_consumer *self,
 				unsigned long func,
-				struct pt_regs *regs);
+				struct pt_regs *regs, __u64 *data);
 	bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
 	struct list_head cons_node;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2a0059464383..6b44c386a5df 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2090,7 +2090,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 		int rc = 0;
 
 		if (uc->handler) {
-			rc = uc->handler(uc, regs);
+			rc = uc->handler(uc, regs, NULL);
 			WARN(rc & ~UPROBE_HANDLER_MASK,
 				"bad rc=0x%x from %ps()\n", rc, uc->handler);
 		}
@@ -2128,7 +2128,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 	rcu_read_lock_trace();
 	list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
 		if (uc->ret_handler)
-			uc->ret_handler(uc, ri->func, regs);
+			uc->ret_handler(uc, ri->func, regs, NULL);
 	}
 	rcu_read_unlock_trace();
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a582cd25ca87..fdab7ecd8dfa 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3244,7 +3244,8 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
 }
 
 static int
-uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
+uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
+			  __u64 *data)
 {
 	struct bpf_uprobe *uprobe;
 
@@ -3253,7 +3254,8 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
 }
 
 static int
-uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
+uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs,
+			      __u64 *data)
 {
 	struct bpf_uprobe *uprobe;
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c40531d2cbad..5895eabe3581 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -89,9 +89,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
 static int register_uprobe_event(struct trace_uprobe *tu);
 static int unregister_uprobe_event(struct trace_uprobe *tu);
 
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+			     __u64 *data);
 static int uretprobe_dispatcher(struct uprobe_consumer *con,
-				unsigned long func, struct pt_regs *regs);
+				unsigned long func, struct pt_regs *regs,
+				__u64 *data);
 
 #ifdef CONFIG_STACK_GROWSUP
 static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
@@ -1517,7 +1519,8 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
 	}
 }
 
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+			     __u64 *data)
 {
 	struct trace_uprobe *tu;
 	struct uprobe_dispatch_data udd;
@@ -1548,7 +1551,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 }
 
 static int uretprobe_dispatcher(struct uprobe_consumer *con,
-				unsigned long func, struct pt_regs *regs)
+				unsigned long func, struct pt_regs *regs,
+				__u64 *data)
 {
 	struct trace_uprobe *tu;
 	struct uprobe_dispatch_data udd;
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 8835761d9a12..12005e3dc3e4 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -461,7 +461,7 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
 
 static int
 uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
-		   struct pt_regs *regs)
+		   struct pt_regs *regs, __u64 *data)
 
 {
 	regs->ax  = 0x12345678deadbeef;
-- 
cgit v1.2.3


From 4d756095d3994cb41393817dc696b458938a6bd0 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 18 Oct 2024 22:22:52 +0200
Subject: uprobe: Add support for session consumer

This change allows the uprobe consumer to behave as session which
means that 'handler' and 'ret_handler' callbacks are connected in
a way that allows to:

  - control execution of 'ret_handler' from 'handler' callback
  - share data between 'handler' and 'ret_handler' callbacks

The session concept fits to our common use case where we do filtering
on entry uprobe and based on the result we decide to run the return
uprobe (or not).

It's also convenient to share the data between session callbacks.

To achive this we are adding new return value the uprobe consumer
can return from 'handler' callback:

  UPROBE_HANDLER_IGNORE
  - Ignore 'ret_handler' callback for this consumer.

And store cookie and pass it to 'ret_handler' when consumer has both
'handler' and 'ret_handler' callbacks defined.

We store shared data in the return_consumer object array as part of
the return_instance object. This way the handle_uretprobe_chain can
find related return_consumer and its shared data.

We also store entry handler return value, for cases when there are
multiple consumers on single uprobe and some of them are ignored and
some of them not, in which case the return probe gets installed and
we need to have a way to find out which consumer needs to be ignored.

The tricky part is when consumer is registered 'after' the uprobe
entry handler is hit. In such case this consumer's 'ret_handler' gets
executed as well, but it won't have the proper data pointer set,
so we can filter it out.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241018202252.693462-3-jolsa@kernel.org
---
 include/linux/uprobes.h |  21 ++++++-
 kernel/events/uprobes.c | 148 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 139 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index bb265a632b91..dbaf04189548 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -23,8 +23,17 @@ struct inode;
 struct notifier_block;
 struct page;
 
+/*
+ * Allowed return values from uprobe consumer's handler callback
+ * with following meaning:
+ *
+ * UPROBE_HANDLER_REMOVE
+ * - Remove the uprobe breakpoint from current->mm.
+ * UPROBE_HANDLER_IGNORE
+ * - Ignore ret_handler callback for this consumer.
+ */
 #define UPROBE_HANDLER_REMOVE		1
-#define UPROBE_HANDLER_MASK		1
+#define UPROBE_HANDLER_IGNORE		2
 
 #define MAX_URETPROBE_DEPTH		64
 
@@ -44,6 +53,8 @@ struct uprobe_consumer {
 	bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
 	struct list_head cons_node;
+
+	__u64 id;	/* set when uprobe_consumer is registered */
 };
 
 #ifdef CONFIG_UPROBES
@@ -83,14 +94,22 @@ struct uprobe_task {
 	unsigned int			depth;
 };
 
+struct return_consumer {
+	__u64	cookie;
+	__u64	id;
+};
+
 struct return_instance {
 	struct uprobe		*uprobe;
 	unsigned long		func;
 	unsigned long		stack;		/* stack pointer */
 	unsigned long		orig_ret_vaddr; /* original return address */
 	bool			chained;	/* true, if instance is nested */
+	int			consumers_cnt;
 
 	struct return_instance	*next;		/* keep as stack */
+
+	struct return_consumer	consumers[] __counted_by(consumers_cnt);
 };
 
 enum rp_check {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6b44c386a5df..4ef4b51776eb 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -64,7 +64,7 @@ struct uprobe {
 	struct rcu_head		rcu;
 	loff_t			offset;
 	loff_t			ref_ctr_offset;
-	unsigned long		flags;
+	unsigned long		flags;		/* "unsigned long" so bitops work */
 
 	/*
 	 * The generic code assumes that it has two members of unknown type
@@ -823,8 +823,11 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
 
 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
+	static atomic64_t id;
+
 	down_write(&uprobe->consumer_rwsem);
 	list_add_rcu(&uc->cons_node, &uprobe->consumers);
+	uc->id = (__u64) atomic64_inc_return(&id);
 	up_write(&uprobe->consumer_rwsem);
 }
 
@@ -1761,6 +1764,34 @@ static struct uprobe_task *get_utask(void)
 	return current->utask;
 }
 
+static size_t ri_size(int consumers_cnt)
+{
+	struct return_instance *ri;
+
+	return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt;
+}
+
+#define DEF_CNT 4
+
+static struct return_instance *alloc_return_instance(void)
+{
+	struct return_instance *ri;
+
+	ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL);
+	if (!ri)
+		return ZERO_SIZE_PTR;
+
+	ri->consumers_cnt = DEF_CNT;
+	return ri;
+}
+
+static struct return_instance *dup_return_instance(struct return_instance *old)
+{
+	size_t size = ri_size(old->consumers_cnt);
+
+	return kmemdup(old, size, GFP_KERNEL);
+}
+
 static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
 {
 	struct uprobe_task *n_utask;
@@ -1773,11 +1804,10 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
 
 	p = &n_utask->return_instances;
 	for (o = o_utask->return_instances; o; o = o->next) {
-		n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+		n = dup_return_instance(o);
 		if (!n)
 			return -ENOMEM;
 
-		*n = *o;
 		/*
 		 * uprobe's refcnt has to be positive at this point, kept by
 		 * utask->return_instances items; return_instances can't be
@@ -1870,35 +1900,31 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
 	utask->return_instances = ri;
 }
 
-static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
+			      struct return_instance *ri)
 {
 	struct uprobe_task *utask = current->utask;
 	unsigned long orig_ret_vaddr, trampoline_vaddr;
-	struct return_instance *ri;
 	bool chained;
 
 	if (!get_xol_area())
-		return;
+		goto free;
 
 	if (utask->depth >= MAX_URETPROBE_DEPTH) {
 		printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
 				" nestedness limit pid/tgid=%d/%d\n",
 				current->pid, current->tgid);
-		return;
+		goto free;
 	}
 
 	/* we need to bump refcount to store uprobe in utask */
 	if (!try_get_uprobe(uprobe))
-		return;
-
-	ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
-	if (!ri)
-		goto fail;
+		goto free;
 
 	trampoline_vaddr = uprobe_get_trampoline_vaddr();
 	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
 	if (orig_ret_vaddr == -1)
-		goto fail;
+		goto put;
 
 	/* drop the entries invalidated by longjmp() */
 	chained = (orig_ret_vaddr == trampoline_vaddr);
@@ -1916,7 +1942,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 			 * attack from user-space.
 			 */
 			uprobe_warn(current, "handle tail call");
-			goto fail;
+			goto put;
 		}
 		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
 	}
@@ -1931,9 +1957,10 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 	utask->return_instances = ri;
 
 	return;
-fail:
-	kfree(ri);
+put:
 	put_uprobe(uprobe);
+free:
+	kfree(ri);
 }
 
 /* Prepare to single-step probed instruction out of line. */
@@ -2077,34 +2104,90 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
 	return uprobe;
 }
 
+static struct return_instance*
+push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie)
+{
+	if (unlikely(ri == ZERO_SIZE_PTR))
+		return ri;
+
+	if (unlikely(idx >= ri->consumers_cnt)) {
+		struct return_instance *old_ri = ri;
+
+		ri->consumers_cnt += DEF_CNT;
+		ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL);
+		if (!ri) {
+			kfree(old_ri);
+			return ZERO_SIZE_PTR;
+		}
+	}
+
+	ri->consumers[idx].id = id;
+	ri->consumers[idx].cookie = cookie;
+	return ri;
+}
+
+static struct return_consumer *
+return_consumer_find(struct return_instance *ri, int *iter, int id)
+{
+	struct return_consumer *ric;
+	int idx = *iter;
+
+	for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) {
+		if (ric->id == id) {
+			*iter = idx + 1;
+			return ric;
+		}
+	}
+	return NULL;
+}
+
+static bool ignore_ret_handler(int rc)
+{
+	return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE;
+}
+
 static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 {
 	struct uprobe_consumer *uc;
-	int remove = UPROBE_HANDLER_REMOVE;
-	bool need_prep = false; /* prepare return uprobe, when needed */
-	bool has_consumers = false;
+	bool has_consumers = false, remove = true;
+	struct return_instance *ri = NULL;
+	int push_idx = 0;
 
 	current->utask->auprobe = &uprobe->arch;
 
 	list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+		bool session = uc->handler && uc->ret_handler;
+		__u64 cookie = 0;
 		int rc = 0;
 
 		if (uc->handler) {
-			rc = uc->handler(uc, regs, NULL);
-			WARN(rc & ~UPROBE_HANDLER_MASK,
+			rc = uc->handler(uc, regs, &cookie);
+			WARN(rc < 0 || rc > 2,
 				"bad rc=0x%x from %ps()\n", rc, uc->handler);
 		}
 
-		if (uc->ret_handler)
-			need_prep = true;
-
-		remove &= rc;
+		remove &= rc == UPROBE_HANDLER_REMOVE;
 		has_consumers = true;
+
+		if (!uc->ret_handler || ignore_ret_handler(rc))
+			continue;
+
+		if (!ri)
+			ri = alloc_return_instance();
+
+		if (session)
+			ri = push_consumer(ri, push_idx++, uc->id, cookie);
 	}
 	current->utask->auprobe = NULL;
 
-	if (need_prep && !remove)
-		prepare_uretprobe(uprobe, regs); /* put bp at return */
+	if (!ZERO_OR_NULL_PTR(ri)) {
+		/*
+		 * The push_idx value has the final number of return consumers,
+		 * and ri->consumers_cnt has number of allocated consumers.
+		 */
+		ri->consumers_cnt = push_idx;
+		prepare_uretprobe(uprobe, regs, ri);
+	}
 
 	if (remove && has_consumers) {
 		down_read(&uprobe->register_rwsem);
@@ -2123,12 +2206,19 @@ static void
 handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 {
 	struct uprobe *uprobe = ri->uprobe;
+	struct return_consumer *ric;
 	struct uprobe_consumer *uc;
+	int ric_idx = 0;
 
 	rcu_read_lock_trace();
 	list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
-		if (uc->ret_handler)
-			uc->ret_handler(uc, ri->func, regs, NULL);
+		bool session = uc->handler && uc->ret_handler;
+
+		if (uc->ret_handler) {
+			ric = return_consumer_find(ri, &ric_idx, uc->id);
+			if (!session || ric)
+				uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL);
+		}
 	}
 	rcu_read_unlock_trace();
 }
-- 
cgit v1.2.3


From ee1251fc0c4e799a48025318f262739919deb977 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong@huawei.com>
Date: Tue, 22 Oct 2024 11:49:45 +0000
Subject: cgroup/freezer: Reduce redundant traversal for cgroup_freeze
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Whether a cgroup is frozen is determined solely by whether it is set to
to be frozen and whether its parent is frozen. Currently, when is cgroup
is frozen or unfrozen, it iterates through the entire subtree to freeze
or unfreeze its descentdants. However, this is unesessary for a cgroup
that does not change its effective frozen status. This path aims to skip
the subtree if its parent does not have a change in effective freeze.

For an example, subtree like, a-b-c-d-e-f-g, when a is frozen, the
entire tree is frozen. If we freeze b and c again, it is unesessary to
iterate d, e, f and g. So does that If we unfreeze b/c.

Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Chen Ridong <chenridong@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  2 +-
 kernel/cgroup/freezer.c     | 30 ++++++++++++++----------------
 2 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 0a80ef9191a6..1b20d2d8ef7c 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -398,7 +398,7 @@ struct cgroup_freezer_state {
 	bool freeze;
 
 	/* Should the cgroup actually be frozen? */
-	int e_freeze;
+	bool e_freeze;
 
 	/* Fields below are protected by css_set_lock */
 
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 617861a54793..188d5f2aeb5a 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -260,8 +260,10 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
 void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 {
 	struct cgroup_subsys_state *css;
+	struct cgroup *parent;
 	struct cgroup *dsct;
 	bool applied = false;
+	bool old_e;
 
 	lockdep_assert_held(&cgroup_mutex);
 
@@ -282,22 +284,18 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 		if (cgroup_is_dead(dsct))
 			continue;
 
-		if (freeze) {
-			dsct->freezer.e_freeze++;
-			/*
-			 * Already frozen because of ancestor's settings?
-			 */
-			if (dsct->freezer.e_freeze > 1)
-				continue;
-		} else {
-			dsct->freezer.e_freeze--;
-			/*
-			 * Still frozen because of ancestor's settings?
-			 */
-			if (dsct->freezer.e_freeze > 0)
-				continue;
-
-			WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
+		/*
+		 * e_freeze is affected by parent's e_freeze and dst's freeze.
+		 * If old e_freeze eq new e_freeze, no change, its children
+		 * will not be affected. So do nothing and skip the subtree
+		 */
+		old_e = dsct->freezer.e_freeze;
+		parent = cgroup_parent(dsct);
+		dsct->freezer.e_freeze = (dsct->freezer.freeze ||
+					  parent->freezer.e_freeze);
+		if (dsct->freezer.e_freeze == old_e) {
+			css = css_rightmost_descendant(css);
+			continue;
 		}
 
 		/*
-- 
cgit v1.2.3


From 04af8a399fa40310f831b4f1dc9f757085f41983 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 18 Oct 2024 17:47:48 +0300
Subject: PCI: Protect Link Control 2 Register with RMW locking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PCIe Bandwidth Controller performs RMW accesses the Link Control 2
Register which can occur concurrently to other sources of Link Control 2
Register writes. Therefore, add Link Control 2 Register among the PCI
Express Capability Registers that need RMW locking.

Link: https://lore.kernel.org/r/20241018144755.7875-3-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/PCI/pciebus-howto.rst | 1 +
 include/linux/pci.h                 | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/PCI/pciebus-howto.rst b/Documentation/PCI/pciebus-howto.rst
index e48d01422efc..375d9ce171f6 100644
--- a/Documentation/PCI/pciebus-howto.rst
+++ b/Documentation/PCI/pciebus-howto.rst
@@ -221,6 +221,7 @@ a selected set of PCI Express Capability Registers:
 
 * Link Control Register
 * Root Control Register
+* Link Control 2 Register
 
 Any change to those registers should be performed using RMW accessors to
 avoid problems due to concurrent updates. For the up-to-date list of
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..be5ed534c39c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1274,6 +1274,7 @@ static inline int pcie_capability_clear_and_set_word(struct pci_dev *dev,
 {
 	switch (pos) {
 	case PCI_EXP_LNKCTL:
+	case PCI_EXP_LNKCTL2:
 	case PCI_EXP_RTCTL:
 		return pcie_capability_clear_and_set_word_locked(dev, pos,
 								 clear, set);
-- 
cgit v1.2.3


From 52e0874fc16bd26e9ea1871e30ffb2c6dff187cf Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 12 Aug 2024 12:39:02 +0200
Subject: locking/rt: Add sparse annotation PREEMPT_RT's sleeping locks.

The sleeping locks on PREEMPT_RT (rt_spin_lock() and friends) lack
sparse annotation. Therefore a missing spin_unlock() won't be spotted by
sparse in a PREEMPT_RT build while it is noticed on a !PREEMPT_RT build.

Add the __acquires/__releases macros to the lock/ unlock functions. The
trylock functions already use the __cond_lock() wrapper.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240812104200.2239232-2-bigeasy@linutronix.de
---
 include/linux/rwlock_rt.h   | 10 +++++-----
 include/linux/spinlock_rt.h |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
index 8544ff05e594..7d81fc6918ee 100644
--- a/include/linux/rwlock_rt.h
+++ b/include/linux/rwlock_rt.h
@@ -24,13 +24,13 @@ do {							\
 	__rt_rwlock_init(rwl, #rwl, &__key);		\
 } while (0)
 
-extern void rt_read_lock(rwlock_t *rwlock);
+extern void rt_read_lock(rwlock_t *rwlock)	__acquires(rwlock);
 extern int rt_read_trylock(rwlock_t *rwlock);
-extern void rt_read_unlock(rwlock_t *rwlock);
-extern void rt_write_lock(rwlock_t *rwlock);
-extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass);
+extern void rt_read_unlock(rwlock_t *rwlock)	__releases(rwlock);
+extern void rt_write_lock(rwlock_t *rwlock)	__acquires(rwlock);
+extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass)	__acquires(rwlock);
 extern int rt_write_trylock(rwlock_t *rwlock);
-extern void rt_write_unlock(rwlock_t *rwlock);
+extern void rt_write_unlock(rwlock_t *rwlock)	__releases(rwlock);
 
 static __always_inline void read_lock(rwlock_t *rwlock)
 {
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index 61c49b16f69a..babc3e028779 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -32,10 +32,10 @@ do {								\
 	__rt_spin_lock_init(slock, #slock, &__key, true);	\
 } while (0)
 
-extern void rt_spin_lock(spinlock_t *lock);
-extern void rt_spin_lock_nested(spinlock_t *lock, int subclass);
-extern void rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock);
-extern void rt_spin_unlock(spinlock_t *lock);
+extern void rt_spin_lock(spinlock_t *lock) __acquires(lock);
+extern void rt_spin_lock_nested(spinlock_t *lock, int subclass)	__acquires(lock);
+extern void rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock) __acquires(lock);
+extern void rt_spin_unlock(spinlock_t *lock)	__releases(lock);
 extern void rt_spin_lock_unlock(spinlock_t *lock);
 extern int rt_spin_trylock_bh(spinlock_t *lock);
 extern int rt_spin_trylock(spinlock_t *lock);
-- 
cgit v1.2.3


From b1f01f9e54b1aaadb6740f86017e8fabdee77fe2 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 12 Aug 2024 12:39:03 +0200
Subject: locking/rt: Remove one __cond_lock() in RT's spin_trylock_irqsave()

spin_trylock_irqsave() has a __cond_lock() wrapper which points to
__spin_trylock_irqsave(). The function then invokes spin_trylock() which
has another __cond_lock() finally pointing to rt_spin_trylock().

The compiler has no problem to parse this but sparse does not recognise
that users of spin_trylock_irqsave() acquire a conditional lock and
complains.

Remove one layer of __cond_lock() so that sparse recognises conditional
locking.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240812104200.2239232-3-bigeasy@linutronix.de
---
 include/linux/spinlock_rt.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index babc3e028779..f9f14e135be7 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -132,7 +132,7 @@ static __always_inline void spin_unlock_irqrestore(spinlock_t *lock,
 #define spin_trylock_irq(lock)				\
 	__cond_lock(lock, rt_spin_trylock(lock))
 
-#define __spin_trylock_irqsave(lock, flags)		\
+#define spin_trylock_irqsave(lock, flags)		\
 ({							\
 	int __locked;					\
 							\
@@ -142,9 +142,6 @@ static __always_inline void spin_unlock_irqrestore(spinlock_t *lock,
 	__locked;					\
 })
 
-#define spin_trylock_irqsave(lock, flags)		\
-	__cond_lock(lock, __spin_trylock_irqsave(lock, flags))
-
 #define spin_is_contended(lock)		(((void)(lock), 0))
 
 static inline int spin_is_locked(spinlock_t *lock)
-- 
cgit v1.2.3


From 9a9f7e13952b2638bc57bc9b34e6bdd106509836 Mon Sep 17 00:00:00 2001
From: Victor Shih <victor.shih@genesyslogic.com.tw>
Date: Fri, 18 Oct 2024 18:53:18 +0800
Subject: mmc: core: Support UHS-II card control and access

Embed UHS-II access/control functionality into the MMC request
processing flow.

Signed-off-by: Jason Lai <jason.lai@genesyslogic.com.tw>
Signed-off-by: Victor Shih <victor.shih@genesyslogic.com.tw>
Message-ID: <20241018105333.4569-2-victorshihgli@gmail.com>
[Ulf: A couple of cleanups and fixed sd_uhs2_power_off()]
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c    |    8 +-
 drivers/mmc/core/sd.c      |    4 +-
 drivers/mmc/core/sd.h      |    2 +
 drivers/mmc/core/sd_ops.c  |    9 +
 drivers/mmc/core/sd_ops.h  |    2 +
 drivers/mmc/core/sd_uhs2.c | 1083 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/mmc/core.h   |   17 +
 include/linux/mmc/host.h   |   15 +
 8 files changed, 1103 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 54ca9dc2114c..a499f3c59de5 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -354,6 +354,9 @@ int mmc_start_request(struct mmc_host *host, struct mmc_request *mrq)
 	if (err)
 		return err;
 
+	if (host->uhs2_sd_tran)
+		mmc_uhs2_prepare_cmd(host, mrq);
+
 	led_trigger_event(host->led, LED_FULL);
 	__mmc_start_request(host, mrq);
 
@@ -453,6 +456,9 @@ int mmc_cqe_start_req(struct mmc_host *host, struct mmc_request *mrq)
 	if (err)
 		goto out_err;
 
+	if (host->uhs2_sd_tran)
+		mmc_uhs2_prepare_cmd(host, mrq);
+
 	err = host->cqe_ops->cqe_request(host, mrq);
 	if (err)
 		goto out_err;
@@ -1135,7 +1141,7 @@ u32 mmc_select_voltage(struct mmc_host *host, u32 ocr)
 		return 0;
 	}
 
-	if (host->caps2 & MMC_CAP2_FULL_PWR_CYCLE) {
+	if (!mmc_card_uhs2(host) && host->caps2 & MMC_CAP2_FULL_PWR_CYCLE) {
 		bit = ffs(ocr) - 1;
 		ocr &= 3 << bit;
 		mmc_power_cycle(host, ocr);
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 918b86bf8bbb..cc757b850e79 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -200,7 +200,7 @@ static int mmc_decode_csd(struct mmc_card *card, bool is_sduc)
 /*
  * Given a 64-bit response, decode to our card SCR structure.
  */
-static int mmc_decode_scr(struct mmc_card *card)
+int mmc_decode_scr(struct mmc_card *card)
 {
 	struct sd_scr *scr = &card->scr;
 	unsigned int scr_struct;
@@ -903,7 +903,7 @@ int mmc_sd_get_csd(struct mmc_card *card, bool is_sduc)
 	return 0;
 }
 
-static int mmc_sd_get_ro(struct mmc_host *host)
+int mmc_sd_get_ro(struct mmc_host *host)
 {
 	int ro;
 
diff --git a/drivers/mmc/core/sd.h b/drivers/mmc/core/sd.h
index 7e8beface2ca..301dc34b8b63 100644
--- a/drivers/mmc/core/sd.h
+++ b/drivers/mmc/core/sd.h
@@ -11,6 +11,8 @@ struct mmc_card;
 
 int mmc_sd_get_cid(struct mmc_host *host, u32 ocr, u32 *cid, u32 *rocr);
 int mmc_sd_get_csd(struct mmc_card *card, bool is_sduc);
+int mmc_decode_scr(struct mmc_card *card);
+int mmc_sd_get_ro(struct mmc_host *host);
 void mmc_decode_cid(struct mmc_card *card);
 int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
 	bool reinit);
diff --git a/drivers/mmc/core/sd_ops.c b/drivers/mmc/core/sd_ops.c
index 50d1380e93b8..cd86463dd306 100644
--- a/drivers/mmc/core/sd_ops.c
+++ b/drivers/mmc/core/sd_ops.c
@@ -42,6 +42,15 @@ int mmc_app_cmd(struct mmc_host *host, struct mmc_card *card)
 	if (WARN_ON(card && card->host != host))
 		return -EINVAL;
 
+	/*
+	 * UHS2 packet has APP bit so only set APP_CMD flag here.
+	 * Will set the APP bit when assembling UHS2 packet.
+	 */
+	if (host->uhs2_sd_tran) {
+		host->uhs2_app_cmd = true;
+		return 0;
+	}
+
 	cmd.opcode = MMC_APP_CMD;
 
 	if (card) {
diff --git a/drivers/mmc/core/sd_ops.h b/drivers/mmc/core/sd_ops.h
index fd3f10b9cf86..8fffc1b29757 100644
--- a/drivers/mmc/core/sd_ops.h
+++ b/drivers/mmc/core/sd_ops.h
@@ -12,6 +12,7 @@
 
 struct mmc_card;
 struct mmc_host;
+struct mmc_request;
 
 int mmc_app_set_bus_width(struct mmc_card *card, int width);
 int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr);
@@ -22,6 +23,7 @@ int mmc_app_send_scr(struct mmc_card *card);
 int mmc_app_sd_status(struct mmc_card *card, void *ssr);
 int mmc_app_cmd(struct mmc_host *host, struct mmc_card *card);
 int mmc_send_ext_addr(struct mmc_host *host, u32 addr);
+void mmc_uhs2_prepare_cmd(struct mmc_host *host, struct mmc_request *mrq);
 
 #endif
 
diff --git a/drivers/mmc/core/sd_uhs2.c b/drivers/mmc/core/sd_uhs2.c
index 19d62d45e1ec..ddd2291ad7c4 100644
--- a/drivers/mmc/core/sd_uhs2.c
+++ b/drivers/mmc/core/sd_uhs2.c
@@ -1,23 +1,51 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2021 Linaro Ltd
- *
  * Author: Ulf Hansson <ulf.hansson@linaro.org>
  *
+ * Copyright (C) 2014 Intel Corp, All Rights Reserved.
+ * Author: Yi Sun <yi.y.sun@intel.com>
+ *
+ * Copyright (C) 2020 Genesys Logic, Inc.
+ * Authors: Ben Chuang <ben.chuang@genesyslogic.com.tw>
+ *
+ * Copyright (C) 2020 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * Copyright (C) 2022 Genesys Logic, Inc.
+ * Authors: Jason Lai <jason.lai@genesyslogic.com.tw>
+ *
+ * Copyright (C) 2023 Genesys Logic, Inc.
+ * Authors: Victor Shih <victor.shih@genesyslogic.com.tw>
+ *
  * Support for SD UHS-II cards
  */
 #include <linux/err.h>
+#include <linux/pm_runtime.h>
 
 #include <linux/mmc/host.h>
 #include <linux/mmc/card.h>
+#include <linux/mmc/mmc.h>
+#include <linux/mmc/sd.h>
+#include <linux/mmc/sd_uhs2.h>
 
+#include "card.h"
 #include "core.h"
 #include "bus.h"
 #include "sd.h"
+#include "sd_ops.h"
 #include "mmc_ops.h"
 
+#define UHS2_WAIT_CFG_COMPLETE_PERIOD_US  (1 * 1000)
+#define UHS2_WAIT_CFG_COMPLETE_TIMEOUT_MS 100
+
 static const unsigned int sd_uhs2_freqs[] = { 52000000, 26000000 };
 
+struct sd_uhs2_wait_active_state_data {
+	struct mmc_host *host;
+	struct mmc_command *cmd;
+};
+
 static int sd_uhs2_power_up(struct mmc_host *host)
 {
 	int err;
@@ -37,15 +65,23 @@ static int sd_uhs2_power_up(struct mmc_host *host)
 
 static int sd_uhs2_power_off(struct mmc_host *host)
 {
+	int err;
+
 	if (host->ios.power_mode == MMC_POWER_OFF)
 		return 0;
 
 	host->ios.vdd = 0;
 	host->ios.clock = 0;
-	host->ios.timing = MMC_TIMING_LEGACY;
 	host->ios.power_mode = MMC_POWER_OFF;
+	host->uhs2_sd_tran = false;
+
+	err = host->ops->uhs2_control(host, UHS2_SET_IOS);
+	if (err)
+		return err;
 
-	return host->ops->uhs2_control(host, UHS2_SET_IOS);
+	/* For consistency, let's restore the initial timing. */
+	host->ios.timing = MMC_TIMING_LEGACY;
+	return 0;
 }
 
 /*
@@ -55,7 +91,47 @@ static int sd_uhs2_power_off(struct mmc_host *host)
  */
 static int sd_uhs2_phy_init(struct mmc_host *host)
 {
-	return 0;
+	int err;
+
+	err = host->ops->uhs2_control(host, UHS2_PHY_INIT);
+	if (err) {
+		pr_err("%s: failed to initial phy for UHS-II!\n",
+		       mmc_hostname(host));
+	}
+
+	return err;
+}
+
+/*
+ * sd_uhs2_cmd_assemble() - build up UHS-II command packet which is embedded in
+ *                          mmc_command structure
+ * @cmd:	MMC command to executed
+ * @uhs2_cmd:	UHS2 command corresponded to MMC command
+ * @header:	Header field of UHS-II command cxpacket
+ * @arg:	Argument field of UHS-II command packet
+ * @payload:	Payload field of UHS-II command packet
+ * @plen:	Payload length
+ * @resp:	Response buffer is allocated by caller and it is used to keep
+ *              the response of CM-TRAN command. For SD-TRAN command, uhs2_resp
+ *              should be null and SD-TRAN command response should be stored in
+ *              resp of mmc_command.
+ * @resp_len:	Response buffer length
+ *
+ * The uhs2_command structure contains message packets which are transmited/
+ * received on UHS-II bus. This function fills in the contents of uhs2_command
+ * structure and embededs UHS2 command into mmc_command structure, which is used
+ * in legacy SD operation functions.
+ *
+ */
+static void sd_uhs2_cmd_assemble(struct mmc_command *cmd,
+				 struct uhs2_command *uhs2_cmd,
+				 u8 plen, u8 resp_len)
+{
+	uhs2_cmd->payload_len = plen * sizeof(u32);
+	uhs2_cmd->packet_len = uhs2_cmd->payload_len + 4;
+
+	cmd->uhs2_cmd = uhs2_cmd;
+	cmd->uhs2_cmd->uhs2_resp_len = resp_len;
 }
 
 /*
@@ -64,6 +140,83 @@ static int sd_uhs2_phy_init(struct mmc_host *host)
  */
 static int sd_uhs2_dev_init(struct mmc_host *host)
 {
+	struct mmc_command cmd = {0};
+	struct uhs2_command uhs2_cmd = {};
+	u32 cnt;
+	u32 dap, gap, resp_gap;
+	u8 gd = 0;
+	int err;
+
+	dap = host->uhs2_caps.dap;
+	gap = host->uhs2_caps.gap;
+
+	/*
+	 * Refer to UHS-II Addendum Version 1.02 Figure 6-21 to see DEVICE_INIT CCMD format.
+	 * Head:
+	 *      - Control Write(R/W=1) with 4-Byte payload(PLEN=01b).
+	 *      - IOADR = CMD_BASE + 002h
+	 * Payload:
+	 *      - bit [3:0]  : GAP(Group Allocated Power)
+	 *      - bit [7:4]  : GD(Group Descriptor)
+	 *      - bit [11]   : Complete Flag
+	 *      - bit [15:12]: DAP(Device Allocated Power)
+	 */
+	uhs2_cmd.header = UHS2_NATIVE_PACKET | UHS2_PACKET_TYPE_CCMD;
+	uhs2_cmd.arg = ((UHS2_DEV_CMD_DEVICE_INIT & 0xFF) << 8) |
+	       UHS2_NATIVE_CMD_WRITE |
+	       UHS2_NATIVE_CMD_PLEN_4B |
+	       (UHS2_DEV_CMD_DEVICE_INIT >> 8);
+
+	/*
+	 * Refer to UHS-II Addendum Version 1.02 section 6.3.1.
+	 * Max. time from DEVICE_INIT CCMD EOP reception on Device
+	 * Rx to its SOP transmission on Device Tx(Tfwd_init_cmd) is
+	 * 1 second.
+	 */
+	cmd.busy_timeout = 1000;
+
+	/*
+	 * Refer to UHS-II Addendum Version 1.02 section 6.2.6.3.
+	 * Let's retry the DEVICE_INIT command no more than 30 times.
+	 */
+	for (cnt = 0; cnt < 30; cnt++) {
+		uhs2_cmd.payload[0] = ((dap & 0xF) << 12) |
+				       UHS2_DEV_INIT_COMPLETE_FLAG |
+				       ((gd & 0xF) << 4) |
+				       (gap & 0xF);
+
+		sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, UHS2_DEV_INIT_PAYLOAD_LEN,
+				     UHS2_DEV_INIT_RESP_LEN);
+
+		err = mmc_wait_for_cmd(host, &cmd, 0);
+
+		if (err) {
+			pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+			       mmc_hostname(host), __func__, err);
+			continue;
+		}
+
+		if (uhs2_cmd.uhs2_resp[3] != (UHS2_DEV_CMD_DEVICE_INIT & 0xFF)) {
+			pr_err("%s: DEVICE_INIT response is wrong!\n",
+			       mmc_hostname(host));
+			return -EIO;
+		}
+
+		if (uhs2_cmd.uhs2_resp[5] & 0x8) {
+			host->uhs2_caps.group_desc = gd;
+			return 0;
+		}
+		resp_gap = uhs2_cmd.uhs2_resp[4] & 0x0F;
+		if (gap == resp_gap)
+			gd++;
+	}
+
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
 	return 0;
 }
 
@@ -74,6 +227,48 @@ static int sd_uhs2_dev_init(struct mmc_host *host)
  */
 static int sd_uhs2_enum(struct mmc_host *host, u32 *node_id)
 {
+	struct mmc_command cmd = {0};
+	struct uhs2_command uhs2_cmd = {};
+	u8 id_f = 0xF, id_l = 0x0;
+	int err;
+
+	/*
+	 * Refer to UHS-II Addendum Version 1.02 Figure 6-28 to see ENUMERATE CCMD format.
+	 * Header:
+	 *      - Control Write(R/W=1) with 4-Byte payload(PLEN=01b).
+	 *      - IOADR = CMD_BASE + 003h
+	 * Payload:
+	 *      - bit [3:0]: ID_L(Last Node ID)
+	 *      - bit [7:4]: ID_F(First Node ID)
+	 */
+	uhs2_cmd.header = UHS2_NATIVE_PACKET | UHS2_PACKET_TYPE_CCMD;
+	uhs2_cmd.arg = ((UHS2_DEV_CMD_ENUMERATE & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_WRITE |
+			UHS2_NATIVE_CMD_PLEN_4B |
+			(UHS2_DEV_CMD_ENUMERATE >> 8);
+
+	uhs2_cmd.payload[0] = (id_f << 4) | id_l;
+	uhs2_cmd.payload[0] = cpu_to_be32(uhs2_cmd.payload[0]);
+
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, UHS2_DEV_ENUM_PAYLOAD_LEN, UHS2_DEV_ENUM_RESP_LEN);
+
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	if (uhs2_cmd.uhs2_resp[3] != (UHS2_DEV_CMD_ENUMERATE & 0xFF)) {
+		pr_err("%s: ENUMERATE response is wrong!\n",
+		       mmc_hostname(host));
+		return -EIO;
+	}
+
+	id_f = (uhs2_cmd.uhs2_resp[4] >> 4) & 0xF;
+	id_l = uhs2_cmd.uhs2_resp[4] & 0xF;
+	*node_id = id_f;
+
 	return 0;
 }
 
@@ -84,6 +279,180 @@ static int sd_uhs2_enum(struct mmc_host *host, u32 *node_id)
  */
 static int sd_uhs2_config_read(struct mmc_host *host, struct mmc_card *card)
 {
+	struct mmc_command cmd = {0};
+	struct uhs2_command uhs2_cmd = {};
+	u32 cap;
+	int err;
+
+	/*
+	 * Use Control Read CCMD to read Generic Capability from Configuration Register.
+	 * - Control Write(R/W=1) with 4-Byte payload(PLEN=01b).
+	 * - IOADR = Generic Capability Register(CFG_BASE + 000h)
+	 */
+	uhs2_cmd.header = UHS2_NATIVE_PACKET | UHS2_PACKET_TYPE_CCMD | card->uhs2_config.node_id;
+	uhs2_cmd.arg = ((UHS2_DEV_CONFIG_GEN_CAPS & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_READ |
+			UHS2_NATIVE_CMD_PLEN_4B |
+			(UHS2_DEV_CONFIG_GEN_CAPS >> 8);
+
+	/*
+	 * There is no payload because per spec, there should be
+	 * no payload field for read CCMD.
+	 * Plen is set in arg. Per spec, plen for read CCMD
+	 * represents the len of read data which is assigned in payload
+	 * of following RES (p136).
+	 */
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, 0, 0);
+
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	/*
+	 * Generic Capability Register:
+	 * bit [7:0]  : Reserved
+	 * bit [13:8] : Device-Specific Number of Lanes and Functionality
+	 *              bit 8: 2L-HD
+	 *              bit 9: 2D-1U FD
+	 *              bit 10: 1D-2U FD
+	 *              bit 11: 2D-2U FD
+	 *              Others: Reserved
+	 * bit [14]   : DADR Length
+	 *              0: 4 bytes
+	 *              1: Reserved
+	 * bit [23:16]: Application Type
+	 *              bit 16: 0=Non-SD memory, 1=SD memory
+	 *              bit 17: 0=Non-SDIO, 1=SDIO
+	 *              bit 18: 0=Card, 1=Embedded
+	 * bit [63:24]: Reserved
+	 */
+	cap = cmd.resp[0];
+	card->uhs2_config.n_lanes =
+				(cap >> UHS2_DEV_CONFIG_N_LANES_POS) &
+				UHS2_DEV_CONFIG_N_LANES_MASK;
+	card->uhs2_config.dadr_len =
+				(cap >> UHS2_DEV_CONFIG_DADR_POS) &
+				UHS2_DEV_CONFIG_DADR_MASK;
+	card->uhs2_config.app_type =
+				(cap >> UHS2_DEV_CONFIG_APP_POS) &
+				UHS2_DEV_CONFIG_APP_MASK;
+
+	/*
+	 * Use Control Read CCMD to read PHY Capability from Configuration Register.
+	 * - Control Write(R/W=1) with 8-Byte payload(PLEN=10b).
+	 * - IOADR = PHY Capability Register(CFG_BASE + 002h)
+	 */
+	uhs2_cmd.arg = ((UHS2_DEV_CONFIG_PHY_CAPS & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_READ |
+			UHS2_NATIVE_CMD_PLEN_8B |
+			(UHS2_DEV_CONFIG_PHY_CAPS >> 8);
+
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, 0, 0);
+
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	/*
+	 * PHY Capability Register:
+	 * bit [3:0]  : PHY Minor Revision
+	 * bit [5:4]  : PHY Major Revision
+	 * bit [15]   : Support Hibernate Mode
+	 *              0: Not support Hibernate Mode
+	 *              1: Support Hibernate Mode
+	 * bit [31:16]: Reserved
+	 * bit [35:32]: Device-Specific N_LSS_SYN
+	 * bit [39:36]: Device-Specific N_LSS_DIR
+	 * bit [63:40]: Reserved
+	 */
+	cap = cmd.resp[0];
+	card->uhs2_config.phy_minor_rev =
+				cap & UHS2_DEV_CONFIG_PHY_MINOR_MASK;
+	card->uhs2_config.phy_major_rev =
+				(cap >> UHS2_DEV_CONFIG_PHY_MAJOR_POS) &
+				 UHS2_DEV_CONFIG_PHY_MAJOR_MASK;
+	card->uhs2_config.can_hibernate =
+				(cap >> UHS2_DEV_CONFIG_CAN_HIBER_POS) &
+				 UHS2_DEV_CONFIG_CAN_HIBER_MASK;
+
+	cap = cmd.resp[1];
+	card->uhs2_config.n_lss_sync =
+				cap & UHS2_DEV_CONFIG_N_LSS_SYN_MASK;
+	card->uhs2_config.n_lss_dir =
+				(cap >> UHS2_DEV_CONFIG_N_LSS_DIR_POS) &
+				UHS2_DEV_CONFIG_N_LSS_DIR_MASK;
+	if (card->uhs2_config.n_lss_sync == 0)
+		card->uhs2_config.n_lss_sync = 16 << 2;
+	else
+		card->uhs2_config.n_lss_sync <<= 2;
+
+	if (card->uhs2_config.n_lss_dir == 0)
+		card->uhs2_config.n_lss_dir = 16 << 3;
+	else
+		card->uhs2_config.n_lss_dir <<= 3;
+
+	/*
+	 * Use Control Read CCMD to read LINK/TRAN Capability from Configuration Register.
+	 * - Control Write(R/W=1) with 8-Byte payload(PLEN=10b).
+	 * - IOADR = LINK/TRAN Capability Register(CFG_BASE + 004h)
+	 */
+	uhs2_cmd.arg = ((UHS2_DEV_CONFIG_LINK_TRAN_CAPS & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_READ |
+			UHS2_NATIVE_CMD_PLEN_8B |
+			(UHS2_DEV_CONFIG_LINK_TRAN_CAPS >> 8);
+
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, 0, 0);
+
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	/*
+	 * LINK/TRAN Capability Register:
+	 * bit [3:0]  : LINK_TRAN Minor Revision
+	 * bit [5:4]  : LINK/TRAN Major Revision
+	 * bit [7:6]  : Reserved
+	 * bit [15:8] : Device-Specific N_FCU
+	 * bit [18:16]: Device Type
+	 *              001b=Host
+	 *              010b=Device
+	 *              011b=Reserved for CMD issuable Device
+	 * bit [19]   : Reserved
+	 * bit [31:20]: Device-Specific MAX_BLKLEN
+	 * bit [39:32]: Device-Specific N_DATA_GAP
+	 * bit [63:40]: Reserved
+	 */
+	cap = cmd.resp[0];
+	card->uhs2_config.link_minor_rev =
+				cap & UHS2_DEV_CONFIG_LT_MINOR_MASK;
+	card->uhs2_config.link_major_rev =
+				(cap >> UHS2_DEV_CONFIG_LT_MAJOR_POS) &
+				UHS2_DEV_CONFIG_LT_MAJOR_MASK;
+	card->uhs2_config.n_fcu =
+				(cap >> UHS2_DEV_CONFIG_N_FCU_POS) &
+				UHS2_DEV_CONFIG_N_FCU_MASK;
+	card->uhs2_config.dev_type =
+				(cap >> UHS2_DEV_CONFIG_DEV_TYPE_POS) &
+				UHS2_DEV_CONFIG_DEV_TYPE_MASK;
+	card->uhs2_config.maxblk_len =
+				(cap >> UHS2_DEV_CONFIG_MAX_BLK_LEN_POS) &
+				UHS2_DEV_CONFIG_MAX_BLK_LEN_MASK;
+
+	cap = cmd.resp[1];
+	card->uhs2_config.n_data_gap =
+				cap & UHS2_DEV_CONFIG_N_DATA_GAP_MASK;
+	if (card->uhs2_config.n_fcu == 0)
+		card->uhs2_config.n_fcu = 256;
+
 	return 0;
 }
 
@@ -98,18 +467,336 @@ static int sd_uhs2_config_read(struct mmc_host *host, struct mmc_card *card)
  */
 static int sd_uhs2_config_write(struct mmc_host *host, struct mmc_card *card)
 {
+	struct mmc_command cmd = {0};
+	struct uhs2_command uhs2_cmd = {};
+	u8 nMinDataGap;
+	int err;
+
+	/*
+	 * Use Control Write CCMD to set Generic Setting in Configuration Register.
+	 * - Control Write(R/W=1) with 8-Byte payload(PLEN=10b).
+	 * - IOADR = Generic Setting Register(CFG_BASE + 008h)
+	 * - Payload = New contents to be written to Generic Setting Register
+	 */
+	uhs2_cmd.header = UHS2_NATIVE_PACKET | UHS2_PACKET_TYPE_CCMD | card->uhs2_config.node_id;
+	uhs2_cmd.arg = ((UHS2_DEV_CONFIG_GEN_SET & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_WRITE |
+			UHS2_NATIVE_CMD_PLEN_8B |
+			(UHS2_DEV_CONFIG_GEN_SET >> 8);
+
+	/*
+	 * Most UHS-II cards only support FD and 2L-HD mode. Other lane numbers
+	 * defined in UHS-II addendem Ver1.01 are optional.
+	 */
+	host->uhs2_caps.n_lanes_set = UHS2_DEV_CONFIG_GEN_SET_2L_FD_HD;
+	card->uhs2_config.n_lanes_set = UHS2_DEV_CONFIG_GEN_SET_2L_FD_HD;
+
+	uhs2_cmd.payload[0] = card->uhs2_config.n_lanes_set << UHS2_DEV_CONFIG_N_LANES_POS;
+	uhs2_cmd.payload[1] = 0;
+	uhs2_cmd.payload[0] = cpu_to_be32(uhs2_cmd.payload[0]);
+	uhs2_cmd.payload[1] = cpu_to_be32(uhs2_cmd.payload[1]);
+
+	/*
+	 * There is no payload because per spec, there should be
+	 * no payload field for read CCMD.
+	 * Plen is set in arg. Per spec, plen for read CCMD
+	 * represents the len of read data which is assigned in payload
+	 * of following RES (p136).
+	 */
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, UHS2_CFG_WRITE_PAYLOAD_LEN, 0);
+
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	/*
+	 * Use Control Write CCMD to set PHY Setting in Configuration Register.
+	 * - Control Write(R/W=1) with 8-Byte payload(PLEN=10b).
+	 * - IOADR = PHY Setting Register(CFG_BASE + 00Ah)
+	 * - Payload = New contents to be written to PHY Setting Register
+	 */
+	uhs2_cmd.arg = ((UHS2_DEV_CONFIG_PHY_SET & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_WRITE |
+			UHS2_NATIVE_CMD_PLEN_8B |
+			(UHS2_DEV_CONFIG_PHY_SET >> 8);
+
+	if (host->uhs2_caps.speed_range == UHS2_DEV_CONFIG_PHY_SET_SPEED_B) {
+		if (card->uhs2_config.n_lanes == UHS2_DEV_CONFIG_2L_HD_FD &&
+		    host->uhs2_caps.n_lanes == UHS2_DEV_CONFIG_2L_HD_FD) {
+			/* Support HD */
+			host->ios.timing = MMC_TIMING_UHS2_SPEED_B_HD;
+			nMinDataGap = 1;
+		} else {
+			/* Only support 2L-FD so far */
+			host->ios.timing = MMC_TIMING_UHS2_SPEED_B;
+			nMinDataGap = 3;
+		}
+		card->uhs2_config.speed_range_set = UHS2_DEV_CONFIG_PHY_SET_SPEED_B;
+	} else {
+		if (card->uhs2_config.n_lanes == UHS2_DEV_CONFIG_2L_HD_FD &&
+		    host->uhs2_caps.n_lanes == UHS2_DEV_CONFIG_2L_HD_FD) {
+			/* Support HD */
+			host->ios.timing = MMC_TIMING_UHS2_SPEED_A_HD;
+			nMinDataGap = 1;
+		} else {
+			/* Only support 2L-FD so far */
+			host->ios.timing = MMC_TIMING_UHS2_SPEED_A;
+			nMinDataGap = 3;
+		}
+		card->uhs2_config.speed_range_set = UHS2_DEV_CONFIG_PHY_SET_SPEED_A;
+	}
+
+	uhs2_cmd.payload[0] =
+			card->uhs2_config.speed_range_set << UHS2_DEV_CONFIG_PHY_SET_SPEED_POS;
+
+	card->uhs2_config.n_lss_sync_set = (max(card->uhs2_config.n_lss_sync,
+						host->uhs2_caps.n_lss_sync) >> 2) &
+					   UHS2_DEV_CONFIG_N_LSS_SYN_MASK;
+	host->uhs2_caps.n_lss_sync_set = card->uhs2_config.n_lss_sync_set;
+
+	card->uhs2_config.n_lss_dir_set = (max(card->uhs2_config.n_lss_dir,
+					       host->uhs2_caps.n_lss_dir) >> 3) &
+					  UHS2_DEV_CONFIG_N_LSS_DIR_MASK;
+	host->uhs2_caps.n_lss_dir_set = card->uhs2_config.n_lss_dir_set;
+
+	uhs2_cmd.payload[1] = (card->uhs2_config.n_lss_dir_set << UHS2_DEV_CONFIG_N_LSS_DIR_POS) |
+			      card->uhs2_config.n_lss_sync_set;
+	uhs2_cmd.payload[0] = cpu_to_be32(uhs2_cmd.payload[0]);
+	uhs2_cmd.payload[1] = cpu_to_be32(uhs2_cmd.payload[1]);
+
+	memset(uhs2_cmd.uhs2_resp, 0, sizeof(uhs2_cmd.uhs2_resp));
+
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, UHS2_CFG_WRITE_PAYLOAD_LEN,
+			     UHS2_CFG_WRITE_PHY_SET_RESP_LEN);
+
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	if ((uhs2_cmd.uhs2_resp[2] & 0x80)) {
+		pr_err("%s: %s: UHS2 CMD not accepted, resp= 0x%x!\n",
+		       mmc_hostname(host), __func__, uhs2_cmd.uhs2_resp[2]);
+		return -EIO;
+	}
+
+	/*
+	 * Use Control Write CCMD to set LINK/TRAN Setting in Configuration Register.
+	 * - Control Write(R/W=1) with 8-Byte payload(PLEN=10b).
+	 * - IOADR = LINK/TRAN Setting Register(CFG_BASE + 00Ch)
+	 * - Payload = New contents to be written to LINK/TRAN Setting Register
+	 */
+	uhs2_cmd.arg = ((UHS2_DEV_CONFIG_LINK_TRAN_SET & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_WRITE |
+			UHS2_NATIVE_CMD_PLEN_8B |
+			(UHS2_DEV_CONFIG_LINK_TRAN_SET >> 8);
+
+	if (card->uhs2_config.app_type == UHS2_DEV_CONFIG_APP_SD_MEM)
+		card->uhs2_config.maxblk_len_set = UHS2_DEV_CONFIG_LT_SET_MAX_BLK_LEN;
+	else
+		card->uhs2_config.maxblk_len_set = min(card->uhs2_config.maxblk_len,
+						       host->uhs2_caps.maxblk_len);
+	host->uhs2_caps.maxblk_len_set = card->uhs2_config.maxblk_len_set;
+
+	card->uhs2_config.n_fcu_set = min(card->uhs2_config.n_fcu, host->uhs2_caps.n_fcu);
+	host->uhs2_caps.n_fcu_set = card->uhs2_config.n_fcu_set;
+
+	card->uhs2_config.n_data_gap_set = max(nMinDataGap, card->uhs2_config.n_data_gap);
+	host->uhs2_caps.n_data_gap_set = card->uhs2_config.n_data_gap_set;
+
+	host->uhs2_caps.max_retry_set = 3;
+	card->uhs2_config.max_retry_set = host->uhs2_caps.max_retry_set;
+
+	uhs2_cmd.payload[0] =
+			(card->uhs2_config.maxblk_len_set << UHS2_DEV_CONFIG_MAX_BLK_LEN_POS) |
+			(card->uhs2_config.max_retry_set << UHS2_DEV_CONFIG_LT_SET_MAX_RETRY_POS) |
+			(card->uhs2_config.n_fcu_set << UHS2_DEV_CONFIG_N_FCU_POS);
+	uhs2_cmd.payload[1] = card->uhs2_config.n_data_gap_set;
+	uhs2_cmd.payload[0] = cpu_to_be32(uhs2_cmd.payload[0]);
+	uhs2_cmd.payload[1] = cpu_to_be32(uhs2_cmd.payload[1]);
+
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, UHS2_CFG_WRITE_PAYLOAD_LEN, 0);
+
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	/*
+	 * Use Control Write CCMD to set Config Completion(payload bit 63) in Generic Setting
+	 * Register.
+	 * Header:
+	 *      - Control Write(R/W=1) with 8-Byte payload(PLEN=10b).
+	 *      - IOADR = PGeneric Setting Register(CFG_BASE + 008h)
+	 * Payload:
+	 *      - bit [63]: Config Completion
+	 *
+	 * DLSM transits to Active state immediately when Config Completion is set to 1.
+	 */
+	uhs2_cmd.arg = ((UHS2_DEV_CONFIG_GEN_SET & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_WRITE |
+			UHS2_NATIVE_CMD_PLEN_8B |
+			(UHS2_DEV_CONFIG_GEN_SET >> 8);
+
+	uhs2_cmd.payload[0] = 0;
+	uhs2_cmd.payload[1] = UHS2_DEV_CONFIG_GEN_SET_CFG_COMPLETE;
+	uhs2_cmd.payload[0] = cpu_to_be32(uhs2_cmd.payload[0]);
+	uhs2_cmd.payload[1] = cpu_to_be32(uhs2_cmd.payload[1]);
+
+	memset(uhs2_cmd.uhs2_resp, 0, sizeof(uhs2_cmd.uhs2_resp));
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, UHS2_CFG_WRITE_PAYLOAD_LEN,
+			     UHS2_CFG_WRITE_GENERIC_SET_RESP_LEN);
+
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	/* Set host Config Setting registers */
+	err = host->ops->uhs2_control(host, UHS2_SET_CONFIG);
+	if (err) {
+		pr_err("%s: %s: UHS2 SET_CONFIG fail!\n", mmc_hostname(host), __func__);
+		return err;
+	}
+
 	return 0;
 }
 
-/*
- * Initialize the UHS-II card through the SD-TRAN transport layer. This enables
- * commands/requests to be backwards compatible through the legacy SD protocol.
- * UHS-II cards has a specific power limit specified for VDD1/VDD2, that should
- * be set through a legacy CMD6. Note that, the power limit that becomes set,
- * survives a soft reset through the GO_DORMANT_STATE command.
- */
-static int sd_uhs2_legacy_init(struct mmc_host *host, struct mmc_card *card)
+static int sd_uhs2_go_dormant(struct mmc_host *host, u32 node_id)
+{
+	struct mmc_command cmd = {0};
+	struct uhs2_command uhs2_cmd = {};
+	int err;
+
+	/* Disable Normal INT */
+	err = host->ops->uhs2_control(host, UHS2_DISABLE_INT);
+	if (err) {
+		pr_err("%s: %s: UHS2 DISABLE_INT fail!\n",
+		       mmc_hostname(host), __func__);
+		return err;
+	}
+
+	/*
+	 * Refer to UHS-II Addendum Version 1.02 Figure 6-17 to see GO_DORMANT_STATE CCMD format.
+	 * Header:
+	 *      - Control Write(R/W=1) with 4-Byte payload(PLEN=01b).
+	 *      - IOADR = CMD_BASE + 001h
+	 * Payload:
+	 *      - bit [7]: HBR(Entry to Hibernate Mode)
+	 *                 1: Host intends to enter Hibernate mode during Dormant state.
+	 *                 The default setting is 0 because hibernate is currently not supported.
+	 */
+	uhs2_cmd.header = UHS2_NATIVE_PACKET | UHS2_PACKET_TYPE_CCMD | node_id;
+	uhs2_cmd.arg = ((UHS2_DEV_CMD_GO_DORMANT_STATE & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_WRITE |
+			UHS2_NATIVE_CMD_PLEN_4B |
+			(UHS2_DEV_CMD_GO_DORMANT_STATE >> 8);
+
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, UHS2_GO_DORMANT_PAYLOAD_LEN, 0);
+
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err) {
+		pr_err("%s: %s: UHS2 CMD send fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	/* Check Dormant State in Present */
+	err = host->ops->uhs2_control(host, UHS2_CHECK_DORMANT);
+	if (err)
+		return err;
+
+	/* Disable UHS2 card clock */
+	err = host->ops->uhs2_control(host, UHS2_DISABLE_CLK);
+	if (err)
+		return err;
+
+	/* Restore sd clock */
+	mmc_delay(5);
+	err = host->ops->uhs2_control(host, UHS2_ENABLE_CLK);
+	if (err)
+		return err;
+
+	/* Enable Normal INT */
+	err = host->ops->uhs2_control(host, UHS2_ENABLE_INT);
+	if (err)
+		return err;
+
+	/* Detect UHS2 */
+	err = host->ops->uhs2_control(host, UHS2_PHY_INIT);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int sd_uhs2_wait_active_state_cb(void *cb_data, bool *busy)
 {
+	struct sd_uhs2_wait_active_state_data *data = cb_data;
+	struct mmc_host *host = data->host;
+	struct mmc_command *cmd = data->cmd;
+	int err;
+
+	err = mmc_wait_for_cmd(host, cmd, 0);
+	if (err)
+		return err;
+
+	if (cmd->resp[1] & UHS2_DEV_CONFIG_GEN_SET_CFG_COMPLETE)
+		*busy = false;
+	else
+		*busy = true;
+
+	return 0;
+}
+
+static int sd_uhs2_go_dormant_state(struct mmc_host *host, u32 node_id)
+{
+	struct mmc_command cmd = {0};
+	struct uhs2_command uhs2_cmd = {};
+	int err;
+	struct sd_uhs2_wait_active_state_data cb_data = {
+		.host = host,
+		.cmd = &cmd
+	};
+
+	err = sd_uhs2_go_dormant(host, node_id);
+	if (err) {
+		pr_err("%s: %s: UHS2 GO_DORMANT_STATE fail, err= 0x%x!\n",
+		       mmc_hostname(host), __func__, err);
+		return err;
+	}
+
+	/*
+	 * Use Control Read CCMD to check Config Completion(bit 63) in Generic Setting Register.
+	 * - Control Read(R/W=0) with 8-Byte payload(PLEN=10b).
+	 * - IOADR = Generic Setting Register(CFG_BASE + 008h)
+	 *
+	 * When UHS-II card been switched to new speed mode, it will set Config Completion to 1.
+	 */
+	uhs2_cmd.header = UHS2_NATIVE_PACKET | UHS2_PACKET_TYPE_CCMD | node_id;
+	uhs2_cmd.arg = ((UHS2_DEV_CONFIG_GEN_SET & 0xFF) << 8) |
+			UHS2_NATIVE_CMD_READ |
+			UHS2_NATIVE_CMD_PLEN_8B |
+			(UHS2_DEV_CONFIG_GEN_SET >> 8);
+
+	sd_uhs2_cmd_assemble(&cmd, &uhs2_cmd, 0, 0);
+	err = __mmc_poll_for_busy(host, UHS2_WAIT_CFG_COMPLETE_PERIOD_US,
+				  UHS2_WAIT_CFG_COMPLETE_TIMEOUT_MS,
+				  &sd_uhs2_wait_active_state_cb, &cb_data);
+	if (err) {
+		pr_err("%s: %s: Not switch to Active in 100 ms\n", mmc_hostname(host), __func__);
+		return err;
+	}
+
 	return 0;
 }
 
@@ -117,7 +804,7 @@ static int sd_uhs2_legacy_init(struct mmc_host *host, struct mmc_card *card)
  * Allocate the data structure for the mmc_card and run the UHS-II specific
  * initialization sequence.
  */
-static int sd_uhs2_init_card(struct mmc_host *host)
+static int sd_uhs2_init_card(struct mmc_host *host, struct mmc_card *oldcard)
 {
 	struct mmc_card *card;
 	u32 node_id = 0;
@@ -131,29 +818,204 @@ static int sd_uhs2_init_card(struct mmc_host *host)
 	if (err)
 		return err;
 
-	card = mmc_alloc_card(host, &sd_type);
-	if (IS_ERR(card))
-		return PTR_ERR(card);
+	if (oldcard) {
+		card = oldcard;
+	} else {
+		card = mmc_alloc_card(host, &sd_type);
+		if (IS_ERR(card))
+			return PTR_ERR(card);
+	}
 
 	card->uhs2_config.node_id = node_id;
 	card->type = MMC_TYPE_SD;
 
 	err = sd_uhs2_config_read(host, card);
 	if (err)
-		goto err;
+		return err;
 
 	err = sd_uhs2_config_write(host, card);
 	if (err)
-		goto err;
+		return err;
 
 	host->card = card;
+	/* If change speed to Range B, need to GO_DORMANT_STATE */
+	if (host->ios.timing == MMC_TIMING_UHS2_SPEED_B ||
+	    host->ios.timing == MMC_TIMING_UHS2_SPEED_B_HD) {
+		err = sd_uhs2_go_dormant_state(host, node_id);
+		if (err)
+			return err;
+	}
+
+	host->uhs2_sd_tran = true;
+
+	return 0;
+}
+
+/*
+ * Initialize the UHS-II card through the SD-TRAN transport layer. This enables
+ * commands/requests to be backwards compatible through the legacy SD protocol.
+ * UHS-II cards has a specific power limit specified for VDD1/VDD2, that should
+ * be set through a legacy CMD6. Note that, the power limit that becomes set,
+ * survives a soft reset through the GO_DORMANT_STATE command.
+ */
+static int sd_uhs2_legacy_init(struct mmc_host *host, struct mmc_card *card,
+			       struct mmc_card *oldcard)
+{
+	int err;
+	u32 cid[4];
+	u32 ocr;
+	u32 rocr;
+	u8  *status;
+	int ro;
+
+	/* Send CMD0 to reset SD card */
+	err = __mmc_go_idle(host);
+	if (err)
+		return err;
+
+	mmc_delay(1);
+
+	/* Send CMD8 to communicate SD interface operation condition */
+	err = mmc_send_if_cond(host, host->ocr_avail);
+	if (err) {
+		dev_warn(mmc_dev(host), "CMD8 error\n");
+		goto err;
+	}
+
+	/*
+	 * Probe SD card working voltage.
+	 */
+	err = mmc_send_app_op_cond(host, 0, &ocr);
+	if (err)
+		goto err;
+
+	card->ocr = ocr;
+
+	/*
+	 * Some SD cards claims an out of spec VDD voltage range. Let's treat
+	 * these bits as being in-valid and especially also bit7.
+	 */
+	ocr &= ~0x7FFF;
+	rocr = mmc_select_voltage(host, ocr);
+	/*
+	 * Some cards have zero value of rocr in UHS-II mode. Assign host's
+	 * ocr value to rocr.
+	 */
+	if (!rocr)
+		rocr = host->ocr_avail;
+
+	rocr |= (SD_OCR_CCS | SD_OCR_XPC);
+
+	/* Wait SD power on ready */
+	ocr = rocr;
+
+	err = mmc_send_app_op_cond(host, ocr, &rocr);
+	if (err)
+		goto err;
+
+	err = mmc_send_cid(host, cid);
+	if (err)
+		goto err;
+
+	if (oldcard) {
+		if (memcmp(cid, oldcard->raw_cid, sizeof(cid)) != 0) {
+			pr_debug("%s: Perhaps the card was replaced\n",
+				 mmc_hostname(host));
+			return -ENOENT;
+		}
+
+		card = oldcard;
+	} else {
+		memcpy(card->raw_cid, cid, sizeof(card->raw_cid));
+		mmc_decode_cid(card);
+	}
+
+	/*
+	 * For native busses:  get card RCA and quit open drain mode.
+	 */
+	err = mmc_send_relative_addr(host, &card->rca);
+	if (err)
+		goto err;
+
+	err = mmc_sd_get_csd(card, false);
+	if (err)
+		goto err;
+
+	/*
+	 * Select card, as all following commands rely on that.
+	 */
+	err = mmc_select_card(card);
+	if (err)
+		goto err;
+
+	/*
+	 * Fetch SCR from card.
+	 */
+	err = mmc_app_send_scr(card);
+	if (err)
+		goto err;
+
+	err = mmc_decode_scr(card);
+	if (err)
+		goto err;
+
+	/*
+	 * Switch to high power consumption mode.
+	 * Even switch failed, sd card can still work at lower power consumption mode, but
+	 * performance will be lower than high power consumption mode.
+	 */
+	status = kmalloc(64, GFP_KERNEL);
+	if (!status)
+		return -ENOMEM;
+
+	if (!(card->csd.cmdclass & CCC_SWITCH)) {
+		pr_warn("%s: card lacks mandatory switch function, performance might suffer\n",
+			mmc_hostname(card->host));
+	} else {
+		/*
+		 * Send CMD6 to set Maximum Power Consumption to get better
+		 * performance. Ignore errors and continue.
+		 */
+		err = mmc_sd_switch(card, 0, 3, SD4_SET_POWER_LIMIT_1_80W, status);
+		if (!err)
+			mmc_sd_switch(card, 1, 3, SD4_SET_POWER_LIMIT_1_80W, status);
+	}
+
+	/*
+	 * Check if read-only switch is active.
+	 */
+	ro = mmc_sd_get_ro(host);
+	if (ro < 0)
+		pr_warn("%s: host does not support read-only switch, assuming write-enable\n",
+			mmc_hostname(host));
+	else if (ro > 0)
+		mmc_card_set_readonly(card);
+
+	kfree(status);
 	return 0;
 
 err:
-	mmc_remove_card(card);
 	return err;
 }
 
+static int sd_uhs2_reinit(struct mmc_host *host)
+{
+	struct mmc_card *card = host->card;
+	int err;
+
+	sd_uhs2_power_up(host);
+
+	err = sd_uhs2_phy_init(host);
+	if (err)
+		return err;
+
+	err = sd_uhs2_init_card(host, card);
+	if (err)
+		return err;
+
+	return sd_uhs2_legacy_init(host, card, card);
+}
+
 static void sd_uhs2_remove(struct mmc_host *host)
 {
 	mmc_remove_card(host->card);
@@ -183,34 +1045,105 @@ static void sd_uhs2_detect(struct mmc_host *host)
 	}
 }
 
-static int sd_uhs2_suspend(struct mmc_host *host)
+static int _sd_uhs2_suspend(struct mmc_host *host)
 {
+	struct mmc_card *card = host->card;
+
+	mmc_claim_host(host);
+
+	if (mmc_card_suspended(card))
+		goto out;
+
+	sd_uhs2_power_off(host);
+	mmc_card_set_suspended(card);
+
+out:
+	mmc_release_host(host);
 	return 0;
 }
 
+/*
+ * Callback for suspend
+ */
+static int sd_uhs2_suspend(struct mmc_host *host)
+{
+	int err;
+
+	err = _sd_uhs2_suspend(host);
+	if (!err) {
+		pm_runtime_disable(&host->card->dev);
+		pm_runtime_set_suspended(&host->card->dev);
+	}
+
+	return err;
+}
+
+/*
+ * This function tries to determine if the same card is still present
+ * and, if so, restore all state to it.
+ */
+static int _mmc_sd_uhs2_resume(struct mmc_host *host)
+{
+	int err = 0;
+
+	mmc_claim_host(host);
+
+	if (!mmc_card_suspended(host->card))
+		goto out;
+
+	/* Power up UHS2 SD card and re-initialize it. */
+	err = sd_uhs2_reinit(host);
+	mmc_card_clr_suspended(host->card);
+
+out:
+	mmc_release_host(host);
+	return err;
+}
+
+/*
+ * Callback for resume
+ */
 static int sd_uhs2_resume(struct mmc_host *host)
 {
+	pm_runtime_enable(&host->card->dev);
 	return 0;
 }
 
+/*
+ * Callback for runtime_suspend.
+ */
 static int sd_uhs2_runtime_suspend(struct mmc_host *host)
 {
-	return 0;
+	int err;
+
+	if (!(host->caps & MMC_CAP_AGGRESSIVE_PM))
+		return 0;
+
+	err = _sd_uhs2_suspend(host);
+	if (err)
+		pr_err("%s: error %d doing aggressive suspend\n", mmc_hostname(host), err);
+
+	return err;
 }
 
 static int sd_uhs2_runtime_resume(struct mmc_host *host)
 {
-	return 0;
-}
+	int err;
 
-static int sd_uhs2_shutdown(struct mmc_host *host)
-{
-	return 0;
+	err = _mmc_sd_uhs2_resume(host);
+	if (err && err != -ENOMEDIUM)
+		pr_err("%s: error %d doing runtime resume\n", mmc_hostname(host), err);
+
+	return err;
 }
 
 static int sd_uhs2_hw_reset(struct mmc_host *host)
 {
-	return 0;
+	sd_uhs2_power_off(host);
+	/* Wait at least 1 ms according to SD spec */
+	mmc_delay(1);
+
+	return sd_uhs2_reinit(host);
 }
 
 static const struct mmc_bus_ops sd_uhs2_ops = {
@@ -221,7 +1154,7 @@ static const struct mmc_bus_ops sd_uhs2_ops = {
 	.resume = sd_uhs2_resume,
 	.runtime_suspend = sd_uhs2_runtime_suspend,
 	.runtime_resume = sd_uhs2_runtime_resume,
-	.shutdown = sd_uhs2_shutdown,
+	.shutdown = sd_uhs2_suspend,
 	.hw_reset = sd_uhs2_hw_reset,
 };
 
@@ -237,11 +1170,11 @@ static int sd_uhs2_attach(struct mmc_host *host)
 	if (err)
 		goto err;
 
-	err = sd_uhs2_init_card(host);
+	err = sd_uhs2_init_card(host, NULL);
 	if (err)
 		goto err;
 
-	err = sd_uhs2_legacy_init(host, host->card);
+	err = sd_uhs2_legacy_init(host, host->card, NULL);
 	if (err)
 		goto err;
 
@@ -254,21 +1187,31 @@ static int sd_uhs2_attach(struct mmc_host *host)
 		goto remove_card;
 
 	mmc_claim_host(host);
+
 	return 0;
 
 remove_card:
-	mmc_remove_card(host->card);
-	host->card = NULL;
+	sd_uhs2_remove(host);
 	mmc_claim_host(host);
-	mmc_detach_bus(host);
+
 err:
+	mmc_detach_bus(host);
 	sd_uhs2_power_off(host);
 	return err;
 }
 
+/**
+ * mmc_attach_sd_uhs2 - select UHS2 interface
+ * @host: MMC host
+ *
+ * Try to select UHS2 interface and initialize the bus for a given
+ * frequency, @freq.
+ *
+ * Return:	0 on success, non-zero error on failure
+ */
 int mmc_attach_sd_uhs2(struct mmc_host *host)
 {
-	int i, err = 0;
+	int i, err;
 
 	if (!(host->caps2 & MMC_CAP2_SD_UHS2))
 		return -EOPNOTSUPP;
@@ -292,3 +1235,75 @@ int mmc_attach_sd_uhs2(struct mmc_host *host)
 
 	return err;
 }
+
+/*
+ * mmc_uhs2_prepare_cmd - prepare for SD command packet
+ * @host:	MMC host
+ * @mrq:	MMC request
+ *
+ * Initialize and fill in a header and a payload of SD command packet.
+ * The caller should allocate uhs2_command in host->cmd->uhs2_cmd in
+ * advance.
+ *
+ * Return:	0 on success, non-zero error on failure
+ */
+void mmc_uhs2_prepare_cmd(struct mmc_host *host, struct mmc_request *mrq)
+{
+	struct mmc_command *cmd;
+	struct uhs2_command *uhs2_cmd;
+	u8 plen;
+
+	cmd = mrq->cmd;
+	cmd->uhs2_cmd = &mrq->uhs2_cmd;
+	uhs2_cmd = cmd->uhs2_cmd;
+	uhs2_cmd->header = host->card->uhs2_config.node_id;
+	if ((cmd->flags & MMC_CMD_MASK) == MMC_CMD_ADTC)
+		uhs2_cmd->header |= UHS2_PACKET_TYPE_DCMD;
+	else
+		uhs2_cmd->header |= UHS2_PACKET_TYPE_CCMD;
+
+	uhs2_cmd->arg = cmd->opcode << UHS2_SD_CMD_INDEX_POS;
+	if (host->uhs2_app_cmd) {
+		uhs2_cmd->arg |= UHS2_SD_CMD_APP;
+		host->uhs2_app_cmd = false;
+	}
+
+	/*
+	 * UHS-II Addendum 7.2.1.2
+	 * Host may set DM to 1 for DCMD which supports multi-block read/write regardless of
+	 * data transfer length (e.g., CMD18, CMD25). Otherwise, it shall not set DM to 1.
+	 * (e.g., CMD6, CMD17, CMD24). These rules are also applied to other multi-block read/write
+	 * commands defined in other Part of SD specifications (for example, Host may set DM to 1
+	 * for ACMD18 or ACMD25).
+	 */
+	if (mmc_op_multi(cmd->opcode))
+		cmd->uhs2_cmd->tmode_half_duplex = mmc_card_uhs2_hd_mode(host);
+	else
+		cmd->uhs2_cmd->tmode_half_duplex = 0;
+
+	uhs2_cmd = cmd->uhs2_cmd;
+	plen = 2; /* at the maximum */
+
+	if ((cmd->flags & MMC_CMD_MASK) == MMC_CMD_ADTC &&
+	    cmd->uhs2_cmd->tmode_half_duplex) {
+		if (mmc_card_uhs2_hd_mode(host))
+			uhs2_cmd->arg |= UHS2_DCMD_2L_HD_MODE;
+
+		uhs2_cmd->arg |= UHS2_DCMD_LM_TLEN_EXIST;
+
+		if (cmd->data->blocks == 1 &&
+		    cmd->data->blksz != 512 &&
+		    cmd->opcode != MMC_READ_SINGLE_BLOCK &&
+		    cmd->opcode != MMC_WRITE_BLOCK) {
+			uhs2_cmd->arg |= UHS2_DCMD_TLUM_BYTE_MODE;
+			uhs2_cmd->payload[1] = cpu_to_be32(cmd->data->blksz);
+		} else {
+			uhs2_cmd->payload[1] = cpu_to_be32(cmd->data->blocks);
+		}
+	} else {
+		plen = 1;
+	}
+
+	uhs2_cmd->payload[0] = cpu_to_be32(cmd->arg);
+	sd_uhs2_cmd_assemble(cmd, uhs2_cmd, plen, 0);
+}
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index a890a71288ef..56972bd78462 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -11,6 +11,20 @@
 struct mmc_data;
 struct mmc_request;
 
+#define UHS2_MAX_PAYLOAD_LEN 2
+#define UHS2_MAX_RESP_LEN 20
+
+struct uhs2_command {
+	u16	header;
+	u16	arg;
+	__be32	payload[UHS2_MAX_PAYLOAD_LEN];
+	u8	payload_len;
+	u8	packet_len;
+	u8	tmode_half_duplex;
+	u8	uhs2_resp[UHS2_MAX_RESP_LEN];	/* UHS2 native cmd resp */
+	u8	uhs2_resp_len;			/* UHS2 native cmd resp len */
+};
+
 struct mmc_command {
 	u32			opcode;
 	u32			arg;
@@ -97,6 +111,8 @@ struct mmc_command {
 	struct mmc_data		*data;		/* data segment associated with cmd */
 	struct mmc_request	*mrq;		/* associated request */
 
+	struct uhs2_command	*uhs2_cmd;	/* UHS2 command */
+
 	/* for SDUC */
 	bool has_ext_addr;
 	u8 ext_addr;
@@ -158,6 +174,7 @@ struct mmc_request {
 	const struct bio_crypt_ctx *crypto_ctx;
 	int			crypto_key_slot;
 #endif
+	struct uhs2_command	uhs2_cmd;
 };
 
 struct mmc_card;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 0980d06ed419..f166d6611ddb 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -127,6 +127,13 @@ struct sd_uhs2_caps {
 };
 
 enum sd_uhs2_operation {
+	UHS2_PHY_INIT = 0,
+	UHS2_SET_CONFIG,
+	UHS2_ENABLE_INT,
+	UHS2_DISABLE_INT,
+	UHS2_ENABLE_CLK,
+	UHS2_DISABLE_CLK,
+	UHS2_CHECK_DORMANT,
 	UHS2_SET_IOS,
 };
 
@@ -453,6 +460,8 @@ struct mmc_host {
 #endif
 #define MMC_CAP2_ALT_GPT_TEGRA	(1 << 28)	/* Host with eMMC that has GPT entry at a non-standard location */
 
+	bool			uhs2_sd_tran;	/* UHS-II flag for SD_TRAN state */
+	bool			uhs2_app_cmd;	/* UHS-II flag for APP command */
 	struct sd_uhs2_caps	uhs2_caps;	/* Host UHS-II capabilities */
 
 	int			fixed_drv_type;	/* fixed driver type for non-removable media */
@@ -714,6 +723,12 @@ static inline void mmc_debugfs_err_stats_inc(struct mmc_host *host,
 	host->err_stats[stat] += 1;
 }
 
+static inline int mmc_card_uhs2_hd_mode(struct mmc_host *host)
+{
+	return host->ios.timing == MMC_TIMING_UHS2_SPEED_A_HD ||
+	       host->ios.timing == MMC_TIMING_UHS2_SPEED_B_HD;
+}
+
 int mmc_sd_switch(struct mmc_card *card, bool mode, int group,
 		u8 value, u8 *resp);
 int mmc_send_status(struct mmc_card *card, u32 *status);
-- 
cgit v1.2.3


From a5a98a786e5e31e85a69d30935254e8730734706 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 14 Oct 2024 16:59:46 +0200
Subject: thermal: core: Add and use cooling device guard

Add and use a special guard for cooling devices.

This allows quite a few error code paths to be simplified among
other things and brings in code size reduction for a good measure.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://patch.msgid.link/5837621.DvuYhMxLoT@rjwysocki.net
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
---
 drivers/thermal/gov_power_allocator.c | 23 ++++++++--------
 drivers/thermal/gov_step_wise.c       |  6 ++---
 drivers/thermal/thermal_core.c        | 17 ++++--------
 drivers/thermal/thermal_debugfs.c     | 25 ++++++++++-------
 drivers/thermal/thermal_helpers.c     | 19 ++++---------
 drivers/thermal/thermal_sysfs.c       | 51 +++++++++++++----------------------
 include/linux/thermal.h               |  3 +++
 7 files changed, 60 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
index bdae60001b2c..5cb03923fa8f 100644
--- a/drivers/thermal/gov_power_allocator.c
+++ b/drivers/thermal/gov_power_allocator.c
@@ -549,18 +549,17 @@ static void allow_maximum_power(struct thermal_zone_device *tz)
 		cdev = instance->cdev;
 
 		instance->target = 0;
-		mutex_lock(&cdev->lock);
-		/*
-		 * Call for updating the cooling devices local stats and avoid
-		 * periods of dozen of seconds when those have not been
-		 * maintained.
-		 */
-		cdev->ops->get_requested_power(cdev, &req_power);
-
-		if (params->update_cdevs)
-			__thermal_cdev_update(cdev);
-
-		mutex_unlock(&cdev->lock);
+		scoped_guard(cooling_dev, cdev) {
+			/*
+			 * Call for updating the cooling devices local stats and
+			 * avoid periods of dozen of seconds when those have not
+			 * been maintained.
+			 */
+			cdev->ops->get_requested_power(cdev, &req_power);
+
+			if (params->update_cdevs)
+				__thermal_cdev_update(cdev);
+		}
 	}
 }
 
diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c
index ea4bf88d37f3..d1bb59f1dfbd 100644
--- a/drivers/thermal/gov_step_wise.c
+++ b/drivers/thermal/gov_step_wise.c
@@ -97,9 +97,9 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz,
 
 		instance->initialized = true;
 
-		mutex_lock(&instance->cdev->lock);
-		instance->cdev->updated = false; /* cdev needs update */
-		mutex_unlock(&instance->cdev->lock);
+		scoped_guard(cooling_dev, instance->cdev) {
+			instance->cdev->updated = false; /* cdev needs update */
+		}
 	}
 }
 
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index ab0c7c6396c0..f98aa396787a 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -758,12 +758,10 @@ static int thermal_instance_add(struct thermal_instance *new_instance,
 
 	list_add_tail(&new_instance->trip_node, &td->thermal_instances);
 
-	mutex_lock(&cdev->lock);
+	guard(cooling_dev)(cdev);
 
 	list_add_tail(&new_instance->cdev_node, &cdev->thermal_instances);
 
-	mutex_unlock(&cdev->lock);
-
 	return 0;
 }
 
@@ -872,11 +870,9 @@ static void thermal_instance_delete(struct thermal_instance *instance)
 {
 	list_del(&instance->trip_node);
 
-	mutex_lock(&instance->cdev->lock);
+	guard(cooling_dev)(instance->cdev);
 
 	list_del(&instance->cdev_node);
-
-	mutex_unlock(&instance->cdev->lock);
 }
 
 /**
@@ -1239,10 +1235,10 @@ void thermal_cooling_device_update(struct thermal_cooling_device *cdev)
 	 * Update under the cdev lock to prevent the state from being set beyond
 	 * the new limit concurrently.
 	 */
-	mutex_lock(&cdev->lock);
+	guard(cooling_dev)(cdev);
 
 	if (cdev->ops->get_max_state(cdev, &cdev->max_state))
-		goto unlock;
+		return;
 
 	thermal_cooling_device_stats_reinit(cdev);
 
@@ -1269,12 +1265,9 @@ void thermal_cooling_device_update(struct thermal_cooling_device *cdev)
 	}
 
 	if (cdev->ops->get_cur_state(cdev, &state) || state > cdev->max_state)
-		goto unlock;
+		return;
 
 	thermal_cooling_device_stats_update(cdev, state);
-
-unlock:
-	mutex_unlock(&cdev->lock);
 }
 EXPORT_SYMBOL_GPL(thermal_cooling_device_update);
 
diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c
index d67021aeb136..c800504c3cfe 100644
--- a/drivers/thermal/thermal_debugfs.c
+++ b/drivers/thermal/thermal_debugfs.c
@@ -516,6 +516,19 @@ void thermal_debug_cdev_add(struct thermal_cooling_device *cdev, int state)
 	cdev->debugfs = thermal_dbg;
 }
 
+static struct thermal_debugfs *thermal_debug_cdev_clear(struct thermal_cooling_device *cdev)
+{
+	struct thermal_debugfs *thermal_dbg;
+
+	guard(cooling_dev)(cdev);
+
+	thermal_dbg = cdev->debugfs;
+	if (thermal_dbg)
+		cdev->debugfs = NULL;
+
+	return thermal_dbg;
+}
+
 /**
  * thermal_debug_cdev_remove - Remove a cooling device debugfs entry
  *
@@ -527,17 +540,9 @@ void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev)
 {
 	struct thermal_debugfs *thermal_dbg;
 
-	mutex_lock(&cdev->lock);
-
-	thermal_dbg = cdev->debugfs;
-	if (!thermal_dbg) {
-		mutex_unlock(&cdev->lock);
+	thermal_dbg = thermal_debug_cdev_clear(cdev);
+	if (!thermal_dbg)
 		return;
-	}
-
-	cdev->debugfs = NULL;
-
-	mutex_unlock(&cdev->lock);
 
 	mutex_lock(&thermal_dbg->lock);
 
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index 030a4e4bc824..b1152ad7acc9 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -58,17 +58,10 @@ bool thermal_trip_is_bound_to_cdev(struct thermal_zone_device *tz,
 				   const struct thermal_trip *trip,
 				   struct thermal_cooling_device *cdev)
 {
-	bool ret;
-
 	guard(thermal_zone)(tz);
+	guard(cooling_dev)(cdev);
 
-	mutex_lock(&cdev->lock);
-
-	ret = thermal_instance_present(tz, cdev, trip);
-
-	mutex_unlock(&cdev->lock);
-
-	return ret;
+	return thermal_instance_present(tz, cdev, trip);
 }
 EXPORT_SYMBOL_GPL(thermal_trip_is_bound_to_cdev);
 
@@ -197,12 +190,12 @@ void __thermal_cdev_update(struct thermal_cooling_device *cdev)
  */
 void thermal_cdev_update(struct thermal_cooling_device *cdev)
 {
-	mutex_lock(&cdev->lock);
+	guard(cooling_dev)(cdev);
+
 	if (!cdev->updated) {
 		__thermal_cdev_update(cdev);
 		cdev->updated = true;
 	}
-	mutex_unlock(&cdev->lock);
 }
 
 /**
@@ -211,11 +204,9 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev)
  */
 void thermal_cdev_update_nocheck(struct thermal_cooling_device *cdev)
 {
-	mutex_lock(&cdev->lock);
+	guard(cooling_dev)(cdev);
 
 	__thermal_cdev_update(cdev);
-
-	mutex_unlock(&cdev->lock);
 }
 
 /**
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 701607a953ff..24b9055a0b6c 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -544,14 +544,15 @@ cur_state_store(struct device *dev, struct device_attribute *attr,
 	if (state > cdev->max_state)
 		return -EINVAL;
 
-	mutex_lock(&cdev->lock);
+	guard(cooling_dev)(cdev);
 
 	result = cdev->ops->set_cur_state(cdev, state);
-	if (!result)
-		thermal_cooling_device_stats_update(cdev, state);
+	if (result)
+		return result;
+
+	thermal_cooling_device_stats_update(cdev, state);
 
-	mutex_unlock(&cdev->lock);
-	return result ? result : count;
+	return count;
 }
 
 static struct device_attribute
@@ -625,21 +626,18 @@ static ssize_t total_trans_show(struct device *dev,
 {
 	struct thermal_cooling_device *cdev = to_cooling_device(dev);
 	struct cooling_dev_stats *stats;
-	int ret = 0;
+	int ret;
 
-	mutex_lock(&cdev->lock);
+	guard(cooling_dev)(cdev);
 
 	stats = cdev->stats;
 	if (!stats)
-		goto unlock;
+		return 0;
 
 	spin_lock(&stats->lock);
 	ret = sprintf(buf, "%u\n", stats->total_trans);
 	spin_unlock(&stats->lock);
 
-unlock:
-	mutex_unlock(&cdev->lock);
-
 	return ret;
 }
 
@@ -652,11 +650,11 @@ time_in_state_ms_show(struct device *dev, struct device_attribute *attr,
 	ssize_t len = 0;
 	int i;
 
-	mutex_lock(&cdev->lock);
+	guard(cooling_dev)(cdev);
 
 	stats = cdev->stats;
 	if (!stats)
-		goto unlock;
+		return 0;
 
 	spin_lock(&stats->lock);
 
@@ -668,9 +666,6 @@ time_in_state_ms_show(struct device *dev, struct device_attribute *attr,
 	}
 	spin_unlock(&stats->lock);
 
-unlock:
-	mutex_unlock(&cdev->lock);
-
 	return len;
 }
 
@@ -682,11 +677,11 @@ reset_store(struct device *dev, struct device_attribute *attr, const char *buf,
 	struct cooling_dev_stats *stats;
 	int i, states;
 
-	mutex_lock(&cdev->lock);
+	guard(cooling_dev)(cdev);
 
 	stats = cdev->stats;
 	if (!stats)
-		goto unlock;
+		return count;
 
 	states = cdev->max_state + 1;
 
@@ -702,9 +697,6 @@ reset_store(struct device *dev, struct device_attribute *attr, const char *buf,
 
 	spin_unlock(&stats->lock);
 
-unlock:
-	mutex_unlock(&cdev->lock);
-
 	return count;
 }
 
@@ -716,13 +708,11 @@ static ssize_t trans_table_show(struct device *dev,
 	ssize_t len = 0;
 	int i, j;
 
-	mutex_lock(&cdev->lock);
+	guard(cooling_dev)(cdev);
 
 	stats = cdev->stats;
-	if (!stats) {
-		len = -ENODATA;
-		goto unlock;
-	}
+	if (!stats)
+		return -ENODATA;
 
 	len += snprintf(buf + len, PAGE_SIZE - len, " From  :    To\n");
 	len += snprintf(buf + len, PAGE_SIZE - len, "       : ");
@@ -731,10 +721,8 @@ static ssize_t trans_table_show(struct device *dev,
 			break;
 		len += snprintf(buf + len, PAGE_SIZE - len, "state%2u  ", i);
 	}
-	if (len >= PAGE_SIZE) {
-		len = PAGE_SIZE;
-		goto unlock;
-	}
+	if (len >= PAGE_SIZE)
+		return PAGE_SIZE;
 
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 
@@ -760,9 +748,6 @@ static ssize_t trans_table_show(struct device *dev,
 		len = -EFBIG;
 	}
 
-unlock:
-	mutex_unlock(&cdev->lock);
-
 	return len;
 }
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index bcaa92732e14..754802478b96 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -140,6 +140,9 @@ struct thermal_cooling_device {
 #endif
 };
 
+DEFINE_GUARD(cooling_dev, struct thermal_cooling_device *, mutex_lock(&_T->lock),
+	     mutex_unlock(&_T->lock))
+
 /* Structure to define Thermal Zone parameters */
 struct thermal_zone_params {
 	const char *governor_name;
-- 
cgit v1.2.3


From a6021aa24f6417416d93318bbfa022ab229c33c8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 11 Oct 2024 06:18:17 +0000
Subject: ACPI: EC: make EC support compile-time conditional

The embedded controller code is mainly used on x86 laptops and cannot
work without PC style I/O port access.

Make this a user-visible configuration option that is default enabled
on x86 but otherwise disabled, and that can never be enabled unless
CONFIG_HAS_IOPORT is also available.

The empty stubs in internal.h help ignore the EC code in configurations
that don't support it. In order to see those stubs, the sbshc code also
has to include this header and drop duplicate declarations.

All the direct callers of ec_read/ec_write already had an x86
dependency and now also need to depend on APCI_EC.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Link: https://patch.msgid.link/20241011061948.3211423-1-arnd@kernel.org
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/Kconfig               | 11 ++++++++++-
 drivers/acpi/Makefile              |  2 +-
 drivers/acpi/internal.h            | 25 +++++++++++++++++++++++++
 drivers/acpi/sbshc.c               |  9 +--------
 drivers/char/Kconfig               |  1 +
 drivers/hwmon/Kconfig              |  3 ++-
 drivers/platform/x86/Kconfig       | 22 ++++++++++++----------
 drivers/platform/x86/dell/Kconfig  |  1 +
 drivers/platform/x86/hp/Kconfig    |  1 +
 drivers/platform/x86/intel/Kconfig |  2 +-
 include/linux/acpi.h               |  8 ++++++--
 11 files changed, 61 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index d67f63d93b2a..d65cd08ba8e1 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -132,8 +132,17 @@ config ACPI_REV_OVERRIDE_POSSIBLE
 	  makes it possible to force the kernel to return "5" as the supported
 	  ACPI revision via the "acpi_rev_override" command line switch.
 
+config ACPI_EC
+	bool "Embedded Controller"
+	depends on HAS_IOPORT
+	default X86
+	help
+	  This driver handles communication with the microcontroller
+	  on many x86 laptops and other machines.
+
 config ACPI_EC_DEBUGFS
 	tristate "EC read/write access through /sys/kernel/debug/ec"
+	depends on ACPI_EC
 	help
 	  Say N to disable Embedded Controller /sys/kernel/debug interface
 
@@ -433,7 +442,7 @@ config ACPI_HOTPLUG_IOAPIC
 
 config ACPI_SBS
 	tristate "Smart Battery System"
-	depends on X86
+	depends on X86 && ACPI_EC
 	select POWER_SUPPLY
 	help
 	  This driver supports the Smart Battery System, another
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 61ca4afe83dc..40208a0f5dfb 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -41,7 +41,7 @@ acpi-y				+= resource.o
 acpi-y				+= acpi_processor.o
 acpi-y				+= processor_core.o
 acpi-$(CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC) += processor_pdc.o
-acpi-y				+= ec.o
+acpi-$(CONFIG_ACPI_EC)		+= ec.o
 acpi-$(CONFIG_ACPI_DOCK)	+= dock.o
 acpi-$(CONFIG_PCI)		+= pci_root.o pci_link.o pci_irq.o
 obj-$(CONFIG_ACPI_MCFG)		+= pci_mcfg.o
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index ced7dff9a5db..00910ccd7eda 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -215,6 +215,8 @@ extern struct acpi_ec *first_ec;
 /* External interfaces use first EC only, so remember */
 typedef int (*acpi_ec_query_func) (void *data);
 
+#ifdef CONFIG_ACPI_EC
+
 void acpi_ec_init(void);
 void acpi_ec_ecdt_probe(void);
 void acpi_ec_dsdt_probe(void);
@@ -231,6 +233,29 @@ void acpi_ec_flush_work(void);
 bool acpi_ec_dispatch_gpe(void);
 #endif
 
+#else
+
+static inline void acpi_ec_init(void) {}
+static inline void acpi_ec_ecdt_probe(void) {}
+static inline void acpi_ec_dsdt_probe(void) {}
+static inline void acpi_ec_block_transactions(void) {}
+static inline void acpi_ec_unblock_transactions(void) {}
+static inline int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit,
+			      acpi_handle handle, acpi_ec_query_func func,
+			      void *data)
+{
+	return -ENXIO;
+}
+static inline void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit) {}
+static inline void acpi_ec_register_opregions(struct acpi_device *adev) {}
+
+static inline void acpi_ec_flush_work(void) {}
+static inline bool acpi_ec_dispatch_gpe(void)
+{
+	return false;
+}
+
+#endif
 
 /*--------------------------------------------------------------------------
                                   Suspend/Resume
diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c
index 16f2daaa2c45..2b63cd18cca2 100644
--- a/drivers/acpi/sbshc.c
+++ b/drivers/acpi/sbshc.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include "sbshc.h"
+#include "internal.h"
 
 #define ACPI_SMB_HC_CLASS	"smbus_host_ctl"
 #define ACPI_SMB_HC_DEVICE_NAME	"ACPI SMBus HC"
@@ -236,12 +237,6 @@ static int smbus_alarm(void *context)
 	return 0;
 }
 
-typedef int (*acpi_ec_query_func) (void *data);
-
-extern int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit,
-			      acpi_handle handle, acpi_ec_query_func func,
-			      void *data);
-
 static int acpi_smbus_hc_add(struct acpi_device *device)
 {
 	int status;
@@ -278,8 +273,6 @@ static int acpi_smbus_hc_add(struct acpi_device *device)
 	return 0;
 }
 
-extern void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit);
-
 static void acpi_smbus_hc_remove(struct acpi_device *device)
 {
 	struct acpi_smb_hc *hc;
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 7c8dd0abcfdf..8fb33c90482f 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -238,6 +238,7 @@ config APPLICOM
 config SONYPI
 	tristate "Sony Vaio Programmable I/O Control Device support"
 	depends on X86_32 && PCI && INPUT
+	depends on ACPI_EC || !ACPI
 	help
 	  This driver enables access to the Sony Programmable I/O Control
 	  Device which can be found in many (all ?) Sony Vaio laptops.
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 08a3c863f80a..1e3a1f4d5c5e 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1752,7 +1752,7 @@ source "drivers/hwmon/occ/Kconfig"
 
 config SENSORS_OXP
 	tristate "OneXPlayer EC fan control"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on X86
 	help
 		If you say yes here you get support for fan readings and control over
@@ -2592,6 +2592,7 @@ config SENSORS_ASUS_WMI
 config SENSORS_ASUS_EC
 	tristate "ASUS EC Sensors"
 	depends on X86
+	depends on ACPI_EC
 	help
 	  If you say yes here you get support for the ACPI embedded controller
 	  hardware monitoring interface found in ASUS motherboards. The driver
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 3875abba5a79..0258dd879d64 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -52,6 +52,7 @@ config WMI_BMOF
 config HUAWEI_WMI
 	tristate "Huawei WMI laptop extras driver"
 	depends on ACPI_BATTERY
+	depends on ACPI_EC
 	depends on ACPI_WMI
 	depends on INPUT
 	select INPUT_SPARSEKMAP
@@ -147,7 +148,7 @@ config YT2_1380
 
 config ACERHDF
 	tristate "Acer Aspire One temperature and fan driver"
-	depends on ACPI && THERMAL
+	depends on ACPI_EC && THERMAL
 	select THERMAL_GOV_BANG_BANG
 	help
 	  This is a driver for Acer Aspire One netbooks. It allows to access
@@ -186,6 +187,7 @@ config ACER_WMI
 	depends on SERIO_I8042
 	depends on INPUT
 	depends on RFKILL || RFKILL = n
+	depends on ACPI_EC
 	depends on ACPI_WMI
 	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on HWMON
@@ -334,7 +336,7 @@ config MERAKI_MX100
 
 config EEEPC_LAPTOP
 	tristate "Eee PC Hotkey Driver"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on INPUT
 	depends on RFKILL || RFKILL = n
 	depends on ACPI_VIDEO || ACPI_VIDEO = n
@@ -503,7 +505,7 @@ config SENSORS_HDAPS
 
 config THINKPAD_ACPI
 	tristate "ThinkPad ACPI Laptop Extras"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on ACPI_BATTERY
 	depends on INPUT
 	depends on RFKILL || RFKILL = n
@@ -682,7 +684,7 @@ config MEEGOPAD_ANX7428
 
 config MSI_EC
 	tristate "MSI EC Extras"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on ACPI_BATTERY
 	help
 	  This driver allows various MSI laptops' functionalities to be
@@ -690,7 +692,7 @@ config MSI_EC
 
 config MSI_LAPTOP
 	tristate "MSI Laptop Extras"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on BACKLIGHT_CLASS_DEVICE
 	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on RFKILL
@@ -796,7 +798,7 @@ config SAMSUNG_LAPTOP
 
 config SAMSUNG_Q10
 	tristate "Samsung Q10 Extras"
-	depends on ACPI
+	depends on ACPI_EC
 	select BACKLIGHT_CLASS_DEVICE
 	help
 	  This driver provides support for backlight control on Samsung Q10
@@ -804,7 +806,7 @@ config SAMSUNG_Q10
 
 config ACPI_TOSHIBA
 	tristate "Toshiba Laptop Extras"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on ACPI_BATTERY
 	depends on ACPI_WMI
 	select LEDS_CLASS
@@ -904,7 +906,7 @@ config ACPI_CMPC
 
 config COMPAL_LAPTOP
 	tristate "Compal (and others) Laptop Extras"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on BACKLIGHT_CLASS_DEVICE
 	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on RFKILL
@@ -949,7 +951,7 @@ config PANASONIC_LAPTOP
 
 config SONY_LAPTOP
 	tristate "Sony Laptop Extras"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on BACKLIGHT_CLASS_DEVICE
 	depends on INPUT
@@ -972,7 +974,7 @@ config SONYPI_COMPAT
 
 config SYSTEM76_ACPI
 	tristate "System76 ACPI Driver"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on ACPI_BATTERY
 	depends on HWMON
 	depends on INPUT
diff --git a/drivers/platform/x86/dell/Kconfig b/drivers/platform/x86/dell/Kconfig
index 68a49788a396..dc21227dd66e 100644
--- a/drivers/platform/x86/dell/Kconfig
+++ b/drivers/platform/x86/dell/Kconfig
@@ -194,6 +194,7 @@ config DELL_WMI
 config DELL_WMI_PRIVACY
 	bool "Dell WMI Hardware Privacy Support"
 	depends on DELL_WMI
+	depends on ACPI_EC
 	help
 	  This option adds integration with the "Dell Hardware Privacy"
 	  feature of Dell laptops to the dell-wmi driver.
diff --git a/drivers/platform/x86/hp/Kconfig b/drivers/platform/x86/hp/Kconfig
index d776761cd5fd..dd51491b9bcd 100644
--- a/drivers/platform/x86/hp/Kconfig
+++ b/drivers/platform/x86/hp/Kconfig
@@ -37,6 +37,7 @@ config HP_ACCEL
 config HP_WMI
 	tristate "HP WMI extras"
 	default m
+	depends on ACPI_EC
 	depends on ACPI_WMI
 	depends on INPUT
 	depends on RFKILL || RFKILL = n
diff --git a/drivers/platform/x86/intel/Kconfig b/drivers/platform/x86/intel/Kconfig
index ad50bbabec61..eb698dcb9af9 100644
--- a/drivers/platform/x86/intel/Kconfig
+++ b/drivers/platform/x86/intel/Kconfig
@@ -62,7 +62,7 @@ config INTEL_INT0002_VGPIO
 
 config INTEL_OAKTRAIL
 	tristate "Intel Oaktrail Platform Extras"
-	depends on ACPI
+	depends on ACPI_EC
 	depends on ACPI_VIDEO || ACPI_VIDEO=n
 	depends on RFKILL && BACKLIGHT_CLASS_DEVICE && ACPI
 	help
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4d5ee84c468b..7dd24acd9ffe 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1164,8 +1164,6 @@ int acpi_subsys_suspend_noirq(struct device *dev);
 int acpi_subsys_suspend(struct device *dev);
 int acpi_subsys_freeze(struct device *dev);
 int acpi_subsys_poweroff(struct device *dev);
-void acpi_ec_mark_gpe_for_wake(void);
-void acpi_ec_set_gpe_wake_mask(u8 action);
 int acpi_subsys_restore_early(struct device *dev);
 #else
 static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
@@ -1176,6 +1174,12 @@ static inline int acpi_subsys_suspend(struct device *dev) { return 0; }
 static inline int acpi_subsys_freeze(struct device *dev) { return 0; }
 static inline int acpi_subsys_poweroff(struct device *dev) { return 0; }
 static inline int acpi_subsys_restore_early(struct device *dev) { return 0; }
+#endif
+
+#if defined(CONFIG_ACPI_EC) && defined(CONFIG_PM_SLEEP)
+void acpi_ec_mark_gpe_for_wake(void);
+void acpi_ec_set_gpe_wake_mask(u8 action);
+#else
 static inline void acpi_ec_mark_gpe_for_wake(void) {}
 static inline void acpi_ec_set_gpe_wake_mask(u8 action) {}
 #endif
-- 
cgit v1.2.3


From 1cb80d9e93f861018fabe81a69ea0ded20f5a2d0 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Wed, 23 Oct 2024 16:47:48 -0700
Subject: bpf: Support __uptr type tag in BTF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces the "__uptr" type tag to BTF. It is to define
a pointer pointing to the user space memory. This patch adds BTF
logic to pass the "__uptr" type tag.

btf_find_kptr() is reused for the "__uptr" tag. The "__uptr" will only
be supported in the map_value of the task storage map. However,
btf_parse_struct_meta() also uses btf_find_kptr() but it is not
interested in "__uptr". This patch adds a "field_mask" argument
to btf_find_kptr() which will return BTF_FIELD_IGNORE if the
caller is not interested in a “__uptr” field.

btf_parse_kptr() is also reused to parse the uptr.
The btf_check_and_fixup_fields() is changed to do extra
checks on the uptr to ensure that its struct size is not larger
than PAGE_SIZE. It is not clear how a uptr pointing to a CO-RE
supported kernel struct will be used, so it is also not allowed now.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20241023234759.860539-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  5 +++++
 kernel/bpf/btf.c     | 34 +++++++++++++++++++++++++++++-----
 kernel/bpf/syscall.c |  2 ++
 3 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0c216e71cec7..bb31bc6d0c4d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -203,6 +203,7 @@ enum btf_field_type {
 	BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
 	BPF_REFCOUNT   = (1 << 9),
 	BPF_WORKQUEUE  = (1 << 10),
+	BPF_UPTR       = (1 << 11),
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
@@ -322,6 +323,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 		return "kptr";
 	case BPF_KPTR_PERCPU:
 		return "percpu_kptr";
+	case BPF_UPTR:
+		return "uptr";
 	case BPF_LIST_HEAD:
 		return "bpf_list_head";
 	case BPF_LIST_NODE:
@@ -350,6 +353,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 	case BPF_KPTR_PERCPU:
+	case BPF_UPTR:
 		return sizeof(u64);
 	case BPF_LIST_HEAD:
 		return sizeof(struct bpf_list_head);
@@ -379,6 +383,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 	case BPF_KPTR_PERCPU:
+	case BPF_UPTR:
 		return __alignof__(u64);
 	case BPF_LIST_HEAD:
 		return __alignof__(struct bpf_list_head);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 13dd1fa1d1b9..76cafff2d99c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3334,7 +3334,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
 }
 
 static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
-			 u32 off, int sz, struct btf_field_info *info)
+			 u32 off, int sz, struct btf_field_info *info, u32 field_mask)
 {
 	enum btf_field_type type;
 	u32 res_id;
@@ -3358,9 +3358,14 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 		type = BPF_KPTR_REF;
 	else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
 		type = BPF_KPTR_PERCPU;
+	else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off)))
+		type = BPF_UPTR;
 	else
 		return -EINVAL;
 
+	if (!(type & field_mask))
+		return BTF_FIELD_IGNORE;
+
 	/* Get the base type */
 	t = btf_type_skip_modifiers(btf, t->type, &res_id);
 	/* Only pointer to struct is allowed */
@@ -3502,7 +3507,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_
 	field_mask_test_name(BPF_REFCOUNT,  "bpf_refcount");
 
 	/* Only return BPF_KPTR when all other types with matchable names fail */
-	if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) {
+	if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
 		type = BPF_KPTR_REF;
 		goto end;
 	}
@@ -3535,6 +3540,7 @@ static int btf_repeat_fields(struct btf_field_info *info,
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 		case BPF_KPTR_PERCPU:
+		case BPF_UPTR:
 		case BPF_LIST_HEAD:
 		case BPF_RB_ROOT:
 			break;
@@ -3661,8 +3667,9 @@ static int btf_find_field_one(const struct btf *btf,
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 	case BPF_KPTR_PERCPU:
+	case BPF_UPTR:
 		ret = btf_find_kptr(btf, var_type, off, sz,
-				    info_cnt ? &info[0] : &tmp);
+				    info_cnt ? &info[0] : &tmp, field_mask);
 		if (ret < 0)
 			return ret;
 		break;
@@ -3985,6 +3992,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 		case BPF_KPTR_PERCPU:
+		case BPF_UPTR:
 			ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
 			if (ret < 0)
 				goto end;
@@ -4044,12 +4052,28 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 	 * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
 	 * does not form cycles.
 	 */
-	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
+	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR)))
 		return 0;
 	for (i = 0; i < rec->cnt; i++) {
 		struct btf_struct_meta *meta;
+		const struct btf_type *t;
 		u32 btf_id;
 
+		if (rec->fields[i].type == BPF_UPTR) {
+			/* The uptr only supports pinning one page and cannot
+			 * point to a kernel struct
+			 */
+			if (btf_is_kernel(rec->fields[i].kptr.btf))
+				return -EINVAL;
+			t = btf_type_by_id(rec->fields[i].kptr.btf,
+					   rec->fields[i].kptr.btf_id);
+			if (!t->size)
+				return -EINVAL;
+			if (t->size > PAGE_SIZE)
+				return -E2BIG;
+			continue;
+		}
+
 		if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
 			continue;
 		btf_id = rec->fields[i].graph_root.value_btf_id;
@@ -5560,7 +5584,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 			goto free_aof;
 		}
 
-		ret = btf_find_kptr(btf, t, 0, 0, &tmp);
+		ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR);
 		if (ret != BTF_FIELD_FOUND)
 			continue;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4d04d4d9c1f3..2d2935d9c096 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -548,6 +548,7 @@ void btf_record_free(struct btf_record *rec)
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 		case BPF_KPTR_PERCPU:
+		case BPF_UPTR:
 			if (rec->fields[i].kptr.module)
 				module_put(rec->fields[i].kptr.module);
 			if (btf_is_kernel(rec->fields[i].kptr.btf))
@@ -597,6 +598,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 		case BPF_KPTR_PERCPU:
+		case BPF_UPTR:
 			if (btf_is_kernel(fields[i].kptr.btf))
 				btf_get(fields[i].kptr.btf);
 			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
-- 
cgit v1.2.3


From b9a5a07aeaa2a903fb1306eb422880b2fa5f937f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 23 Oct 2024 16:47:50 -0700
Subject: bpf: Add "bool swap_uptrs" arg to bpf_local_storage_update() and
 bpf_selem_alloc()

In a later patch, the task local storage will only accept uptr
from the syscall update_elem and will not accept uptr from
the bpf prog. The reason is the bpf prog does not have a way
to provide a valid user space address.

bpf_local_storage_update() and bpf_selem_alloc() are used by
both bpf prog bpf_task_storage_get(BPF_LOCAL_STORAGE_GET_F_CREATE)
and bpf syscall update_elem. "bool swap_uptrs" arg is added
to bpf_local_storage_update() and bpf_selem_alloc() to tell if
it is called by the bpf prog or by the bpf syscall. When
swap_uptrs==true, it is called by the syscall.

The arg is named (swap_)uptrs because the later patch will swap
the uptrs between the newly allocated selem and the user space
provided map_value. It will make error handling easier in case
map->ops->map_update_elem() fails and the caller can decide
if it needs to unpin the uptr in the user space provided
map_value or the bpf_local_storage_update() has already
taken the uptr ownership and will take care of unpinning it also.

Only swap_uptrs==false is passed now. The logic to handle
the true case will be added in a later patch.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20241023234759.860539-4-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 4 ++--
 kernel/bpf/bpf_cgrp_storage.c     | 4 ++--
 kernel/bpf/bpf_inode_storage.c    | 4 ++--
 kernel/bpf/bpf_local_storage.c    | 8 ++++----
 kernel/bpf/bpf_task_storage.c     | 4 ++--
 net/core/bpf_sk_storage.c         | 6 +++---
 6 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index dcddb0aef7d8..0c7216c065d5 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -181,7 +181,7 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
-		bool charge_mem, gfp_t gfp_flags);
+		bool charge_mem, bool swap_uptrs, gfp_t gfp_flags);
 
 void bpf_selem_free(struct bpf_local_storage_elem *selem,
 		    struct bpf_local_storage_map *smap,
@@ -195,7 +195,7 @@ bpf_local_storage_alloc(void *owner,
 
 struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
-			 void *value, u64 map_flags, gfp_t gfp_flags);
+			 void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags);
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map);
 
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 28efd0a3f220..20f05de92e9c 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -107,7 +107,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
 
 	bpf_cgrp_storage_lock();
 	sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
-					 value, map_flags, GFP_ATOMIC);
+					 value, map_flags, false, GFP_ATOMIC);
 	bpf_cgrp_storage_unlock();
 	cgroup_put(cgroup);
 	return PTR_ERR_OR_ZERO(sdata);
@@ -181,7 +181,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
 	if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
 	    (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
 		sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
-						 value, BPF_NOEXIST, gfp_flags);
+						 value, BPF_NOEXIST, false, gfp_flags);
 
 unlock:
 	bpf_cgrp_storage_unlock();
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 29da6d3838f6..44ccebc745e5 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -100,7 +100,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
 
 	sdata = bpf_local_storage_update(file_inode(fd_file(f)),
 					 (struct bpf_local_storage_map *)map,
-					 value, map_flags, GFP_ATOMIC);
+					 value, map_flags, false, GFP_ATOMIC);
 	return PTR_ERR_OR_ZERO(sdata);
 }
 
@@ -154,7 +154,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
 	if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
 		sdata = bpf_local_storage_update(
 			inode, (struct bpf_local_storage_map *)map, value,
-			BPF_NOEXIST, gfp_flags);
+			BPF_NOEXIST, false, gfp_flags);
 		return IS_ERR(sdata) ? (unsigned long)NULL :
 					     (unsigned long)sdata->data;
 	}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index c938dea5ddbf..1cf772cb26eb 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
 
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
-		void *value, bool charge_mem, gfp_t gfp_flags)
+		void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags)
 {
 	struct bpf_local_storage_elem *selem;
 
@@ -524,7 +524,7 @@ uncharge:
  */
 struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
-			 void *value, u64 map_flags, gfp_t gfp_flags)
+			 void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
 {
 	struct bpf_local_storage_data *old_sdata = NULL;
 	struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
@@ -550,7 +550,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		if (err)
 			return ERR_PTR(err);
 
-		selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+		selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
 		if (!selem)
 			return ERR_PTR(-ENOMEM);
 
@@ -584,7 +584,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	/* A lookup has just been done before and concluded a new selem is
 	 * needed. The chance of an unnecessary alloc is unlikely.
 	 */
-	alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+	alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
 	if (!alloc_selem)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index adf6dfe0ba68..45dc3ca334d3 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -147,7 +147,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
 	bpf_task_storage_lock();
 	sdata = bpf_local_storage_update(
 		task, (struct bpf_local_storage_map *)map, value, map_flags,
-		GFP_ATOMIC);
+		false, GFP_ATOMIC);
 	bpf_task_storage_unlock();
 
 	err = PTR_ERR_OR_ZERO(sdata);
@@ -219,7 +219,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map,
 	    (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
 		sdata = bpf_local_storage_update(
 			task, (struct bpf_local_storage_map *)map, value,
-			BPF_NOEXIST, gfp_flags);
+			BPF_NOEXIST, false, gfp_flags);
 		return IS_ERR(sdata) ? NULL : sdata->data;
 	}
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index bc01b3aa6b0f..2f4ed83a75ae 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -106,7 +106,7 @@ static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
 	if (sock) {
 		sdata = bpf_local_storage_update(
 			sock->sk, (struct bpf_local_storage_map *)map, value,
-			map_flags, GFP_ATOMIC);
+			map_flags, false, GFP_ATOMIC);
 		sockfd_put(sock);
 		return PTR_ERR_OR_ZERO(sdata);
 	}
@@ -137,7 +137,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
 {
 	struct bpf_local_storage_elem *copy_selem;
 
-	copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC);
+	copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC);
 	if (!copy_selem)
 		return NULL;
 
@@ -243,7 +243,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 	    refcount_inc_not_zero(&sk->sk_refcnt)) {
 		sdata = bpf_local_storage_update(
 			sk, (struct bpf_local_storage_map *)map, value,
-			BPF_NOEXIST, gfp_flags);
+			BPF_NOEXIST, false, gfp_flags);
 		/* sk must be a fullsock (guaranteed by verifier),
 		 * so sock_gen_put() is unnecessary.
 		 */
-- 
cgit v1.2.3


From 5bd5bab76669b1e1551f03f5fcbc165f3fa8d269 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 23 Oct 2024 16:47:51 -0700
Subject: bpf: Postpone bpf_selem_free() in bpf_selem_unlink_storage_nolock()

In a later patch, bpf_selem_free() will call unpin_user_page()
through bpf_obj_free_fields(). unpin_user_page() may take spin_lock.
However, some bpf_selem_free() call paths have held a raw_spin_lock.
Like this:

raw_spin_lock_irqsave()
  bpf_selem_unlink_storage_nolock()
    bpf_selem_free()
      unpin_user_page()
        spin_lock()

To avoid spinlock nested in raw_spinlock, bpf_selem_free() should be
done after releasing the raw_spinlock. The "bool reuse_now" arg is
replaced with "struct hlist_head *free_selem_list" in
bpf_selem_unlink_storage_nolock(). The bpf_selem_unlink_storage_nolock()
will append the to-be-free selem at the free_selem_list. The caller of
bpf_selem_unlink_storage_nolock() will need to call the new
bpf_selem_free_list(free_selem_list, reuse_now) to free the selem
after releasing the raw_spinlock.

Note that the selem->snode cannot be reused for linking to
the free_selem_list because the selem->snode is protected by the
raw_spinlock that we want to avoid holding. A new
"struct hlist_node free_node;" is union-ized with
the rcu_head. Only the first one successfully
hlist_del_init_rcu(&selem->snode) will be able
to use the free_node. After succeeding hlist_del_init_rcu(&selem->snode),
the free_node and rcu_head usage is serialized such that they
can share the 16 bytes in a union.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20241023234759.860539-5-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  8 +++++++-
 kernel/bpf/bpf_local_storage.c    | 35 ++++++++++++++++++++++++++++++-----
 2 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 0c7216c065d5..ab7244d8108f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -77,7 +77,13 @@ struct bpf_local_storage_elem {
 	struct hlist_node map_node;	/* Linked to bpf_local_storage_map */
 	struct hlist_node snode;	/* Linked to bpf_local_storage */
 	struct bpf_local_storage __rcu *local_storage;
-	struct rcu_head rcu;
+	union {
+		struct rcu_head rcu;
+		struct hlist_node free_node;	/* used to postpone
+						 * bpf_selem_free
+						 * after raw_spin_unlock
+						 */
+	};
 	/* 8 bytes hole */
 	/* The data is stored in another cacheline to minimize
 	 * the number of cachelines access during a cache hit.
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 1cf772cb26eb..09a67dff2336 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -246,13 +246,30 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
 	}
 }
 
+static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
+{
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_map *smap;
+	struct hlist_node *n;
+
+	/* The "_safe" iteration is needed.
+	 * The loop is not removing the selem from the list
+	 * but bpf_selem_free will use the selem->rcu_head
+	 * which is union-ized with the selem->free_node.
+	 */
+	hlist_for_each_entry_safe(selem, n, list, free_node) {
+		smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+		bpf_selem_free(selem, smap, reuse_now);
+	}
+}
+
 /* local_storage->lock must be held and selem->local_storage == local_storage.
  * The caller must ensure selem->smap is still valid to be
  * dereferenced for its smap->elem_size and smap->cache_idx.
  */
 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 					    struct bpf_local_storage_elem *selem,
-					    bool uncharge_mem, bool reuse_now)
+					    bool uncharge_mem, struct hlist_head *free_selem_list)
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
@@ -296,7 +313,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	    SDATA(selem))
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
-	bpf_selem_free(selem, smap, reuse_now);
+	hlist_add_head(&selem->free_node, free_selem_list);
 
 	if (rcu_access_pointer(local_storage->smap) == smap)
 		RCU_INIT_POINTER(local_storage->smap, NULL);
@@ -345,6 +362,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage *local_storage;
 	bool bpf_ma, free_local_storage = false;
+	HLIST_HEAD(selem_free_list);
 	unsigned long flags;
 
 	if (unlikely(!selem_linked_to_storage_lockless(selem)))
@@ -360,9 +378,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, true, reuse_now);
+			local_storage, selem, true, &selem_free_list);
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
+	bpf_selem_free_list(&selem_free_list, reuse_now);
+
 	if (free_local_storage)
 		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
 }
@@ -529,6 +549,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	struct bpf_local_storage_data *old_sdata = NULL;
 	struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
 	struct bpf_local_storage *local_storage;
+	HLIST_HEAD(old_selem_free_list);
 	unsigned long flags;
 	int err;
 
@@ -624,11 +645,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata) {
 		bpf_selem_unlink_map(SELEM(old_sdata));
 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
-						true, false);
+						true, &old_selem_free_list);
 	}
 
 unlock:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	bpf_selem_free_list(&old_selem_free_list, false);
 	if (alloc_selem) {
 		mem_uncharge(smap, owner, smap->elem_size);
 		bpf_selem_free(alloc_selem, smap, true);
@@ -706,6 +728,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage_elem *selem;
 	bool bpf_ma, free_storage = false;
+	HLIST_HEAD(free_selem_list);
 	struct hlist_node *n;
 	unsigned long flags;
 
@@ -734,10 +757,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 		 * of the loop will set the free_cgroup_storage to true.
 		 */
 		free_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, true, true);
+			local_storage, selem, true, &free_selem_list);
 	}
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
+	bpf_selem_free_list(&free_selem_list, true);
+
 	if (free_storage)
 		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
 }
-- 
cgit v1.2.3


From ba512b00e5efbf7e19cfb7fa9f66ce82669b7077 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 23 Oct 2024 16:47:53 -0700
Subject: bpf: Add uptr support in the map_value of the task local storage.

This patch adds uptr support in the map_value of the task local storage.

struct map_value {
	struct user_data __uptr *uptr;
};

struct {
	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
	__uint(map_flags, BPF_F_NO_PREALLOC);
	__type(key, int);
	__type(value, struct value_type);
} datamap SEC(".maps");

A new bpf_obj_pin_uptrs() is added to pin the user page and
also stores the kernel address back to the uptr for the
bpf prog to use later. It currently does not support
the uptr pointing to a user struct across two pages.
It also excludes PageHighMem support to keep it simple.
As of now, the 32bit bpf jit is missing other more crucial bpf
features. For example, many important bpf features depend on
bpf kfunc now but so far only one arch (x86-32) supports it
which was added by me as an example when kfunc was first
introduced to bpf.

The uptr can only be stored to the task local storage by the
syscall update_elem. Meaning the uptr will not be considered
if it is provided by the bpf prog through
bpf_task_storage_get(BPF_LOCAL_STORAGE_GET_F_CREATE).
This is enforced by only calling
bpf_local_storage_update(swap_uptrs==true) in
bpf_pid_task_storage_update_elem. Everywhere else will
have swap_uptrs==false.

This will pump down to bpf_selem_alloc(swap_uptrs==true). It is
the only case that bpf_selem_alloc() will take the uptr value when
updating the newly allocated selem. bpf_obj_swap_uptrs() is added
to swap the uptr between the SDATA(selem)->data and the user provided
map_value in "void *value". bpf_obj_swap_uptrs() makes the
SDATA(selem)->data takes the ownership of the uptr and the user space
provided map_value will have NULL in the uptr.

The bpf_obj_unpin_uptrs() is called after map->ops->map_update_elem()
returning error. If the map->ops->map_update_elem has reached
a state that the local storage has taken the uptr ownership,
the bpf_obj_unpin_uptrs() will be a no op because the uptr
is NULL. A "__"bpf_obj_unpin_uptrs is added to make this
error path unpin easier such that it does not have to check
the map->record is NULL or not.

BPF_F_LOCK is not supported when the map_value has uptr.
This can be revisited later if there is a use case. A similar
swap_uptrs idea can be considered.

The final bit is to do unpin_user_page in the bpf_obj_free_fields().
The earlier patch has ensured that the bpf_obj_free_fields() has
gone through the rcu gp when needed.

Cc: linux-mm@kvack.org
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Link: https://lore.kernel.org/r/20241023234759.860539-7-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  20 ++++++++
 kernel/bpf/bpf_local_storage.c |   7 ++-
 kernel/bpf/bpf_task_storage.c  |   5 +-
 kernel/bpf/syscall.c           | 106 +++++++++++++++++++++++++++++++++++++++--
 4 files changed, 131 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bb31bc6d0c4d..8888689aa917 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -424,6 +424,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 	case BPF_KPTR_PERCPU:
+	case BPF_UPTR:
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -512,6 +513,25 @@ static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src
 	bpf_obj_memcpy(map->record, dst, src, map->value_size, true);
 }
 
+static inline void bpf_obj_swap_uptrs(const struct btf_record *rec, void *dst, void *src)
+{
+	unsigned long *src_uptr, *dst_uptr;
+	const struct btf_field *field;
+	int i;
+
+	if (!btf_record_has_field(rec, BPF_UPTR))
+		return;
+
+	for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
+		if (field->type != BPF_UPTR)
+			continue;
+
+		src_uptr = src + field->offset;
+		dst_uptr = dst + field->offset;
+		swap(*src_uptr, *dst_uptr);
+	}
+}
+
 static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size)
 {
 	u32 curr_off = 0;
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index ca871be1c42d..7e6a0af0afc1 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -99,9 +99,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	}
 
 	if (selem) {
-		if (value)
+		if (value) {
+			/* No need to call check_and_init_map_value as memory is zero init */
 			copy_map_value(&smap->map, SDATA(selem)->data, value);
-		/* No need to call check_and_init_map_value as memory is zero init */
+			if (swap_uptrs)
+				bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
+		}
 		return selem;
 	}
 
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 45dc3ca334d3..09705f9988f3 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -129,6 +129,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
 	struct pid *pid;
 	int fd, err;
 
+	if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR))
+		return -EOPNOTSUPP;
+
 	fd = *(int *)key;
 	pid = pidfd_get_pid(fd, &f_flags);
 	if (IS_ERR(pid))
@@ -147,7 +150,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
 	bpf_task_storage_lock();
 	sdata = bpf_local_storage_update(
 		task, (struct bpf_local_storage_map *)map, value, map_flags,
-		false, GFP_ATOMIC);
+		true, GFP_ATOMIC);
 	bpf_task_storage_unlock();
 
 	err = PTR_ERR_OR_ZERO(sdata);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2d2935d9c096..426a52e5c7da 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -155,6 +155,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
 		synchronize_rcu();
 }
 
+static void unpin_uptr_kaddr(void *kaddr)
+{
+	if (kaddr)
+		unpin_user_page(virt_to_page(kaddr));
+}
+
+static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj)
+{
+	const struct btf_field *field;
+	void **uptr_addr;
+	int i;
+
+	for (i = 0, field = rec->fields; i < cnt; i++, field++) {
+		if (field->type != BPF_UPTR)
+			continue;
+
+		uptr_addr = obj + field->offset;
+		unpin_uptr_kaddr(*uptr_addr);
+	}
+}
+
+static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj)
+{
+	if (!btf_record_has_field(rec, BPF_UPTR))
+		return;
+
+	__bpf_obj_unpin_uptrs(rec, rec->cnt, obj);
+}
+
+static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
+{
+	const struct btf_field *field;
+	const struct btf_type *t;
+	unsigned long start, end;
+	struct page *page;
+	void **uptr_addr;
+	int i, err;
+
+	if (!btf_record_has_field(rec, BPF_UPTR))
+		return 0;
+
+	for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
+		if (field->type != BPF_UPTR)
+			continue;
+
+		uptr_addr = obj + field->offset;
+		start = *(unsigned long *)uptr_addr;
+		if (!start)
+			continue;
+
+		t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+		/* t->size was checked for zero before */
+		if (check_add_overflow(start, t->size - 1, &end)) {
+			err = -EFAULT;
+			goto unpin_all;
+		}
+
+		/* The uptr's struct cannot span across two pages */
+		if ((start & PAGE_MASK) != (end & PAGE_MASK)) {
+			err = -EOPNOTSUPP;
+			goto unpin_all;
+		}
+
+		err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page);
+		if (err != 1)
+			goto unpin_all;
+
+		if (PageHighMem(page)) {
+			err = -EOPNOTSUPP;
+			unpin_user_page(page);
+			goto unpin_all;
+		}
+
+		*uptr_addr = page_address(page) + offset_in_page(start);
+	}
+
+	return 0;
+
+unpin_all:
+	__bpf_obj_unpin_uptrs(rec, i, obj);
+	return err;
+}
+
 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, __u64 flags)
 {
@@ -199,9 +282,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
 		   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
 		err = map->ops->map_push_elem(map, value, flags);
 	} else {
-		rcu_read_lock();
-		err = map->ops->map_update_elem(map, key, value, flags);
-		rcu_read_unlock();
+		err = bpf_obj_pin_uptrs(map->record, value);
+		if (!err) {
+			rcu_read_lock();
+			err = map->ops->map_update_elem(map, key, value, flags);
+			rcu_read_unlock();
+			if (err)
+				bpf_obj_unpin_uptrs(map->record, value);
+		}
 	}
 	bpf_enable_instrumentation();
 
@@ -716,6 +804,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 				field->kptr.dtor(xchgd_field);
 			}
 			break;
+		case BPF_UPTR:
+			/* The caller ensured that no one is using the uptr */
+			unpin_uptr_kaddr(*(void **)field_ptr);
+			break;
 		case BPF_LIST_HEAD:
 			if (WARN_ON_ONCE(rec->spin_lock_off < 0))
 				continue;
@@ -1107,7 +1199,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 
 	map->record = btf_parse_fields(btf, value_type,
 				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
-				       BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
+				       BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
 				       map->value_size);
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
@@ -1163,6 +1255,12 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 					goto free_map_tab;
 				}
 				break;
+			case BPF_UPTR:
+				if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
+					ret = -EOPNOTSUPP;
+					goto free_map_tab;
+				}
+				break;
 			case BPF_LIST_HEAD:
 			case BPF_RB_ROOT:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
-- 
cgit v1.2.3


From 02ac5f9d6caec96071103f7c62b5117526e47b64 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Thu, 24 Oct 2024 02:20:02 +0000
Subject: of: property: add of_graph_get_next_port()

We have endpoint base functions
	- of_graph_get_next_endpoint()
	- of_graph_get_endpoint_count()
	- for_each_endpoint_of_node()

Here, for_each_endpoint_of_node() loop finds each endpoints

	ports {
		port@0 {
(1)			endpoint {...};
		};
		port@1 {
(2)			endpoint {...};
		};
		...
	};

In above case, it finds endpoint as (1) -> (2) -> ...

Basically, user/driver knows which port is used for what, but not in
all cases. For example on flexible/generic driver case, how many ports
are used is not fixed.

For example Sound Generic Card driver which is very flexible/generic and
used from many venders can't know how many ports are used, and used for
what, because it depends on each vender SoC and/or its used board.

And more, the port can have multi endpoints. For example Generic Sound
Card case, it supports many type of connection between CPU / Codec, and
some of them uses multi endpoint in one port. see below.

	ports {
(A)		port@0 {
(1)			endpoint@0 {...};
(2)			endpoint@1 {...};
		};
(B)		port@1 {
(3)			endpoint {...};
		};
		...
	};

Generic Sound Card want to handle each connection via "port" base instead
of "endpoint" base. But, it is very difficult to handle each "port" via
existing for_each_endpoint_of_node(). Because getting each "port" via
of_get_parent() from each "endpoint" doesn't work. For example in above
case, both (1) (2) endpoint has same "port" (= A).

Add "port" base functions.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87ldyeb5t9.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/of/property.c    | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of_graph.h | 28 +++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/property.c b/drivers/of/property.c
index 33dd72cad4ab..6dd5461ea359 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -630,6 +630,43 @@ struct device_node *of_graph_get_port_by_id(struct device_node *parent, u32 id)
 }
 EXPORT_SYMBOL(of_graph_get_port_by_id);
 
+/**
+ * of_graph_get_next_port() - get next port node.
+ * @parent: pointer to the parent device node, or parent ports node
+ * @prev: previous port node, or NULL to get first
+ *
+ * Parent device node can be used as @parent whether device node has ports node
+ * or not. It will work same as ports@0 node.
+ *
+ * Return: A 'port' node pointer with refcount incremented. Refcount
+ * of the passed @prev node is decremented.
+ */
+struct device_node *of_graph_get_next_port(const struct device_node *parent,
+					   struct device_node *prev)
+{
+	if (!parent)
+		return NULL;
+
+	if (!prev) {
+		struct device_node *node __free(device_node) =
+			of_get_child_by_name(parent, "ports");
+
+		if (node)
+			parent = node;
+
+		return of_get_child_by_name(parent, "port");
+	}
+
+	do {
+		prev = of_get_next_child(parent, prev);
+		if (!prev)
+			break;
+	} while (!of_node_name_eq(prev, "port"));
+
+	return prev;
+}
+EXPORT_SYMBOL(of_graph_get_next_port);
+
 /**
  * of_graph_get_next_endpoint() - get next endpoint node
  * @parent: pointer to the parent device node
@@ -823,6 +860,23 @@ unsigned int of_graph_get_endpoint_count(const struct device_node *np)
 }
 EXPORT_SYMBOL(of_graph_get_endpoint_count);
 
+/**
+ * of_graph_get_port_count() - get the number of port in a device or ports node
+ * @np: pointer to the device or ports node
+ *
+ * Return: count of port of this device or ports node
+ */
+unsigned int of_graph_get_port_count(struct device_node *np)
+{
+	unsigned int num = 0;
+
+	for_each_of_graph_port(np, port)
+		num++;
+
+	return num;
+}
+EXPORT_SYMBOL(of_graph_get_port_count);
+
 /**
  * of_graph_get_remote_node() - get remote parent device_node for given port/endpoint
  * @node: pointer to parent device_node containing graph port/endpoint
diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h
index a4bea62bfa29..44518f3583a4 100644
--- a/include/linux/of_graph.h
+++ b/include/linux/of_graph.h
@@ -11,6 +11,7 @@
 #ifndef __LINUX_OF_GRAPH_H
 #define __LINUX_OF_GRAPH_H
 
+#include <linux/cleanup.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 
@@ -37,14 +38,29 @@ struct of_endpoint {
 	for (child = of_graph_get_next_endpoint(parent, NULL); child != NULL; \
 	     child = of_graph_get_next_endpoint(parent, child))
 
+/**
+ * for_each_of_graph_port - iterate over every port in a device or ports node
+ * @parent: parent device or ports node containing port
+ * @child: loop variable pointing to the current port node
+ *
+ * When breaking out of the loop, and continue to use the @child, you need to
+ * use return_ptr(@child) or no_free_ptr(@child) not to call __free() for it.
+ */
+#define for_each_of_graph_port(parent, child)			\
+	for (struct device_node *child __free(device_node) = of_graph_get_next_port(parent, NULL);\
+	     child != NULL; child = of_graph_get_next_port(parent, child))
+
 #ifdef CONFIG_OF
 bool of_graph_is_present(const struct device_node *node);
 int of_graph_parse_endpoint(const struct device_node *node,
 				struct of_endpoint *endpoint);
 unsigned int of_graph_get_endpoint_count(const struct device_node *np);
+unsigned int of_graph_get_port_count(struct device_node *np);
 struct device_node *of_graph_get_port_by_id(struct device_node *node, u32 id);
 struct device_node *of_graph_get_next_endpoint(const struct device_node *parent,
 					struct device_node *previous);
+struct device_node *of_graph_get_next_port(const struct device_node *parent,
+					   struct device_node *port);
 struct device_node *of_graph_get_endpoint_by_regs(
 		const struct device_node *parent, int port_reg, int reg);
 struct device_node *of_graph_get_remote_endpoint(
@@ -73,6 +89,11 @@ static inline unsigned int of_graph_get_endpoint_count(const struct device_node
 	return 0;
 }
 
+static inline unsigned int of_graph_get_port_count(struct device_node *np)
+{
+	return 0;
+}
+
 static inline struct device_node *of_graph_get_port_by_id(
 					struct device_node *node, u32 id)
 {
@@ -86,6 +107,13 @@ static inline struct device_node *of_graph_get_next_endpoint(
 	return NULL;
 }
 
+static inline struct device_node *of_graph_get_next_port(
+					const struct device_node *parent,
+					struct device_node *previous)
+{
+	return NULL;
+}
+
 static inline struct device_node *of_graph_get_endpoint_by_regs(
 		const struct device_node *parent, int port_reg, int reg)
 {
-- 
cgit v1.2.3


From 58fe47d6ac7413172e1fa88324fb1b4c0eb2c0a2 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Thu, 24 Oct 2024 02:20:06 +0000
Subject: of: property: add of_graph_get_next_port_endpoint()

We already have of_graph_get_next_endpoint(), but it is not
intuitive to use in some case.

(X)	node {
(Y)		ports {
(P0)			port@0 { endpoint { remote-endpoint = ...; };};
(P10)			port@1 { endpoint { remote-endpoint = ...; };
(P11)				 endpoint { remote-endpoint = ...; };};
(P2)			port@2 { endpoint { remote-endpoint = ...; };};
		};
	};

For example, if I want to handle port@1's 2 endpoints (= P10, P11),
I want to use like below

	P10 = of_graph_get_next_endpoint(port1, NULL);
	P11 = of_graph_get_next_endpoint(port1, P10);

But 1st one will be error, because of_graph_get_next_endpoint()
requested 1st parameter is "node" (X) or "ports" (Y), not but "port".
Below works well, but it will get P0

	P0 = of_graph_get_next_endpoint(node,  NULL);
	P0 = of_graph_get_next_endpoint(ports, NULL);

In other words, we can't handle P10/P11 directly via
of_graph_get_next_endpoint().

There is another non intuitive behavior on of_graph_get_next_endpoint().
In case of if I could get P10 pointer for some way, and if I want to
handle port@1 things by loop, I would like use it like below

	/*
	 * "ep" is now P10, and handle port1 things here,
	 * but we don't know how many endpoints port1 have.
	 *
	 * Because "ep" is non NULL now, we can use port1
	 * as of_graph_get_next_endpoint(port1, xxx)
	 */
	do {
		/* do something for port1 specific things here */
	} while (ep = of_graph_get_next_endpoint(port1, ep))

But it also not worked as I expected.
I expect it will be P10 -> P11 -> NULL,
but      it will be P10 -> P11 -> P2,    because
of_graph_get_next_endpoint() will fetch "endpoint" beyond the "port".
It is not useful for generic driver.

To handle endpoint more intuitive, create of_graph_get_next_port_endpoint()

	of_graph_get_next_port_endpoint(port1, NULL); // P10
	of_graph_get_next_port_endpoint(port1, P10);  // P11
	of_graph_get_next_port_endpoint(port1, P11);  // NULL

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87jzdyb5t5.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/of/property.c    | 27 +++++++++++++++++++++++++++
 include/linux/of_graph.h | 21 +++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/property.c b/drivers/of/property.c
index 6dd5461ea359..ce0e08176fca 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -667,6 +667,33 @@ struct device_node *of_graph_get_next_port(const struct device_node *parent,
 }
 EXPORT_SYMBOL(of_graph_get_next_port);
 
+/**
+ * of_graph_get_next_port_endpoint() - get next endpoint node in port.
+ * If it reached to end of the port, it will return NULL.
+ * @port: pointer to the target port node
+ * @prev: previous endpoint node, or NULL to get first
+ *
+ * Return: An 'endpoint' node pointer with refcount incremented. Refcount
+ * of the passed @prev node is decremented.
+ */
+struct device_node *of_graph_get_next_port_endpoint(const struct device_node *port,
+						    struct device_node *prev)
+{
+	while (1) {
+		prev = of_get_next_child(port, prev);
+		if (!prev)
+			break;
+		if (WARN(!of_node_name_eq(prev, "endpoint"),
+			 "non endpoint node is used (%pOF)", prev))
+			continue;
+
+		break;
+	}
+
+	return prev;
+}
+EXPORT_SYMBOL(of_graph_get_next_port_endpoint);
+
 /**
  * of_graph_get_next_endpoint() - get next endpoint node
  * @parent: pointer to the parent device node
diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h
index 44518f3583a4..a692d9d979a6 100644
--- a/include/linux/of_graph.h
+++ b/include/linux/of_graph.h
@@ -50,6 +50,18 @@ struct of_endpoint {
 	for (struct device_node *child __free(device_node) = of_graph_get_next_port(parent, NULL);\
 	     child != NULL; child = of_graph_get_next_port(parent, child))
 
+/**
+ * for_each_of_graph_port_endpoint - iterate over every endpoint in a port node
+ * @parent: parent port node
+ * @child: loop variable pointing to the current endpoint node
+ *
+ * When breaking out of the loop, and continue to use the @child, you need to
+ * use return_ptr(@child) or no_free_ptr(@child) not to call __free() for it.
+ */
+#define for_each_of_graph_port_endpoint(parent, child)			\
+	for (struct device_node *child __free(device_node) = of_graph_get_next_port_endpoint(parent, NULL);\
+	     child != NULL; child = of_graph_get_next_port_endpoint(parent, child))
+
 #ifdef CONFIG_OF
 bool of_graph_is_present(const struct device_node *node);
 int of_graph_parse_endpoint(const struct device_node *node,
@@ -61,6 +73,8 @@ struct device_node *of_graph_get_next_endpoint(const struct device_node *parent,
 					struct device_node *previous);
 struct device_node *of_graph_get_next_port(const struct device_node *parent,
 					   struct device_node *port);
+struct device_node *of_graph_get_next_port_endpoint(const struct device_node *port,
+						    struct device_node *prev);
 struct device_node *of_graph_get_endpoint_by_regs(
 		const struct device_node *parent, int port_reg, int reg);
 struct device_node *of_graph_get_remote_endpoint(
@@ -114,6 +128,13 @@ static inline struct device_node *of_graph_get_next_port(
 	return NULL;
 }
 
+static inline struct device_node *of_graph_get_next_port_endpoint(
+					const struct device_node *parent,
+					struct device_node *previous)
+{
+	return NULL;
+}
+
 static inline struct device_node *of_graph_get_endpoint_by_regs(
 		const struct device_node *parent, int port_reg, int reg)
 {
-- 
cgit v1.2.3


From f730fd535fc51573f982fad629f2fc6b4a0cde2f Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 19 Aug 2024 09:41:15 +0200
Subject: cleanup: Remove address space of returned pointer

Guard functions in local_lock.h are defined using DEFINE_GUARD() and
DEFINE_LOCK_GUARD_1() macros having lock type defined as pointer in
the percpu address space. The functions, defined by these macros
return value in generic address space, causing:

cleanup.h:157:18: error: return from pointer to non-enclosed address space

and

cleanup.h:214:18: error: return from pointer to non-enclosed address space

when strict percpu checks are enabled.

Add explicit casts to remove address space of the returned pointer.

Found by GCC's named address space checks.

Fixes: e4ab322fbaaa ("cleanup: Add conditional guard support")
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20240819074124.143565-1-ubizjak@gmail.com
---
 include/linux/cleanup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 038b2d523bf8..518bd1fd86fb 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -290,7 +290,7 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
 #define DEFINE_GUARD(_name, _type, _lock, _unlock) \
 	DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \
 	static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
-	{ return *_T; }
+	{ return (void *)(__force unsigned long)*_T; }
 
 #define DEFINE_GUARD_COND(_name, _ext, _condlock) \
 	EXTEND_CLASS(_name, _ext, \
@@ -347,7 +347,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T)	\
 									\
 static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T)	\
 {									\
-	return _T->lock;						\
+	return (void *)(__force unsigned long)_T->lock;			\
 }
 
 
-- 
cgit v1.2.3


From fcc22ac5baf06dd17193de44b60dbceea6461983 Mon Sep 17 00:00:00 2001
From: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Date: Fri, 18 Oct 2024 13:38:14 +0200
Subject: cleanup: Adjust scoped_guard() macros to avoid potential warning

Change scoped_guard() and scoped_cond_guard() macros to make reasoning
about them easier for static analysis tools (smatch, compiler
diagnostics), especially to enable them to tell if the given usage of
scoped_guard() is with a conditional lock class (interruptible-locks,
try-locks) or not (like simple mutex_lock()).

Add compile-time error if scoped_cond_guard() is used for non-conditional
lock class.

Beyond easier tooling and a little shrink reported by bloat-o-meter
this patch enables developer to write code like:

int foo(struct my_drv *adapter)
{
	scoped_guard(spinlock, &adapter->some_spinlock)
		return adapter->spinlock_protected_var;
}

Current scoped_guard() implementation does not support that,
due to compiler complaining:
error: control reaches end of non-void function [-Werror=return-type]

Technical stuff about the change:
scoped_guard() macro uses common idiom of using "for" statement to declare
a scoped variable. Unfortunately, current logic is too hard for compiler
diagnostics to be sure that there is exactly one loop step; fix that.

To make any loop so trivial that there is no above warning, it must not
depend on any non-const variable to tell if there are more steps. There is
no obvious solution for that in C, but one could use the compound
statement expression with "goto" jumping past the "loop", effectively
leaving only the subscope part of the loop semantics.

More impl details:
one more level of macro indirection is now needed to avoid duplicating
label names;
I didn't spot any other place that is using the
"for (...; goto label) if (0) label: break;" idiom, so it's not packed for
reuse beyond scoped_guard() family, what makes actual macros code cleaner.

There was also a need to introduce const true/false variable per lock
class, it is used to aid compiler diagnostics reasoning about "exactly
1 step" loops (note that converting that to function would undo the whole
benefit).

Big thanks to Andy Shevchenko for help on this patch, both internal and
public, ranging from whitespace/formatting, through commit message
clarifications, general improvements, ending with presenting alternative
approaches - all despite not even liking the idea.

Big thanks to Dmitry Torokhov for the idea of compile-time check for
scoped_cond_guard() (to use it only with conditional locsk), and general
improvements for the patch.

Big thanks to David Lechner for idea to cover also scoped_cond_guard().

Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Link: https://lkml.kernel.org/r/20241018113823.171256-1-przemyslaw.kitszel@intel.com
---
 include/linux/cleanup.h | 52 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 518bd1fd86fb..0cc66f8d28e7 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -285,14 +285,20 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
  *      similar to scoped_guard(), except it does fail when the lock
  *      acquire fails.
  *
+ *      Only for conditional locks.
  */
 
+#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond)	\
+static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
+
 #define DEFINE_GUARD(_name, _type, _lock, _unlock) \
+	__DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
 	DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \
 	static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
 	{ return (void *)(__force unsigned long)*_T; }
 
 #define DEFINE_GUARD_COND(_name, _ext, _condlock) \
+	__DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \
 	EXTEND_CLASS(_name, _ext, \
 		     ({ void *_t = _T; if (_T && !(_condlock)) _t = NULL; _t; }), \
 		     class_##_name##_t _T) \
@@ -303,17 +309,40 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
 	CLASS(_name, __UNIQUE_ID(guard))
 
 #define __guard_ptr(_name) class_##_name##_lock_ptr
+#define __is_cond_ptr(_name) class_##_name##_is_conditional
 
-#define scoped_guard(_name, args...)					\
-	for (CLASS(_name, scope)(args),					\
-	     *done = NULL; __guard_ptr(_name)(&scope) && !done; done = (void *)1)
-
-#define scoped_cond_guard(_name, _fail, args...) \
-	for (CLASS(_name, scope)(args), \
-	     *done = NULL; !done; done = (void *)1) \
-		if (!__guard_ptr(_name)(&scope)) _fail; \
-		else
-
+/*
+ * Helper macro for scoped_guard().
+ *
+ * Note that the "!__is_cond_ptr(_name)" part of the condition ensures that
+ * compiler would be sure that for the unconditional locks the body of the
+ * loop (caller-provided code glued to the else clause) could not be skipped.
+ * It is needed because the other part - "__guard_ptr(_name)(&scope)" - is too
+ * hard to deduce (even if could be proven true for unconditional locks).
+ */
+#define __scoped_guard(_name, _label, args...)				\
+	for (CLASS(_name, scope)(args);					\
+	     __guard_ptr(_name)(&scope) || !__is_cond_ptr(_name);	\
+	     ({ goto _label; }))					\
+		if (0) {						\
+_label:									\
+			break;						\
+		} else
+
+#define scoped_guard(_name, args...)	\
+	__scoped_guard(_name, __UNIQUE_ID(label), args)
+
+#define __scoped_cond_guard(_name, _fail, _label, args...)		\
+	for (CLASS(_name, scope)(args); true; ({ goto _label; }))	\
+		if (!__guard_ptr(_name)(&scope)) {			\
+			BUILD_BUG_ON(!__is_cond_ptr(_name));		\
+			_fail;						\
+_label:									\
+			break;						\
+		} else
+
+#define scoped_cond_guard(_name, _fail, args...)	\
+	__scoped_cond_guard(_name, _fail, __UNIQUE_ID(label), args)
 /*
  * Additional helper macros for generating lock guards with types, either for
  * locks that don't have a native type (eg. RCU, preempt) or those that need a
@@ -369,14 +398,17 @@ static inline class_##_name##_t class_##_name##_constructor(void)	\
 }
 
 #define DEFINE_LOCK_GUARD_1(_name, _type, _lock, _unlock, ...)		\
+__DEFINE_CLASS_IS_CONDITIONAL(_name, false);				\
 __DEFINE_UNLOCK_GUARD(_name, _type, _unlock, __VA_ARGS__)		\
 __DEFINE_LOCK_GUARD_1(_name, _type, _lock)
 
 #define DEFINE_LOCK_GUARD_0(_name, _lock, _unlock, ...)			\
+__DEFINE_CLASS_IS_CONDITIONAL(_name, false);				\
 __DEFINE_UNLOCK_GUARD(_name, void, _unlock, __VA_ARGS__)		\
 __DEFINE_LOCK_GUARD_0(_name, _lock)
 
 #define DEFINE_LOCK_GUARD_1_COND(_name, _ext, _condlock)		\
+	__DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true);		\
 	EXTEND_CLASS(_name, _ext,					\
 		     ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\
 		        if (_T->lock && !(_condlock)) _T->lock = NULL;	\
-- 
cgit v1.2.3


From 36c2cf88808d47e926d11b98734f154fe4a9f50f Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Tue, 1 Oct 2024 17:30:18 -0500
Subject: cleanup: Add conditional guard helper

Add a new if_not_guard() macro to cleanup.h for handling
conditional guards such as mutext_trylock().

This is more ergonomic than scoped_guard() for most use cases.
Instead of hiding the error handling statement in the macro args, it
works like a normal if statement and allow the error path to be indented
while the normal code flow path is not indented. And it avoid unwanted
side-effect from hidden for loop in scoped_guard().

Signed-off-by: David Lechner <dlechner@baylibre.com>
Co-developed-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
Signed-off-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lkml.kernel.org/r/20241001-cleanup-if_not_cond_guard-v1-1-7753810b0f7a@baylibre.com
---
 include/linux/cleanup.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 0cc66f8d28e7..e859f79b9d2d 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -273,6 +273,12 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
  *	an anonymous instance of the (guard) class, not recommended for
  *	conditional locks.
  *
+ * if_not_guard(name, args...) { <error handling> }:
+ *	convenience macro for conditional guards that calls the statement that
+ *	follows only if the lock was not acquired (typically an error return).
+ *
+ *	Only for conditional locks.
+ *
  * scoped_guard (name, args...) { }:
  *	similar to CLASS(name, scope)(args), except the variable (with the
  *	explicit name 'scope') is declard in a for-loop such that its scope is
@@ -343,6 +349,15 @@ _label:									\
 
 #define scoped_cond_guard(_name, _fail, args...)	\
 	__scoped_cond_guard(_name, _fail, __UNIQUE_ID(label), args)
+
+#define __if_not_guard(_name, _id, args...)		\
+	BUILD_BUG_ON(!__is_cond_ptr(_name));		\
+	CLASS(_name, _id)(args);			\
+	if (!__guard_ptr(_name)(&_id))
+
+#define if_not_guard(_name, args...) \
+	__if_not_guard(_name, __UNIQUE_ID(guard), args)
+
 /*
  * Additional helper macros for generating lock guards with types, either for
  * locks that don't have a native type (eg. RCU, preempt) or those that need a
-- 
cgit v1.2.3


From ec24643bdd625971933451f22b8e33d364920f6e Mon Sep 17 00:00:00 2001
From: Vibhore Vardhan <vibhore@ti.com>
Date: Mon, 7 Oct 2024 08:08:56 +0200
Subject: firmware: ti_sci: Add system suspend and resume call

Introduce system suspend call that enables the ti_sci driver to support
low power mode when the user space issues a suspend to mem.

The following power management operations defined in the TISCI
Low Power Mode API [1] are implemented to support suspend and resume:

1) TISCI_MSG_PREPARE_SLEEP
Prepare the SOC for entering into a low power mode and
provide details to firmware about the state being entered.

2) TISCI_MSG_SET_IO_ISOLATION
Control the IO isolation for Low Power Mode.

Also, write a ti_sci_prepare_system_suspend call to be used in the driver
suspend handler to allow the system to identify the low power mode being
entered and if necessary, send TISCI_MSG_PREPARE_SLEEP with information
about the mode being entered.

Sysfw version >= 10.00.04 support LPM_DM_MANAGED capability [2], where
Device Mgr firmware now manages which low power mode is chosen. Going
forward, this is the default configuration supported for TI AM62 family
of devices. The state chosen by the DM can be influenced by sending
constraints using the new LPM constraint APIs.

In case the firmware does not support LPM_DM_MANAGED mode, the mode
selection logic can be extended as needed. If no suspend-to-RAM modes
are supported, return without taking any action.

We're using "pm_suspend_target_state" to map the kernel's target suspend
state to SysFW low power mode. Make sure this is available only when
CONFIG_SUSPEND is enabled.

Suspend has to be split into two parts, ti_sci_suspend() will send
the prepare sleep message to prepare suspend. ti_sci_suspend_noirq()
sets IO isolation which needs to be done as late as possible to avoid
any issues. On resume this has to be done as early as possible.

[1] https://software-dl.ti.com/tisci/esd/latest/2_tisci_msgs/pm/lpm.html

Co-developed-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Georgi Vlaev <g-vlaev@ti.com>
Signed-off-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Vibhore Vardhan <vibhore@ti.com>
Signed-off-by: Kevin Hilman <khilman@baylibre.com>
Tested-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Markus Schneider-Pargmann <msp@baylibre.com>
Tested-by: Roger Quadros <rogerq@kernel.org>
Link: https://lore.kernel.org/r/20241007-tisci-syssuspendresume-v13-3-ed54cd659a49@baylibre.com
Signed-off-by: Nishanth Menon <nm@ti.com>
---
 drivers/firmware/ti_sci.c              | 194 +++++++++++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              |  45 +++++++-
 include/linux/soc/ti/ti_sci_protocol.h |   4 +
 3 files changed, 242 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index d53192607709..ebf6c0756ba4 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/soc/ti/ti-msgmgr.h>
 #include <linux/soc/ti/ti_sci_protocol.h>
+#include <linux/suspend.h>
 #include <linux/sys_soc.h>
 #include <linux/reboot.h>
 
@@ -1654,6 +1655,68 @@ fail:
 	return ret;
 }
 
+/**
+ * ti_sci_cmd_prepare_sleep() - Prepare system for system suspend
+ * @handle:		pointer to TI SCI handle
+ * @mode:		Device identifier
+ * @ctx_lo:		Low part of address for context save
+ * @ctx_hi:		High part of address for context save
+ * @debug_flags:	Debug flags to pass to firmware
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_prepare_sleep(const struct ti_sci_handle *handle, u8 mode,
+				    u32 ctx_lo, u32 ctx_hi, u32 debug_flags)
+{
+	struct ti_sci_info *info;
+	struct ti_sci_msg_req_prepare_sleep *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PREPARE_SLEEP,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+
+	req = (struct ti_sci_msg_req_prepare_sleep *)xfer->xfer_buf;
+	req->mode = mode;
+	req->ctx_lo = ctx_lo;
+	req->ctx_hi = ctx_hi;
+	req->debug_flags = debug_flags;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		dev_err(dev, "Failed to prepare sleep\n");
+		ret = -ENODEV;
+	}
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
 /**
  * ti_sci_msg_cmd_query_fw_caps() - Get the FW/SoC capabilities
  * @handle:		Pointer to TI SCI handle
@@ -1715,6 +1778,61 @@ fail:
 	return ret;
 }
 
+/**
+ * ti_sci_cmd_set_io_isolation() - Enable IO isolation in LPM
+ * @handle:		Pointer to TI SCI handle
+ * @state:		The desired state of the IO isolation
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_set_io_isolation(const struct ti_sci_handle *handle,
+				       u8 state)
+{
+	struct ti_sci_info *info;
+	struct ti_sci_msg_req_set_io_isolation *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_SET_IO_ISOLATION,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_set_io_isolation *)xfer->xfer_buf;
+	req->state = state;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		dev_err(dev, "Failed to set IO isolation\n");
+		ret = -ENODEV;
+	}
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
 static int ti_sci_cmd_core_reboot(const struct ti_sci_handle *handle)
 {
 	struct ti_sci_info *info;
@@ -3326,6 +3444,81 @@ static int tisci_reboot_handler(struct sys_off_data *data)
 	return NOTIFY_BAD;
 }
 
+static int ti_sci_prepare_system_suspend(struct ti_sci_info *info)
+{
+	/*
+	 * Map and validate the target Linux suspend state to TISCI LPM.
+	 * Default is to let Device Manager select the low power mode.
+	 */
+	switch (pm_suspend_target_state) {
+	case PM_SUSPEND_MEM:
+		if (info->fw_caps & MSG_FLAG_CAPS_LPM_DM_MANAGED) {
+			/*
+			 * For the DM_MANAGED mode the context is reserved for
+			 * internal use and can be 0
+			 */
+			return ti_sci_cmd_prepare_sleep(&info->handle,
+							TISCI_MSG_VALUE_SLEEP_MODE_DM_MANAGED,
+							0, 0, 0);
+		} else {
+			/* DM Managed is not supported by the firmware. */
+			dev_err(info->dev, "Suspend to memory is not supported by the firmware\n");
+			return -EOPNOTSUPP;
+		}
+		break;
+	default:
+		/*
+		 * Do not fail if we don't have action to take for a
+		 * specific suspend mode.
+		 */
+		return 0;
+	}
+}
+
+static int __maybe_unused ti_sci_suspend(struct device *dev)
+{
+	struct ti_sci_info *info = dev_get_drvdata(dev);
+	int ret;
+
+	ret = ti_sci_prepare_system_suspend(info);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int __maybe_unused ti_sci_suspend_noirq(struct device *dev)
+{
+	struct ti_sci_info *info = dev_get_drvdata(dev);
+	int ret = 0;
+
+	ret = ti_sci_cmd_set_io_isolation(&info->handle, TISCI_MSG_VALUE_IO_ENABLE);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int __maybe_unused ti_sci_resume_noirq(struct device *dev)
+{
+	struct ti_sci_info *info = dev_get_drvdata(dev);
+	int ret = 0;
+
+	ret = ti_sci_cmd_set_io_isolation(&info->handle, TISCI_MSG_VALUE_IO_DISABLE);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static const struct dev_pm_ops ti_sci_pm_ops = {
+#ifdef CONFIG_PM_SLEEP
+	.suspend = ti_sci_suspend,
+	.suspend_noirq = ti_sci_suspend_noirq,
+	.resume_noirq = ti_sci_resume_noirq,
+#endif
+};
+
 /* Description for K2G */
 static const struct ti_sci_desc ti_sci_pmmc_k2g_desc = {
 	.default_host_id = 2,
@@ -3494,6 +3687,7 @@ static struct platform_driver ti_sci_driver = {
 		   .name = "ti-sci",
 		   .of_match_table = of_match_ptr(ti_sci_of_match),
 		   .suppress_bind_attrs = true,
+		   .pm = &ti_sci_pm_ops,
 	},
 };
 module_platform_driver(ti_sci_driver);
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 73ca9503606b..8efe4d0e61fb 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -6,7 +6,7 @@
  * The system works in a message response protocol
  * See: https://software-dl.ti.com/tisci/esd/latest/index.html for details
  *
- * Copyright (C)  2015-2016 Texas Instruments Incorporated - https://www.ti.com/
+ * Copyright (C)  2015-2024 Texas Instruments Incorporated - https://www.ti.com/
  */
 
 #ifndef __TI_SCI_H
@@ -36,6 +36,10 @@
 #define TI_SCI_MSG_QUERY_CLOCK_FREQ	0x010d
 #define TI_SCI_MSG_GET_CLOCK_FREQ	0x010e
 
+/* Low Power Mode Requests */
+#define TI_SCI_MSG_PREPARE_SLEEP	0x0300
+#define TI_SCI_MSG_SET_IO_ISOLATION	0x0307
+
 /* Resource Management Requests */
 #define TI_SCI_MSG_GET_RESOURCE_RANGE	0x1500
 
@@ -567,6 +571,45 @@ struct ti_sci_msg_resp_get_clock_freq {
 	u64 freq_hz;
 } __packed;
 
+/**
+ * struct tisci_msg_req_prepare_sleep - Request for TISCI_MSG_PREPARE_SLEEP.
+ *
+ * @hdr				TISCI header to provide ACK/NAK flags to the host.
+ * @mode			Low power mode to enter.
+ * @ctx_lo			Low 32-bits of physical pointer to address to use for context save.
+ * @ctx_hi			High 32-bits of physical pointer to address to use for context save.
+ * @debug_flags			Flags that can be set to halt the sequence during suspend or
+ *				resume to allow JTAG connection and debug.
+ *
+ * This message is used as the first step of entering a low power mode. It
+ * allows configurable information, including which state to enter to be
+ * easily shared from the application, as this is a non-secure message and
+ * therefore can be sent by anyone.
+ */
+struct ti_sci_msg_req_prepare_sleep {
+	struct ti_sci_msg_hdr	hdr;
+
+#define TISCI_MSG_VALUE_SLEEP_MODE_DM_MANAGED				0xfd
+	u8			mode;
+	u32			ctx_lo;
+	u32			ctx_hi;
+	u32			debug_flags;
+} __packed;
+
+/**
+ * struct tisci_msg_set_io_isolation_req - Request for TI_SCI_MSG_SET_IO_ISOLATION.
+ *
+ * @hdr:	Generic header
+ * @state:	The deseared state of the IO isolation.
+ *
+ * This message is used to enable/disable IO isolation for low power modes.
+ * Response is generic ACK / NACK message.
+ */
+struct ti_sci_msg_req_set_io_isolation {
+	struct ti_sci_msg_hdr hdr;
+	u8 state;
+} __packed;
+
 #define TI_SCI_IRQ_SECONDARY_HOST_INVALID	0xff
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index bd0d11af76c5..1f1871e23f76 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -195,6 +195,10 @@ struct ti_sci_clk_ops {
 			u64 *current_freq);
 };
 
+/* TISCI LPM IO isolation control values */
+#define TISCI_MSG_VALUE_IO_ENABLE			1
+#define TISCI_MSG_VALUE_IO_DISABLE			0
+
 /**
  * struct ti_sci_resource_desc - Description of TI SCI resource instance range.
  * @start:	Start index of the first resource range.
-- 
cgit v1.2.3


From 60357991f6b9d4bd4dc442a368da3f468cfa4903 Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Mon, 7 Oct 2024 08:08:57 +0200
Subject: firmware: ti_sci: Introduce Power Management Ops

Introduce power management ops supported by the TISCI
Low Power Mode API [1].

1) TISCI_MSG_LPM_WAKE_REASON
Get which wake up source woke the SoC from Low Power Mode.
The wake up source IDs will be common for all K3 platforms.

2) TISCI_MSG_LPM_SET_DEVICE_CONSTRAINT
Set LPM constraint on behalf of a device. By setting a constraint, the
device ensures that it will not be powered off or reset in the selected
mode.

3) TISCI_MSG_LPM_SET_LATENCY_CONSTRAINT
Set LPM resume latency constraint. By setting a constraint, the host
ensures that the resume time from selected mode will be less than the
constraint value.

[1] https://software-dl.ti.com/tisci/esd/latest/2_tisci_msgs/pm/lpm.html

Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
[g-vlaev@ti.com: LPM_WAKE_REASON and IO_ISOLATION support]
Signed-off-by: Georgi Vlaev <g-vlaev@ti.com>
[a-kaur@ti.com: SET_DEVICE_CONSTRAINT support]
Signed-off-by: Akashdeep Kaur <a-kaur@ti.com>
[vibhore@ti.com: SET_LATENCY_CONSTRAINT support]
Signed-off-by: Vibhore Vardhan <vibhore@ti.com>
Signed-off-by: Kevin Hilman <khilman@baylibre.com>
Reviewed-by: Akashdeep Kaur <a-kaur@ti.com>
Tested-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Markus Schneider-Pargmann <msp@baylibre.com>
Tested-by: Kevin Hilman <khilman@baylibre.com>
Tested-by: Roger Quadros <rogerq@kernel.org>
Acked-by: Dhruva Gole <d-gole@ti.com>
Link: https://lore.kernel.org/r/20241007-tisci-syssuspendresume-v13-4-ed54cd659a49@baylibre.com
Signed-off-by: Nishanth Menon <nm@ti.com>
---
 drivers/firmware/ti_sci.c              | 198 +++++++++++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              |  76 +++++++++++++
 include/linux/soc/ti/ti_sci_protocol.h |  26 +++++
 3 files changed, 300 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index ebf6c0756ba4..b67c35c93802 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -1833,6 +1833,186 @@ fail:
 	return ret;
 }
 
+/**
+ * ti_sci_msg_cmd_lpm_wake_reason() - Get the wakeup source from LPM
+ * @handle:		Pointer to TI SCI handle
+ * @source:		The wakeup source that woke the SoC from LPM
+ * @timestamp:		Timestamp of the wakeup event
+ * @pin:		The pin that has triggered wake up
+ * @mode:		The last entered low power mode
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_msg_cmd_lpm_wake_reason(const struct ti_sci_handle *handle,
+					  u32 *source, u64 *timestamp, u8 *pin, u8 *mode)
+{
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_msg_resp_lpm_wake_reason *resp;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_LPM_WAKE_REASON,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(struct ti_sci_msg_hdr),
+				   sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_resp_lpm_wake_reason *)xfer->xfer_buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		dev_err(dev, "Failed to get wake reason\n");
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	if (source)
+		*source = resp->wake_source;
+	if (timestamp)
+		*timestamp = resp->wake_timestamp;
+	if (pin)
+		*pin = resp->wake_pin;
+	if (mode)
+		*mode = resp->mode;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_set_device_constraint() - Set LPM constraint on behalf of a device
+ * @handle:	pointer to TI SCI handle
+ * @id:	Device identifier
+ * @state:	The desired state of device constraint: set or clear
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_set_device_constraint(const struct ti_sci_handle *handle,
+					    u32 id, u8 state)
+{
+	struct ti_sci_info *info;
+	struct ti_sci_msg_req_lpm_set_device_constraint *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_LPM_SET_DEVICE_CONSTRAINT,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_lpm_set_device_constraint *)xfer->xfer_buf;
+	req->id = id;
+	req->state = state;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		dev_err(dev, "Failed to set device constraint\n");
+		ret = -ENODEV;
+	}
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_set_latency_constraint() - Set LPM resume latency constraint
+ * @handle:	pointer to TI SCI handle
+ * @latency:	maximum acceptable latency (in ms) to wake up from LPM
+ * @state:	The desired state of latency constraint: set or clear
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_set_latency_constraint(const struct ti_sci_handle *handle,
+					     u16 latency, u8 state)
+{
+	struct ti_sci_info *info;
+	struct ti_sci_msg_req_lpm_set_latency_constraint *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_LPM_SET_LATENCY_CONSTRAINT,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_lpm_set_latency_constraint *)xfer->xfer_buf;
+	req->latency = latency;
+	req->state = state;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		dev_err(dev, "Failed to set device constraint\n");
+		ret = -ENODEV;
+	}
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
 static int ti_sci_cmd_core_reboot(const struct ti_sci_handle *handle)
 {
 	struct ti_sci_info *info;
@@ -2975,6 +3155,7 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	struct ti_sci_core_ops *core_ops = &ops->core_ops;
 	struct ti_sci_dev_ops *dops = &ops->dev_ops;
 	struct ti_sci_clk_ops *cops = &ops->clk_ops;
+	struct ti_sci_pm_ops *pmops = &ops->pm_ops;
 	struct ti_sci_rm_core_ops *rm_core_ops = &ops->rm_core_ops;
 	struct ti_sci_rm_irq_ops *iops = &ops->rm_irq_ops;
 	struct ti_sci_rm_ringacc_ops *rops = &ops->rm_ring_ops;
@@ -3014,6 +3195,13 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	cops->set_freq = ti_sci_cmd_clk_set_freq;
 	cops->get_freq = ti_sci_cmd_clk_get_freq;
 
+	if (info->fw_caps & MSG_FLAG_CAPS_LPM_DM_MANAGED) {
+		pr_debug("detected DM managed LPM in fw_caps\n");
+		pmops->lpm_wake_reason = ti_sci_msg_cmd_lpm_wake_reason;
+		pmops->set_device_constraint = ti_sci_cmd_set_device_constraint;
+		pmops->set_latency_constraint = ti_sci_cmd_set_latency_constraint;
+	}
+
 	rm_core_ops->get_range = ti_sci_cmd_get_resource_range;
 	rm_core_ops->get_range_from_shost =
 				ti_sci_cmd_get_resource_range_from_shost;
@@ -3503,11 +3691,21 @@ static int __maybe_unused ti_sci_resume_noirq(struct device *dev)
 {
 	struct ti_sci_info *info = dev_get_drvdata(dev);
 	int ret = 0;
+	u32 source;
+	u64 time;
+	u8 pin;
+	u8 mode;
 
 	ret = ti_sci_cmd_set_io_isolation(&info->handle, TISCI_MSG_VALUE_IO_DISABLE);
 	if (ret)
 		return ret;
 
+	ret = ti_sci_msg_cmd_lpm_wake_reason(&info->handle, &source, &time, &pin, &mode);
+	/* Do not fail to resume on error as the wake reason is not critical */
+	if (!ret)
+		dev_info(dev, "ti_sci: wakeup source:0x%x, pin:0x%x, mode:0x%x\n",
+			 source, pin, mode);
+
 	return 0;
 }
 
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 8efe4d0e61fb..053387d7baa0 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -38,7 +38,10 @@
 
 /* Low Power Mode Requests */
 #define TI_SCI_MSG_PREPARE_SLEEP	0x0300
+#define TI_SCI_MSG_LPM_WAKE_REASON	0x0306
 #define TI_SCI_MSG_SET_IO_ISOLATION	0x0307
+#define TI_SCI_MSG_LPM_SET_DEVICE_CONSTRAINT	0x0309
+#define TI_SCI_MSG_LPM_SET_LATENCY_CONSTRAINT	0x030A
 
 /* Resource Management Requests */
 #define TI_SCI_MSG_GET_RESOURCE_RANGE	0x1500
@@ -610,6 +613,79 @@ struct ti_sci_msg_req_set_io_isolation {
 	u8 state;
 } __packed;
 
+/**
+ * struct ti_sci_msg_resp_lpm_wake_reason - Response for TI_SCI_MSG_LPM_WAKE_REASON.
+ *
+ * @hdr:		Generic header.
+ * @wake_source:	The wake up source that woke soc from LPM.
+ * @wake_timestamp:	Timestamp at which soc woke.
+ * @wake_pin: The pin that has triggered wake up.
+ * @mode: The last entered low power mode.
+ * @rsvd:	Reserved for future use.
+ *
+ * Response to a generic message with message type TI_SCI_MSG_LPM_WAKE_REASON,
+ * used to query the wake up source, pin and entered low power mode.
+ */
+struct ti_sci_msg_resp_lpm_wake_reason {
+	struct ti_sci_msg_hdr hdr;
+	u32 wake_source;
+	u64 wake_timestamp;
+	u8 wake_pin;
+	u8 mode;
+	u32 rsvd[2];
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_lpm_set_device_constraint - Request for
+ * TISCI_MSG_LPM_SET_DEVICE_CONSTRAINT.
+ *
+ * @hdr:	TISCI header to provide ACK/NAK flags to the host.
+ * @id:	Device ID of device whose constraint has to be modified.
+ * @state:	The desired state of device constraint: set or clear.
+ * @rsvd:	Reserved for future use.
+ *
+ * This message is used by host to set constraint on the device. This can be
+ * sent anytime after boot before prepare sleep message. Any device can set a
+ * constraint on the low power mode that the SoC can enter. It allows
+ * configurable information to be easily shared from the application, as this
+ * is a non-secure message and therefore can be sent by anyone. By setting a
+ * constraint, the device ensures that it will not be powered off or reset in
+ * the selected mode. Note: Access Restriction: Exclusivity flag of Device will
+ * be honored. If some other host already has constraint on this device ID,
+ * NACK will be returned.
+ */
+struct ti_sci_msg_req_lpm_set_device_constraint {
+	struct ti_sci_msg_hdr hdr;
+	u32 id;
+	u8 state;
+	u32 rsvd[2];
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_lpm_set_latency_constraint - Request for
+ * TISCI_MSG_LPM_SET_LATENCY_CONSTRAINT.
+ *
+ * @hdr:	TISCI header to provide ACK/NAK flags to the host.
+ * @wkup_latency:	The maximum acceptable latency to wake up from low power mode
+ *			in milliseconds. The deeper the state, the higher the latency.
+ * @state:	The desired state of wakeup latency constraint: set or clear.
+ * @rsvd:	Reserved for future use.
+ *
+ * This message is used by host to set wakeup latency from low power mode. This can
+ * be sent anytime after boot before prepare sleep message, and can be sent after
+ * current low power mode is exited. Any device can set a constraint on the low power
+ * mode that the SoC can enter. It allows configurable information to be easily shared
+ * from the application, as this is a non-secure message and therefore can be sent by
+ * anyone. By setting a wakeup latency constraint, the host ensures that the resume time
+ * from selected low power mode will be less than the constraint value.
+ */
+struct ti_sci_msg_req_lpm_set_latency_constraint {
+	struct ti_sci_msg_hdr hdr;
+	u16 latency;
+	u8 state;
+	u32 rsvd;
+} __packed;
+
 #define TI_SCI_IRQ_SECONDARY_HOST_INVALID	0xff
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 1f1871e23f76..fd104b666836 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -199,6 +199,31 @@ struct ti_sci_clk_ops {
 #define TISCI_MSG_VALUE_IO_ENABLE			1
 #define TISCI_MSG_VALUE_IO_DISABLE			0
 
+/* TISCI LPM constraint state values */
+#define TISCI_MSG_CONSTRAINT_SET			1
+#define TISCI_MSG_CONSTRAINT_CLR			0
+
+/**
+ * struct ti_sci_pm_ops - Low Power Mode (LPM) control operations
+ * @lpm_wake_reason: Get the wake up source that woke the SoC from LPM
+ *		- source: The wake up source that woke soc from LPM.
+ *		- timestamp: Timestamp at which soc woke.
+ * @set_device_constraint: Set LPM constraint on behalf of a device
+ *		- id: Device Identifier
+ *		- state: The desired state of device constraint: set or clear.
+ * @set_latency_constraint: Set LPM resume latency constraint
+ *		- latency: maximum acceptable latency to wake up from low power mode
+ *		- state: The desired state of latency constraint: set or clear.
+ */
+struct ti_sci_pm_ops {
+	int (*lpm_wake_reason)(const struct ti_sci_handle *handle,
+			       u32 *source, u64 *timestamp, u8 *pin, u8 *mode);
+	int (*set_device_constraint)(const struct ti_sci_handle *handle,
+				     u32 id, u8 state);
+	int (*set_latency_constraint)(const struct ti_sci_handle *handle,
+				      u16 latency, u8 state);
+};
+
 /**
  * struct ti_sci_resource_desc - Description of TI SCI resource instance range.
  * @start:	Start index of the first resource range.
@@ -543,6 +568,7 @@ struct ti_sci_ops {
 	struct ti_sci_core_ops core_ops;
 	struct ti_sci_dev_ops dev_ops;
 	struct ti_sci_clk_ops clk_ops;
+	struct ti_sci_pm_ops pm_ops;
 	struct ti_sci_rm_core_ops rm_core_ops;
 	struct ti_sci_rm_irq_ops rm_irq_ops;
 	struct ti_sci_rm_ringacc_ops rm_ring_ops;
-- 
cgit v1.2.3


From 037bc38b298c9a8de64f84b253c0b472228bbb10 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:03 -0700
Subject: KVM: Drop KVM_ERR_PTR_BAD_PAGE and instead return NULL to indicate an
 error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove KVM_ERR_PTR_BAD_PAGE and instead return NULL, as "bad page" is just
a leftover bit of weirdness from days of old when KVM stuffed a "bad" page
into the guest instead of actually handling missing pages.  See commit
cea7bb21280e ("KVM: MMU: Make gfn_to_page() always safe").

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-2-seanjc@google.com>
---
 arch/powerpc/kvm/book3s_pr.c          |  2 +-
 arch/powerpc/kvm/book3s_xive_native.c |  2 +-
 arch/s390/kvm/vsie.c                  |  2 +-
 arch/x86/kvm/lapic.c                  |  2 +-
 include/linux/kvm_host.h              |  7 -------
 virt/kvm/kvm_main.c                   | 15 ++++++---------
 6 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 7b8ae509328f..d7721297b9b6 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -645,7 +645,7 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
 	int i;
 
 	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
-	if (is_error_page(hpage))
+	if (!hpage)
 		return;
 
 	hpage_offset = pte->raddr & ~PAGE_MASK;
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 6e2ebbd8aaac..d9bf1bc3ff61 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -654,7 +654,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
 	}
 
 	page = gfn_to_page(kvm, gfn);
-	if (is_error_page(page)) {
+	if (!page) {
 		srcu_read_unlock(&kvm->srcu, srcu_idx);
 		pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
 		return -EINVAL;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 89cafea4c41f..763a070f5955 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -661,7 +661,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
 	struct page *page;
 
 	page = gfn_to_page(kvm, gpa_to_gfn(gpa));
-	if (is_error_page(page))
+	if (!page)
 		return -EINVAL;
 	*hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK);
 	return 0;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2098dc689088..20526e4d6c62 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2664,7 +2664,7 @@ int kvm_alloc_apic_access_page(struct kvm *kvm)
 	}
 
 	page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-	if (is_error_page(page)) {
+	if (!page) {
 		ret = -EFAULT;
 		goto out;
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 45be36e5285f..0b280e5bad0a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -153,13 +153,6 @@ static inline bool kvm_is_error_gpa(gpa_t gpa)
 	return gpa == INVALID_GPA;
 }
 
-#define KVM_ERR_PTR_BAD_PAGE	(ERR_PTR(-ENOENT))
-
-static inline bool is_error_page(struct page *page)
-{
-	return IS_ERR(page);
-}
-
 #define KVM_REQUEST_MASK           GENMASK(7,0)
 #define KVM_REQUEST_NO_WAKEUP      BIT(8)
 #define KVM_REQUEST_WAIT           BIT(9)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 141db5b79cd4..5fc95504affe 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3066,19 +3066,14 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
  */
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
-	struct page *page;
 	kvm_pfn_t pfn;
 
 	pfn = gfn_to_pfn(kvm, gfn);
 
 	if (is_error_noslot_pfn(pfn))
-		return KVM_ERR_PTR_BAD_PAGE;
-
-	page = kvm_pfn_to_refcounted_page(pfn);
-	if (!page)
-		return KVM_ERR_PTR_BAD_PAGE;
+		return NULL;
 
-	return page;
+	return kvm_pfn_to_refcounted_page(pfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
@@ -3172,7 +3167,8 @@ static void kvm_set_page_accessed(struct page *page)
 
 void kvm_release_page_clean(struct page *page)
 {
-	WARN_ON(is_error_page(page));
+	if (WARN_ON(!page))
+		return;
 
 	kvm_set_page_accessed(page);
 	put_page(page);
@@ -3196,7 +3192,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 
 void kvm_release_page_dirty(struct page *page)
 {
-	WARN_ON(is_error_page(page));
+	if (WARN_ON(!page))
+		return;
 
 	kvm_set_page_dirty(page);
 	kvm_release_page_clean(page);
-- 
cgit v1.2.3


From 3af91068b7e10dba438f70eae94d877f20842fa1 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:05 -0700
Subject: KVM: Add kvm_release_page_unused() API to put pages that KVM never
 consumes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an API to release an unused page, i.e. to put a page without marking
it accessed or dirty.  The API will be used when KVM faults-in a page but
bails before installing the guest mapping (and other similar flows).

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-4-seanjc@google.com>
---
 include/linux/kvm_host.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0b280e5bad0a..8de9acb0b35e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1216,6 +1216,15 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable);
 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
 				      bool *writable);
+
+static inline void kvm_release_page_unused(struct page *page)
+{
+	if (!page)
+		return;
+
+	put_page(page);
+}
+
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 
-- 
cgit v1.2.3


From 6419bc52072b928acb764766968c672d5fede802 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:13 -0700
Subject: KVM: Rename gfn_to_page_many_atomic() to kvm_prefetch_pages()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename gfn_to_page_many_atomic() to kvm_prefetch_pages() to try and
communicate its true purpose, as the "atomic" aspect is essentially a
side effect of the fact that x86 uses the API while holding mmu_lock.
E.g. even if mmu_lock weren't held, KVM wouldn't want to fault-in pages,
as the goal is to opportunistically grab surrounding pages that have
already been accessed and/or dirtied by the host, and to do so quickly.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-12-seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c         | 2 +-
 arch/x86/kvm/mmu/paging_tmpl.h | 2 +-
 include/linux/kvm_host.h       | 4 ++--
 virt/kvm/kvm_main.c            | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 993eeba32487..37c2f8d11e05 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2965,7 +2965,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 	if (!slot)
 		return -1;
 
-	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
+	ret = kvm_prefetch_pages(slot, gfn, pages, end - start);
 	if (ret <= 0)
 		return -1;
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 36b2607280f0..143b7e9f26dc 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -549,7 +549,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (!slot)
 		return false;
 
-	if (gfn_to_page_many_atomic(slot, gfn, &page, 1) != 1)
+	if (kvm_prefetch_pages(slot, gfn, &page, 1) != 1)
 		return false;
 
 	mmu_set_spte(vcpu, slot, spte, pte_access, gfn, page_to_pfn(page), NULL);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8de9acb0b35e..6a3976c1a218 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1207,8 +1207,8 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm);
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot);
 
-int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
-			    struct page **pages, int nr_pages);
+int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+		       struct page **pages, int nr_pages);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e7561ca96a09..aa7ae0f0f90e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3041,8 +3041,8 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
-int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
-			    struct page **pages, int nr_pages)
+int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+		       struct page **pages, int nr_pages)
 {
 	unsigned long addr;
 	gfn_t entry = 0;
@@ -3056,7 +3056,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 
 	return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
 }
-EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+EXPORT_SYMBOL_GPL(kvm_prefetch_pages);
 
 /*
  * Do not use this helper unless you are absolutely certain the gfn _must_ be
-- 
cgit v1.2.3


From e2d2ca71ac03c748dbc44e0dd7dc1557befb1ab6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:14 -0700
Subject: KVM: Drop @atomic param from gfn=>pfn and hva=>pfn APIs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop @atomic from the myriad "to_pfn" APIs now that all callers pass
"false", and remove a comment blurb about KVM running only the "GUP fast"
part in atomic context.

No functional change intended.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-13-seanjc@google.com>
---
 Documentation/virt/kvm/locking.rst     |  4 ++--
 arch/arm64/kvm/mmu.c                   |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c |  2 +-
 arch/x86/kvm/mmu/mmu.c                 | 12 ++++++------
 include/linux/kvm_host.h               |  3 +--
 virt/kvm/kvm_main.c                    | 33 ++++++++-------------------------
 virt/kvm/kvm_mm.h                      |  4 ++--
 virt/kvm/pfncache.c                    |  2 +-
 9 files changed, 23 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 693090bfc66d..f463ac42ac7a 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -135,8 +135,8 @@ We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
 For direct sp, we can easily avoid it since the spte of direct sp is fixed
 to gfn.  For indirect sp, we disabled fast page fault for simplicity.
 
-A solution for indirect sp could be to pin the gfn, for example via
-gfn_to_pfn_memslot_atomic, before the cmpxchg.  After the pinning:
+A solution for indirect sp could be to pin the gfn before the cmpxchg.  After
+the pinning:
 
 - We have held the refcount of pfn; that means the pfn can not be freed and
   be reused for another gfn.
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 0f7658aefa1a..9fbc79fad292 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1570,7 +1570,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	mmap_read_unlock(current->mm);
 
-	pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
+	pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
 				   write_fault, &writable, NULL);
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
 		kvm_send_hwpoison_signal(hva, vma_shift);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 1b51b1c4713b..8cd02ca4b1b8 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -613,7 +613,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
 		write_ok = true;
 	} else {
 		/* Call KVM generic code to do the slow-path check */
-		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
+		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
 					   writing, &write_ok, NULL);
 		if (is_error_noslot_pfn(pfn))
 			return -EFAULT;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 408d98f8a514..26a969e935e3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -852,7 +852,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 		unsigned long pfn;
 
 		/* Call KVM generic code to do the slow-path check */
-		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
+		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
 					   writing, upgrade_p, NULL);
 		if (is_error_noslot_pfn(pfn))
 			return -EFAULT;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 37c2f8d11e05..e5e0bf7593e7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4387,9 +4387,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		return kvm_faultin_pfn_private(vcpu, fault);
 
 	async = false;
-	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false,
-					  &async, fault->write,
-					  &fault->map_writable, &fault->hva);
+	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, &async,
+					  fault->write, &fault->map_writable,
+					  &fault->hva);
 	if (!async)
 		return RET_PF_CONTINUE; /* *pfn has correct page already */
 
@@ -4409,9 +4409,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	 * to wait for IO.  Note, gup always bails if it is unable to quickly
 	 * get a page and a fatal signal, i.e. SIGKILL, is pending.
 	 */
-	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
-					  NULL, fault->write,
-					  &fault->map_writable, &fault->hva);
+	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, NULL,
+					  fault->write, &fault->map_writable,
+					  &fault->hva);
 	return RET_PF_CONTINUE;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6a3976c1a218..32e23e05a8c3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1232,9 +1232,8 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable);
 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn);
-kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
-			       bool atomic, bool interruptible, bool *async,
+			       bool interruptible, bool *async,
 			       bool write_fault, bool *writable, hva_t *hva);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aa7ae0f0f90e..c7506eb23086 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2756,8 +2756,7 @@ static inline int check_user_page_hwpoison(unsigned long addr)
 
 /*
  * The fast path to get the writable pfn which will be stored in @pfn,
- * true indicates success, otherwise false is returned.  It's also the
- * only part that runs if we can in atomic context.
+ * true indicates success, otherwise false is returned.
  */
 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
 			    bool *writable, kvm_pfn_t *pfn)
@@ -2922,7 +2921,6 @@ out:
 /*
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
- * @atomic: whether this function is forbidden from sleeping
  * @interruptible: whether the process can be interrupted by non-fatal signals
  * @async: whether this function need to wait IO complete if the
  *         host page is not in the memory
@@ -2934,22 +2932,16 @@ out:
  * 2): @write_fault = false && @writable, @writable will tell the caller
  *     whether the mapping is writable.
  */
-kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
-		     bool *async, bool write_fault, bool *writable)
+kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async,
+		     bool write_fault, bool *writable)
 {
 	struct vm_area_struct *vma;
 	kvm_pfn_t pfn;
 	int npages, r;
 
-	/* we can do it either atomically or asynchronously, not both */
-	BUG_ON(atomic && async);
-
 	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
 		return pfn;
 
-	if (atomic)
-		return KVM_PFN_ERR_FAULT;
-
 	npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
 				 writable, &pfn);
 	if (npages == 1)
@@ -2986,7 +2978,7 @@ exit:
 }
 
 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
-			       bool atomic, bool interruptible, bool *async,
+			       bool interruptible, bool *async,
 			       bool write_fault, bool *writable, hva_t *hva)
 {
 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
@@ -3008,33 +3000,24 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
 		writable = NULL;
 	}
 
-	return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
-			  writable);
+	return hva_to_pfn(addr, interruptible, async, write_fault, writable);
 }
 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
 
 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable)
 {
-	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
-				    NULL, write_fault, writable, NULL);
+	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
+				    write_fault, writable, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
-				    NULL, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
-kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
-{
-	return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
-				    NULL, NULL);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
-
 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
 	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01..a3fa86f60d6c 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -20,8 +20,8 @@
 #define KVM_MMU_UNLOCK(kvm)		spin_unlock(&(kvm)->mmu_lock)
 #endif /* KVM_HAVE_MMU_RWLOCK */
 
-kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
-		     bool *async, bool write_fault, bool *writable);
+kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async,
+		     bool write_fault, bool *writable);
 
 #ifdef CONFIG_HAVE_KVM_PFNCACHE
 void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f0039efb9e1e..58c706a610e5 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -198,7 +198,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		}
 
 		/* We always request a writeable mapping */
-		new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL);
+		new_pfn = hva_to_pfn(gpc->uhva, false, NULL, true, NULL);
 		if (is_error_noslot_pfn(new_pfn))
 			goto out_error;
 
-- 
cgit v1.2.3


From 6769d1bcd3509ad2d2ee04da122c465a11a165b4 Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Thu, 10 Oct 2024 11:23:18 -0700
Subject: KVM: Replace "async" pointer in gfn=>pfn with "no_wait" and error
 code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a pfn error code to communicate that hva_to_pfn() failed because I/O
was needed and disallowed, and convert @async to a constant @no_wait
boolean.  This will allow eliminating the @no_wait param by having callers
pass in FOLL_NOWAIT along with other FOLL_* flags.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: David Stevens <stevensd@chromium.org>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-17-seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c   | 18 +++++++++++-------
 include/linux/kvm_host.h |  3 ++-
 virt/kvm/kvm_main.c      | 27 ++++++++++++++-------------
 virt/kvm/kvm_mm.h        |  2 +-
 virt/kvm/pfncache.c      |  4 ++--
 5 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e5e0bf7593e7..cae81209f9ed 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4381,17 +4381,21 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
 
 static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
-	bool async;
-
 	if (fault->is_private)
 		return kvm_faultin_pfn_private(vcpu, fault);
 
-	async = false;
-	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, &async,
+	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
 					  fault->write, &fault->map_writable,
 					  &fault->hva);
-	if (!async)
-		return RET_PF_CONTINUE; /* *pfn has correct page already */
+
+	/*
+	 * If resolving the page failed because I/O is needed to fault-in the
+	 * page, then either set up an asynchronous #PF to do the I/O, or if
+	 * doing an async #PF isn't possible, retry with I/O allowed.  All
+	 * other failures are terminal, i.e. retrying won't help.
+	 */
+	if (fault->pfn != KVM_PFN_ERR_NEEDS_IO)
+		return RET_PF_CONTINUE;
 
 	if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(fault->addr, fault->gfn);
@@ -4409,7 +4413,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	 * to wait for IO.  Note, gup always bails if it is unable to quickly
 	 * get a page and a fatal signal, i.e. SIGKILL, is pending.
 	 */
-	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, NULL,
+	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, true,
 					  fault->write, &fault->map_writable,
 					  &fault->hva);
 	return RET_PF_CONTINUE;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 32e23e05a8c3..dc15a9a64408 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -97,6 +97,7 @@
 #define KVM_PFN_ERR_HWPOISON	(KVM_PFN_ERR_MASK + 1)
 #define KVM_PFN_ERR_RO_FAULT	(KVM_PFN_ERR_MASK + 2)
 #define KVM_PFN_ERR_SIGPENDING	(KVM_PFN_ERR_MASK + 3)
+#define KVM_PFN_ERR_NEEDS_IO	(KVM_PFN_ERR_MASK + 4)
 
 /*
  * error pfns indicate that the gfn is in slot but faild to
@@ -1233,7 +1234,7 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable);
 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
-			       bool interruptible, bool *async,
+			       bool interruptible, bool no_wait,
 			       bool write_fault, bool *writable, hva_t *hva);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f5b7fd653341..d7a72278c033 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2778,7 +2778,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
  * The slow path to get the pfn of the specified host virtual address,
  * 1 indicates success, -errno is returned if error is detected.
  */
-static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+static int hva_to_pfn_slow(unsigned long addr, bool no_wait, bool write_fault,
 			   bool interruptible, bool *writable, kvm_pfn_t *pfn)
 {
 	/*
@@ -2801,7 +2801,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 
 	if (write_fault)
 		flags |= FOLL_WRITE;
-	if (async)
+	if (no_wait)
 		flags |= FOLL_NOWAIT;
 	if (interruptible)
 		flags |= FOLL_INTERRUPTIBLE;
@@ -2912,8 +2912,8 @@ out:
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
  * @interruptible: whether the process can be interrupted by non-fatal signals
- * @async: whether this function need to wait IO complete if the
- *         host page is not in the memory
+ * @no_wait: whether or not this function need to wait IO complete if the
+ *	     host page is not in the memory
  * @write_fault: whether we should get a writable host page
  * @writable: whether it allows to map a writable host page for !@write_fault
  *
@@ -2922,7 +2922,7 @@ out:
  * 2): @write_fault = false && @writable, @writable will tell the caller
  *     whether the mapping is writable.
  */
-kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async,
+kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait,
 		     bool write_fault, bool *writable)
 {
 	struct vm_area_struct *vma;
@@ -2934,7 +2934,7 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async,
 	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
 		return pfn;
 
-	npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
+	npages = hva_to_pfn_slow(addr, no_wait, write_fault, interruptible,
 				 writable, &pfn);
 	if (npages == 1)
 		return pfn;
@@ -2956,16 +2956,17 @@ retry:
 		if (r < 0)
 			pfn = KVM_PFN_ERR_FAULT;
 	} else {
-		if (async && vma_is_valid(vma, write_fault))
-			*async = true;
-		pfn = KVM_PFN_ERR_FAULT;
+		if (no_wait && vma_is_valid(vma, write_fault))
+			pfn = KVM_PFN_ERR_NEEDS_IO;
+		else
+			pfn = KVM_PFN_ERR_FAULT;
 	}
 	mmap_read_unlock(current->mm);
 	return pfn;
 }
 
 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
-			       bool interruptible, bool *async,
+			       bool interruptible, bool no_wait,
 			       bool write_fault, bool *writable, hva_t *hva)
 {
 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
@@ -2987,21 +2988,21 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
 		writable = NULL;
 	}
 
-	return hva_to_pfn(addr, interruptible, async, write_fault, writable);
+	return hva_to_pfn(addr, interruptible, no_wait, write_fault, writable);
 }
 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
 
 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable)
 {
-	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
+	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
 				    write_fault, writable, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index a3fa86f60d6c..51f3fee4ca3f 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -20,7 +20,7 @@
 #define KVM_MMU_UNLOCK(kvm)		spin_unlock(&(kvm)->mmu_lock)
 #endif /* KVM_HAVE_MMU_RWLOCK */
 
-kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async,
+kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait,
 		     bool write_fault, bool *writable);
 
 #ifdef CONFIG_HAVE_KVM_PFNCACHE
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 58c706a610e5..32dc61f48c81 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -197,8 +197,8 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			cond_resched();
 		}
 
-		/* We always request a writeable mapping */
-		new_pfn = hva_to_pfn(gpc->uhva, false, NULL, true, NULL);
+		/* We always request a writable mapping */
+		new_pfn = hva_to_pfn(gpc->uhva, false, false, true, NULL);
 		if (is_error_noslot_pfn(new_pfn))
 			goto out_error;
 
-- 
cgit v1.2.3


From cccefb0a0d3b4f7b41b1921538087dd7031876ac Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:20 -0700
Subject: KVM: Drop unused "hva" pointer from __gfn_to_pfn_memslot()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop @hva from __gfn_to_pfn_memslot() now that all callers pass NULL.

No functional change intended.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-19-seanjc@google.com>
---
 arch/arm64/kvm/mmu.c                   | 2 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c    | 2 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +-
 arch/x86/kvm/mmu/mmu.c                 | 6 ++----
 include/linux/kvm_host.h               | 2 +-
 virt/kvm/kvm_main.c                    | 9 +++------
 6 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 9fbc79fad292..246c820379ec 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1571,7 +1571,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	mmap_read_unlock(current->mm);
 
 	pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-				   write_fault, &writable, NULL);
+				   write_fault, &writable);
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
 		kvm_send_hwpoison_signal(hva, vma_shift);
 		return 0;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 8cd02ca4b1b8..2f1d58984b41 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -614,7 +614,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
 	} else {
 		/* Call KVM generic code to do the slow-path check */
 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-					   writing, &write_ok, NULL);
+					   writing, &write_ok);
 		if (is_error_noslot_pfn(pfn))
 			return -EFAULT;
 		page = NULL;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 26a969e935e3..8304b6f8fe45 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -853,7 +853,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 
 		/* Call KVM generic code to do the slow-path check */
 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-					   writing, upgrade_p, NULL);
+					   writing, upgrade_p);
 		if (is_error_noslot_pfn(pfn))
 			return -EFAULT;
 		page = NULL;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d4e21071b78e..688202ac50c2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4384,8 +4384,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		return kvm_faultin_pfn_private(vcpu, fault);
 
 	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
-					  fault->write, &fault->map_writable,
-					  NULL);
+					  fault->write, &fault->map_writable);
 
 	/*
 	 * If resolving the page failed because I/O is needed to fault-in the
@@ -4413,8 +4412,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	 * get a page and a fatal signal, i.e. SIGKILL, is pending.
 	 */
 	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, true,
-					  fault->write, &fault->map_writable,
-					  NULL);
+					  fault->write, &fault->map_writable);
 	return RET_PF_CONTINUE;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index dc15a9a64408..2c9eb472f059 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1235,7 +1235,7 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool interruptible, bool no_wait,
-			       bool write_fault, bool *writable, hva_t *hva);
+			       bool write_fault, bool *writable);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn);
 void kvm_release_pfn_dirty(kvm_pfn_t pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d7a72278c033..9de915a56bd5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2967,13 +2967,10 @@ retry:
 
 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool interruptible, bool no_wait,
-			       bool write_fault, bool *writable, hva_t *hva)
+			       bool write_fault, bool *writable)
 {
 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
 
-	if (hva)
-		*hva = addr;
-
 	if (kvm_is_error_hva(addr)) {
 		if (writable)
 			*writable = false;
@@ -2996,13 +2993,13 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable)
 {
 	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
-				    write_fault, writable, NULL);
+				    write_fault, writable);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
-- 
cgit v1.2.3


From ef7db98e477f4b379fac3131bf94c33774c4a211 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:24 -0700
Subject: KVM: Use NULL for struct page pointer to indicate mremapped memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop yet another unnecessary magic page value from KVM, as there's zero
reason to use a poisoned pointer to indicate "no page".  If KVM uses a
NULL page pointer, the kernel will explode just as quickly as if KVM uses
a poisoned pointer.  Never mind the fact that such usage would be a
blatant and egregious KVM bug.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-23-seanjc@google.com>
---
 include/linux/kvm_host.h | 4 ----
 virt/kvm/kvm_main.c      | 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c9eb472f059..cd6f5cc1930f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -273,16 +273,12 @@ enum {
 	READING_SHADOW_PAGE_TABLES,
 };
 
-#define KVM_UNMAPPED_PAGE	((void *) 0x500 + POISON_POINTER_DELTA)
-
 struct kvm_host_map {
 	/*
 	 * Only valid if the 'pfn' is managed by the host kernel (i.e. There is
 	 * a 'struct page' for it. When using mem= kernel parameter some memory
 	 * can be used as guest memory but they are not managed by host
 	 * kernel).
-	 * If 'pfn' is not managed by the host kernel, this field is
-	 * initialized to KVM_UNMAPPED_PAGE.
 	 */
 	struct page *page;
 	void *hva;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4074f49eb3f1..c20386a8aa3e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3061,7 +3061,7 @@ void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
 
 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 {
-	map->page = KVM_UNMAPPED_PAGE;
+	map->page = NULL;
 	map->hva = NULL;
 	map->gfn = gfn;
 
@@ -3087,7 +3087,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 	if (!map->hva)
 		return;
 
-	if (map->page != KVM_UNMAPPED_PAGE)
+	if (map->page)
 		kunmap(map->page);
 #ifdef CONFIG_HAS_IOMEM
 	else
-- 
cgit v1.2.3


From 2ff072ba7ad2deb1c3b2d231faa0ac3a25e2451b Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Thu, 10 Oct 2024 11:23:32 -0700
Subject: KVM: Migrate kvm_vcpu_map() to kvm_follow_pfn()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Migrate kvm_vcpu_map() to kvm_follow_pfn(), and have it track whether or
not the map holds a refcounted struct page.  Precisely tracking struct
page references will eventually allow removing kvm_pfn_to_refcounted_page()
and its various wrappers.

Signed-off-by: David Stevens <stevensd@chromium.org>
[sean: use a pointer instead of a boolean]
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-31-seanjc@google.com>
---
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      | 26 ++++++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cd6f5cc1930f..35e1beb017dd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -280,6 +280,7 @@ struct kvm_host_map {
 	 * can be used as guest memory but they are not managed by host
 	 * kernel).
 	 */
+	struct page *refcounted_page;
 	struct page *page;
 	void *hva;
 	kvm_pfn_t pfn;
@@ -1238,7 +1239,6 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_accessed(kvm_pfn_t pfn);
 
-void kvm_release_pfn(kvm_pfn_t pfn, bool dirty);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 			int len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 64888257e301..842a5d5f3120 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3087,21 +3087,21 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
-void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
-{
-	if (dirty)
-		kvm_release_pfn_dirty(pfn);
-	else
-		kvm_release_pfn_clean(pfn);
-}
-
 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 {
+	struct kvm_follow_pfn kfp = {
+		.slot = gfn_to_memslot(vcpu->kvm, gfn),
+		.gfn = gfn,
+		.flags = FOLL_WRITE,
+		.refcounted_page = &map->refcounted_page,
+	};
+
+	map->refcounted_page = NULL;
 	map->page = NULL;
 	map->hva = NULL;
 	map->gfn = gfn;
 
-	map->pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	map->pfn = kvm_follow_pfn(&kfp);
 	if (is_error_noslot_pfn(map->pfn))
 		return -EINVAL;
 
@@ -3133,10 +3133,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 	if (dirty)
 		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
 
-	kvm_release_pfn(map->pfn, dirty);
+	if (map->refcounted_page) {
+		if (dirty)
+			kvm_release_page_dirty(map->refcounted_page);
+		else
+			kvm_release_page_clean(map->refcounted_page);
+	}
 
 	map->hva = NULL;
 	map->page = NULL;
+	map->refcounted_page = NULL;
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
-- 
cgit v1.2.3


From 2bcb52a3602bf4cbc55d8fb4da00c930f83d7789 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:33 -0700
Subject: KVM: Pin (as in FOLL_PIN) pages during kvm_vcpu_map()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pin, as in FOLL_PIN, pages when mapping them for direct access by KVM.
As per Documentation/core-api/pin_user_pages.rst, writing to a page that
was gotten via FOLL_GET is explicitly disallowed.

  Correct (uses FOLL_PIN calls):
      pin_user_pages()
      write to the data within the pages
      unpin_user_pages()

  INCORRECT (uses FOLL_GET calls):
      get_user_pages()
      write to the data within the pages
      put_page()

Unfortunately, FOLL_PIN is a "private" flag, and so kvm_follow_pfn must
use a one-off bool instead of being able to piggyback the "flags" field.

Link: https://lwn.net/Articles/930667
Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-32-seanjc@google.com>
---
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      | 54 ++++++++++++++++++++++++++++++++++--------------
 virt/kvm/kvm_mm.h        |  7 +++++++
 3 files changed, 47 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 35e1beb017dd..b4c541fa5a1f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -280,7 +280,7 @@ struct kvm_host_map {
 	 * can be used as guest memory but they are not managed by host
 	 * kernel).
 	 */
-	struct page *refcounted_page;
+	struct page *pinned_page;
 	struct page *page;
 	void *hva;
 	kvm_pfn_t pfn;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 842a5d5f3120..0d59f47f099e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2814,9 +2814,12 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
 	 */
 	if (map) {
 		pfn = map->pfn;
-		page = kvm_pfn_to_refcounted_page(pfn);
-		if (page && !get_page_unless_zero(page))
-			return KVM_PFN_ERR_FAULT;
+
+		if (!kfp->pin) {
+			page = kvm_pfn_to_refcounted_page(pfn);
+			if (page && !get_page_unless_zero(page))
+				return KVM_PFN_ERR_FAULT;
+		}
 	} else {
 		pfn = page_to_pfn(page);
 	}
@@ -2834,16 +2837,24 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
 static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
 {
 	struct page *page;
+	bool r;
 
 	/*
-	 * Fast pin a writable pfn only if it is a write fault request
-	 * or the caller allows to map a writable pfn for a read fault
-	 * request.
+	 * Try the fast-only path when the caller wants to pin/get the page for
+	 * writing.  If the caller only wants to read the page, KVM must go
+	 * down the full, slow path in order to avoid racing an operation that
+	 * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing
+	 * at the old, read-only page while mm/ points at a new, writable page.
 	 */
 	if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable))
 		return false;
 
-	if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) {
+	if (kfp->pin)
+		r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1;
+	else
+		r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page);
+
+	if (r) {
 		*pfn = kvm_resolve_pfn(kfp, page, NULL, true);
 		return true;
 	}
@@ -2872,10 +2883,21 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
 	struct page *page, *wpage;
 	int npages;
 
-	npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
+	if (kfp->pin)
+		npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags);
+	else
+		npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
 	if (npages != 1)
 		return npages;
 
+	/*
+	 * Pinning is mutually exclusive with opportunistically mapping a read
+	 * fault as writable, as KVM should never pin pages when mapping memory
+	 * into the guest (pinning is only for direct accesses from KVM).
+	 */
+	if (WARN_ON_ONCE(kfp->map_writable && kfp->pin))
+		goto out;
+
 	/* map read fault as writable if possible */
 	if (!(flags & FOLL_WRITE) && kfp->map_writable &&
 	    get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
@@ -2884,6 +2906,7 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
 		flags |= FOLL_WRITE;
 	}
 
+out:
 	*pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE);
 	return npages;
 }
@@ -3093,10 +3116,11 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 		.slot = gfn_to_memslot(vcpu->kvm, gfn),
 		.gfn = gfn,
 		.flags = FOLL_WRITE,
-		.refcounted_page = &map->refcounted_page,
+		.refcounted_page = &map->pinned_page,
+		.pin = true,
 	};
 
-	map->refcounted_page = NULL;
+	map->pinned_page = NULL;
 	map->page = NULL;
 	map->hva = NULL;
 	map->gfn = gfn;
@@ -3133,16 +3157,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 	if (dirty)
 		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
 
-	if (map->refcounted_page) {
+	if (map->pinned_page) {
 		if (dirty)
-			kvm_release_page_dirty(map->refcounted_page);
-		else
-			kvm_release_page_clean(map->refcounted_page);
+			kvm_set_page_dirty(map->pinned_page);
+		kvm_set_page_accessed(map->pinned_page);
+		unpin_user_page(map->pinned_page);
 	}
 
 	map->hva = NULL;
 	map->page = NULL;
-	map->refcounted_page = NULL;
+	map->pinned_page = NULL;
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index d3ac1ba8ba66..acef3f5c582a 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -30,6 +30,13 @@ struct kvm_follow_pfn {
 	/* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */
 	unsigned int flags;
 
+	/*
+	 * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag).
+	 * The page *must* be pinned if KVM will write to the page via a kernel
+	 * mapping, e.g. via kmap(), mremap(), etc.
+	 */
+	bool pin;
+
 	/*
 	 * If non-NULL, try to get a writable mapping even for a read fault.
 	 * Set to true if a writable mapping was obtained.
-- 
cgit v1.2.3


From 365e319208442a0807a96e9ea4d0b1fa338f1929 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:35 -0700
Subject: KVM: Pass in write/dirty to kvm_vcpu_map(), not kvm_vcpu_unmap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that all kvm_vcpu_{,un}map() users pass "true" for @dirty, have them
pass "true" as a @writable param to kvm_vcpu_map(), and thus create a
read-only mapping when possible.

Note, creating read-only mappings can be theoretically slower, as they
don't play nice with fast GUP due to the need to break CoW before mapping
the underlying PFN.  But practically speaking, creating a mapping isn't
a super hot path, and getting a writable mapping for reading is weird and
confusing.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-34-seanjc@google.com>
---
 arch/x86/kvm/svm/nested.c |  4 ++--
 arch/x86/kvm/svm/sev.c    |  2 +-
 arch/x86/kvm/svm/svm.c    |  8 ++++----
 arch/x86/kvm/vmx/nested.c | 16 ++++++++--------
 include/linux/kvm_host.h  | 20 ++++++++++++++++++--
 virt/kvm/kvm_main.c       | 12 +++++++-----
 6 files changed, 40 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index cf84103ce38b..b708bdf7eaff 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -926,7 +926,7 @@ out_exit_err:
 	nested_svm_vmexit(svm);
 
 out:
-	kvm_vcpu_unmap(vcpu, &map, true);
+	kvm_vcpu_unmap(vcpu, &map);
 
 	return ret;
 }
@@ -1130,7 +1130,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 				       vmcb12->control.exit_int_info_err,
 				       KVM_ISA_SVM);
 
-	kvm_vcpu_unmap(vcpu, &map, true);
+	kvm_vcpu_unmap(vcpu, &map);
 
 	nested_svm_transition_tlb_flush(vcpu);
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0b851ef937f2..4557ff3804ae 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3468,7 +3468,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
 
 	sev_es_sync_to_ghcb(svm);
 
-	kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true);
+	kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map);
 	svm->sev_es.ghcb = NULL;
 }
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9df3e1e5ae81..c1e29307826b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2299,7 +2299,7 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
 		svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
 	}
 
-	kvm_vcpu_unmap(vcpu, &map, true);
+	kvm_vcpu_unmap(vcpu, &map);
 
 	return ret;
 }
@@ -4714,7 +4714,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
 	svm_copy_vmrun_state(map_save.hva + 0x400,
 			     &svm->vmcb01.ptr->save);
 
-	kvm_vcpu_unmap(vcpu, &map_save, true);
+	kvm_vcpu_unmap(vcpu, &map_save);
 	return 0;
 }
 
@@ -4774,9 +4774,9 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
 	svm->nested.nested_run_pending = 1;
 
 unmap_save:
-	kvm_vcpu_unmap(vcpu, &map_save, true);
+	kvm_vcpu_unmap(vcpu, &map_save);
 unmap_map:
-	kvm_vcpu_unmap(vcpu, &map, true);
+	kvm_vcpu_unmap(vcpu, &map);
 	return ret;
 }
 
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ff83b56fe2fa..259fe445e695 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -231,7 +231,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
+	kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
 	vmx->nested.hv_evmcs = NULL;
 	vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
 
@@ -318,9 +318,9 @@ static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, true);
-	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
-	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+	kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
+	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
+	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
 	vmx->nested.pi_desc = NULL;
 }
 
@@ -624,7 +624,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 	int msr;
 	unsigned long *msr_bitmap_l1;
 	unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
-	struct kvm_host_map msr_bitmap_map;
+	struct kvm_host_map map;
 
 	/* Nothing to do if the MSR bitmap is not in use.  */
 	if (!cpu_has_vmx_msr_bitmap() ||
@@ -647,10 +647,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 			return true;
 	}
 
-	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &msr_bitmap_map))
+	if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
 		return false;
 
-	msr_bitmap_l1 = (unsigned long *)msr_bitmap_map.hva;
+	msr_bitmap_l1 = (unsigned long *)map.hva;
 
 	/*
 	 * To keep the control flow simple, pay eight 8-byte writes (sixteen
@@ -714,7 +714,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
 					 MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
 
-	kvm_vcpu_unmap(vcpu, &msr_bitmap_map, false);
+	kvm_vcpu_unmap(vcpu, &map);
 
 	vmx->nested.force_msr_bitmap_recalc = false;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b4c541fa5a1f..101dbf2be1ce 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -285,6 +285,7 @@ struct kvm_host_map {
 	void *hva;
 	kvm_pfn_t pfn;
 	kvm_pfn_t gfn;
+	bool writable;
 };
 
 /*
@@ -1311,8 +1312,23 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
-int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
-void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
+
+int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map,
+		   bool writable);
+void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map);
+
+static inline int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa,
+			       struct kvm_host_map *map)
+{
+	return __kvm_vcpu_map(vcpu, gpa, map, true);
+}
+
+static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa,
+					struct kvm_host_map *map)
+{
+	return __kvm_vcpu_map(vcpu, gpa, map, false);
+}
+
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0d59f47f099e..baa741c2b81c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3110,7 +3110,8 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
-int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
+		   bool writable)
 {
 	struct kvm_follow_pfn kfp = {
 		.slot = gfn_to_memslot(vcpu->kvm, gfn),
@@ -3124,6 +3125,7 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 	map->page = NULL;
 	map->hva = NULL;
 	map->gfn = gfn;
+	map->writable = writable;
 
 	map->pfn = kvm_follow_pfn(&kfp);
 	if (is_error_noslot_pfn(map->pfn))
@@ -3140,9 +3142,9 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 
 	return map->hva ? 0 : -EFAULT;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+EXPORT_SYMBOL_GPL(__kvm_vcpu_map);
 
-void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
+void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map)
 {
 	if (!map->hva)
 		return;
@@ -3154,11 +3156,11 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 		memunmap(map->hva);
 #endif
 
-	if (dirty)
+	if (map->writable)
 		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
 
 	if (map->pinned_page) {
-		if (dirty)
+		if (map->writable)
 			kvm_set_page_dirty(map->pinned_page);
 		kvm_set_page_accessed(map->pinned_page);
 		unpin_user_page(map->pinned_page);
-- 
cgit v1.2.3


From 21dd877060d49f1f57901f929189653fc42ac37a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:44 -0700
Subject: KVM: Move declarations of memslot accessors up in kvm_host.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the memslot lookup helpers further up in kvm_host.h so that they can
be used by inlined "to pfn" wrappers.

No functional change intended.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-43-seanjc@google.com>
---
 include/linux/kvm_host.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 101dbf2be1ce..8d35a36e7707 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1168,6 +1168,10 @@ static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_
 	     kvm_memslot_iter_is_valid(iter, end);			\
 	     kvm_memslot_iter_next(iter))
 
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
+struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
+struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
+
 /*
  * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
  * - create a new memory slot
@@ -1303,16 +1307,12 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 })
 
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
 void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
-struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
-struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
-
 int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map,
 		   bool writable);
 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map);
-- 
cgit v1.2.3


From 1c7b627e930624dd64ee906df554c8f2bad628ff Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:45 -0700
Subject: KVM: Add kvm_faultin_pfn() to specifically service guest page faults
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new dedicated API, kvm_faultin_pfn(), for servicing guest page
faults, i.e. for getting pages/pfns that will be mapped into the guest via
an mmu_notifier-protected KVM MMU.  Keep struct kvm_follow_pfn buried in
internal code, as having __kvm_faultin_pfn() take "out" params is actually
cleaner for several architectures, e.g. it allows the caller to have its
own "page fault" structure without having to marshal data to/from
kvm_follow_pfn.

Long term, common KVM would ideally provide a kvm_page_fault structure, a
la x86's struct of the same name.  But all architectures need to be
converted to a common API before that can happen.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-44-seanjc@google.com>
---
 include/linux/kvm_host.h | 12 ++++++++++++
 virt/kvm/kvm_main.c      | 22 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8d35a36e7707..a63b0325d3e2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1231,6 +1231,18 @@ static inline void kvm_release_page_unused(struct page *page)
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 
+kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
+			    unsigned int foll, bool *writable,
+			    struct page **refcounted_page);
+
+static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+					bool write, bool *writable,
+					struct page **refcounted_page)
+{
+	return __kvm_faultin_pfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn,
+				 write ? FOLL_WRITE : 0, writable, refcounted_page);
+}
+
 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index becf640e369c..f79745d6500c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3092,6 +3092,28 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
+kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
+			    unsigned int foll, bool *writable,
+			    struct page **refcounted_page)
+{
+	struct kvm_follow_pfn kfp = {
+		.slot = slot,
+		.gfn = gfn,
+		.flags = foll,
+		.map_writable = writable,
+		.refcounted_page = refcounted_page,
+	};
+
+	if (WARN_ON_ONCE(!writable || !refcounted_page))
+		return KVM_PFN_ERR_FAULT;
+
+	*writable = false;
+	*refcounted_page = NULL;
+
+	return kvm_follow_pfn(&kfp);
+}
+EXPORT_SYMBOL_GPL(__kvm_faultin_pfn);
+
 int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
 		       struct page **pages, int nr_pages)
 {
-- 
cgit v1.2.3


From 1fbee5b01a0fd27db571eed757682a7c20045107 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:48 -0700
Subject: KVM: guest_memfd: Provide "struct page" as output from
 kvm_gmem_get_pfn()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide the "struct page" associated with a guest_memfd pfn as an output
from __kvm_gmem_get_pfn() so that KVM guest page fault handlers can
directly put the page instead of having to rely on
kvm_pfn_to_refcounted_page().

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-47-seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c   |  2 +-
 arch/x86/kvm/svm/sev.c   | 10 ++++++----
 include/linux/kvm_host.h |  6 ++++--
 virt/kvm/guest_memfd.c   |  8 ++++++--
 4 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2bea2d20c571..c657c3c449c8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4407,7 +4407,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
 	}
 
 	r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
-			     &max_order);
+			     &fault->refcounted_page, &max_order);
 	if (r) {
 		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
 		return r;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 4557ff3804ae..c6c852485900 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3849,6 +3849,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
 	if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
 		gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
 		struct kvm_memory_slot *slot;
+		struct page *page;
 		kvm_pfn_t pfn;
 
 		slot = gfn_to_memslot(vcpu->kvm, gfn);
@@ -3859,7 +3860,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
 		 * The new VMSA will be private memory guest memory, so
 		 * retrieve the PFN from the gmem backend.
 		 */
-		if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL))
+		if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
 			return -EINVAL;
 
 		/*
@@ -3888,7 +3889,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
 		 * changes then care should be taken to ensure
 		 * svm->sev_es.vmsa is pinned through some other means.
 		 */
-		kvm_release_pfn_clean(pfn);
+		kvm_release_page_clean(page);
 	}
 
 	/*
@@ -4688,6 +4689,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
 	struct kvm_memory_slot *slot;
 	struct kvm *kvm = vcpu->kvm;
 	int order, rmp_level, ret;
+	struct page *page;
 	bool assigned;
 	kvm_pfn_t pfn;
 	gfn_t gfn;
@@ -4714,7 +4716,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
 		return;
 	}
 
-	ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order);
+	ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order);
 	if (ret) {
 		pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
 				    gpa);
@@ -4772,7 +4774,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
 out:
 	trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
 out_no_trace:
-	put_page(pfn_to_page(pfn));
+	kvm_release_page_unused(page);
 }
 
 static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a63b0325d3e2..6efdc00b4254 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2487,11 +2487,13 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
 
 #ifdef CONFIG_KVM_PRIVATE_MEM
 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
-		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
+		     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
+		     int *max_order);
 #else
 static inline int kvm_gmem_get_pfn(struct kvm *kvm,
 				   struct kvm_memory_slot *slot, gfn_t gfn,
-				   kvm_pfn_t *pfn, int *max_order)
+				   kvm_pfn_t *pfn, struct page **page,
+				   int *max_order)
 {
 	KVM_BUG_ON(1, kvm);
 	return -EIO;
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 8a878e57c5d4..47a9f68f7b24 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -594,7 +594,8 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
 }
 
 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
-		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+		     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
+		     int *max_order)
 {
 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
 	struct file *file = kvm_gmem_get_file(slot);
@@ -615,7 +616,10 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
 
 	folio_unlock(folio);
-	if (r < 0)
+
+	if (!r)
+		*page = folio_file_page(folio, index);
+	else
 		folio_put(folio);
 
 out:
-- 
cgit v1.2.3


From dc06193532af4ba88ed20daeef88f22b053ebb91 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:23:51 -0700
Subject: KVM: Move x86's API to release a faultin page to common KVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move KVM x86's helper that "finishes" the faultin process to common KVM
so that the logic can be shared across all architectures.  Note, not all
architectures implement a fast page fault path, but the gist of the
comment applies to all architectures.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-50-seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c   | 24 ++----------------------
 include/linux/kvm_host.h | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f3a4ed6afec2..0aae8e63566c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4377,28 +4377,8 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
 static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
 				      struct kvm_page_fault *fault, int r)
 {
-	lockdep_assert_once(lockdep_is_held(&vcpu->kvm->mmu_lock) ||
-			    r == RET_PF_RETRY);
-
-	if (!fault->refcounted_page)
-		return;
-
-	/*
-	 * If the page that KVM got from the *primary MMU* is writable, and KVM
-	 * installed or reused a SPTE, mark the page/folio dirty.  Note, this
-	 * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if
-	 * the GFN is write-protected.  Folios can't be safely marked dirty
-	 * outside of mmu_lock as doing so could race with writeback on the
-	 * folio.  As a result, KVM can't mark folios dirty in the fast page
-	 * fault handler, and so KVM must (somewhat) speculatively mark the
-	 * folio dirty if KVM could locklessly make the SPTE writable.
-	 */
-	if (r == RET_PF_RETRY)
-		kvm_release_page_unused(fault->refcounted_page);
-	else if (!fault->map_writable)
-		kvm_release_page_clean(fault->refcounted_page);
-	else
-		kvm_release_page_dirty(fault->refcounted_page);
+	kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page,
+				 r == RET_PF_RETRY, fault->map_writable);
 }
 
 static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6efdc00b4254..3e06393e5f1e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1231,6 +1231,32 @@ static inline void kvm_release_page_unused(struct page *page)
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 
+static inline void kvm_release_faultin_page(struct kvm *kvm, struct page *page,
+					    bool unused, bool dirty)
+{
+	lockdep_assert_once(lockdep_is_held(&kvm->mmu_lock) || unused);
+
+	if (!page)
+		return;
+
+	/*
+	 * If the page that KVM got from the *primary MMU* is writable, and KVM
+	 * installed or reused a SPTE, mark the page/folio dirty.  Note, this
+	 * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if
+	 * the GFN is write-protected.  Folios can't be safely marked dirty
+	 * outside of mmu_lock as doing so could race with writeback on the
+	 * folio.  As a result, KVM can't mark folios dirty in the fast page
+	 * fault handler, and so KVM must (somewhat) speculatively mark the
+	 * folio dirty if KVM could locklessly make the SPTE writable.
+	 */
+	if (unused)
+		kvm_release_page_unused(page);
+	else if (dirty)
+		kvm_release_page_dirty(page);
+	else
+		kvm_release_page_clean(page);
+}
+
 kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
 			    unsigned int foll, bool *writable,
 			    struct page **refcounted_page);
-- 
cgit v1.2.3


From f42e289a2095f61755e6ca5fd1370d441bf589d5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:24:18 -0700
Subject: KVM: Add support for read-only usage of gfn_to_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rework gfn_to_page() to support read-only accesses so that it can be used
by arm64 to get MTE tags out of guest memory.

Opportunistically rewrite the comment to be even more stern about using
gfn_to_page(), as there are very few scenarios where requiring a struct
page is actually the right thing to do (though there are such scenarios).
Add a FIXME to call out that KVM probably should be pinning pages, not
just getting pages.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-77-seanjc@google.com>
---
 include/linux/kvm_host.h |  7 ++++++-
 virt/kvm/kvm_main.c      | 15 ++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3e06393e5f1e..96cf9e5660c3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1213,7 +1213,12 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
 		       struct page **pages, int nr_pages);
 
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write);
+static inline struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_page(kvm, gfn, true);
+}
+
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable);
 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5a424598610f..1c11a05a97af 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3132,25 +3132,26 @@ int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
 EXPORT_SYMBOL_GPL(kvm_prefetch_pages);
 
 /*
- * Do not use this helper unless you are absolutely certain the gfn _must_ be
- * backed by 'struct page'.  A valid example is if the backing memslot is
- * controlled by KVM.  Note, if the returned page is valid, it's refcount has
- * been elevated by gfn_to_pfn().
+ * Don't use this API unless you are absolutely, positively certain that KVM
+ * needs to get a struct page, e.g. to pin the page for firmware DMA.
+ *
+ * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate
+ *	  its refcount.
  */
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write)
 {
 	struct page *refcounted_page = NULL;
 	struct kvm_follow_pfn kfp = {
 		.slot = gfn_to_memslot(kvm, gfn),
 		.gfn = gfn,
-		.flags = FOLL_WRITE,
+		.flags = write ? FOLL_WRITE : 0,
 		.refcounted_page = &refcounted_page,
 	};
 
 	(void)kvm_follow_pfn(&kfp);
 	return refcounted_page;
 }
-EXPORT_SYMBOL_GPL(gfn_to_page);
+EXPORT_SYMBOL_GPL(__gfn_to_page);
 
 int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
 		   bool writable)
-- 
cgit v1.2.3


From 06cdaff80e50e3fb74e5e3101e1d5d7aa8b68da6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:24:21 -0700
Subject: KVM: Drop gfn_to_pfn() APIs now that all users are gone
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop gfn_to_pfn() and all its variants now that all users are gone.

No functional change intended.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-80-seanjc@google.com>
---
 include/linux/kvm_host.h |  8 --------
 virt/kvm/kvm_main.c      | 53 ------------------------------------------------
 2 files changed, 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 96cf9e5660c3..4a1eaa40a215 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1274,14 +1274,6 @@ static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 				 write ? FOLL_WRITE : 0, writable, refcounted_page);
 }
 
-kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
-kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
-		      bool *writable);
-kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn);
-kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
-			       bool interruptible, bool no_wait,
-			       bool write_fault, bool *writable);
-
 void kvm_release_pfn_clean(kvm_pfn_t pfn);
 void kvm_release_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_dirty(kvm_pfn_t pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1c11a05a97af..b2c8d429442d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3039,59 +3039,6 @@ static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp)
 	return hva_to_pfn(kfp);
 }
 
-kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
-			       bool interruptible, bool no_wait,
-			       bool write_fault, bool *writable)
-{
-	struct kvm_follow_pfn kfp = {
-		.slot = slot,
-		.gfn = gfn,
-		.map_writable = writable,
-	};
-
-	if (write_fault)
-		kfp.flags |= FOLL_WRITE;
-	if (no_wait)
-		kfp.flags |= FOLL_NOWAIT;
-	if (interruptible)
-		kfp.flags |= FOLL_INTERRUPTIBLE;
-
-	return kvm_follow_pfn(&kfp);
-}
-EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
-
-kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
-		      bool *writable)
-{
-	struct kvm_follow_pfn kfp = {
-		.slot = gfn_to_memslot(kvm, gfn),
-		.gfn = gfn,
-		.flags = write_fault ? FOLL_WRITE : 0,
-		.map_writable = writable,
-	};
-
-	return kvm_follow_pfn(&kfp);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
-
-kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
-{
-	struct kvm_follow_pfn kfp = {
-		.slot = slot,
-		.gfn = gfn,
-		.flags = FOLL_WRITE,
-	};
-
-	return kvm_follow_pfn(&kfp);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
-
-kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn);
-
 kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
 			    unsigned int foll, bool *writable,
 			    struct page **refcounted_page)
-- 
cgit v1.2.3


From 93b7da404f5b0b02a4211bbb784889f001d27953 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:24:26 -0700
Subject: KVM: Drop APIs that manipulate "struct page" via pfns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove all kvm_{release,set}_pfn_*() APIs now that all users are gone.

No functional change intended.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-85-seanjc@google.com>
---
 include/linux/kvm_host.h |  5 -----
 virt/kvm/kvm_main.c      | 55 ------------------------------------------------
 2 files changed, 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4a1eaa40a215..d045f8310a48 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1274,11 +1274,6 @@ static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 				 write ? FOLL_WRITE : 0, writable, refcounted_page);
 }
 
-void kvm_release_pfn_clean(kvm_pfn_t pfn);
-void kvm_release_pfn_dirty(kvm_pfn_t pfn);
-void kvm_set_pfn_dirty(kvm_pfn_t pfn);
-void kvm_set_pfn_accessed(kvm_pfn_t pfn);
-
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 			int len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a483da96f4be..396ca14f18f3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3164,61 +3164,6 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
-void kvm_release_pfn_clean(kvm_pfn_t pfn)
-{
-	struct page *page;
-
-	if (is_error_noslot_pfn(pfn))
-		return;
-
-	page = kvm_pfn_to_refcounted_page(pfn);
-	if (!page)
-		return;
-
-	kvm_release_page_clean(page);
-}
-EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
-
-void kvm_release_pfn_dirty(kvm_pfn_t pfn)
-{
-	struct page *page;
-
-	if (is_error_noslot_pfn(pfn))
-		return;
-
-	page = kvm_pfn_to_refcounted_page(pfn);
-	if (!page)
-		return;
-
-	kvm_release_page_dirty(page);
-}
-EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
-
-/*
- * Note, checking for an error/noslot pfn is the caller's responsibility when
- * directly marking a page dirty/accessed.  Unlike the "release" helpers, the
- * "set" helpers are not to be used when the pfn might point at garbage.
- */
-void kvm_set_pfn_dirty(kvm_pfn_t pfn)
-{
-	if (WARN_ON(is_error_noslot_pfn(pfn)))
-		return;
-
-	if (pfn_valid(pfn))
-		kvm_set_page_dirty(pfn_to_page(pfn));
-}
-EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
-
-void kvm_set_pfn_accessed(kvm_pfn_t pfn)
-{
-	if (WARN_ON(is_error_noslot_pfn(pfn)))
-		return;
-
-	if (pfn_valid(pfn))
-		kvm_set_page_accessed(pfn_to_page(pfn));
-}
-EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
-
 static int next_segment(unsigned long len, int offset)
 {
 	if (len > PAGE_SIZE - offset)
-- 
cgit v1.2.3


From 8b15c3764c05ed8766709711d2054d96349dee8e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 11:24:27 -0700
Subject: KVM: Don't grab reference on VM_MIXEDMAP pfns that have a "struct
 page"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that KVM no longer relies on an ugly heuristic to find its struct page
references, i.e. now that KVM can't get false positives on VM_MIXEDMAP
pfns, remove KVM's hack to elevate the refcount for pfns that happen to
have a valid struct page.  In addition to removing a long-standing wart
in KVM, this allows KVM to map non-refcounted struct page memory into the
guest, e.g. for exposing GPU TTM buffers to KVM guests.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-86-seanjc@google.com>
---
 include/linux/kvm_host.h |  3 --
 virt/kvm/kvm_main.c      | 75 ++----------------------------------------------
 2 files changed, 2 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d045f8310a48..02f0206fd2dc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1730,9 +1730,6 @@ void kvm_arch_sync_events(struct kvm *kvm);
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 
-struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn);
-bool kvm_is_zone_device_page(struct page *page);
-
 struct kvm_irq_ack_notifier {
 	struct hlist_node link;
 	unsigned gsi;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 396ca14f18f3..b1b10dc408a0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -160,52 +160,6 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
 {
 }
 
-bool kvm_is_zone_device_page(struct page *page)
-{
-	/*
-	 * The metadata used by is_zone_device_page() to determine whether or
-	 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
-	 * the device has been pinned, e.g. by get_user_pages().  WARN if the
-	 * page_count() is zero to help detect bad usage of this helper.
-	 */
-	if (WARN_ON_ONCE(!page_count(page)))
-		return false;
-
-	return is_zone_device_page(page);
-}
-
-/*
- * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
- * page, NULL otherwise.  Note, the list of refcounted PG_reserved page types
- * is likely incomplete, it has been compiled purely through people wanting to
- * back guest with a certain type of memory and encountering issues.
- */
-struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
-{
-	struct page *page;
-
-	if (!pfn_valid(pfn))
-		return NULL;
-
-	page = pfn_to_page(pfn);
-	if (!PageReserved(page))
-		return page;
-
-	/* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
-	if (is_zero_pfn(pfn))
-		return page;
-
-	/*
-	 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
-	 * perspective they are "normal" pages, albeit with slightly different
-	 * usage rules.
-	 */
-	if (kvm_is_zone_device_page(page))
-		return page;
-
-	return NULL;
-}
-
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
@@ -2804,35 +2758,10 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
 	if (kfp->map_writable)
 		*kfp->map_writable = writable;
 
-	/*
-	 * FIXME: Remove this once KVM no longer blindly calls put_page() on
-	 *	  every pfn that points at a struct page.
-	 *
-	 * Get a reference for follow_pte() pfns if they happen to point at a
-	 * struct page, as KVM will ultimately call kvm_release_pfn_clean() on
-	 * the returned pfn, i.e. KVM expects to have a reference.
-	 *
-	 * Certain IO or PFNMAP mappings can be backed with valid struct pages,
-	 * but be allocated without refcounting, e.g. tail pages of
-	 * non-compound higher order allocations.  Grabbing and putting a
-	 * reference to such pages would cause KVM to prematurely free a page
-	 * it doesn't own (KVM gets and puts the one and only reference).
-	 * Don't allow those pages until the FIXME is resolved.
-	 *
-	 * Don't grab a reference for pins, callers that pin pages are required
-	 * to check refcounted_page, i.e. must not blindly release the pfn.
-	 */
-	if (map) {
+	if (map)
 		pfn = map->pfn;
-
-		if (!kfp->pin) {
-			page = kvm_pfn_to_refcounted_page(pfn);
-			if (page && !get_page_unless_zero(page))
-				return KVM_PFN_ERR_FAULT;
-		}
-	} else {
+	else
 		pfn = page_to_pfn(page);
-	}
 
 	*kfp->refcounted_page = page;
 
-- 
cgit v1.2.3


From 6860d28ccb2390b4eeda32ab2ce7eb10f71921e1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 15 Oct 2024 12:08:39 +0200
Subject: timekeeping: Reorder struct timekeeper

struct timekeeper is ordered suboptimal vs. cachelines. The layout,
including the preceding seqcount (see struct tk_core in timekeeper.c) is:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   tkr_raw, xtime_sec
 cacheline 2:   ktime_sec ... tai_offset, internal variables
 cacheline 3:	next_leap_ktime, raw_sec, internal variables
 cacheline 4:	internal variables

So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW
will use either cachelines 0 + 1 or cachelines 0 + 2. Access to
CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3.

Reorder the members so that the result is more efficient:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   xtime_sec, ktime_sec ... tai_offset
 cacheline 2:	tkr_raw, raw_sec
 cacheline 3:	internal variables
 cacheline 4:	internal variables

That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW
access will use cachelines 0 + 2.

Update kernel-doc and fix formatting issues while at it. Also fix a typo
in struct tk_read_base kernel-doc.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241015100839.12702-1-anna-maria@linutronix.de
---
 include/linux/timekeeper_internal.h | 106 ++++++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 902c20ef495a..a3b6380a7777 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -26,7 +26,7 @@
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessors.
+ * for the fast NMI safe accessors.
  *
  * @base_real is for the fast NMI safe accessor to allow reading clock
  * realtime from any context.
@@ -44,33 +44,41 @@ struct tk_read_base {
 
 /**
  * struct timekeeper - Structure holding internal timekeeping values.
- * @tkr_mono:		The readout base structure for CLOCK_MONOTONIC
- * @tkr_raw:		The readout base structure for CLOCK_MONOTONIC_RAW
- * @xtime_sec:		Current CLOCK_REALTIME time in seconds
- * @ktime_sec:		Current CLOCK_MONOTONIC time in seconds
- * @wall_to_monotonic:	CLOCK_REALTIME to CLOCK_MONOTONIC offset
- * @offs_real:		Offset clock monotonic -> clock realtime
- * @offs_boot:		Offset clock monotonic -> clock boottime
- * @offs_tai:		Offset clock monotonic -> clock tai
- * @tai_offset:		The current UTC to TAI offset in seconds
- * @clock_was_set_seq:	The sequence number of clock was set events
- * @cs_was_changed_seq:	The sequence number of clocksource change events
- * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
- * @raw_sec:		CLOCK_MONOTONIC_RAW  time in seconds
- * @monotonic_to_boot:	CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
- * @cycle_interval:	Number of clock cycles in one NTP interval
- * @xtime_interval:	Number of clock shifted nano seconds in one NTP
- *			interval.
- * @xtime_remainder:	Shifted nano seconds left over when rounding
- *			@cycle_interval
- * @raw_interval:	Shifted raw nano seconds accumulated per NTP interval.
- * @ntp_error:		Difference between accumulated time and NTP time in ntp
- *			shifted nano seconds.
- * @ntp_error_shift:	Shift conversion between clock shifted nano seconds and
- *			ntp shifted nano seconds.
- * @last_warning:	Warning ratelimiter (DEBUG_TIMEKEEPING)
- * @underflow_seen:	Underflow warning flag (DEBUG_TIMEKEEPING)
- * @overflow_seen:	Overflow warning flag (DEBUG_TIMEKEEPING)
+ * @tkr_mono:			The readout base structure for CLOCK_MONOTONIC
+ * @xtime_sec:			Current CLOCK_REALTIME time in seconds
+ * @ktime_sec:			Current CLOCK_MONOTONIC time in seconds
+ * @wall_to_monotonic:		CLOCK_REALTIME to CLOCK_MONOTONIC offset
+ * @offs_real:			Offset clock monotonic -> clock realtime
+ * @offs_boot:			Offset clock monotonic -> clock boottime
+ * @offs_tai:			Offset clock monotonic -> clock tai
+ * @tai_offset:			The current UTC to TAI offset in seconds
+ * @tkr_raw:			The readout base structure for CLOCK_MONOTONIC_RAW
+ * @raw_sec:			CLOCK_MONOTONIC_RAW  time in seconds
+ * @clock_was_set_seq:		The sequence number of clock was set events
+ * @cs_was_changed_seq:		The sequence number of clocksource change events
+ * @monotonic_to_boot:		CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
+ * @cycle_interval:		Number of clock cycles in one NTP interval
+ * @xtime_interval:		Number of clock shifted nano seconds in one NTP
+ *				interval.
+ * @xtime_remainder:		Shifted nano seconds left over when rounding
+ *				@cycle_interval
+ * @raw_interval:		Shifted raw nano seconds accumulated per NTP interval.
+ * @next_leap_ktime:		CLOCK_MONOTONIC time value of a pending leap-second
+ * @ntp_tick:			The ntp_tick_length() value currently being
+ *				used. This cached copy ensures we consistently
+ *				apply the tick length for an entire tick, as
+ *				ntp_tick_length may change mid-tick, and we don't
+ *				want to apply that new value to the tick in
+ *				progress.
+ * @ntp_error:			Difference between accumulated time and NTP time in ntp
+ *				shifted nano seconds.
+ * @ntp_error_shift:		Shift conversion between clock shifted nano seconds and
+ *				ntp shifted nano seconds.
+ * @ntp_err_mult:		Multiplication factor for scaled math conversion
+ * @skip_second_overflow:	Flag used to avoid updating NTP twice with same second
+ * @last_warning:		Warning ratelimiter (DEBUG_TIMEKEEPING)
+ * @underflow_seen:		Underflow warning flag (DEBUG_TIMEKEEPING)
+ * @overflow_seen:		Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -88,10 +96,28 @@ struct tk_read_base {
  *
  * @monotonic_to_boottime is a timespec64 representation of @offs_boot to
  * accelerate the VDSO update for CLOCK_BOOTTIME.
+ *
+ * The cacheline ordering of the structure is optimized for in kernel usage of
+ * the ktime_get() and ktime_get_ts64() family of time accessors. Struct
+ * timekeeper is prepended in the core timekeeping code with a sequence count,
+ * which results in the following cacheline layout:
+ *
+ * 0:	seqcount, tkr_mono
+ * 1:	xtime_sec ... tai_offset
+ * 2:	tkr_raw, raw_sec
+ * 3,4: Internal variables
+ *
+ * Cacheline 0,1 contain the data which is used for accessing
+ * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the
+ * data for accessing CLOCK_MONOTONIC_RAW.  Cacheline 3,4 are internal
+ * variables which are only accessed during timekeeper updates once per
+ * tick.
  */
 struct timekeeper {
+	/* Cacheline 0 (together with prepended seqcount of timekeeper core): */
 	struct tk_read_base	tkr_mono;
-	struct tk_read_base	tkr_raw;
+
+	/* Cacheline 1: */
 	u64			xtime_sec;
 	unsigned long		ktime_sec;
 	struct timespec64	wall_to_monotonic;
@@ -99,31 +125,29 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
+
+	/* Cacheline 2: */
+	struct tk_read_base	tkr_raw;
+	u64			raw_sec;
+
+	/* Cachline 3 and 4 (timekeeping internal variables): */
 	unsigned int		clock_was_set_seq;
 	u8			cs_was_changed_seq;
-	ktime_t			next_leap_ktime;
-	u64			raw_sec;
+
 	struct timespec64	monotonic_to_boot;
 
-	/* The following members are for timekeeping internal use */
 	u64			cycle_interval;
 	u64			xtime_interval;
 	s64			xtime_remainder;
 	u64			raw_interval;
-	/* The ntp_tick_length() value currently being used.
-	 * This cached copy ensures we consistently apply the tick
-	 * length for an entire tick, as ntp_tick_length may change
-	 * mid-tick, and we don't want to apply that new value to
-	 * the tick in progress.
-	 */
+
+	ktime_t			next_leap_ktime;
 	u64			ntp_tick;
-	/* Difference between accumulated time and NTP time in ntp
-	 * shifted nano seconds. */
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
-	/* Flag used to avoid updating NTP twice with same second */
 	u32			skip_second_overflow;
+
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 	long			last_warning;
 	/*
-- 
cgit v1.2.3


From 92b043fd995a63a57aae29ff85a39b6f30cd440c Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Fri, 25 Oct 2024 13:01:41 +0200
Subject: time: Fix references to _msecs_to_jiffies() handling of values

The details about the handling of the "normal" values were moved
to the _msecs_to_jiffies() helpers in commit ca42aaf0c861 ("time:
Refactor msecs_to_jiffies"). However, the same commit still mentioned
__msecs_to_jiffies() in the added documentation.

Thus point to _msecs_to_jiffies() instead.

Fixes: ca42aaf0c861 ("time: Refactor msecs_to_jiffies")
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20241025110141.157205-2-ojeda@kernel.org
---
 include/linux/jiffies.h | 2 +-
 kernel/time/time.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 1220f0fbe5bf..5d21dacd62bc 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -502,7 +502,7 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m)
  * - all other values are converted to jiffies by either multiplying
  *   the input value by a factor or dividing it with a factor and
  *   handling any 32-bit overflows.
- *   for the details see __msecs_to_jiffies()
+ *   for the details see _msecs_to_jiffies()
  *
  * msecs_to_jiffies() checks for the passed in value being a constant
  * via __builtin_constant_p() allowing gcc to eliminate most of the
diff --git a/kernel/time/time.c b/kernel/time/time.c
index b1809a1b0827..1b69caa87480 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -556,7 +556,7 @@ EXPORT_SYMBOL(ns_to_timespec64);
  * - all other values are converted to jiffies by either multiplying
  *   the input value by a factor or dividing it with a factor and
  *   handling any 32-bit overflows.
- *   for the details see __msecs_to_jiffies()
+ *   for the details see _msecs_to_jiffies()
  *
  * msecs_to_jiffies() checks for the passed in value being a constant
  * via __builtin_constant_p() allowing gcc to eliminate most of the
-- 
cgit v1.2.3


From 8acdd0e7bfadda6b5103f2960d293581954454ed Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 25 Oct 2024 08:37:18 +0800
Subject: blk-mq: add non_owner variant of start_freeze/unfreeze queue APIs

Add non_owner variant of start_freeze/unfreeze queue APIs, so that the
caller knows that what they are doing, and we can skip lockdep support
for non_owner variant in per-call level.

Prepare for supporting lockdep for freezing/unfreezing queue.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20241025003722.3630252-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 20 ++++++++++++++++++++
 include/linux/blk-mq.h |  2 ++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index aae9a112c913..770276815507 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -196,6 +196,26 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
+/*
+ * non_owner variant of blk_freeze_queue_start
+ *
+ * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen
+ * by the same task.  This is fragile and should not be used if at all
+ * possible.
+ */
+void blk_freeze_queue_start_non_owner(struct request_queue *q)
+{
+	blk_freeze_queue_start(q);
+}
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
+
+/* non_owner variant of blk_mq_unfreeze_queue */
+void blk_mq_unfreeze_queue_non_owner(struct request_queue *q)
+{
+	__blk_mq_unfreeze_queue(q, false);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner);
+
 /*
  * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
  * mpt3sas driver such that this function can be removed.
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 59e9adf815a4..2035fad3131f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -919,6 +919,8 @@ void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
+void blk_mq_unfreeze_queue_non_owner(struct request_queue *q);
+void blk_freeze_queue_start_non_owner(struct request_queue *q);
 
 void blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
-- 
cgit v1.2.3


From f1be1788a32e8fa63416ad4518bbd1a85a825c9d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 25 Oct 2024 08:37:20 +0800
Subject: block: model freeze & enter queue as lock for supporting lockdep

Recently we got several deadlock report[1][2][3] caused by
blk_mq_freeze_queue and blk_enter_queue().

Turns out the two are just like acquiring read/write lock, so model them
as read/write lock for supporting lockdep:

1) model q->q_usage_counter as two locks(io and queue lock)

- queue lock covers sync with blk_enter_queue()

- io lock covers sync with bio_enter_queue()

2) make the lockdep class/key as per-queue:

- different subsystem has very different lock use pattern, shared lock
 class causes false positive easily

- freeze_queue degrades to no lock in case that disk state becomes DEAD
  because bio_enter_queue() won't be blocked any more

- freeze_queue degrades to no lock in case that request queue becomes dying
  because blk_enter_queue() won't be blocked any more

3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock
- it is exclusive lock, so dependency with blk_enter_queue() is covered

- it is trylock because blk_mq_freeze_queue() are allowed to run
  concurrently

4) model blk_enter_queue() & bio_enter_queue() as acquire_read()
- nested blk_enter_queue() are allowed

- dependency with blk_mq_freeze_queue() is covered

- blk_queue_exit() is often called from other contexts(such as irq), and
it can't be annotated as lock_release(), so simply do it in
blk_enter_queue(), this way still covered cases as many as possible

With lockdep support, such kind of reports may be reported asap and
needn't wait until the real deadlock is triggered.

For example, lockdep report can be triggered in the report[3] with this
patch applied.

[1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler'
https://bugzilla.kernel.org/show_bug.cgi?id=219166

[2] del_gendisk() vs blk_queue_enter() race condition
https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/

[3] queue_freeze & queue_enter deadlock in scsi
https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20241025003722.3630252-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 18 ++++++++++++++++--
 block/blk-mq.c         | 26 ++++++++++++++++++++++----
 block/blk.h            | 29 ++++++++++++++++++++++++++---
 block/genhd.c          | 15 +++++++++++----
 include/linux/blkdev.h |  6 ++++++
 5 files changed, 81 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index bc5e8c5eaac9..09d10bb95fda 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -261,6 +261,8 @@ static void blk_free_queue(struct request_queue *q)
 		blk_mq_release(q);
 
 	ida_free(&blk_queue_ida, q->id);
+	lockdep_unregister_key(&q->io_lock_cls_key);
+	lockdep_unregister_key(&q->q_lock_cls_key);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
 
@@ -278,18 +280,20 @@ void blk_put_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_put_queue);
 
-void blk_queue_start_drain(struct request_queue *q)
+bool blk_queue_start_drain(struct request_queue *q)
 {
 	/*
 	 * When queue DYING flag is set, we need to block new req
 	 * entering queue, so we call blk_freeze_queue_start() to
 	 * prevent I/O from crossing blk_queue_enter().
 	 */
-	blk_freeze_queue_start(q);
+	bool freeze = __blk_freeze_queue_start(q);
 	if (queue_is_mq(q))
 		blk_mq_wake_waiters(q);
 	/* Make blk_queue_enter() reexamine the DYING flag. */
 	wake_up_all(&q->mq_freeze_wq);
+
+	return freeze;
 }
 
 /**
@@ -321,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 			return -ENODEV;
 	}
 
+	rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_);
+	rwsem_release(&q->q_lockdep_map, _RET_IP_);
 	return 0;
 }
 
@@ -352,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio)
 			goto dead;
 	}
 
+	rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+	rwsem_release(&q->io_lockdep_map, _RET_IP_);
 	return 0;
 dead:
 	bio_io_error(bio);
@@ -441,6 +449,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
 				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
 	if (error)
 		goto fail_stats;
+	lockdep_register_key(&q->io_lock_cls_key);
+	lockdep_register_key(&q->q_lock_cls_key);
+	lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)",
+			 &q->io_lock_cls_key, 0);
+	lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)",
+			 &q->q_lock_cls_key, 0);
 
 	q->nr_requests = BLKDEV_DEFAULT_RQ;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 770276815507..4ae7eb335fbd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -120,17 +120,29 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 	inflight[1] = mi.inflight[1];
 }
 
-void blk_freeze_queue_start(struct request_queue *q)
+bool __blk_freeze_queue_start(struct request_queue *q)
 {
+	int freeze;
+
 	mutex_lock(&q->mq_freeze_lock);
 	if (++q->mq_freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
 		mutex_unlock(&q->mq_freeze_lock);
 		if (queue_is_mq(q))
 			blk_mq_run_hw_queues(q, false);
+		freeze = true;
 	} else {
 		mutex_unlock(&q->mq_freeze_lock);
+		freeze = false;
 	}
+
+	return freeze;
+}
+
+void blk_freeze_queue_start(struct request_queue *q)
+{
+	if (__blk_freeze_queue_start(q))
+		blk_freeze_acquire_lock(q, false, false);
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 
@@ -176,8 +188,10 @@ void blk_mq_freeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
+bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 {
+	int unfreeze = false;
+
 	mutex_lock(&q->mq_freeze_lock);
 	if (force_atomic)
 		q->q_usage_counter.data->force_atomic = true;
@@ -186,13 +200,17 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 	if (!q->mq_freeze_depth) {
 		percpu_ref_resurrect(&q->q_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
+		unfreeze = true;
 	}
 	mutex_unlock(&q->mq_freeze_lock);
+
+	return unfreeze;
 }
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	__blk_mq_unfreeze_queue(q, false);
+	if (__blk_mq_unfreeze_queue(q, false))
+		blk_unfreeze_release_lock(q, false, false);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
@@ -205,7 +223,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
  */
 void blk_freeze_queue_start_non_owner(struct request_queue *q)
 {
-	blk_freeze_queue_start(q);
+	__blk_freeze_queue_start(q);
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
 
diff --git a/block/blk.h b/block/blk.h
index 8fddaf6eae49..63d5df0dc29c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,6 +4,7 @@
 
 #include <linux/bio-integrity.h>
 #include <linux/blk-crypto.h>
+#include <linux/lockdep.h>
 #include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
 #include <linux/sched/sysctl.h>
 #include <linux/timekeeping.h>
@@ -35,8 +36,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
 void blk_free_flush_queue(struct blk_flush_queue *q);
 
 void blk_freeze_queue(struct request_queue *q);
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
-void blk_queue_start_drain(struct request_queue *q);
+bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
+bool blk_queue_start_drain(struct request_queue *q);
+bool __blk_freeze_queue_start(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
 void submit_bio_noacct_nocheck(struct bio *bio);
 void bio_await_chain(struct bio *bio);
@@ -69,8 +71,11 @@ static inline int bio_queue_enter(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-	if (blk_try_enter_queue(q, false))
+	if (blk_try_enter_queue(q, false)) {
+		rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+		rwsem_release(&q->io_lockdep_map, _RET_IP_);
 		return 0;
+	}
 	return __bio_queue_enter(q, bio);
 }
 
@@ -724,4 +729,22 @@ void blk_integrity_verify(struct bio *bio);
 void blk_integrity_prepare(struct request *rq);
 void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);
 
+static inline void blk_freeze_acquire_lock(struct request_queue *q, bool
+		disk_dead, bool queue_dying)
+{
+	if (!disk_dead)
+		rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_);
+	if (!queue_dying)
+		rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_);
+}
+
+static inline void blk_unfreeze_release_lock(struct request_queue *q, bool
+		disk_dead, bool queue_dying)
+{
+	if (!queue_dying)
+		rwsem_release(&q->q_lockdep_map, _RET_IP_);
+	if (!disk_dead)
+		rwsem_release(&q->io_lockdep_map, _RET_IP_);
+}
+
 #endif /* BLK_INTERNAL_H */
diff --git a/block/genhd.c b/block/genhd.c
index bc30eee7ab16..dfee66146bd1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -601,13 +601,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
 	rcu_read_unlock();
 }
 
-static void __blk_mark_disk_dead(struct gendisk *disk)
+static bool __blk_mark_disk_dead(struct gendisk *disk)
 {
 	/*
 	 * Fail any new I/O.
 	 */
 	if (test_and_set_bit(GD_DEAD, &disk->state))
-		return;
+		return false;
 
 	if (test_bit(GD_OWNS_QUEUE, &disk->state))
 		blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue);
@@ -620,7 +620,7 @@ static void __blk_mark_disk_dead(struct gendisk *disk)
 	/*
 	 * Prevent new I/O from crossing bio_queue_enter().
 	 */
-	blk_queue_start_drain(disk->queue);
+	return blk_queue_start_drain(disk->queue);
 }
 
 /**
@@ -661,6 +661,7 @@ void del_gendisk(struct gendisk *disk)
 	struct request_queue *q = disk->queue;
 	struct block_device *part;
 	unsigned long idx;
+	bool start_drain, queue_dying;
 
 	might_sleep();
 
@@ -688,7 +689,10 @@ void del_gendisk(struct gendisk *disk)
 	 * Drop all partitions now that the disk is marked dead.
 	 */
 	mutex_lock(&disk->open_mutex);
-	__blk_mark_disk_dead(disk);
+	start_drain = __blk_mark_disk_dead(disk);
+	queue_dying = blk_queue_dying(q);
+	if (start_drain)
+		blk_freeze_acquire_lock(q, true, queue_dying);
 	xa_for_each_start(&disk->part_tbl, idx, part, 1)
 		drop_partition(part);
 	mutex_unlock(&disk->open_mutex);
@@ -745,6 +749,9 @@ void del_gendisk(struct gendisk *disk)
 		if (queue_is_mq(q))
 			blk_mq_exit_queue(q);
 	}
+
+	if (start_drain)
+		blk_unfreeze_release_lock(q, true, queue_dying);
 }
 EXPORT_SYMBOL(del_gendisk);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 55bec14fe55f..d0a52ed05e60 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -25,6 +25,7 @@
 #include <linux/uuid.h>
 #include <linux/xarray.h>
 #include <linux/file.h>
+#include <linux/lockdep.h>
 
 struct module;
 struct request_queue;
@@ -474,6 +475,11 @@ struct request_queue {
 	struct xarray		hctx_table;
 
 	struct percpu_ref	q_usage_counter;
+	struct lock_class_key	io_lock_cls_key;
+	struct lockdep_map	io_lockdep_map;
+
+	struct lock_class_key	q_lock_cls_key;
+	struct lockdep_map	q_lockdep_map;
 
 	struct request		*last_merge;
 
-- 
cgit v1.2.3


From 30dac24e14b52e1787572d1d4e06eeabe8a63630 Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Thu, 26 Sep 2024 16:01:21 +0200
Subject: fs/writeback: convert wbc_account_cgroup_owner to take a folio

Most of the callers of wbc_account_cgroup_owner() are converting a folio
to page before calling the function. wbc_account_cgroup_owner() is
converting the page back to a folio to call mem_cgroup_css_from_folio().

Convert wbc_account_cgroup_owner() to take a folio instead of a page,
and convert all callers to pass a folio directly except f2fs.

Convert the page to folio for all the callers from f2fs as they were the
only callers calling wbc_account_cgroup_owner() with a page. As f2fs is
already in the process of converting to folios, these call sites might
also soon be calling wbc_account_cgroup_owner() with a folio directly in
the future.

No functional changes. Only compile tested.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Link: https://lore.kernel.org/r/20240926140121.203821-1-kernel@pankajraghav.com
Acked-by: David Sterba <dsterba@suse.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 2 +-
 fs/btrfs/extent_io.c                    | 7 +++----
 fs/btrfs/inode.c                        | 2 +-
 fs/buffer.c                             | 4 ++--
 fs/ext4/page-io.c                       | 2 +-
 fs/f2fs/data.c                          | 9 ++++++---
 fs/fs-writeback.c                       | 8 +++-----
 fs/iomap/buffered-io.c                  | 2 +-
 fs/mpage.c                              | 2 +-
 include/linux/writeback.h               | 4 ++--
 10 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 69af2173555f..064012ea6f36 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2945,7 +2945,7 @@ following two functions.
 	a queue (device) has been associated with the bio and
 	before submission.
 
-  wbc_account_cgroup_owner(@wbc, @page, @bytes)
+  wbc_account_cgroup_owner(@wbc, @folio, @bytes)
 	Should be called for each data segment being written out.
 	While this function doesn't care exactly when it's called
 	during the writeback session, it's the easiest and most
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 39c9677c47d5..4667d1e034e0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -785,7 +785,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
 		}
 
 		if (bio_ctrl->wbc)
-			wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page,
+			wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
 						 len);
 
 		size -= len;
@@ -1707,7 +1707,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
 		ret = bio_add_folio(&bbio->bio, folio, eb->len,
 				    eb->start - folio_pos(folio));
 		ASSERT(ret);
-		wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+		wbc_account_cgroup_owner(wbc, folio, eb->len);
 		folio_unlock(folio);
 	} else {
 		int num_folios = num_extent_folios(eb);
@@ -1721,8 +1721,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
 			folio_start_writeback(folio);
 			ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
 			ASSERT(ret);
-			wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
-						 eb->folio_size);
+			wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
 			wbc->nr_to_write -= folio_nr_pages(folio);
 			folio_unlock(folio);
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index edac499fd83d..eb64f04755c2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1729,7 +1729,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
 			 * need full accuracy.  Just account the whole thing
 			 * against the first page.
 			 */
-			wbc_account_cgroup_owner(wbc, &locked_folio->page,
+			wbc_account_cgroup_owner(wbc, locked_folio,
 						 cur_end - start);
 			async_chunk[i].locked_folio = locked_folio;
 			locked_folio = NULL;
diff --git a/fs/buffer.c b/fs/buffer.c
index 1fc9a50def0b..32bd0f4c4223 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2803,7 +2803,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_write_hint = write_hint;
 
-	__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+	bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
 
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
@@ -2813,7 +2813,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
 
 	if (wbc) {
 		wbc_init_bio(wbc, bio);
-		wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
+		wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
 	}
 
 	submit_bio(bio);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ad5543866d21..b7b9261fec3b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -421,7 +421,7 @@ submit_and_retry:
 		io_submit_init_bio(io, bh);
 	if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
 		goto submit_and_retry;
-	wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
+	wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
 	io->io_next_block++;
 }
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 94f7b084f601..e3ce763cce18 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	}
 
 	if (fio->io_wbc && !is_read_io(fio->op))
-		wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+					 PAGE_SIZE);
 
 	inc_page_count(fio->sbi, is_read_io(fio->op) ?
 			__read_io_type(page) : WB_DATA_TYPE(fio->page, false));
@@ -911,7 +912,8 @@ alloc_new:
 	}
 
 	if (fio->io_wbc)
-		wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+					 PAGE_SIZE);
 
 	inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
 
@@ -1011,7 +1013,8 @@ alloc_new:
 	}
 
 	if (fio->io_wbc)
-		wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+					 PAGE_SIZE);
 
 	io->last_block_in_bio = fio->new_blkaddr;
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d8bec3c1bb1f..2391b09f4ced 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -890,17 +890,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
 /**
  * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
  * @wbc: writeback_control of the writeback in progress
- * @page: page being written out
+ * @folio: folio being written out
  * @bytes: number of bytes being written out
  *
- * @bytes from @page are about to written out during the writeback
+ * @bytes from @folio are about to written out during the writeback
  * controlled by @wbc.  Keep the book for foreign inode detection.  See
  * wbc_detach_inode().
  */
-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
 			      size_t bytes)
 {
-	struct folio *folio;
 	struct cgroup_subsys_state *css;
 	int id;
 
@@ -913,7 +912,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 	if (!wbc->wb || wbc->no_cgroup_owner)
 		return;
 
-	folio = page_folio(page);
 	css = mem_cgroup_css_from_folio(folio);
 	/* dead cgroups shouldn't contribute to inode ownership arbitration */
 	if (!(css->flags & CSS_ONLINE))
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 11ea747228ae..3d1fae7d3a64 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1833,7 +1833,7 @@ new_ioend:
 	if (ifs)
 		atomic_add(len, &ifs->write_bytes_pending);
 	wpc->ioend->io_size += len;
-	wbc_account_cgroup_owner(wbc, &folio->page, len);
+	wbc_account_cgroup_owner(wbc, folio, len);
 	return 0;
 }
 
diff --git a/fs/mpage.c b/fs/mpage.c
index b5b5ddf9d513..82aecf372743 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -606,7 +606,7 @@ alloc_new:
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
-	wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
+	wbc_account_cgroup_owner(wbc, folio, folio_size(folio));
 	length = first_unmapped << blkbits;
 	if (!bio_add_folio(bio, folio, length, 0)) {
 		bio = mpage_bio_submit_write(bio);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d6db822e4bb3..641a057e0413 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -217,7 +217,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 				 struct inode *inode)
 	__releases(&inode->i_lock);
 void wbc_detach_inode(struct writeback_control *wbc);
-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
 			      size_t bytes);
 int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
 			   enum wb_reason reason, struct wb_completion *done);
@@ -324,7 +324,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 }
 
 static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
-					    struct page *page, size_t bytes)
+					    struct folio *folio, size_t bytes)
 {
 }
 
-- 
cgit v1.2.3


From 0e152beb5aa1ccdac9aae9fa570a9e039aff7a03 Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@igalia.com>
Date: Mon, 21 Oct 2024 13:37:17 -0300
Subject: libfs: Create the helper function generic_ci_validate_strict_name()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a helper function for filesystems do the checks required for
casefold directories and strict encoding.

Suggested-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Gabriel Krisman Bertazi <gabriel@krisman.be>
Signed-off-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-1-f443d5814194@igalia.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3559446279c1..403ee5d54c60 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -45,6 +45,7 @@
 #include <linux/slab.h>
 #include <linux/maple_tree.h>
 #include <linux/rw_hint.h>
+#include <linux/unicode.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -3456,6 +3457,50 @@ extern int generic_ci_match(const struct inode *parent,
 			    const struct qstr *folded_name,
 			    const u8 *de_name, u32 de_name_len);
 
+#if IS_ENABLED(CONFIG_UNICODE)
+/**
+ * generic_ci_validate_strict_name - Check if a given name is suitable
+ * for a directory
+ *
+ * This functions checks if the proposed filename is valid for the
+ * parent directory. That means that only valid UTF-8 filenames will be
+ * accepted for casefold directories from filesystems created with the
+ * strict encoding flag.  That also means that any name will be
+ * accepted for directories that doesn't have casefold enabled, or
+ * aren't being strict with the encoding.
+ *
+ * @dir: inode of the directory where the new file will be created
+ * @name: name of the new file
+ *
+ * Return:
+ * * True if the filename is suitable for this directory. It can be
+ * true if a given name is not suitable for a strict encoding
+ * directory, but the directory being used isn't strict
+ * * False if the filename isn't suitable for this directory. This only
+ * happens when a directory is casefolded and the filesystem is strict
+ * about its encoding.
+ */
+static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+{
+	if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb))
+		return true;
+
+	/*
+	 * A casefold dir must have a encoding set, unless the filesystem
+	 * is corrupted
+	 */
+	if (WARN_ON_ONCE(!dir->i_sb->s_encoding))
+		return true;
+
+	return !utf8_validate(dir->i_sb->s_encoding, name);
+}
+#else
+static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+{
+	return true;
+}
+#endif
+
 static inline bool sb_has_encoding(const struct super_block *sb)
 {
 #if IS_ENABLED(CONFIG_UNICODE)
-- 
cgit v1.2.3


From 04dad6c6d37d741bad9946a92171bfa637e989f0 Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@igalia.com>
Date: Mon, 21 Oct 2024 13:37:19 -0300
Subject: unicode: Export latest available UTF-8 version number
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export latest available UTF-8 version number so filesystems can easily
load the newest one.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-3-f443d5814194@igalia.com
Acked-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/unicode/utf8-selftest.c | 3 ---
 include/linux/unicode.h    | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index 600e15efe9ed..5ddaf27b21a6 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -17,9 +17,6 @@
 static unsigned int failed_tests;
 static unsigned int total_tests;
 
-/* Tests will be based on this version. */
-#define UTF8_LATEST	UNICODE_AGE(12, 1, 0)
-
 #define _test(cond, func, line, fmt, ...) do {				\
 		total_tests++;						\
 		if (!cond) {						\
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 4d39e6e11a95..0c0ab04e84ee 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -16,6 +16,8 @@ struct utf8data_table;
 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
 	 ((unsigned int)(REV)))
 
+#define UTF8_LATEST        UNICODE_AGE(12, 1, 0)
+
 static inline u8 unicode_major(unsigned int age)
 {
 	return (age >> UNICODE_MAJ_SHIFT) & 0xff;
-- 
cgit v1.2.3


From 142fa60f61f93805471012f24e029af6d113c5cc Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@igalia.com>
Date: Mon, 21 Oct 2024 13:37:20 -0300
Subject: unicode: Recreate utf8_parse_version()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All filesystems that currently support UTF-8 casefold can fetch the
UTF-8 version from the filesystem metadata stored on disk. They can get
the data stored and directly match it to a integer, so they can skip the
string parsing step, which motivated the removal of this function in the
first place.

However, for tmpfs, the only way to tell the kernel which UTF-8 version
we are about to use is via mount options, using a string. Re-introduce
utf8_parse_version() to be used by tmpfs.

This version differs from the original by skipping the intermediate step
of copying the version string to an auxiliary string before calling
match_token(). This versions calls match_token() in the argument string.
The paramenters are simpler now as well.

utf8_parse_version() was created by 9d53690f0d4 ("unicode: implement
higher level API for string handling") and later removed by 49bd03cc7e9
("unicode: pass a UNICODE_AGE() tripple to utf8_load").

Signed-off-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-4-f443d5814194@igalia.com
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/unicode/utf8-core.c  | 26 ++++++++++++++++++++++++++
 include/linux/unicode.h |  2 ++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 8395066341a4..7f7cb14e01ce 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -214,3 +214,29 @@ void utf8_unload(struct unicode_map *um)
 }
 EXPORT_SYMBOL(utf8_unload);
 
+/**
+ * utf8_parse_version - Parse a UTF-8 version number from a string
+ *
+ * @version: input string
+ *
+ * Returns the parsed version on success, negative code on error
+ */
+int utf8_parse_version(char *version)
+{
+	substring_t args[3];
+	unsigned int maj, min, rev;
+	static const struct match_token token[] = {
+		{1, "%d.%d.%d"},
+		{0, NULL}
+	};
+
+	if (match_token(version, token, args) != 1)
+		return -EINVAL;
+
+	if (match_int(&args[0], &maj) || match_int(&args[1], &min) ||
+	    match_int(&args[2], &rev))
+		return -EINVAL;
+
+	return UNICODE_AGE(maj, min, rev);
+}
+EXPORT_SYMBOL(utf8_parse_version);
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 0c0ab04e84ee..5e6b212a2aed 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -78,4 +78,6 @@ int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
 struct unicode_map *utf8_load(unsigned int version);
 void utf8_unload(struct unicode_map *um);
 
+int utf8_parse_version(char *version);
+
 #endif /* _LINUX_UNICODE_H */
-- 
cgit v1.2.3


From 458532c8dfeb24edd5e07467605a6484a728e5c2 Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@igalia.com>
Date: Mon, 21 Oct 2024 13:37:21 -0300
Subject: libfs: Export generic_ci_ dentry functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export generic_ci_ dentry functions so they can be used by
case-insensitive filesystems that need something more custom than the
default one set by `struct generic_ci_dentry_ops`.

Reviewed-by: Gabriel Krisman Bertazi <gabriel@krisman.be>
Signed-off-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-5-f443d5814194@igalia.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c         | 8 +++++---
 include/linux/fs.h | 4 ++++
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index 46966fd8bcf9..7b290404c5f9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1791,8 +1791,8 @@ bool is_empty_dir_inode(struct inode *inode)
  *
  * Return: 0 if names match, 1 if mismatch, or -ERRNO
  */
-static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
-				const char *str, const struct qstr *name)
+int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+			 const char *str, const struct qstr *name)
 {
 	const struct dentry *parent;
 	const struct inode *dir;
@@ -1835,6 +1835,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 
 	return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
 }
+EXPORT_SYMBOL(generic_ci_d_compare);
 
 /**
  * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
@@ -1843,7 +1844,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
  *
  * Return: 0 if hash was successful or unchanged, and -EINVAL on error
  */
-static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 {
 	const struct inode *dir = READ_ONCE(dentry->d_inode);
 	struct super_block *sb = dentry->d_sb;
@@ -1858,6 +1859,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 		return -EINVAL;
 	return 0;
 }
+EXPORT_SYMBOL(generic_ci_d_hash);
 
 static const struct dentry_operations generic_ci_dentry_ops = {
 	.d_hash = generic_ci_d_hash,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 403ee5d54c60..b277369672a1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3458,6 +3458,10 @@ extern int generic_ci_match(const struct inode *parent,
 			    const u8 *de_name, u32 de_name_len);
 
 #if IS_ENABLED(CONFIG_UNICODE)
+int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
+int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+			 const char *str, const struct qstr *name);
+
 /**
  * generic_ci_validate_strict_name - Check if a given name is suitable
  * for a directory
-- 
cgit v1.2.3


From 5cd9aecbc72c07e8b83e900078669a7d0bc5f98f Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@igalia.com>
Date: Mon, 21 Oct 2024 13:37:23 -0300
Subject: tmpfs: Add flag FS_CASEFOLD_FL support for tmpfs dirs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable setting flag FS_CASEFOLD_FL for tmpfs directories, when tmpfs is
mounted with casefold support. A special check is need for this flag,
since it can't be set for non-empty directories.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Gabriel Krisman Bertazi <gabriel@krisman.be>
Signed-off-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-7-f443d5814194@igalia.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/shmem_fs.h |  6 ++---
 mm/shmem.c               | 70 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 67 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 515a9a6a3c6f..018da28c01e7 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -42,10 +42,10 @@ struct shmem_inode_info {
 	struct inode		vfs_inode;
 };
 
-#define SHMEM_FL_USER_VISIBLE		FS_FL_USER_VISIBLE
+#define SHMEM_FL_USER_VISIBLE		(FS_FL_USER_VISIBLE | FS_CASEFOLD_FL)
 #define SHMEM_FL_USER_MODIFIABLE \
-	(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL)
-#define SHMEM_FL_INHERITED		(FS_NODUMP_FL | FS_NOATIME_FL)
+	(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL)
+#define SHMEM_FL_INHERITED		(FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL)
 
 struct shmem_quota_limits {
 	qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */
diff --git a/mm/shmem.c b/mm/shmem.c
index f26488ff3d6a..ea01628e4434 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2757,13 +2757,62 @@ static int shmem_file_open(struct inode *inode, struct file *file)
 #ifdef CONFIG_TMPFS_XATTR
 static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
 
+#if IS_ENABLED(CONFIG_UNICODE)
+/*
+ * shmem_inode_casefold_flags - Deal with casefold file attribute flag
+ *
+ * The casefold file attribute needs some special checks. I can just be added to
+ * an empty dir, and can't be removed from a non-empty dir.
+ */
+static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
+				      struct dentry *dentry, unsigned int *i_flags)
+{
+	unsigned int old = inode->i_flags;
+	struct super_block *sb = inode->i_sb;
+
+	if (fsflags & FS_CASEFOLD_FL) {
+		if (!(old & S_CASEFOLD)) {
+			if (!sb->s_encoding)
+				return -EOPNOTSUPP;
+
+			if (!S_ISDIR(inode->i_mode))
+				return -ENOTDIR;
+
+			if (dentry && !simple_empty(dentry))
+				return -ENOTEMPTY;
+		}
+
+		*i_flags = *i_flags | S_CASEFOLD;
+	} else if (old & S_CASEFOLD) {
+		if (dentry && !simple_empty(dentry))
+			return -ENOTEMPTY;
+	}
+
+	return 0;
+}
+#else
+static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
+				      struct dentry *dentry, unsigned int *i_flags)
+{
+	if (fsflags & FS_CASEFOLD_FL)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+#endif
+
 /*
  * chattr's fsflags are unrelated to extended attributes,
  * but tmpfs has chosen to enable them under the same config option.
  */
-static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
 {
 	unsigned int i_flags = 0;
+	int ret;
+
+	ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags);
+	if (ret)
+		return ret;
 
 	if (fsflags & FS_NOATIME_FL)
 		i_flags |= S_NOATIME;
@@ -2774,10 +2823,12 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
 	/*
 	 * But FS_NODUMP_FL does not require any action in i_flags.
 	 */
-	inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
+	inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);
+
+	return 0;
 }
 #else
-static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
 {
 }
 #define shmem_initxattrs NULL
@@ -2824,7 +2875,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
 	info->fsflags = (dir == NULL) ? 0 :
 		SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
 	if (info->fsflags)
-		shmem_set_inode_flags(inode, info->fsflags);
+		shmem_set_inode_flags(inode, info->fsflags, NULL);
 	INIT_LIST_HEAD(&info->shrinklist);
 	INIT_LIST_HEAD(&info->swaplist);
 	simple_xattrs_init(&info->xattrs);
@@ -3931,16 +3982,23 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap,
 {
 	struct inode *inode = d_inode(dentry);
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	int ret, flags;
 
 	if (fileattr_has_fsx(fa))
 		return -EOPNOTSUPP;
 	if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
 		return -EOPNOTSUPP;
 
-	info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
+	flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
 		(fa->flags & SHMEM_FL_USER_MODIFIABLE);
 
-	shmem_set_inode_flags(inode, info->fsflags);
+	ret = shmem_set_inode_flags(inode, flags, dentry);
+
+	if (ret)
+		return ret;
+
+	info->fsflags = flags;
+
 	inode_set_ctime_current(inode);
 	inode_inc_iversion(inode);
 	return 0;
-- 
cgit v1.2.3


From a2ad1b8101a36c927bcbd1317475b9576c352155 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 25 Oct 2024 06:11:51 -0700
Subject: mm/gup: Add folio_add_pins()

Export a function that adds pins to an already-pinned huge-page folio.
This allows any range of small pages within the folio to be unpinned later.
For example, pages pinned via memfd_pin_folios and modified by
folio_add_pins could be unpinned via unpin_user_page(s).

Link: https://patch.msgid.link/r/1729861919-234514-2-git-send-email-steven.sistare@oracle.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/mm.h |  1 +
 mm/gup.c           | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..f9de33ac2add 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2524,6 +2524,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
 		      struct folio **folios, unsigned int max_folios,
 		      pgoff_t *offset);
+int folio_add_pins(struct folio *folio, unsigned int pins);
 
 int get_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
diff --git a/mm/gup.c b/mm/gup.c
index a82890b46a36..4ac3e567e08e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3717,3 +3717,27 @@ err:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(memfd_pin_folios);
+
+/**
+ * folio_add_pins() - add pins to an already-pinned folio
+ * @folio: the folio to add more pins to
+ * @pins: number of pins to add
+ *
+ * Try to add more pins to an already-pinned folio. The semantics
+ * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot
+ * be changed.
+ *
+ * This function is helpful when having obtained a pin on a large folio
+ * using memfd_pin_folios(), but wanting to logically unpin parts
+ * (e.g., individual pages) of the folio later, for example, using
+ * unpin_user_page_range_dirty_lock().
+ *
+ * This is not the right interface to initially pin a folio.
+ */
+int folio_add_pins(struct folio *folio, unsigned int pins)
+{
+	VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));
+
+	return try_grab_folio(folio, pins, FOLL_PIN);
+}
+EXPORT_SYMBOL_GPL(folio_add_pins);
-- 
cgit v1.2.3


From 66418687ac895717dc2f6ddffe24cf9b74cd0d3e Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 10 Oct 2024 10:24:42 -0500
Subject: kernel/range: Const-ify range_contains parameters

range_contains() does not modify the range values.  David suggested it
is safer to keep those parameters as const.[1]

Make range parameters const

Link: https://lore.kernel.org/all/20241008161032.GB1609@twin.jikos.cz/ [1]
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Link: https://patch.msgid.link/20241010-const-range-v1-1-afb6e4bfd8ce@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/range.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/range.h b/include/linux/range.h
index 6ad0b73cb7ad..7dc5e835e079 100644
--- a/include/linux/range.h
+++ b/include/linux/range.h
@@ -13,7 +13,8 @@ static inline u64 range_len(const struct range *range)
 	return range->end - range->start + 1;
 }
 
-static inline bool range_contains(struct range *r1, struct range *r2)
+static inline bool range_contains(const struct range *r1,
+				  const struct range *r2)
 {
 	return r1->start <= r2->start && r1->end >= r2->end;
 }
-- 
cgit v1.2.3


From 0bbff9ed81654d5f06bfca484681756ee407f924 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 2 Oct 2024 13:43:24 -0500
Subject: perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access control

Armv8.9/9.4 PMUv3.9 adds per counter EL0 access controls. Per counter
access is enabled with the UEN bit in PMUSERENR_EL1 register. Individual
counters are enabled/disabled in the PMUACR_EL1 register. When UEN is
set, the CR/ER bits control EL0 write access and must be set to disable
write access.

With the access controls, the clearing of unused counters can be
skipped.

KVM also configures PMUSERENR_EL1 in order to trap to EL2. UEN does not
need to be set for it since only PMUv3.5 is exposed to guests.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20241002184326.1105499-1-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm/include/asm/arm_pmuv3.h   |  6 ++++++
 arch/arm64/include/asm/arm_pmuv3.h | 10 ++++++++++
 arch/arm64/tools/sysreg            |  8 ++++++++
 drivers/perf/arm_pmuv3.c           | 29 +++++++++++++++++++----------
 include/linux/perf/arm_pmuv3.h     |  1 +
 5 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h
index f63ba8986b24..d242b5e1ca0d 100644
--- a/arch/arm/include/asm/arm_pmuv3.h
+++ b/arch/arm/include/asm/arm_pmuv3.h
@@ -231,6 +231,7 @@ static inline void kvm_vcpu_pmu_resync_el0(void) {}
 #define ARMV8_PMU_DFR_VER_V3P1      0x4
 #define ARMV8_PMU_DFR_VER_V3P4      0x5
 #define ARMV8_PMU_DFR_VER_V3P5      0x6
+#define ARMV8_PMU_DFR_VER_V3P9      0x9
 #define ARMV8_PMU_DFR_VER_IMP_DEF   0xF
 
 static inline bool pmuv3_implemented(int pmuver)
@@ -249,6 +250,11 @@ static inline bool is_pmuv3p5(int pmuver)
 	return pmuver >= ARMV8_PMU_DFR_VER_V3P5;
 }
 
+static inline bool is_pmuv3p9(int pmuver)
+{
+	return pmuver >= ARMV8_PMU_DFR_VER_V3P9;
+}
+
 static inline u64 read_pmceid0(void)
 {
 	u64 val = read_sysreg(PMCEID0);
diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h
index 468a049bc63b..8a777dec8d88 100644
--- a/arch/arm64/include/asm/arm_pmuv3.h
+++ b/arch/arm64/include/asm/arm_pmuv3.h
@@ -152,6 +152,11 @@ static inline void write_pmuserenr(u32 val)
 	write_sysreg(val, pmuserenr_el0);
 }
 
+static inline void write_pmuacr(u64 val)
+{
+	write_sysreg_s(val, SYS_PMUACR_EL1);
+}
+
 static inline u64 read_pmceid0(void)
 {
 	return read_sysreg(pmceid0_el0);
@@ -178,4 +183,9 @@ static inline bool is_pmuv3p5(int pmuver)
 	return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5;
 }
 
+static inline bool is_pmuv3p9(int pmuver)
+{
+	return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P9;
+}
+
 #endif
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 8d637ac4b7c6..74fb5af91d4f 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1238,6 +1238,7 @@ UnsignedEnum	11:8	PMUVer
 	0b0110	V3P5
 	0b0111	V3P7
 	0b1000	V3P8
+	0b1001	V3P9
 	0b1111	IMP_DEF
 EndEnum
 UnsignedEnum	7:4	TraceVer
@@ -2178,6 +2179,13 @@ Field	4	P
 Field	3:0	ALIGN
 EndSysreg
 
+Sysreg	PMUACR_EL1	3	0	9	14	4
+Res0	63:33
+Field	32	F0
+Field	31	C
+Field	30:0	P
+EndSysreg
+
 Sysreg	PMSELR_EL0	3	3	9	12	5
 Res0	63:5
 Field	4:0	SEL
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 0afe02f879b4..bb93d32b86ea 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -770,18 +770,27 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
 	int i;
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 
-	/* Clear any unused counters to avoid leaking their contents */
-	for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask,
-			    ARMPMU_MAX_HWEVENTS) {
-		if (i == ARMV8_PMU_CYCLE_IDX)
-			write_pmccntr(0);
-		else if (i == ARMV8_PMU_INSTR_IDX)
-			write_pmicntr(0);
-		else
-			armv8pmu_write_evcntr(i, 0);
+	if (is_pmuv3p9(cpu_pmu->pmuver)) {
+		u64 mask = 0;
+		for_each_set_bit(i, cpuc->used_mask, ARMPMU_MAX_HWEVENTS) {
+			if (armv8pmu_event_has_user_read(cpuc->events[i]))
+				mask |= BIT(i);
+		}
+		write_pmuacr(mask);
+	} else {
+		/* Clear any unused counters to avoid leaking their contents */
+		for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask,
+				    ARMPMU_MAX_HWEVENTS) {
+			if (i == ARMV8_PMU_CYCLE_IDX)
+				write_pmccntr(0);
+			else if (i == ARMV8_PMU_INSTR_IDX)
+				write_pmicntr(0);
+			else
+				armv8pmu_write_evcntr(i, 0);
+		}
 	}
 
-	update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR);
+	update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_UEN);
 }
 
 static void armv8pmu_enable_event(struct perf_event *event)
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 3372c1b56486..d698efba28a2 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -257,6 +257,7 @@
 #define ARMV8_PMU_USERENR_SW	(1 << 1) /* PMSWINC can be written at EL0 */
 #define ARMV8_PMU_USERENR_CR	(1 << 2) /* Cycle counter can be read at EL0 */
 #define ARMV8_PMU_USERENR_ER	(1 << 3) /* Event counter can be read at EL0 */
+#define ARMV8_PMU_USERENR_UEN	(1 << 4) /* Fine grained per counter access at EL0 */
 /* Mask for writable bits */
 #define ARMV8_PMU_USERENR_MASK	(ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \
 				 ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER)
-- 
cgit v1.2.3


From e1dce56443a4a18978fe39ee4af663e5b6b31422 Mon Sep 17 00:00:00 2001
From: Gowthami Thiagarajan <gthiagarajan@marvell.com>
Date: Mon, 28 Oct 2024 11:23:09 +0530
Subject: perf/marvell: Marvell PEM performance monitor support

PCI Express Interface PMU includes various performance counters
to monitor the data that is transmitted over the PCIe link. The
counters track various inbound and outbound transactions which
includes separate counters for posted/non-posted/completion TLPs.
Also, inbound and outbound memory read requests along with their
latencies can also be monitored. Address Translation Services(ATS)events
such as ATS Translation, ATS Page Request, ATS Invalidation along with
their corresponding latencies are also supported.

The performance counters are 64 bits wide.

For instance,
perf stat -e ib_tlp_pr <workload>
tracks the inbound posted TLPs for the workload.

Co-developed-by: Linu Cherian <lcherian@marvell.com>
Signed-off-by: Linu Cherian <lcherian@marvell.com>
Signed-off-by: Gowthami Thiagarajan <gthiagarajan@marvell.com>
Link: https://lore.kernel.org/r/20241028055309.17893-1-gthiagarajan@marvell.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/admin-guide/perf/index.rst        |   1 +
 Documentation/admin-guide/perf/mrvl-pem-pmu.rst |  56 ++++
 MAINTAINERS                                     |   6 +
 drivers/perf/Kconfig                            |   7 +
 drivers/perf/Makefile                           |   1 +
 drivers/perf/marvell_pem_pmu.c                  | 425 ++++++++++++++++++++++++
 include/linux/cpuhotplug.h                      |   1 +
 7 files changed, 497 insertions(+)
 create mode 100644 Documentation/admin-guide/perf/mrvl-pem-pmu.rst
 create mode 100644 drivers/perf/marvell_pem_pmu.c

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index 8502bc174640..a58bd3f7e190 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -26,3 +26,4 @@ Performance monitor support
    meson-ddr-pmu
    cxl
    ampere_cspmu
+   mrvl-pem-pmu
diff --git a/Documentation/admin-guide/perf/mrvl-pem-pmu.rst b/Documentation/admin-guide/perf/mrvl-pem-pmu.rst
new file mode 100644
index 000000000000..c39007149b97
--- /dev/null
+++ b/Documentation/admin-guide/perf/mrvl-pem-pmu.rst
@@ -0,0 +1,56 @@
+=================================================================
+Marvell Odyssey PEM Performance Monitoring Unit (PMU UNCORE)
+=================================================================
+
+The PCI Express Interface Units(PEM) are associated with a corresponding
+monitoring unit. This includes performance counters to track various
+characteristics of the data that is transmitted over the PCIe link.
+
+The counters track inbound and outbound transactions which
+includes separate counters for posted/non-posted/completion TLPs.
+Also, inbound and outbound memory read requests along with their
+latencies can also be monitored. Address Translation Services(ATS)events
+such as ATS Translation, ATS Page Request, ATS Invalidation along with
+their corresponding latencies are also tracked.
+
+There are separate 64 bit counters to measure posted/non-posted/completion
+tlps in inbound and outbound transactions. ATS events are measured by
+different counters.
+
+The PMU driver exposes the available events and format options under sysfs,
+/sys/bus/event_source/devices/mrvl_pcie_rc_pmu_<>/events/
+/sys/bus/event_source/devices/mrvl_pcie_rc_pmu_<>/format/
+
+Examples::
+
+  # perf list | grep mrvl_pcie_rc_pmu
+  mrvl_pcie_rc_pmu_<>/ats_inv/             [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_inv_latency/     [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_pri/             [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_pri_latency/     [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_trans/           [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_trans_latency/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_inflight/         [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_reads/            [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_req_no_ro_ebus/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_req_no_ro_ncb/    [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_cpl_partid/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_cpl_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_npr/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_pr/    [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_npr/          [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_pr/           [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_inflight_partid/  [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_merges_cpl_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_merges_npr_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_merges_pr_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_reads_partid/     [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_cpl_partid/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_cpl_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_npr_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_pr_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_npr_partid/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_pr_partid/    [Kernel PMU event]
+
+
+  # perf stat -e ib_inflight,ib_reads,ib_req_no_ro_ebus,ib_req_no_ro_ncb <workload>
diff --git a/MAINTAINERS b/MAINTAINERS
index 7ad507f49324..aaf2cfbf1d7c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13845,6 +13845,12 @@ S:	Supported
 F:	Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst
 F:	drivers/net/ethernet/marvell/octeontx2/af/
 
+MARVELL PEM PMU DRIVER
+M:	Linu Cherian <lcherian@marvell.com>
+M:	Gowthami Thiagarajan <gthiagarajan@marvell.com>
+S:	Supported
+F:	drivers/perf/marvell_pem_pmu.c
+
 MARVELL PRESTERA ETHERNET SWITCH DRIVER
 M:	Taras Chornyi <taras.chornyi@plvision.eu>
 S:	Supported
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index bab8ba64162f..4e268de351c4 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -284,4 +284,11 @@ config CXL_PMU
 
 	  If unsure say 'm'.
 
+config MARVELL_PEM_PMU
+	tristate "MARVELL PEM PMU Support"
+	depends on ARCH_THUNDER || (COMPILE_TEST && 64BIT)
+	help
+	  Enable support for PCIe Interface performance monitoring
+	  on Marvell platform.
+
 endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 8268f38e42c5..de71d2574857 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
 obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
+obj-$(CONFIG_MARVELL_PEM_PMU) += marvell_pem_pmu.o
 obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
 obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o
 obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
diff --git a/drivers/perf/marvell_pem_pmu.c b/drivers/perf/marvell_pem_pmu.c
new file mode 100644
index 000000000000..29fbcd1848e4
--- /dev/null
+++ b/drivers/perf/marvell_pem_pmu.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Marvell PEM(PCIe RC) Performance Monitor Driver
+ *
+ * Copyright (C) 2024 Marvell.
+ */
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+/*
+ * Each of these events maps to a free running 64 bit counter
+ * with no event control, but can be reset.
+ */
+enum pem_events {
+	IB_TLP_NPR,
+	IB_TLP_PR,
+	IB_TLP_CPL,
+	IB_TLP_DWORDS_NPR,
+	IB_TLP_DWORDS_PR,
+	IB_TLP_DWORDS_CPL,
+	IB_INFLIGHT,
+	IB_READS,
+	IB_REQ_NO_RO_NCB,
+	IB_REQ_NO_RO_EBUS,
+	OB_TLP_NPR,
+	OB_TLP_PR,
+	OB_TLP_CPL,
+	OB_TLP_DWORDS_NPR,
+	OB_TLP_DWORDS_PR,
+	OB_TLP_DWORDS_CPL,
+	OB_INFLIGHT,
+	OB_READS,
+	OB_MERGES_NPR,
+	OB_MERGES_PR,
+	OB_MERGES_CPL,
+	ATS_TRANS,
+	ATS_TRANS_LATENCY,
+	ATS_PRI,
+	ATS_PRI_LATENCY,
+	ATS_INV,
+	ATS_INV_LATENCY,
+	PEM_EVENTIDS_MAX
+};
+
+static u64 eventid_to_offset_table[] = {
+	[IB_TLP_NPR]	     = 0x0,
+	[IB_TLP_PR]	     = 0x8,
+	[IB_TLP_CPL]	     = 0x10,
+	[IB_TLP_DWORDS_NPR]  = 0x100,
+	[IB_TLP_DWORDS_PR]   = 0x108,
+	[IB_TLP_DWORDS_CPL]  = 0x110,
+	[IB_INFLIGHT]	     = 0x200,
+	[IB_READS]	     = 0x300,
+	[IB_REQ_NO_RO_NCB]   = 0x400,
+	[IB_REQ_NO_RO_EBUS]  = 0x408,
+	[OB_TLP_NPR]         = 0x500,
+	[OB_TLP_PR]          = 0x508,
+	[OB_TLP_CPL]         = 0x510,
+	[OB_TLP_DWORDS_NPR]  = 0x600,
+	[OB_TLP_DWORDS_PR]   = 0x608,
+	[OB_TLP_DWORDS_CPL]  = 0x610,
+	[OB_INFLIGHT]        = 0x700,
+	[OB_READS]	     = 0x800,
+	[OB_MERGES_NPR]      = 0x900,
+	[OB_MERGES_PR]       = 0x908,
+	[OB_MERGES_CPL]      = 0x910,
+	[ATS_TRANS]          = 0x2D18,
+	[ATS_TRANS_LATENCY]  = 0x2D20,
+	[ATS_PRI]            = 0x2D28,
+	[ATS_PRI_LATENCY]    = 0x2D30,
+	[ATS_INV]            = 0x2D38,
+	[ATS_INV_LATENCY]    = 0x2D40,
+};
+
+struct pem_pmu {
+	struct pmu pmu;
+	void __iomem *base;
+	unsigned int cpu;
+	struct	device *dev;
+	struct hlist_node node;
+};
+
+#define to_pem_pmu(p)	container_of(p, struct pem_pmu, pmu)
+
+static int eventid_to_offset(int eventid)
+{
+	return eventid_to_offset_table[eventid];
+}
+
+/* Events */
+static ssize_t pem_pmu_event_show(struct device *dev,
+				  struct device_attribute *attr,
+				  char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+#define PEM_EVENT_ATTR(_name, _id)					\
+	(&((struct perf_pmu_events_attr[]) {				\
+	{ .attr = __ATTR(_name, 0444, pem_pmu_event_show, NULL),	\
+		.id = _id, }						\
+	})[0].attr.attr)
+
+static struct attribute *pem_perf_events_attrs[] = {
+	PEM_EVENT_ATTR(ib_tlp_npr, IB_TLP_NPR),
+	PEM_EVENT_ATTR(ib_tlp_pr, IB_TLP_PR),
+	PEM_EVENT_ATTR(ib_tlp_cpl_partid, IB_TLP_CPL),
+	PEM_EVENT_ATTR(ib_tlp_dwords_npr, IB_TLP_DWORDS_NPR),
+	PEM_EVENT_ATTR(ib_tlp_dwords_pr, IB_TLP_DWORDS_PR),
+	PEM_EVENT_ATTR(ib_tlp_dwords_cpl_partid, IB_TLP_DWORDS_CPL),
+	PEM_EVENT_ATTR(ib_inflight, IB_INFLIGHT),
+	PEM_EVENT_ATTR(ib_reads, IB_READS),
+	PEM_EVENT_ATTR(ib_req_no_ro_ncb, IB_REQ_NO_RO_NCB),
+	PEM_EVENT_ATTR(ib_req_no_ro_ebus, IB_REQ_NO_RO_EBUS),
+	PEM_EVENT_ATTR(ob_tlp_npr_partid, OB_TLP_NPR),
+	PEM_EVENT_ATTR(ob_tlp_pr_partid, OB_TLP_PR),
+	PEM_EVENT_ATTR(ob_tlp_cpl_partid, OB_TLP_CPL),
+	PEM_EVENT_ATTR(ob_tlp_dwords_npr_partid, OB_TLP_DWORDS_NPR),
+	PEM_EVENT_ATTR(ob_tlp_dwords_pr_partid, OB_TLP_DWORDS_PR),
+	PEM_EVENT_ATTR(ob_tlp_dwords_cpl_partid, OB_TLP_DWORDS_CPL),
+	PEM_EVENT_ATTR(ob_inflight_partid, OB_INFLIGHT),
+	PEM_EVENT_ATTR(ob_reads_partid, OB_READS),
+	PEM_EVENT_ATTR(ob_merges_npr_partid, OB_MERGES_NPR),
+	PEM_EVENT_ATTR(ob_merges_pr_partid, OB_MERGES_PR),
+	PEM_EVENT_ATTR(ob_merges_cpl_partid, OB_MERGES_CPL),
+	PEM_EVENT_ATTR(ats_trans, ATS_TRANS),
+	PEM_EVENT_ATTR(ats_trans_latency, ATS_TRANS_LATENCY),
+	PEM_EVENT_ATTR(ats_pri, ATS_PRI),
+	PEM_EVENT_ATTR(ats_pri_latency, ATS_PRI_LATENCY),
+	PEM_EVENT_ATTR(ats_inv, ATS_INV),
+	PEM_EVENT_ATTR(ats_inv_latency, ATS_INV_LATENCY),
+	NULL
+};
+
+static struct attribute_group pem_perf_events_attr_group = {
+	.name = "events",
+	.attrs = pem_perf_events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-5");
+
+static struct attribute *pem_perf_format_attrs[] = {
+	&format_attr_event.attr,
+	NULL
+};
+
+static struct attribute_group pem_perf_format_attr_group = {
+	.name = "format",
+	.attrs = pem_perf_format_attrs,
+};
+
+/* cpumask */
+static ssize_t pem_perf_cpumask_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct pem_pmu *pmu = dev_get_drvdata(dev);
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pmu->cpu));
+}
+
+static struct device_attribute pem_perf_cpumask_attr =
+	__ATTR(cpumask, 0444, pem_perf_cpumask_show, NULL);
+
+static struct attribute *pem_perf_cpumask_attrs[] = {
+	&pem_perf_cpumask_attr.attr,
+	NULL
+};
+
+static struct attribute_group pem_perf_cpumask_attr_group = {
+	.attrs = pem_perf_cpumask_attrs,
+};
+
+static const struct attribute_group *pem_perf_attr_groups[] = {
+	&pem_perf_events_attr_group,
+	&pem_perf_cpumask_attr_group,
+	&pem_perf_format_attr_group,
+	NULL
+};
+
+static int pem_perf_event_init(struct perf_event *event)
+{
+	struct pem_pmu *pmu = to_pem_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_event *sibling;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (event->attr.config >= PEM_EVENTIDS_MAX)
+		return -EINVAL;
+
+	if (is_sampling_event(event) ||
+	    event->attach_state & PERF_ATTACH_TASK) {
+		return -EOPNOTSUPP;
+	}
+
+	if (event->cpu < 0)
+		return -EOPNOTSUPP;
+
+	/*  We must NOT create groups containing mixed PMUs */
+	if (event->group_leader->pmu != event->pmu &&
+	    !is_software_event(event->group_leader))
+		return -EINVAL;
+
+	for_each_sibling_event(sibling, event->group_leader) {
+		if (sibling->pmu != event->pmu &&
+		    !is_software_event(sibling))
+			return -EINVAL;
+	}
+	/*
+	 * Set ownership of event to one CPU, same event can not be observed
+	 * on multiple cpus at same time.
+	 */
+	event->cpu = pmu->cpu;
+	hwc->idx = -1;
+	return 0;
+}
+
+static u64 pem_perf_read_counter(struct pem_pmu *pmu,
+				 struct perf_event *event, int eventid)
+{
+	return readq_relaxed(pmu->base + eventid_to_offset(eventid));
+}
+
+static void pem_perf_event_update(struct perf_event *event)
+{
+	struct pem_pmu *pmu = to_pem_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_count, new_count;
+
+	do {
+		prev_count = local64_read(&hwc->prev_count);
+		new_count = pem_perf_read_counter(pmu, event, hwc->idx);
+	} while (local64_xchg(&hwc->prev_count, new_count) != prev_count);
+
+	local64_add((new_count - prev_count), &event->count);
+}
+
+static void pem_perf_event_start(struct perf_event *event, int flags)
+{
+	struct pem_pmu *pmu = to_pem_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int eventid = hwc->idx;
+
+	/*
+	 * All counters are free-running and associated with
+	 * a fixed event to track in Hardware
+	 */
+	local64_set(&hwc->prev_count,
+		    pem_perf_read_counter(pmu, event, eventid));
+
+	hwc->state = 0;
+}
+
+static int pem_perf_event_add(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = event->attr.config;
+	if (WARN_ON_ONCE(hwc->idx >= PEM_EVENTIDS_MAX))
+		return -EINVAL;
+	hwc->state |= PERF_HES_STOPPED;
+
+	if (flags & PERF_EF_START)
+		pem_perf_event_start(event, flags);
+
+	return 0;
+}
+
+static void pem_perf_event_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (flags & PERF_EF_UPDATE)
+		pem_perf_event_update(event);
+
+	hwc->state |= PERF_HES_STOPPED;
+}
+
+static void pem_perf_event_del(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	pem_perf_event_stop(event, PERF_EF_UPDATE);
+	hwc->idx = -1;
+}
+
+static int pem_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct pem_pmu *pmu = hlist_entry_safe(node, struct pem_pmu, node);
+	unsigned int target;
+
+	if (cpu != pmu->cpu)
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		return 0;
+
+	perf_pmu_migrate_context(&pmu->pmu, cpu, target);
+	pmu->cpu = target;
+	return 0;
+}
+
+static int pem_perf_probe(struct platform_device *pdev)
+{
+	struct pem_pmu *pem_pmu;
+	struct resource *res;
+	void __iomem *base;
+	char *name;
+	int ret;
+
+	pem_pmu = devm_kzalloc(&pdev->dev, sizeof(*pem_pmu), GFP_KERNEL);
+	if (!pem_pmu)
+		return -ENOMEM;
+
+	pem_pmu->dev = &pdev->dev;
+	platform_set_drvdata(pdev, pem_pmu);
+
+	base = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	pem_pmu->base = base;
+
+	pem_pmu->pmu = (struct pmu) {
+		.module	      = THIS_MODULE,
+		.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+		.task_ctx_nr = perf_invalid_context,
+		.attr_groups = pem_perf_attr_groups,
+		.event_init  = pem_perf_event_init,
+		.add	     = pem_perf_event_add,
+		.del	     = pem_perf_event_del,
+		.start	     = pem_perf_event_start,
+		.stop	     = pem_perf_event_stop,
+		.read	     = pem_perf_event_update,
+	};
+
+	/* Choose this cpu to collect perf data */
+	pem_pmu->cpu = raw_smp_processor_id();
+
+	name = devm_kasprintf(pem_pmu->dev, GFP_KERNEL, "mrvl_pcie_rc_pmu_%llx",
+			      res->start);
+	if (!name)
+		return -ENOMEM;
+
+	cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+					 &pem_pmu->node);
+
+	ret = perf_pmu_register(&pem_pmu->pmu, name, -1);
+	if (ret)
+		goto error;
+
+	return 0;
+error:
+	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+					    &pem_pmu->node);
+	return ret;
+}
+
+static void pem_perf_remove(struct platform_device *pdev)
+{
+	struct pem_pmu *pem_pmu = platform_get_drvdata(pdev);
+
+	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+					    &pem_pmu->node);
+
+	perf_pmu_unregister(&pem_pmu->pmu);
+}
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id pem_pmu_acpi_match[] = {
+	{"MRVL000E", 0},
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, pem_pmu_acpi_match);
+#endif
+
+static struct platform_driver pem_pmu_driver = {
+	.driver	= {
+		.name   = "pem-pmu",
+		.acpi_match_table = ACPI_PTR(pem_pmu_acpi_match),
+		.suppress_bind_attrs = true,
+	},
+	.probe		= pem_perf_probe,
+	.remove		= pem_perf_remove,
+};
+
+static int __init pem_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+				      "perf/marvell/pem:online", NULL,
+				       pem_pmu_offline_cpu);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&pem_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE);
+	return ret;
+}
+
+static void __exit pem_pmu_exit(void)
+{
+	platform_driver_unregister(&pem_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE);
+}
+
+module_init(pem_pmu_init);
+module_exit(pem_pmu_exit);
+
+MODULE_DESCRIPTION("Marvell PEM Perf driver");
+MODULE_AUTHOR("Gowthami Thiagarajan <gthiagarajan@marvell.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2361ed4d2b15..61d9a66d1807 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -227,6 +227,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_APM_XGENE_ONLINE,
 	CPUHP_AP_PERF_ARM_CAVIUM_TX2_UNCORE_ONLINE,
 	CPUHP_AP_PERF_ARM_MARVELL_CN10K_DDR_ONLINE,
+	CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
 	CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
-- 
cgit v1.2.3


From dc60de4eb0a431bde94e5abe55be6f646cdb435d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 24 Oct 2024 22:04:54 +0300
Subject: iio: acpi: Add iio_get_acpi_device_name_and_data() helper function

A few drivers duplicate the code to retrieve ACPI device instance name.
Some of them want an associated driver data as well.

In order of deduplication introduce the common helper functions.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20241024191200.229894-6-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-acpi.c | 38 +++++++++++++++++++++++++++++++++++++-
 include/linux/iio/iio.h         | 10 ++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-acpi.c b/drivers/iio/industrialio-acpi.c
index 1e46908f9534..d67a43843799 100644
--- a/drivers/iio/industrialio-acpi.c
+++ b/drivers/iio/industrialio-acpi.c
@@ -2,7 +2,8 @@
 /* IIO ACPI helper functions */
 
 #include <linux/acpi.h>
-#include <linux/dev_printk.h>
+#include <linux/device.h>
+#include <linux/export.h>
 #include <linux/iio/iio.h>
 #include <linux/sprintf.h>
 
@@ -87,3 +88,38 @@ out_free_buffer:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iio_read_acpi_mount_matrix);
+
+/**
+ * iio_get_acpi_device_name_and_data() - Return ACPI device instance name and driver data
+ * @dev:		Device structure
+ * @data:		Optional pointer to return driver data
+ *
+ * When device was enumerated by ACPI ID matching, the user might
+ * want to set description for the physical chip. In such cases
+ * the ACPI device instance name might be used. This call may be
+ * performed to retrieve this information.
+ *
+ * NOTE: This helper function exists only for backward compatibility,
+ * do not use in a new code!
+ *
+ * Returns: ACPI device instance name or %NULL.
+ */
+const char *iio_get_acpi_device_name_and_data(struct device *dev, const void **data)
+{
+	const struct acpi_device_id *id;
+	acpi_handle handle;
+
+	handle = ACPI_HANDLE(dev);
+	if (!handle)
+		return NULL;
+
+	id = acpi_match_device(dev->driver->acpi_match_table, dev);
+	if (!id)
+		return NULL;
+
+	if (data)
+		*data = (const void *)id->driver_data;
+
+	return dev_name(dev);
+}
+EXPORT_SYMBOL_GPL(iio_get_acpi_device_name_and_data);
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 3a9b57187a95..445d6666a291 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -831,6 +831,7 @@ int iio_device_resume_triggering(struct iio_dev *indio_dev);
 bool iio_read_acpi_mount_matrix(struct device *dev,
 				struct iio_mount_matrix *orientation,
 				char *acpi_method);
+const char *iio_get_acpi_device_name_and_data(struct device *dev, const void **data);
 #else
 static inline bool iio_read_acpi_mount_matrix(struct device *dev,
 					      struct iio_mount_matrix *orientation,
@@ -838,7 +839,16 @@ static inline bool iio_read_acpi_mount_matrix(struct device *dev,
 {
 	return false;
 }
+static inline const char *
+iio_get_acpi_device_name_and_data(struct device *dev, const void **data)
+{
+	return NULL;
+}
 #endif
+static inline const char *iio_get_acpi_device_name(struct device *dev)
+{
+	return iio_get_acpi_device_name_and_data(dev, NULL);
+}
 
 /**
  * iio_get_current_scan_type - Get the current scan type for a channel
-- 
cgit v1.2.3


From 4261974701851630951e9ab31f0de4ade0faea53 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Fri, 25 Oct 2024 19:46:55 -0500
Subject: printf: Add print format (%pra) for struct range

The use of struct range in the CXL subsystem is growing.  In particular,
the addition of Dynamic Capacity devices uses struct range in a number
of places which are reported in debug and error messages.

To wit requiring the printing of the start/end fields in each print
became cumbersome.  Dan Williams mentions in [1] that it might be time
to have a print specifier for struct range similar to struct resource.

A few alternatives were considered including '%par', '%r', and '%pn'.
%pra follows that struct range is similar to struct resource (%p[rR])
but needs to be different.  Based on discussions with Petr and Andy
'%pra' was chosen.[2]

Andy also suggested to keep the range prints similar to struct resource
though combined code.  Add hex_range() to handle printing for both
pointer types.

Finally introduce DEFINE_RANGE() as a parallel to DEFINE_RES_*() and use
it in the tests.

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-doc@vger.kernel.org
Cc: open list <linux-kernel@vger.kernel.org>
Link: https://lore.kernel.org/all/663922b475e50_d54d72945b@dwillia2-xfh.jf.intel.com.notmuch/ [1]
Link: https://lore.kernel.org/all/66cea3bf3332f_f937b29424@iweiny-mobl.notmuch/ [2]
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20241025-cxl-pra-v2-3-123a825daba2@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 Documentation/core-api/printk-formats.rst | 13 +++++++
 include/linux/range.h                     |  6 ++++
 lib/test_printf.c                         | 17 +++++++++
 lib/vsprintf.c                            | 57 +++++++++++++++++++++++++++----
 4 files changed, 87 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index 552f51046cf3..ecccc0473da9 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -236,6 +236,19 @@ width of the CPU data path.
 
 Passed by reference.
 
+Struct Range
+------------
+
+::
+
+	%pra    [range 0x0000000060000000-0x000000006fffffff] or
+		[range 0x0000000060000000]
+
+For printing struct range.  struct range holds an arbitrary range of u64
+values.  If start is equal to end only print the start value.
+
+Passed by reference.
+
 DMA address types dma_addr_t
 ----------------------------
 
diff --git a/include/linux/range.h b/include/linux/range.h
index 6ad0b73cb7ad..1358d4b1807a 100644
--- a/include/linux/range.h
+++ b/include/linux/range.h
@@ -31,4 +31,10 @@ int clean_sort_range(struct range *range, int az);
 
 void sort_range(struct range *range, int nr_range);
 
+#define DEFINE_RANGE(_start, _end)		\
+(struct range) {				\
+		.start = (_start),		\
+		.end = (_end),			\
+	}
+
 #endif
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 5afdf5efc627..59dbe4f9a4cb 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -432,6 +432,22 @@ struct_resource(void)
 	     "%pR", &test_resource);
 }
 
+static void __init
+struct_range(void)
+{
+	struct range test_range = DEFINE_RANGE(0xc0ffee00ba5eba11,
+					       0xc0ffee00ba5eba11);
+	test("[range 0xc0ffee00ba5eba11]", "%pra", &test_range);
+
+	test_range = DEFINE_RANGE(0xc0ffee, 0xba5eba11);
+	test("[range 0x0000000000c0ffee-0x00000000ba5eba11]",
+	     "%pra", &test_range);
+
+	test_range = DEFINE_RANGE(0xba5eba11, 0xc0ffee);
+	test("[range 0x00000000ba5eba11-0x0000000000c0ffee]",
+	     "%pra", &test_range);
+}
+
 static void __init
 addr(void)
 {
@@ -807,6 +823,7 @@ test_pointer(void)
 	symbol_ptr();
 	kernel_ptr();
 	struct_resource();
+	struct_range();
 	addr();
 	escaped_str();
 	hex_string();
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index c5e2ec9303c5..6ac02bbb7df1 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1039,6 +1039,20 @@ static const struct printf_spec default_dec04_spec = {
 	.flags = ZEROPAD,
 };
 
+static noinline_for_stack
+char *hex_range(char *buf, char *end, u64 start_val, u64 end_val,
+		struct printf_spec spec)
+{
+	buf = number(buf, end, start_val, spec);
+	if (start_val == end_val)
+		return buf;
+
+	if (buf < end)
+		*buf = '-';
+	++buf;
+	return number(buf, end, end_val, spec);
+}
+
 static noinline_for_stack
 char *resource_string(char *buf, char *end, struct resource *res,
 		      struct printf_spec spec, const char *fmt)
@@ -1115,11 +1129,7 @@ char *resource_string(char *buf, char *end, struct resource *res,
 		p = string_nocheck(p, pend, "size ", str_spec);
 		p = number(p, pend, resource_size(res), *specp);
 	} else {
-		p = number(p, pend, res->start, *specp);
-		if (res->start != res->end) {
-			*p++ = '-';
-			p = number(p, pend, res->end, *specp);
-		}
+		p = hex_range(p, pend, res->start, res->end, *specp);
 	}
 	if (decode) {
 		if (res->flags & IORESOURCE_MEM_64)
@@ -1140,6 +1150,31 @@ char *resource_string(char *buf, char *end, struct resource *res,
 	return string_nocheck(buf, end, sym, spec);
 }
 
+static noinline_for_stack
+char *range_string(char *buf, char *end, const struct range *range,
+		   struct printf_spec spec, const char *fmt)
+{
+	char sym[sizeof("[range 0x0123456789abcdef-0x0123456789abcdef]")];
+	char *p = sym, *pend = sym + sizeof(sym);
+
+	struct printf_spec range_spec = {
+		.field_width = 2 + 2 * sizeof(range->start), /* 0x + 2 * 8 */
+		.flags = SPECIAL | SMALL | ZEROPAD,
+		.base = 16,
+		.precision = -1,
+	};
+
+	if (check_pointer(&buf, end, range, spec))
+		return buf;
+
+	p = string_nocheck(p, pend, "[range ", default_str_spec);
+	p = hex_range(p, pend, range->start, range->end, range_spec);
+	*p++ = ']';
+	*p = '\0';
+
+	return string_nocheck(buf, end, sym, spec);
+}
+
 static noinline_for_stack
 char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
 		 const char *fmt)
@@ -2229,6 +2264,15 @@ char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
 	return widen_string(buf, buf - buf_start, end, spec);
 }
 
+static noinline_for_stack
+char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr,
+			struct printf_spec spec)
+{
+	if (*fmt == 'r' && fmt[1] == 'a')
+		return range_string(buf, end, ptr, spec, fmt);
+	return resource_string(buf, end, ptr, spec, fmt);
+}
+
 int __init no_hash_pointers_enable(char *str)
 {
 	if (no_hash_pointers)
@@ -2277,6 +2321,7 @@ char *rust_fmt_argument(char *buf, char *end, void *ptr);
  * - 'Bb' as above with module build ID (for use in backtraces)
  * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
  * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
+ * - 'ra' For struct ranges, e.g., [range 0x0000000000000000 - 0x00000000000000ff]
  * - 'b[l]' For a bitmap, the number of bits is determined by the field
  *       width which must be explicitly specified either as part of the
  *       format string '%32b[l]' or through '%*b[l]', [l] selects
@@ -2401,7 +2446,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 		return symbol_string(buf, end, ptr, spec, fmt);
 	case 'R':
 	case 'r':
-		return resource_string(buf, end, ptr, spec, fmt);
+		return resource_or_range(fmt, buf, end, ptr, spec);
 	case 'h':
 		return hex_string(buf, end, ptr, spec, fmt);
 	case 'b':
-- 
cgit v1.2.3


From 7c7e6c8924e7bf98db4e5b2edb202842003c00c2 Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Thu, 24 Oct 2024 19:54:43 +0200
Subject: tty: serial: handle HAS_IOPORT dependencies

In a future patch HAS_IOPORT=n will disable inb()/outb() and friends at
compile time. We thus need to add HAS_IOPORT as dependency for those
drivers using them unconditionally. Some 8250 serial drivers support
MMIO only use, so fence only the parts requiring I/O ports and print an
error message if a device can't be supported with the current
configuration.

Co-developed-by: Arnd Bergmann <arnd@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Maciej W. Rozycki <macro@orcam.me.uk>
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/tty/Kconfig                   |  4 ++--
 drivers/tty/serial/8250/8250_early.c  |  4 ++++
 drivers/tty/serial/8250/8250_pci.c    | 40 +++++++++++++++++++++++++++++++++++
 drivers/tty/serial/8250/8250_pcilib.c | 12 ++++++++++-
 drivers/tty/serial/8250/8250_pcilib.h |  2 ++
 drivers/tty/serial/8250/8250_port.c   | 27 +++++++++++++++++++----
 drivers/tty/serial/8250/Kconfig       |  4 ++--
 drivers/tty/serial/Kconfig            |  2 +-
 include/linux/serial_core.h           |  4 ++++
 9 files changed, 89 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index a45d423ad10f..63a494d36a1f 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -220,7 +220,7 @@ config MOXA_INTELLIO
 
 config MOXA_SMARTIO
 	tristate "Moxa SmartIO support v. 2.0"
-	depends on SERIAL_NONSTANDARD && PCI
+	depends on SERIAL_NONSTANDARD && PCI && HAS_IOPORT
 	help
 	  Say Y here if you have a Moxa SmartIO multiport serial card and/or
 	  want to help develop a new version of this driver.
@@ -302,7 +302,7 @@ config GOLDFISH_TTY_EARLY_CONSOLE
 
 config IPWIRELESS
 	tristate "IPWireless 3G UMTS PCMCIA card support"
-	depends on PCMCIA && NETDEVICES
+	depends on PCMCIA && NETDEVICES && HAS_IOPORT
 	select PPP
 	help
 	  This is a driver for 3G UMTS PCMCIA card from IPWireless company. In
diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c
index 6176083d0341..842422921765 100644
--- a/drivers/tty/serial/8250/8250_early.c
+++ b/drivers/tty/serial/8250/8250_early.c
@@ -46,8 +46,10 @@ static unsigned int serial8250_early_in(struct uart_port *port, int offset)
 		return readl(port->membase + offset);
 	case UPIO_MEM32BE:
 		return ioread32be(port->membase + offset);
+#ifdef CONFIG_HAS_IOPORT
 	case UPIO_PORT:
 		return inb(port->iobase + offset);
+#endif
 	default:
 		return 0;
 	}
@@ -70,9 +72,11 @@ static void serial8250_early_out(struct uart_port *port, int offset, int value)
 	case UPIO_MEM32BE:
 		iowrite32be(value, port->membase + offset);
 		break;
+#ifdef CONFIG_HAS_IOPORT
 	case UPIO_PORT:
 		outb(value, port->iobase + offset);
 		break;
+#endif
 	}
 }
 
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 6709b6a5f301..7d7a6d62c09c 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -964,6 +964,9 @@ static int pci_ite887x_init(struct pci_dev *dev)
 	struct resource *iobase = NULL;
 	u32 miscr, uartbar, ioport;
 
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(dev);
+
 	/* search for the base-ioport */
 	for (i = 0; i < ARRAY_SIZE(inta_addr); i++) {
 		iobase = request_region(inta_addr[i], ITE_887x_IOSIZE,
@@ -1514,6 +1517,9 @@ static int pci_quatech_init(struct pci_dev *dev)
 	const struct pci_device_id *match;
 	bool amcc = false;
 
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(dev);
+
 	match = pci_match_id(quatech_cards, dev);
 	if (match)
 		amcc = match->driver_data;
@@ -1538,6 +1544,9 @@ static int pci_quatech_setup(struct serial_private *priv,
 		  const struct pciserial_board *board,
 		  struct uart_8250_port *port, int idx)
 {
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(priv->dev);
+
 	/* Needed by pci_quatech calls below */
 	port->port.iobase = pci_resource_start(priv->dev, FL_GET_BASE(board->flags));
 	/* Set up the clocking */
@@ -1655,6 +1664,9 @@ static int pci_fintek_setup(struct serial_private *priv,
 	u8 config_base;
 	u16 iobase;
 
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(pdev);
+
 	config_base = 0x40 + 0x08 * idx;
 
 	/* Get the io address from configuration space */
@@ -1686,6 +1698,9 @@ static int pci_fintek_init(struct pci_dev *dev)
 	u8 config_base;
 	struct serial_private *priv = pci_get_drvdata(dev);
 
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(dev);
+
 	if (!(pci_resource_flags(dev, 5) & IORESOURCE_IO) ||
 			!(pci_resource_flags(dev, 4) & IORESOURCE_IO) ||
 			!(pci_resource_flags(dev, 3) & IORESOURCE_IO))
@@ -1864,6 +1879,9 @@ static int kt_serial_setup(struct serial_private *priv,
 			   const struct pciserial_board *board,
 			   struct uart_8250_port *port, int idx)
 {
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(priv->dev);
+
 	port->port.flags |= UPF_BUG_THRE;
 	port->port.serial_in = kt_serial_in;
 	port->port.handle_break = kt_handle_break;
@@ -1884,6 +1902,9 @@ pci_wch_ch353_setup(struct serial_private *priv,
 		    const struct pciserial_board *board,
 		    struct uart_8250_port *port, int idx)
 {
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(priv->dev);
+
 	port->port.flags |= UPF_FIXED_TYPE;
 	port->port.type = PORT_16550A;
 	return pci_default_setup(priv, board, port, idx);
@@ -1894,6 +1915,9 @@ pci_wch_ch355_setup(struct serial_private *priv,
 		const struct pciserial_board *board,
 		struct uart_8250_port *port, int idx)
 {
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(priv->dev);
+
 	port->port.flags |= UPF_FIXED_TYPE;
 	port->port.type = PORT_16550A;
 	return pci_default_setup(priv, board, port, idx);
@@ -1904,6 +1928,9 @@ pci_wch_ch38x_setup(struct serial_private *priv,
 		    const struct pciserial_board *board,
 		    struct uart_8250_port *port, int idx)
 {
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(priv->dev);
+
 	port->port.flags |= UPF_FIXED_TYPE;
 	port->port.type = PORT_16850;
 	return pci_default_setup(priv, board, port, idx);
@@ -1918,6 +1945,8 @@ static int pci_wch_ch38x_init(struct pci_dev *dev)
 	int max_port;
 	unsigned long iobase;
 
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(dev);
 
 	switch (dev->device) {
 	case 0x3853: /* 8 ports */
@@ -1937,6 +1966,11 @@ static void pci_wch_ch38x_exit(struct pci_dev *dev)
 {
 	unsigned long iobase;
 
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT)) {
+		serial_8250_warn_need_ioport(dev);
+		return;
+	}
+
 	iobase = pci_resource_start(dev, 0);
 	outb(0x0, iobase + CH384_XINT_ENABLE_REG);
 }
@@ -2052,6 +2086,9 @@ static int pci_moxa_init(struct pci_dev *dev)
 	unsigned int i, num_ports = moxa_get_nports(device);
 	u8 val, init_mode = MOXA_RS232;
 
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(dev);
+
 	if (!(pci_moxa_supported_rs(dev) & MOXA_SUPP_RS232)) {
 		init_mode = MOXA_RS422;
 	}
@@ -2084,6 +2121,9 @@ pci_moxa_setup(struct serial_private *priv,
 	unsigned int bar = FL_GET_BASE(board->flags);
 	int offset;
 
+	if (!IS_ENABLED(CONFIG_HAS_IOPORT))
+		return serial_8250_warn_need_ioport(priv->dev);
+
 	if (board->num_ports == 4 && idx == 3)
 		offset = 7 * board->uart_offset;
 	else
diff --git a/drivers/tty/serial/8250/8250_pcilib.c b/drivers/tty/serial/8250/8250_pcilib.c
index ea906d721b2c..ec4d04177802 100644
--- a/drivers/tty/serial/8250/8250_pcilib.c
+++ b/drivers/tty/serial/8250/8250_pcilib.c
@@ -12,6 +12,14 @@
 #include "8250.h"
 #include "8250_pcilib.h"
 
+int serial_8250_warn_need_ioport(struct pci_dev *dev)
+{
+	dev_warn(&dev->dev,
+		 "Serial port not supported because of missing I/O resource\n");
+
+	return -ENXIO;
+}
+
 int serial8250_pci_setup_port(struct pci_dev *dev, struct uart_8250_port *port,
 		   u8 bar, unsigned int offset, int regshift)
 {
@@ -27,12 +35,14 @@ int serial8250_pci_setup_port(struct pci_dev *dev, struct uart_8250_port *port,
 		port->port.mapbase = pci_resource_start(dev, bar) + offset;
 		port->port.membase = pcim_iomap_table(dev)[bar] + offset;
 		port->port.regshift = regshift;
-	} else {
+	} else if (IS_ENABLED(CONFIG_HAS_IOPORT)) {
 		port->port.iotype = UPIO_PORT;
 		port->port.iobase = pci_resource_start(dev, bar) + offset;
 		port->port.mapbase = 0;
 		port->port.membase = NULL;
 		port->port.regshift = 0;
+	} else {
+		return serial_8250_warn_need_ioport(dev);
 	}
 	return 0;
 }
diff --git a/drivers/tty/serial/8250/8250_pcilib.h b/drivers/tty/serial/8250/8250_pcilib.h
index 1aaf1b50ce9c..16a274574cde 100644
--- a/drivers/tty/serial/8250/8250_pcilib.h
+++ b/drivers/tty/serial/8250/8250_pcilib.h
@@ -13,3 +13,5 @@ struct uart_8250_port;
 
 int serial8250_pci_setup_port(struct pci_dev *dev, struct uart_8250_port *port, u8 bar,
 		   unsigned int offset, int regshift);
+
+int serial_8250_warn_need_ioport(struct pci_dev *dev);
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 3509af7dc52b..91369f542b0b 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -338,6 +338,7 @@ static void default_serial_dl_write(struct uart_8250_port *up, u32 value)
 	serial_out(up, UART_DLM, value >> 8 & 0xff);
 }
 
+#ifdef CONFIG_HAS_IOPORT
 static unsigned int hub6_serial_in(struct uart_port *p, int offset)
 {
 	offset = offset << p->regshift;
@@ -351,6 +352,7 @@ static void hub6_serial_out(struct uart_port *p, int offset, int value)
 	outb(p->hub6 - 1 + offset, p->iobase);
 	outb(value, p->iobase + 1);
 }
+#endif /* CONFIG_HAS_IOPORT */
 
 static unsigned int mem_serial_in(struct uart_port *p, int offset)
 {
@@ -400,6 +402,7 @@ static unsigned int mem32be_serial_in(struct uart_port *p, int offset)
 	return ioread32be(p->membase + offset);
 }
 
+#ifdef CONFIG_HAS_IOPORT
 static unsigned int io_serial_in(struct uart_port *p, int offset)
 {
 	offset = offset << p->regshift;
@@ -411,6 +414,15 @@ static void io_serial_out(struct uart_port *p, int offset, int value)
 	offset = offset << p->regshift;
 	outb(value, p->iobase + offset);
 }
+#endif
+static unsigned int no_serial_in(struct uart_port *p, int offset)
+{
+	return (unsigned int)-1;
+}
+
+static void no_serial_out(struct uart_port *p, int offset, int value)
+{
+}
 
 static int serial8250_default_handle_irq(struct uart_port *port);
 
@@ -422,10 +434,12 @@ static void set_io_from_upio(struct uart_port *p)
 	up->dl_write = default_serial_dl_write;
 
 	switch (p->iotype) {
+#ifdef CONFIG_HAS_IOPORT
 	case UPIO_HUB6:
 		p->serial_in = hub6_serial_in;
 		p->serial_out = hub6_serial_out;
 		break;
+#endif
 
 	case UPIO_MEM:
 		p->serial_in = mem_serial_in;
@@ -446,11 +460,16 @@ static void set_io_from_upio(struct uart_port *p)
 		p->serial_in = mem32be_serial_in;
 		p->serial_out = mem32be_serial_out;
 		break;
-
-	default:
+#ifdef CONFIG_HAS_IOPORT
+	case UPIO_PORT:
 		p->serial_in = io_serial_in;
 		p->serial_out = io_serial_out;
 		break;
+#endif
+	default:
+		WARN(1, "Unsupported UART type %x\n", p->iotype);
+		p->serial_in = no_serial_in;
+		p->serial_out = no_serial_out;
 	}
 	/* Remember loaded iotype */
 	up->cur_iotype = p->iotype;
@@ -1174,7 +1193,7 @@ static void autoconfig(struct uart_8250_port *up)
 		 */
 		scratch = serial_in(up, UART_IER);
 		serial_out(up, UART_IER, 0);
-#ifdef __i386__
+#if defined(__i386__) && defined(CONFIG_HAS_IOPORT)
 		outb(0xff, 0x080);
 #endif
 		/*
@@ -1183,7 +1202,7 @@ static void autoconfig(struct uart_8250_port *up)
 		 */
 		scratch2 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
 		serial_out(up, UART_IER, UART_IER_ALL_INTR);
-#ifdef __i386__
+#if defined(__i386__) && defined(CONFIG_HAS_IOPORT)
 		outb(0, 0x080);
 #endif
 		scratch3 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index 47ff50763c04..1eb20350fcf4 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -72,7 +72,7 @@ config SERIAL_8250_16550A_VARIANTS
 
 config SERIAL_8250_FINTEK
 	bool "Support for Fintek variants"
-	depends on SERIAL_8250
+	depends on SERIAL_8250 && HAS_IOPORT
 	help
 	  Selecting this option will add support for the RS232 and RS485
 	  capabilities of the Fintek F81216A LPC to 4 UART as well similar
@@ -163,7 +163,7 @@ config SERIAL_8250_HP300
 
 config SERIAL_8250_CS
 	tristate "8250/16550 PCMCIA device support"
-	depends on PCMCIA && SERIAL_8250
+	depends on PCMCIA && SERIAL_8250 && HAS_IOPORT
 	help
 	  Say Y here to enable support for 16-bit PCMCIA serial devices,
 	  including serial port cards, modems, and the modem functions of
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 28e4beeabf8f..45f0f779fbf9 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -877,7 +877,7 @@ config SERIAL_TXX9_STDSERIAL
 
 config SERIAL_JSM
 	tristate "Digi International NEO and Classic PCI Support"
-	depends on PCI
+	depends on PCI && HAS_IOPORT
 	select SERIAL_CORE
 	help
 	  This is a driver for Digi International's Neo and Classic series
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 4ab65874a850..743b4afaad4c 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -505,7 +505,11 @@ struct uart_port {
 	 * The remaining bits are serial-core specific and not modifiable by
 	 * userspace.
 	 */
+#ifdef CONFIG_HAS_IOPORT
 #define UPF_FOURPORT		((__force upf_t) ASYNC_FOURPORT       /* 1  */ )
+#else
+#define UPF_FOURPORT		0
+#endif
 #define UPF_SAK			((__force upf_t) ASYNC_SAK            /* 2  */ )
 #define UPF_SPD_HI		((__force upf_t) ASYNC_SPD_HI         /* 4  */ )
 #define UPF_SPD_VHI		((__force upf_t) ASYNC_SPD_VHI        /* 5  */ )
-- 
cgit v1.2.3


From f390525d27bc03dc5925293cbd9f22329648b248 Mon Sep 17 00:00:00 2001
From: Dingyan Li <18500469033@163.com>
Date: Sun, 20 Oct 2024 15:47:21 +0800
Subject: usb: storage: fix wrong comments for struct bulk_cb_wrap

In the flags, direction is in bit 7 instead of bit 0 based
on the specification.

Signed-off-by: Dingyan Li <18500469033@163.com>
Link: https://lore.kernel.org/r/20241020074721.26905-1-18500469033@163.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/storage.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/storage.h b/include/linux/usb/storage.h
index 2827ce72e502..8539956bc2be 100644
--- a/include/linux/usb/storage.h
+++ b/include/linux/usb/storage.h
@@ -53,7 +53,7 @@ struct bulk_cb_wrap {
 	__le32	Signature;		/* contains 'USBC' */
 	__u32	Tag;			/* unique per command id */
 	__le32	DataTransferLength;	/* size of data */
-	__u8	Flags;			/* direction in bit 0 */
+	__u8	Flags;			/* direction in bit 7 */
 	__u8	Lun;			/* LUN normally 0 */
 	__u8	Length;			/* length of the CDB */
 	__u8	CDB[16];		/* max command */
-- 
cgit v1.2.3


From 4fd06a5358e0d888d1bf23d274971ea7d1f45aad Mon Sep 17 00:00:00 2001
From: David Dai <davidai@google.com>
Date: Wed, 18 Sep 2024 17:08:33 -0700
Subject: cpufreq: add virtual-cpufreq driver

Introduce a virtualized cpufreq driver for guest kernels to improve
performance and power of workloads within VMs.

This driver does two main things:

1. Sends the frequency of vCPUs as a hint to the host. The host uses the
hint to schedule the vCPU threads and decide physical CPU frequency.

2. If a VM does not support a virtualized FIE(like AMUs), it queries the
host CPU frequency by reading a MMIO region of a virtual cpufreq device
to update the guest's frequency scaling factor periodically. This enables
accurate Per-Entity Load Tracking for tasks running in the guest.

Co-developed-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: David Dai <davidai@google.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/Kconfig           |  14 ++
 drivers/cpufreq/Makefile          |   1 +
 drivers/cpufreq/virtual-cpufreq.c | 333 ++++++++++++++++++++++++++++++++++++++
 include/linux/arch_topology.h     |   1 +
 4 files changed, 349 insertions(+)
 create mode 100644 drivers/cpufreq/virtual-cpufreq.c

(limited to 'include/linux')

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 2561b215432a..92a83a9bb2e1 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -217,6 +217,20 @@ config CPUFREQ_DT
 
 	  If in doubt, say N.
 
+config CPUFREQ_VIRT
+	tristate "Virtual cpufreq driver"
+	depends on GENERIC_ARCH_TOPOLOGY
+	help
+	  This adds a virtualized cpufreq driver for guest kernels that
+	  read/writes to a MMIO region for a virtualized cpufreq device to
+	  communicate with the host. It sends performance requests to the host
+	  which gets used as a hint to schedule vCPU threads and select CPU
+	  frequency. If a VM does not support a virtualized FIE such as AMUs,
+	  it updates the frequency scaling factor by polling host CPU frequency
+	  to enable accurate Per-Entity Load Tracking for tasks running in the guest.
+
+	  If in doubt, say N.
+
 config CPUFREQ_DT_PLATDEV
 	tristate "Generic DT based cpufreq platdev driver"
 	depends on OF
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 0f184031dd12..10d7d6e55da8 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET)	+= cpufreq_governor_attr_set.o
 
 obj-$(CONFIG_CPUFREQ_DT)		+= cpufreq-dt.o
 obj-$(CONFIG_CPUFREQ_DT_PLATDEV)	+= cpufreq-dt-platdev.o
+obj-$(CONFIG_CPUFREQ_VIRT)		+= virtual-cpufreq.o
 
 # Traces
 CFLAGS_amd-pstate-trace.o               := -I$(src)
diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c
new file mode 100644
index 000000000000..a050b3a6737f
--- /dev/null
+++ b/drivers/cpufreq/virtual-cpufreq.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Google LLC
+ */
+
+#include <linux/arch_topology.h>
+#include <linux/cpufreq.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+/*
+ * CPU0..CPUn
+ * +-------------+-------------------------------+--------+-------+
+ * | Register    | Description                   | Offset |   Len |
+ * +-------------+-------------------------------+--------+-------+
+ * | cur_perf    | read this register to get     |    0x0 |   0x4 |
+ * |             | the current perf (integer val |        |       |
+ * |             | representing perf relative to |        |       |
+ * |             | max performance)              |        |       |
+ * |             | that vCPU is running at       |        |       |
+ * +-------------+-------------------------------+--------+-------+
+ * | set_perf    | write to this register to set |    0x4 |   0x4 |
+ * |             | perf value of the vCPU        |        |       |
+ * +-------------+-------------------------------+--------+-------+
+ * | perftbl_len | number of entries in perf     |    0x8 |   0x4 |
+ * |             | table. A single entry in the  |        |       |
+ * |             | perf table denotes no table   |        |       |
+ * |             | and the entry contains        |        |       |
+ * |             | the maximum perf value        |        |       |
+ * |             | that this vCPU supports.      |        |       |
+ * |             | The guest can request any     |        |       |
+ * |             | value between 1 and max perf  |        |       |
+ * |             | when perftbls are not used.   |        |       |
+ * +---------------------------------------------+--------+-------+
+ * | perftbl_sel | write to this register to     |    0xc |   0x4 |
+ * |             | select perf table entry to    |        |       |
+ * |             | read from                     |        |       |
+ * +---------------------------------------------+--------+-------+
+ * | perftbl_rd  | read this register to get     |   0x10 |   0x4 |
+ * |             | perf value of the selected    |        |       |
+ * |             | entry based on perftbl_sel    |        |       |
+ * +---------------------------------------------+--------+-------+
+ * | perf_domain | performance domain number     |   0x14 |   0x4 |
+ * |             | that this vCPU belongs to.    |        |       |
+ * |             | vCPUs sharing the same perf   |        |       |
+ * |             | domain number are part of the |        |       |
+ * |             | same performance domain.      |        |       |
+ * +-------------+-------------------------------+--------+-------+
+ */
+
+#define REG_CUR_PERF_STATE_OFFSET 0x0
+#define REG_SET_PERF_STATE_OFFSET 0x4
+#define REG_PERFTBL_LEN_OFFSET 0x8
+#define REG_PERFTBL_SEL_OFFSET 0xc
+#define REG_PERFTBL_RD_OFFSET 0x10
+#define REG_PERF_DOMAIN_OFFSET 0x14
+#define PER_CPU_OFFSET 0x1000
+
+#define PERFTBL_MAX_ENTRIES 64U
+
+static void __iomem *base;
+static DEFINE_PER_CPU(u32, perftbl_num_entries);
+
+static void virt_scale_freq_tick(void)
+{
+	int cpu = smp_processor_id();
+	u32 max_freq = (u32)cpufreq_get_hw_max_freq(cpu);
+	u64 cur_freq;
+	unsigned long scale;
+
+	cur_freq = (u64)readl_relaxed(base + cpu * PER_CPU_OFFSET
+			+ REG_CUR_PERF_STATE_OFFSET);
+
+	cur_freq <<= SCHED_CAPACITY_SHIFT;
+	scale = (unsigned long)div_u64(cur_freq, max_freq);
+	scale = min(scale, SCHED_CAPACITY_SCALE);
+
+	this_cpu_write(arch_freq_scale, scale);
+}
+
+static struct scale_freq_data virt_sfd = {
+	.source = SCALE_FREQ_SOURCE_VIRT,
+	.set_freq_scale = virt_scale_freq_tick,
+};
+
+static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy,
+					  unsigned int target_freq)
+{
+	writel_relaxed(target_freq,
+		       base + policy->cpu * PER_CPU_OFFSET + REG_SET_PERF_STATE_OFFSET);
+	return 0;
+}
+
+static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy,
+					     unsigned int target_freq)
+{
+	virt_cpufreq_set_perf(policy, target_freq);
+	return target_freq;
+}
+
+static u32 virt_cpufreq_get_perftbl_entry(int cpu, u32 idx)
+{
+	writel_relaxed(idx, base + cpu * PER_CPU_OFFSET +
+		       REG_PERFTBL_SEL_OFFSET);
+	return readl_relaxed(base + cpu * PER_CPU_OFFSET +
+			     REG_PERFTBL_RD_OFFSET);
+}
+
+static int virt_cpufreq_target(struct cpufreq_policy *policy,
+			       unsigned int target_freq,
+			       unsigned int relation)
+{
+	struct cpufreq_freqs freqs;
+	int ret = 0;
+
+	freqs.old = policy->cur;
+	freqs.new = target_freq;
+
+	cpufreq_freq_transition_begin(policy, &freqs);
+	ret = virt_cpufreq_set_perf(policy, target_freq);
+	cpufreq_freq_transition_end(policy, &freqs, ret != 0);
+
+	return ret;
+}
+
+static int virt_cpufreq_get_sharing_cpus(struct cpufreq_policy *policy)
+{
+	u32 cur_perf_domain, perf_domain;
+	struct device *cpu_dev;
+	int cpu;
+
+	cur_perf_domain = readl_relaxed(base + policy->cpu *
+					PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
+
+	for_each_possible_cpu(cpu) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev)
+			continue;
+
+		perf_domain = readl_relaxed(base + cpu *
+					    PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
+
+		if (perf_domain == cur_perf_domain)
+			cpumask_set_cpu(cpu, policy->cpus);
+	}
+
+	return 0;
+}
+
+static int virt_cpufreq_get_freq_info(struct cpufreq_policy *policy)
+{
+	struct cpufreq_frequency_table *table;
+	u32 num_perftbl_entries, idx;
+
+	num_perftbl_entries = per_cpu(perftbl_num_entries, policy->cpu);
+
+	if (num_perftbl_entries == 1) {
+		policy->cpuinfo.min_freq = 1;
+		policy->cpuinfo.max_freq = virt_cpufreq_get_perftbl_entry(policy->cpu, 0);
+
+		policy->min = policy->cpuinfo.min_freq;
+		policy->max = policy->cpuinfo.max_freq;
+
+		policy->cur = policy->max;
+		return 0;
+	}
+
+	table = kcalloc(num_perftbl_entries + 1, sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return -ENOMEM;
+
+	for (idx = 0; idx < num_perftbl_entries; idx++)
+		table[idx].frequency = virt_cpufreq_get_perftbl_entry(policy->cpu, idx);
+
+	table[idx].frequency = CPUFREQ_TABLE_END;
+	policy->freq_table = table;
+
+	return 0;
+}
+
+static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	struct device *cpu_dev;
+	int ret;
+
+	cpu_dev = get_cpu_device(policy->cpu);
+	if (!cpu_dev)
+		return -ENODEV;
+
+	ret = virt_cpufreq_get_freq_info(policy);
+	if (ret) {
+		dev_warn(cpu_dev, "failed to get cpufreq info\n");
+		return ret;
+	}
+
+	ret = virt_cpufreq_get_sharing_cpus(policy);
+	if (ret) {
+		dev_warn(cpu_dev, "failed to get sharing cpumask\n");
+		return ret;
+	}
+
+	/*
+	 * To simplify and improve latency of handling frequency requests on
+	 * the host side, this ensures that the vCPU thread triggering the MMIO
+	 * abort is the same thread whose performance constraints (Ex. uclamp
+	 * settings) need to be updated. This simplifies the VMM (Virtual
+	 * Machine Manager) having to find the correct vCPU thread and/or
+	 * facing permission issues when configuring other threads.
+	 */
+	policy->dvfs_possible_from_any_cpu = false;
+	policy->fast_switch_possible = true;
+
+	/*
+	 * Using the default SCALE_FREQ_SOURCE_CPUFREQ is insufficient since
+	 * the actual physical CPU frequency may not match requested frequency
+	 * from the vCPU thread due to frequency update latencies or other
+	 * inputs to the physical CPU frequency selection. This additional FIE
+	 * source allows for more accurate freq_scale updates and only takes
+	 * effect if another FIE source such as AMUs have not been registered.
+	 */
+	topology_set_scale_freq_source(&virt_sfd, policy->cpus);
+
+	return 0;
+}
+
+static void virt_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus);
+	kfree(policy->freq_table);
+}
+
+static int virt_cpufreq_online(struct cpufreq_policy *policy)
+{
+	/* Nothing to restore. */
+	return 0;
+}
+
+static int virt_cpufreq_offline(struct cpufreq_policy *policy)
+{
+	/* Dummy offline() to avoid exit() being called and freeing resources. */
+	return 0;
+}
+
+static int virt_cpufreq_verify_policy(struct cpufreq_policy_data *policy)
+{
+	if (policy->freq_table)
+		return cpufreq_frequency_table_verify(policy, policy->freq_table);
+
+	cpufreq_verify_within_cpu_limits(policy);
+	return 0;
+}
+
+static struct cpufreq_driver cpufreq_virt_driver = {
+	.name		= "virt-cpufreq",
+	.init		= virt_cpufreq_cpu_init,
+	.exit		= virt_cpufreq_cpu_exit,
+	.online         = virt_cpufreq_online,
+	.offline        = virt_cpufreq_offline,
+	.verify		= virt_cpufreq_verify_policy,
+	.target		= virt_cpufreq_target,
+	.fast_switch	= virt_cpufreq_fast_switch,
+	.attr		= cpufreq_generic_attr,
+};
+
+static int virt_cpufreq_driver_probe(struct platform_device *pdev)
+{
+	u32 num_perftbl_entries;
+	int ret, cpu;
+
+	base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	for_each_possible_cpu(cpu) {
+		num_perftbl_entries = readl_relaxed(base + cpu * PER_CPU_OFFSET +
+						    REG_PERFTBL_LEN_OFFSET);
+
+		if (!num_perftbl_entries || num_perftbl_entries > PERFTBL_MAX_ENTRIES)
+			return -ENODEV;
+
+		per_cpu(perftbl_num_entries, cpu) = num_perftbl_entries;
+	}
+
+	ret = cpufreq_register_driver(&cpufreq_virt_driver);
+	if (ret) {
+		dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret);
+		return ret;
+	}
+
+	dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n");
+	return 0;
+}
+
+static void virt_cpufreq_driver_remove(struct platform_device *pdev)
+{
+	cpufreq_unregister_driver(&cpufreq_virt_driver);
+}
+
+static const struct of_device_id virt_cpufreq_match[] = {
+	{ .compatible = "qemu,virtual-cpufreq", .data = NULL},
+	{}
+};
+MODULE_DEVICE_TABLE(of, virt_cpufreq_match);
+
+static struct platform_driver virt_cpufreq_driver = {
+	.probe = virt_cpufreq_driver_probe,
+	.remove = virt_cpufreq_driver_remove,
+	.driver = {
+		.name = "virt-cpufreq",
+		.of_match_table = virt_cpufreq_match,
+	},
+};
+
+static int __init virt_cpufreq_init(void)
+{
+	return platform_driver_register(&virt_cpufreq_driver);
+}
+postcore_initcall(virt_cpufreq_init);
+
+static void __exit virt_cpufreq_exit(void)
+{
+	platform_driver_unregister(&virt_cpufreq_driver);
+}
+module_exit(virt_cpufreq_exit);
+
+MODULE_DESCRIPTION("Virtual cpufreq driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index b721f360d759..d5d848849408 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -49,6 +49,7 @@ enum scale_freq_source {
 	SCALE_FREQ_SOURCE_CPUFREQ = 0,
 	SCALE_FREQ_SOURCE_ARCH,
 	SCALE_FREQ_SOURCE_CPPC,
+	SCALE_FREQ_SOURCE_VIRT,
 };
 
 struct scale_freq_data {
-- 
cgit v1.2.3


From 5935b8377a0f3b2401e4e487336ed90fe6b9254d Mon Sep 17 00:00:00 2001
From: Sui Jingfeng <sui.jingfeng@linux.dev>
Date: Wed, 23 Oct 2024 11:25:44 +0800
Subject: dma-mapping: remove an outdated comment from dma-map-ops.h

The "/* CONFIG_ARCH_HAS_DMA_COHERENCE_H */" was an description about the
ARCH_HAS_DMA_COHERENCE_H configure option, but it has been removed since
the dma_default_coherent variable was lifted from the mips architecture
to the driver core. Therefore it doesn't match any compile guard now.
Just remove it.

Fixes: 6d4e9a8efe3d ("driver core: lift dma_default_coherent into common code")
Signed-off-by: Sui Jingfeng <sui.jingfeng@linux.dev>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-map-ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index b7773201414c..e172522cd936 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -242,7 +242,7 @@ static inline bool dev_is_dma_coherent(struct device *dev)
 {
 	return true;
 }
-#endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */
+#endif
 
 static inline void dma_reset_need_sync(struct device *dev)
 {
-- 
cgit v1.2.3


From be164349e173a8e71cd76f17c7ed720813b8d69b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 13 Oct 2024 07:19:48 +0200
Subject: dma-mapping: drop unneeded includes from dma-mapping.h

Back in the day a lot of logic was implemented inline in dma-mapping.h and
needed various includes.  Move of this has long been moved out of line,
so we can drop various includes to improve kernel rebuild times.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/powerpc/platforms/pseries/svm.c | 1 +
 include/linux/dma-mapping.h          | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
index 3b4045d508ec..384c9dc1899a 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -8,6 +8,7 @@
 
 #include <linux/mm.h>
 #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
 #include <linux/cc_platform.h>
 #include <asm/machdep.h>
 #include <asm/svm.h>
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 1524da363734..b79925b1c433 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -2,15 +2,11 @@
 #ifndef _LINUX_DMA_MAPPING_H
 #define _LINUX_DMA_MAPPING_H
 
-#include <linux/cache.h>
-#include <linux/sizes.h>
-#include <linux/string.h>
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
 #include <linux/bug.h>
-#include <linux/mem_encrypt.h>
 
 /**
  * List of possible attributes associated with a DMA mapping. The semantics
-- 
cgit v1.2.3


From 69e5a17511f654db0fe058d22719cef008c9faf1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 18 Oct 2024 14:01:19 -0300
Subject: iommu: Remove useless flush from
 iommu_create_device_direct_mappings()

These days iommu_map() does not require external flushing, it always
internally handles any required flushes. Since
iommu_create_device_direct_mappings() only calls iommu_map(), remove
the extra call.

Since this is the last call site for iommu_flush_iotlb_all() remove it
too.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/0-v1-bb6c694e1b07+a29e1-iommu_no_flush_all_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c |  4 ----
 include/linux/iommu.h | 10 ----------
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 4c29a4bc368b..e5d5b4fd322a 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1141,10 +1141,6 @@ map_end:
 		}
 
 	}
-
-	if (!list_empty(&mappings) && iommu_is_dma_domain(domain))
-		iommu_flush_iotlb_all(domain);
-
 out:
 	iommu_put_resv_regions(dev, &mappings);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 62d1b85c80d3..03f9fa833db9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -852,12 +852,6 @@ void iommu_set_dma_strict(void);
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
-static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
-{
-	if (domain->ops->flush_iotlb_all)
-		domain->ops->flush_iotlb_all(domain);
-}
-
 static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
@@ -1141,10 +1135,6 @@ static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
-static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
-{
-}
-
 static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
-- 
cgit v1.2.3


From f6440fcc9c7b7aaad2e52830ab83fa71990f76ff Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 9 Oct 2024 12:11:47 +0800
Subject: iommu: Remove iommu_domain_alloc()

The iommu_domain_alloc() interface is no longer used in the tree anymore.
Remove it to avoid dead code.

There is increasing demand for supporting multiple IOMMU drivers, and this
is the last bus-based thing standing in the way of that.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20241009041147.28391-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 36 ------------------------------------
 include/linux/iommu.h |  6 ------
 2 files changed, 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e5d5b4fd322a..fe967b5fd094 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1965,42 +1965,6 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type)
 	return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type);
 }
 
-static int __iommu_domain_alloc_dev(struct device *dev, void *data)
-{
-	const struct iommu_ops **ops = data;
-
-	if (!dev_has_iommu(dev))
-		return 0;
-
-	if (WARN_ONCE(*ops && *ops != dev_iommu_ops(dev),
-		      "Multiple IOMMU drivers present for bus %s, which the public IOMMU API can't fully support yet. You will still need to disable one or more for this to work, sorry!\n",
-		      dev_bus_name(dev)))
-		return -EBUSY;
-
-	*ops = dev_iommu_ops(dev);
-	return 0;
-}
-
-/*
- * The iommu ops in bus has been retired. Do not use this interface in
- * new drivers.
- */
-struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
-{
-	const struct iommu_ops *ops = NULL;
-	int err = bus_for_each_dev(bus, NULL, &ops, __iommu_domain_alloc_dev);
-	struct iommu_domain *domain;
-
-	if (err || !ops)
-		return NULL;
-
-	domain = __iommu_domain_alloc(ops, NULL, IOMMU_DOMAIN_UNMANAGED);
-	if (IS_ERR(domain))
-		return NULL;
-	return domain;
-}
-EXPORT_SYMBOL_GPL(iommu_domain_alloc);
-
 /**
  * iommu_paging_domain_alloc() - Allocate a paging domain
  * @dev: device for which the domain is allocated
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 03f9fa833db9..d93c87f0c003 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -787,7 +787,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 extern int bus_iommu_probe(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
-extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus);
 struct iommu_domain *iommu_paging_domain_alloc(struct device *dev);
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
@@ -1079,11 +1078,6 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
-static inline struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
-{
-	return NULL;
-}
-
 static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From 20858d4ebb423826b7a978d54cde195f51d87e20 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@ziepe.ca>
Date: Mon, 28 Oct 2024 09:38:00 +0000
Subject: iommu: Introduce iommu_paging_domain_alloc_flags()

Currently drivers calls iommu_paging_domain_alloc(dev) to get an
UNMANAGED domain. This is not sufficient to support PASID with
UNMANAGED domain as some HW like AMD requires certain page table type
to support PASIDs.

Also the domain_alloc_paging op only passes device as param for domain
allocation. This is not sufficient for AMD driver to decide the right
page table.

Instead of extending ops->domain_alloc_paging() it was decided to
enhance ops->domain_alloc_user() so that caller can pass various
additional flags.

Hence add iommu_paging_domain_alloc_flags() API which takes flags as
parameter. Caller can pass additional parameter to indicate type of
domain required, etc. iommu_paging_domain_alloc_flags() internally calls
appropriate callback function to allocate a domain.

Signed-off-by: Jason Gunthorpe <jgg@ziepe.ca>
[Added description - Vasant]
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20241028093810.5901-3-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 32 +++++++++++++++++++++++++++-----
 include/linux/iommu.h | 14 +++++++++++---
 2 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cd91c0eabb34..d82aa6563d84 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1962,20 +1962,42 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type)
 }
 
 /**
- * iommu_paging_domain_alloc() - Allocate a paging domain
+ * iommu_paging_domain_alloc_flags() - Allocate a paging domain
  * @dev: device for which the domain is allocated
+ * @flags: Enum of iommufd_hwpt_alloc_flags
  *
  * Allocate a paging domain which will be managed by a kernel driver. Return
- * allocated domain if successful, or a ERR pointer for failure.
+ * allocated domain if successful, or an ERR pointer for failure.
  */
-struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+						     unsigned int flags)
 {
+	const struct iommu_ops *ops;
+	struct iommu_domain *domain;
+
 	if (!dev_has_iommu(dev))
 		return ERR_PTR(-ENODEV);
 
-	return __iommu_domain_alloc(dev_iommu_ops(dev), dev, IOMMU_DOMAIN_UNMANAGED);
+	ops = dev_iommu_ops(dev);
+
+	if (ops->domain_alloc_paging && !flags)
+		domain = ops->domain_alloc_paging(dev);
+	else if (ops->domain_alloc_user)
+		domain = ops->domain_alloc_user(dev, flags, NULL, NULL);
+	else if (ops->domain_alloc && !flags)
+		domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED);
+	else
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (IS_ERR(domain))
+		return domain;
+	if (!domain)
+		return ERR_PTR(-ENOMEM);
+
+	iommu_domain_init(domain, IOMMU_DOMAIN_UNMANAGED, ops);
+	return domain;
 }
-EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc);
+EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags);
 
 void iommu_domain_free(struct iommu_domain *domain)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d93c87f0c003..aa78d911fdda 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -511,8 +511,6 @@ static inline int __iommu_copy_struct_from_user_array(
  *                the caller iommu_domain_alloc() returns.
  * @domain_alloc_user: Allocate an iommu domain corresponding to the input
  *                     parameters as defined in include/uapi/linux/iommufd.h.
- *                     Unlike @domain_alloc, it is called only by IOMMUFD and
- *                     must fully initialize the new domain before return.
  *                     Upon success, if the @user_data is valid and the @parent
  *                     points to a kernel-managed domain, the new domain must be
  *                     IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be
@@ -787,7 +785,11 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 extern int bus_iommu_probe(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
-struct iommu_domain *iommu_paging_domain_alloc(struct device *dev);
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags);
+static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
+{
+	return iommu_paging_domain_alloc_flags(dev, 0);
+}
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
@@ -1078,6 +1080,12 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+						     unsigned int flags)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From b6da940130579769a42605b2c7f529b6f14ef1f8 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 9 Oct 2024 16:29:37 +0200
Subject: mm, slab: add kerneldocs for common SLAB_ flags

We have many SLAB_ flags but many are used only internally, by kunit
tests or debugging subsystems cooperating with slab, or are set
according to slab_debug boot parameter.

Create kerneldocs for the commonly used flags that may be passed to
kmem_cache_create(). SLAB_TYPESAFE_BY_RCU already had a detailed
description, so turn it to a kerneldoc. Add some details for
SLAB_ACCOUNT, SLAB_RECLAIM_ACCOUNT and SLAB_HWCACHE_ALIGN. Reference
them from the __kmem_cache_create_args() kerneldoc.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 60 +++++++++++++++++++++++++++++++++++-----------------
 mm/slab_common.c     | 14 +++++++++++-
 2 files changed, 54 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index b35e2db7eb0e..49e9fb93e864 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -77,7 +77,17 @@ enum _slab_flag_bits {
 #define SLAB_POISON		__SLAB_FLAG_BIT(_SLAB_POISON)
 /* Indicate a kmalloc slab */
 #define SLAB_KMALLOC		__SLAB_FLAG_BIT(_SLAB_KMALLOC)
-/* Align objs on cache lines */
+/**
+ * define SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
+ *
+ * Sufficiently large objects are aligned on cache line boundary. For object
+ * size smaller than a half of cache line size, the alignment is on the half of
+ * cache line size. In general, if object size is smaller than 1/2^n of cache
+ * line size, the alignment is adjusted to 1/2^n.
+ *
+ * If explicit alignment is also requested by the respective
+ * &struct kmem_cache_args field, the greater of both is alignments is applied.
+ */
 #define SLAB_HWCACHE_ALIGN	__SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN)
 /* Use GFP_DMA memory */
 #define SLAB_CACHE_DMA		__SLAB_FLAG_BIT(_SLAB_CACHE_DMA)
@@ -87,8 +97,8 @@ enum _slab_flag_bits {
 #define SLAB_STORE_USER		__SLAB_FLAG_BIT(_SLAB_STORE_USER)
 /* Panic if kmem_cache_create() fails */
 #define SLAB_PANIC		__SLAB_FLAG_BIT(_SLAB_PANIC)
-/*
- * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
+/**
+ * define SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
  *
  * This delays freeing the SLAB page by a grace period, it does _NOT_
  * delay object freeing. This means that if you do kmem_cache_free()
@@ -99,20 +109,22 @@ enum _slab_flag_bits {
  * stays valid, the trick to using this is relying on an independent
  * object validation pass. Something like:
  *
- * begin:
- *  rcu_read_lock();
- *  obj = lockless_lookup(key);
- *  if (obj) {
- *    if (!try_get_ref(obj)) // might fail for free objects
- *      rcu_read_unlock();
- *      goto begin;
+ * ::
+ *
+ *  begin:
+ *   rcu_read_lock();
+ *   obj = lockless_lookup(key);
+ *   if (obj) {
+ *     if (!try_get_ref(obj)) // might fail for free objects
+ *       rcu_read_unlock();
+ *       goto begin;
  *
- *    if (obj->key != key) { // not the object we expected
- *      put_ref(obj);
- *      rcu_read_unlock();
- *      goto begin;
- *    }
- *  }
+ *     if (obj->key != key) { // not the object we expected
+ *       put_ref(obj);
+ *       rcu_read_unlock();
+ *       goto begin;
+ *     }
+ *   }
  *  rcu_read_unlock();
  *
  * This is useful if we need to approach a kernel structure obliquely,
@@ -137,7 +149,6 @@ enum _slab_flag_bits {
  *
  * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
  */
-/* Defer freeing slabs to RCU */
 #define SLAB_TYPESAFE_BY_RCU	__SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU)
 /* Trace allocations and frees */
 #define SLAB_TRACE		__SLAB_FLAG_BIT(_SLAB_TRACE)
@@ -170,7 +181,12 @@ enum _slab_flag_bits {
 #else
 # define SLAB_FAILSLAB		__SLAB_FLAG_UNUSED
 #endif
-/* Account to memcg */
+/**
+ * define SLAB_ACCOUNT - Account allocations to memcg.
+ *
+ * All object allocations from this cache will be memcg accounted, regardless of
+ * __GFP_ACCOUNT being or not being passed to individual allocations.
+ */
 #ifdef CONFIG_MEMCG
 # define SLAB_ACCOUNT		__SLAB_FLAG_BIT(_SLAB_ACCOUNT)
 #else
@@ -197,7 +213,13 @@ enum _slab_flag_bits {
 #endif
 
 /* The following flags affect the page allocator grouping pages by mobility */
-/* Objects are reclaimable */
+/**
+ * define SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
+ *
+ * Use this flag for caches that have an associated shrinker. As a result, slab
+ * pages are allocated with __GFP_RECLAIMABLE, which affects grouping pages by
+ * mobility, and are accounted in SReclaimable counter in /proc/meminfo
+ */
 #ifndef CONFIG_SLUB_TINY
 #define SLAB_RECLAIM_ACCOUNT	__SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT)
 #else
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 9e22d1266f6a..62878edb0a81 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -254,11 +254,23 @@ out:
  * @object_size: The size of objects to be created in this cache.
  * @args: Additional arguments for the cache creation (see
  *        &struct kmem_cache_args).
- * @flags: See %SLAB_* flags for an explanation of individual @flags.
+ * @flags: See the desriptions of individual flags. The common ones are listed
+ *         in the description below.
  *
  * Not to be called directly, use the kmem_cache_create() wrapper with the same
  * parameters.
  *
+ * Commonly used @flags:
+ *
+ * &SLAB_ACCOUNT - Account allocations to memcg.
+ *
+ * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
+ *
+ * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
+ *
+ * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
+ * by a grace period - see the full description before using.
+ *
  * Context: Cannot be called within a interrupt, but can be interrupted.
  *
  * Return: a pointer to the cache on success, NULL on failure.
-- 
cgit v1.2.3


From 68f99be287a59d50a9ad231d523f7e578f8bd28a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Oct 2024 10:42:00 +0200
Subject: signal: Confine POSIX_TIMERS properly

Move the itimer rearming out of the signal code and consolidate all posix
timer related functions in the signal code under one ifdef.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241001083835.314100569@linutronix.de
---
 include/linux/posix-timers.h |   5 +-
 kernel/signal.c              | 125 +++++++++++++++----------------------------
 kernel/time/itimer.c         |  22 +++++++-
 kernel/time/posix-timers.c   |  15 +++++-
 4 files changed, 81 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 453691710839..670bf03a56ef 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -100,6 +100,8 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 {
 	pct->bases[CPUCLOCK_SCHED].nextevt = runtime;
 }
+void posixtimer_rearm_itimer(struct task_struct *p);
+void posixtimer_rearm(struct kernel_siginfo *info);
 
 /* Init task static initializer */
 #define INIT_CPU_TIMERBASE(b) {						\
@@ -122,6 +124,8 @@ struct cpu_timer { };
 static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
 static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
 					      u64 cpu_limit) { }
+static inline void posixtimer_rearm_itimer(struct task_struct *p) { }
+static inline void posixtimer_rearm(struct kernel_siginfo *info) { }
 #endif
 
 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
@@ -196,5 +200,4 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 
 int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
 
-void posixtimer_rearm(struct kernel_siginfo *info);
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 4344860ffcac..b65cc1853a09 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -478,42 +478,6 @@ void flush_signals(struct task_struct *t)
 }
 EXPORT_SYMBOL(flush_signals);
 
-#ifdef CONFIG_POSIX_TIMERS
-static void __flush_itimer_signals(struct sigpending *pending)
-{
-	sigset_t signal, retain;
-	struct sigqueue *q, *n;
-
-	signal = pending->signal;
-	sigemptyset(&retain);
-
-	list_for_each_entry_safe(q, n, &pending->list, list) {
-		int sig = q->info.si_signo;
-
-		if (likely(q->info.si_code != SI_TIMER)) {
-			sigaddset(&retain, sig);
-		} else {
-			sigdelset(&signal, sig);
-			list_del_init(&q->list);
-			__sigqueue_free(q);
-		}
-	}
-
-	sigorsets(&pending->signal, &signal, &retain);
-}
-
-void flush_itimer_signals(void)
-{
-	struct task_struct *tsk = current;
-	unsigned long flags;
-
-	spin_lock_irqsave(&tsk->sighand->siglock, flags);
-	__flush_itimer_signals(&tsk->pending);
-	__flush_itimer_signals(&tsk->signal->shared_pending);
-	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
-}
-#endif
-
 void ignore_signals(struct task_struct *t)
 {
 	int i;
@@ -636,31 +600,9 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
 		*type = PIDTYPE_TGID;
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info, &resched_timer);
-#ifdef CONFIG_POSIX_TIMERS
-		/*
-		 * itimer signal ?
-		 *
-		 * itimers are process shared and we restart periodic
-		 * itimers in the signal delivery path to prevent DoS
-		 * attacks in the high resolution timer case. This is
-		 * compliant with the old way of self-restarting
-		 * itimers, as the SIGALRM is a legacy signal and only
-		 * queued once. Changing the restart behaviour to
-		 * restart the timer in the signal dequeue path is
-		 * reducing the timer noise on heavy loaded !highres
-		 * systems too.
-		 */
-		if (unlikely(signr == SIGALRM)) {
-			struct hrtimer *tmr = &tsk->signal->real_timer;
-
-			if (!hrtimer_is_queued(tmr) &&
-			    tsk->signal->it_real_incr != 0) {
-				hrtimer_forward(tmr, tmr->base->get_time(),
-						tsk->signal->it_real_incr);
-				hrtimer_restart(tmr);
-			}
-		}
-#endif
+
+		if (unlikely(signr == SIGALRM))
+			posixtimer_rearm_itimer(tsk);
 	}
 
 	recalc_sigpending();
@@ -682,22 +624,12 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
 		 */
 		current->jobctl |= JOBCTL_STOP_DEQUEUED;
 	}
-#ifdef CONFIG_POSIX_TIMERS
-	if (resched_timer) {
-		/*
-		 * Release the siglock to ensure proper locking order
-		 * of timer locks outside of siglocks.  Note, we leave
-		 * irqs disabled here, since the posix-timers code is
-		 * about to disable them again anyway.
-		 */
-		spin_unlock(&tsk->sighand->siglock);
-		posixtimer_rearm(info);
-		spin_lock(&tsk->sighand->siglock);
 
-		/* Don't expose the si_sys_private value to userspace */
-		info->si_sys_private = 0;
+	if (IS_ENABLED(CONFIG_POSIX_TIMERS)) {
+		if (unlikely(resched_timer))
+			posixtimer_rearm(info);
 	}
-#endif
+
 	return signr;
 }
 EXPORT_SYMBOL_GPL(dequeue_signal);
@@ -1922,15 +1854,43 @@ int kill_pid(struct pid *pid, int sig, int priv)
 }
 EXPORT_SYMBOL(kill_pid);
 
+#ifdef CONFIG_POSIX_TIMERS
 /*
- * These functions support sending signals using preallocated sigqueue
- * structures.  This is needed "because realtime applications cannot
- * afford to lose notifications of asynchronous events, like timer
- * expirations or I/O completions".  In the case of POSIX Timers
- * we allocate the sigqueue structure from the timer_create.  If this
- * allocation fails we are able to report the failure to the application
- * with an EAGAIN error.
+ * These functions handle POSIX timer signals. POSIX timers use
+ * preallocated sigqueue structs for sending signals.
  */
+static void __flush_itimer_signals(struct sigpending *pending)
+{
+	sigset_t signal, retain;
+	struct sigqueue *q, *n;
+
+	signal = pending->signal;
+	sigemptyset(&retain);
+
+	list_for_each_entry_safe(q, n, &pending->list, list) {
+		int sig = q->info.si_signo;
+
+		if (likely(q->info.si_code != SI_TIMER)) {
+			sigaddset(&retain, sig);
+		} else {
+			sigdelset(&signal, sig);
+			list_del_init(&q->list);
+			__sigqueue_free(q);
+		}
+	}
+
+	sigorsets(&pending->signal, &signal, &retain);
+}
+
+void flush_itimer_signals(void)
+{
+	struct task_struct *tsk = current;
+
+	guard(spinlock_irqsave)(&tsk->sighand->siglock);
+	__flush_itimer_signals(&tsk->pending);
+	__flush_itimer_signals(&tsk->signal->shared_pending);
+}
+
 struct sigqueue *sigqueue_alloc(void)
 {
 	return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
@@ -2027,6 +1987,7 @@ ret:
 	rcu_read_unlock();
 	return ret;
 }
+#endif /* CONFIG_POSIX_TIMERS */
 
 void do_notify_pidfd(struct task_struct *task)
 {
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 00629e658ca1..876d389b2e21 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -151,7 +151,27 @@ COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
 #endif
 
 /*
- * The timer is automagically restarted, when interval != 0
+ * Invoked from dequeue_signal() when SIG_ALRM is delivered.
+ *
+ * Restart the ITIMER_REAL timer if it is armed as periodic timer.  Doing
+ * this in the signal delivery path instead of self rearming prevents a DoS
+ * with small increments in the high reolution timer case and reduces timer
+ * noise in general.
+ */
+void posixtimer_rearm_itimer(struct task_struct *tsk)
+{
+	struct hrtimer *tmr = &tsk->signal->real_timer;
+
+	if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) {
+		hrtimer_forward(tmr, tmr->base->get_time(),
+				tsk->signal->it_real_incr);
+		hrtimer_restart(tmr);
+	}
+}
+
+/*
+ * Interval timers are restarted in the signal delivery path.  See
+ * posixtimer_rearm_itimer().
  */
 enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index fc40dacabe78..d461a32b7260 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -251,7 +251,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
 
 /*
  * This function is called from the signal delivery code if
- * info->si_sys_private is not zero, which indicates that the timer has to
+ * info::si_sys_private is not zero, which indicates that the timer has to
  * be rearmed. Restart the timer and update info::si_overrun.
  */
 void posixtimer_rearm(struct kernel_siginfo *info)
@@ -259,9 +259,15 @@ void posixtimer_rearm(struct kernel_siginfo *info)
 	struct k_itimer *timr;
 	unsigned long flags;
 
+	/*
+	 * Release siglock to ensure proper locking order versus
+	 * timr::it_lock. Keep interrupts disabled.
+	 */
+	spin_unlock(&current->sighand->siglock);
+
 	timr = lock_timer(info->si_tid, &flags);
 	if (!timr)
-		return;
+		goto out;
 
 	if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) {
 		timr->kclock->timer_rearm(timr);
@@ -275,6 +281,11 @@ void posixtimer_rearm(struct kernel_siginfo *info)
 	}
 
 	unlock_timer(timr, flags);
+out:
+	spin_lock(&current->sighand->siglock);
+
+	/* Don't expose the si_sys_private value to userspace */
+	info->si_sys_private = 0;
 }
 
 int posix_timer_queue_signal(struct k_itimer *timr)
-- 
cgit v1.2.3


From 4febce44cfebcb490b196d5d10ae9f403ca4c956 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Oct 2024 10:42:03 +0200
Subject: posix-timers: Cure si_sys_private race

The si_sys_private member of the siginfo which is embedded in the
preallocated sigqueue is used by the posix timer code to decide whether a
timer must be reprogrammed on signal delivery.

The handling of this is racy as a long standing comment in that code
documents. It is modified with the timer lock held, but without sighand
lock being held. The actual signal delivery code checks for it under
sighand lock without holding the timer lock.

Hand the new value to send_sigqueue() as argument and store it with sighand
lock held. This is an intermediate change to address this issue.

The arguments to this function will be cleanup in subsequent changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241001083835.434338954@linutronix.de
---
 include/linux/sched/signal.h |  2 +-
 kernel/signal.c              | 10 +++++++++-
 kernel/time/posix-timers.c   | 15 +--------------
 3 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index c8ed09ac29ac..bd9f569231d9 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -340,7 +340,7 @@ extern int send_sig(int, struct task_struct *, int);
 extern int zap_other_threads(struct task_struct *p);
 extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
-extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
+extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type, int si_private);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 
 static inline void clear_notify_signal(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index f420c430b24a..1563c83ff224 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1919,7 +1919,7 @@ void sigqueue_free(struct sigqueue *q)
 		__sigqueue_free(q);
 }
 
-int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
+int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type, int si_private)
 {
 	int sig = q->info.si_signo;
 	struct sigpending *pending;
@@ -1954,6 +1954,14 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
 	if (!likely(lock_task_sighand(t, &flags)))
 		goto ret;
 
+	/*
+	 * Update @q::info::si_sys_private for posix timer signals with
+	 * sighand locked to prevent a race against dequeue_signal() which
+	 * decides based on si_sys_private whether to invoke
+	 * posixtimer_rearm() or not.
+	 */
+	q->info.si_sys_private = si_private;
+
 	ret = 1; /* the signal is ignored */
 	result = TRACE_SIGNAL_IGNORED;
 	if (!prepare_signal(sig, t, false))
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index d461a32b7260..05af074285fa 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -299,21 +299,8 @@ int posix_timer_queue_signal(struct k_itimer *timr)
 	if (timr->it_interval)
 		si_private = ++timr->it_requeue_pending;
 
-	/*
-	 * FIXME: if ->sigq is queued we can race with
-	 * dequeue_signal()->posixtimer_rearm().
-	 *
-	 * If dequeue_signal() sees the "right" value of
-	 * si_sys_private it calls posixtimer_rearm().
-	 * We re-queue ->sigq and drop ->it_lock().
-	 * posixtimer_rearm() locks the timer
-	 * and re-schedules it while ->sigq is pending.
-	 * Not really bad, but not that we want.
-	 */
-	timr->sigq->info.si_sys_private = si_private;
-
 	type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
-	ret = send_sigqueue(timr->sigq, timr->it_pid, type);
+	ret = send_sigqueue(timr->sigq, timr->it_pid, type, si_private);
 	/* If we failed to send the signal the timer stops. */
 	return ret > 0;
 }
-- 
cgit v1.2.3


From c775ea28d4e23f5e58b6953645ef90c1b27a8e83 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Oct 2024 10:42:04 +0200
Subject: signal: Allow POSIX timer signals to be dropped

In case that a timer was reprogrammed or deleted an already pending signal
is obsolete. Right now such signals are kept around and eventually
delivered. While POSIX is blury about this:

 - "The effect of disarming or resetting a timer with pending expiration
    notifications is unspecified."

 - "The disposition of pending signals for the deleted timer is
    unspecified."

it is reasonable in both cases to expect that pending signals are discarded
as they have no meaning anymore.

Prepare the signal code to allow dropping posix timer signals.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241001083835.494416923@linutronix.de
---
 include/linux/posix-timers.h | 5 +++--
 kernel/signal.c              | 7 ++++---
 kernel/time/posix-timers.c   | 3 ++-
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 670bf03a56ef..4ab49e5c42af 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -100,8 +100,9 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 {
 	pct->bases[CPUCLOCK_SCHED].nextevt = runtime;
 }
+
 void posixtimer_rearm_itimer(struct task_struct *p);
-void posixtimer_rearm(struct kernel_siginfo *info);
+bool posixtimer_deliver_signal(struct kernel_siginfo *info);
 
 /* Init task static initializer */
 #define INIT_CPU_TIMERBASE(b) {						\
@@ -125,7 +126,7 @@ static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
 static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
 					      u64 cpu_limit) { }
 static inline void posixtimer_rearm_itimer(struct task_struct *p) { }
-static inline void posixtimer_rearm(struct kernel_siginfo *info) { }
+static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info) { return false; }
 #endif
 
 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
diff --git a/kernel/signal.c b/kernel/signal.c
index 1563c83ff224..df34aa47181e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -594,6 +594,7 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
 
 	lockdep_assert_held(&tsk->sighand->siglock);
 
+again:
 	*type = PIDTYPE_PID;
 	signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
 	if (!signr) {
@@ -625,9 +626,9 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
 		current->jobctl |= JOBCTL_STOP_DEQUEUED;
 	}
 
-	if (IS_ENABLED(CONFIG_POSIX_TIMERS)) {
-		if (unlikely(resched_timer))
-			posixtimer_rearm(info);
+	if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(resched_timer)) {
+		if (!posixtimer_deliver_signal(info))
+			goto again;
 	}
 
 	return signr;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 05af074285fa..dd0b1dff54d9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -254,7 +254,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
  * info::si_sys_private is not zero, which indicates that the timer has to
  * be rearmed. Restart the timer and update info::si_overrun.
  */
-void posixtimer_rearm(struct kernel_siginfo *info)
+bool posixtimer_deliver_signal(struct kernel_siginfo *info)
 {
 	struct k_itimer *timr;
 	unsigned long flags;
@@ -286,6 +286,7 @@ out:
 
 	/* Don't expose the si_sys_private value to userspace */
 	info->si_sys_private = 0;
+	return true;
 }
 
 int posix_timer_queue_signal(struct k_itimer *timr)
-- 
cgit v1.2.3


From cd1e93aedab7f749760a33e9e094381973b1120e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Oct 2024 10:42:07 +0200
Subject: posix-timers: Rename k_itimer:: It_requeue_pending

Prepare for using this struct member to do a proper reprogramming and
deletion accounting so that stale signals can be dropped.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241001083835.611997737@linutronix.de
---
 include/linux/posix-timers.h   |  5 ++---
 kernel/time/alarmtimer.c       |  2 +-
 kernel/time/posix-cpu-timers.c |  4 ++--
 kernel/time/posix-timers.c     | 12 ++++++------
 4 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 4ab49e5c42af..253d106fac2c 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -150,8 +150,7 @@ static inline void posix_cputimers_init_work(void) { }
  * @it_active:		Marker that timer is active
  * @it_overrun:		The overrun counter for pending signals
  * @it_overrun_last:	The overrun at the time of the last delivered signal
- * @it_requeue_pending:	Indicator that timer waits for being requeued on
- *			signal delivery
+ * @it_signal_seq:	Sequence count to control signal delivery
  * @it_sigev_notify:	The notify word of sigevent struct for signal delivery
  * @it_interval:	The interval for periodic timers
  * @it_signal:		Pointer to the creators signal struct
@@ -172,7 +171,7 @@ struct k_itimer {
 	int			it_active;
 	s64			it_overrun;
 	s64			it_overrun_last;
-	int			it_requeue_pending;
+	unsigned int		it_signal_seq;
 	int			it_sigev_notify;
 	ktime_t			it_interval;
 	struct signal_struct	*it_signal;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8bf888641694..75f844385070 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -584,7 +584,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 		 * small intervals cannot starve the system.
 		 */
 		ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
-		++ptr->it_requeue_pending;
+		++ptr->it_signal_seq;
 		ptr->it_active = 1;
 		result = ALARMTIMER_RESTART;
 	}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 6bcee4704059..993243b5be98 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -608,7 +608,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
 		 * ticking in case the signal is deliverable next time.
 		 */
 		posix_cpu_timer_rearm(timer);
-		++timer->it_requeue_pending;
+		++timer->it_signal_seq;
 	}
 }
 
@@ -745,7 +745,7 @@ static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *i
 	 *  - Timers which expired, but the signal has not yet been
 	 *    delivered
 	 */
-	if (iv && ((timer->it_requeue_pending & REQUEUE_PENDING) || sigev_none))
+	if (iv && ((timer->it_signal_seq & REQUEUE_PENDING) || sigev_none))
 		expires = bump_cpu_timer(timer, now);
 	else
 		expires = cpu_timer_getexpires(&timer->it.cpu);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 22e1d6bf349b..fd321fcc3f6c 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -269,13 +269,13 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info)
 	if (!timr)
 		goto out;
 
-	if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) {
+	if (timr->it_interval && timr->it_signal_seq == info->si_sys_private) {
 		timr->kclock->timer_rearm(timr);
 
 		timr->it_active = 1;
 		timr->it_overrun_last = timr->it_overrun;
 		timr->it_overrun = -1LL;
-		++timr->it_requeue_pending;
+		++timr->it_signal_seq;
 
 		info->si_overrun = timer_overrun_to_int(timr, info->si_overrun);
 	}
@@ -299,7 +299,7 @@ int posix_timer_queue_signal(struct k_itimer *timr)
 
 	timr->it_active = 0;
 	if (timr->it_interval)
-		si_private = ++timr->it_requeue_pending;
+		si_private = ++timr->it_signal_seq;
 
 	type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
 	ret = send_sigqueue(timr->sigq, timr->it_pid, type, si_private);
@@ -366,7 +366,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 
 			timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval);
 			ret = HRTIMER_RESTART;
-			++timr->it_requeue_pending;
+			++timr->it_signal_seq;
 			timr->it_active = 1;
 		}
 	}
@@ -660,7 +660,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 	 * is a SIGEV_NONE timer move the expiry time forward by intervals,
 	 * so expiry is > now.
 	 */
-	if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
+	if (iv && (timr->it_signal_seq & REQUEUE_PENDING || sig_none))
 		timr->it_overrun += kc->timer_forward(timr, now);
 
 	remaining = kc->timer_remaining(timr, now);
@@ -861,7 +861,7 @@ void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_set
 		timer->it_interval = 0;
 
 	/* Prevent reloading in case there is a signal pending */
-	timer->it_requeue_pending = (timer->it_requeue_pending + 2) & ~REQUEUE_PENDING;
+	timer->it_signal_seq = (timer->it_signal_seq + 2) & ~REQUEUE_PENDING;
 	/* Reset overrun accounting */
 	timer->it_overrun_last = 0;
 	timer->it_overrun = -1LL;
-- 
cgit v1.2.3


From 1550dde8a537b35dbf066c7f9cfe5f9b360bce0d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Oct 2024 10:42:09 +0200
Subject: posix-timers: Add proper state tracking

Right now the state tracking is done by two struct members:

 - it_active:
     A boolean which tracks armed/disarmed state

 - it_signal_seq:
     A sequence counter which is used to invalidate settings
     and prevent rearming

Replace it_active with it_status and keep properly track about the states
in one place.

This allows to reuse it_signal_seq to track reprogramming, disarm and
delete operations in order to drop signals which are related to the state
previous of those operations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241001083835.670337048@linutronix.de
---
 include/linux/posix-timers.h   |  4 ++--
 kernel/time/alarmtimer.c       |  2 +-
 kernel/time/posix-cpu-timers.c | 15 ++++++++-------
 kernel/time/posix-timers.c     | 22 +++++++++++++---------
 kernel/time/posix-timers.h     |  6 ++++++
 5 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 253d106fac2c..02afbb4da7f7 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -147,7 +147,7 @@ static inline void posix_cputimers_init_work(void) { }
  * @kclock:		Pointer to the k_clock struct handling this timer
  * @it_clock:		The posix timer clock id
  * @it_id:		The posix timer id for identifying the timer
- * @it_active:		Marker that timer is active
+ * @it_status:		The status of the timer
  * @it_overrun:		The overrun counter for pending signals
  * @it_overrun_last:	The overrun at the time of the last delivered signal
  * @it_signal_seq:	Sequence count to control signal delivery
@@ -168,7 +168,7 @@ struct k_itimer {
 	const struct k_clock	*kclock;
 	clockid_t		it_clock;
 	timer_t			it_id;
-	int			it_active;
+	int			it_status;
 	s64			it_overrun;
 	s64			it_overrun_last;
 	unsigned int		it_signal_seq;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 75f844385070..452d8aa2f6e0 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -585,7 +585,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 		 */
 		ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
 		++ptr->it_signal_seq;
-		ptr->it_active = 1;
+		ptr->it_status = POSIX_TIMER_ARMED;
 		result = ALARMTIMER_RESTART;
 	}
 	spin_unlock_irqrestore(&ptr->it_lock, flags);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 993243b5be98..12f828d704b1 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -453,7 +453,6 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p)
 	struct cpu_timer *ctmr = &timer->it.cpu;
 	struct posix_cputimer_base *base;
 
-	timer->it_active = 0;
 	if (!cpu_timer_dequeue(ctmr))
 		return;
 
@@ -494,11 +493,12 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
 		 */
 		WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));
 	} else {
-		if (timer->it.cpu.firing)
+		if (timer->it.cpu.firing) {
 			ret = TIMER_RETRY;
-		else
+		} else {
 			disarm_timer(timer, p);
-
+			timer->it_status = POSIX_TIMER_DISARMED;
+		}
 		unlock_task_sighand(p, &flags);
 	}
 
@@ -560,7 +560,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p)
 	struct cpu_timer *ctmr = &timer->it.cpu;
 	u64 newexp = cpu_timer_getexpires(ctmr);
 
-	timer->it_active = 1;
+	timer->it_status = POSIX_TIMER_ARMED;
 	if (!cpu_timer_enqueue(&base->tqhead, ctmr))
 		return;
 
@@ -586,7 +586,8 @@ static void cpu_timer_fire(struct k_itimer *timer)
 {
 	struct cpu_timer *ctmr = &timer->it.cpu;
 
-	timer->it_active = 0;
+	timer->it_status = POSIX_TIMER_DISARMED;
+
 	if (unlikely(timer->sigq == NULL)) {
 		/*
 		 * This a special case for clock_nanosleep,
@@ -671,7 +672,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 		ret = TIMER_RETRY;
 	} else {
 		cpu_timer_dequeue(ctmr);
-		timer->it_active = 0;
+		timer->it_status = POSIX_TIMER_DISARMED;
 	}
 
 	/*
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index fd321fcc3f6c..dd72b8e72697 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -272,7 +272,7 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info)
 	if (timr->it_interval && timr->it_signal_seq == info->si_sys_private) {
 		timr->kclock->timer_rearm(timr);
 
-		timr->it_active = 1;
+		timr->it_status = POSIX_TIMER_ARMED;
 		timr->it_overrun_last = timr->it_overrun;
 		timr->it_overrun = -1LL;
 		++timr->it_signal_seq;
@@ -292,14 +292,17 @@ out:
 
 int posix_timer_queue_signal(struct k_itimer *timr)
 {
+	enum posix_timer_state state = POSIX_TIMER_DISARMED;
 	int ret, si_private = 0;
 	enum pid_type type;
 
 	lockdep_assert_held(&timr->it_lock);
 
-	timr->it_active = 0;
-	if (timr->it_interval)
+	if (timr->it_interval) {
+		state = POSIX_TIMER_REQUEUE_PENDING;
 		si_private = ++timr->it_signal_seq;
+	}
+	timr->it_status = state;
 
 	type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
 	ret = send_sigqueue(timr->sigq, timr->it_pid, type, si_private);
@@ -367,7 +370,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 			timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval);
 			ret = HRTIMER_RESTART;
 			++timr->it_signal_seq;
-			timr->it_active = 1;
+			timr->it_status = POSIX_TIMER_ARMED;
 		}
 	}
 
@@ -640,10 +643,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 	/* interval timer ? */
 	if (iv) {
 		cur_setting->it_interval = ktime_to_timespec64(iv);
-	} else if (!timr->it_active) {
+	} else if (timr->it_status == POSIX_TIMER_DISARMED) {
 		/*
 		 * SIGEV_NONE oneshot timers are never queued and therefore
-		 * timr->it_active is always false. The check below
+		 * timr->it_status is always DISARMED. The check below
 		 * vs. remaining time will handle this case.
 		 *
 		 * For all other timers there is nothing to update here, so
@@ -888,7 +891,7 @@ int common_timer_set(struct k_itimer *timr, int flags,
 	if (kc->timer_try_to_cancel(timr) < 0)
 		return TIMER_RETRY;
 
-	timr->it_active = 0;
+	timr->it_status = POSIX_TIMER_DISARMED;
 	posix_timer_set_common(timr, new_setting);
 
 	/* Keep timer disarmed when it_value is zero */
@@ -901,7 +904,8 @@ int common_timer_set(struct k_itimer *timr, int flags,
 	sigev_none = timr->it_sigev_notify == SIGEV_NONE;
 
 	kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
-	timr->it_active = !sigev_none;
+	if (!sigev_none)
+		timr->it_status = POSIX_TIMER_ARMED;
 	return 0;
 }
 
@@ -1000,7 +1004,7 @@ int common_timer_del(struct k_itimer *timer)
 	timer->it_interval = 0;
 	if (kc->timer_try_to_cancel(timer) < 0)
 		return TIMER_RETRY;
-	timer->it_active = 0;
+	timer->it_status = POSIX_TIMER_DISARMED;
 	return 0;
 }
 
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 4784ea65f685..4d09677e584e 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -1,6 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #define TIMER_RETRY 1
 
+enum posix_timer_state {
+	POSIX_TIMER_DISARMED,
+	POSIX_TIMER_ARMED,
+	POSIX_TIMER_REQUEUE_PENDING,
+};
+
 struct k_clock {
 	int	(*clock_getres)(const clockid_t which_clock,
 				struct timespec64 *tp);
-- 
cgit v1.2.3


From 9cb7e40d388d6c0e4677809c6b2950bc67fd8830 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 21 Oct 2024 11:32:28 -0700
Subject: rtnetlink: Make per-netns RTNL dereference helpers to macro.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When CONFIG_DEBUG_NET_SMALL_RTNL is off, rtnl_net_dereference() is the
static inline wrapper of rtnl_dereference() returning a plain (void *)
pointer to make sure net is always evaluated as requested in [0].

But, it makes sparse complain [1] when the pointer has __rcu annotation:

  net/ipv4/devinet.c:674:47: sparse: warning: incorrect type in argument 2 (different address spaces)
  net/ipv4/devinet.c:674:47: sparse:    expected void *p
  net/ipv4/devinet.c:674:47: sparse:    got struct in_ifaddr [noderef] __rcu *

Also, if we evaluate net as (void *) in a macro, then the compiler
in turn fails to build due to -Werror=unused-value.

  #define rtnl_net_dereference(net, p)                  \
        ({                                              \
                (void *)net;                            \
                rtnl_dereference(p);                    \
        })

  net/ipv4/devinet.c: In function ‘inet_rtm_deladdr’:
  ./include/linux/rtnetlink.h:154:17: error: statement with no effect [-Werror=unused-value]
    154 |                 (void *)net;                            \
  net/ipv4/devinet.c:674:21: note: in expansion of macro ‘rtnl_net_dereference’
    674 |              (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
        |                     ^~~~~~~~~~~~~~~~~~~~

Let's go back to the original simplest macro.

Note that checkpatch complains about this approach, but it's one-shot and
less noisy than the other two.

  WARNING: Argument 'net' is not used in function-like macro
  #76: FILE: include/linux/rtnetlink.h:142:
  +#define rtnl_net_dereference(net, p)			\
  +	rtnl_dereference(p)

Fixes: 844e5e7e656d ("rtnetlink: Add assertion helpers for per-netns RTNL.")
Link: https://lore.kernel.org/netdev/20241004132145.7fd208e9@kernel.org/ [0]
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202410200325.SaEJmyZS-lkp@intel.com/ [1]
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/rtnetlink.h | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 8468a4ce8510..0e62918de63b 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -137,21 +137,12 @@ static inline void ASSERT_RTNL_NET(struct net *net)
 	ASSERT_RTNL();
 }
 
-static inline void *rcu_dereference_rtnl_net(struct net *net, void *p)
-{
-	return rcu_dereference_rtnl(p);
-}
-
-static inline void *rtnl_net_dereference(struct net *net, void *p)
-{
-	return rtnl_dereference(p);
-}
-
-static inline void *rcu_replace_pointer_rtnl_net(struct net *net,
-						 void *rp, void *p)
-{
-	return rcu_replace_pointer_rtnl(rp, p);
-}
+#define rcu_dereference_rtnl_net(net, p)		\
+	rcu_dereference_rtnl(p)
+#define rtnl_net_dereference(net, p)			\
+	rtnl_dereference(p)
+#define rcu_replace_pointer_rtnl_net(net, rp, p)	\
+	rcu_replace_pointer_rtnl(rp, p)
 #endif
 
 static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
-- 
cgit v1.2.3


From d4b483208b2606add41a22bdd3c8cd6d36009319 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 21 Oct 2024 11:32:33 -0700
Subject: ipv4: Use per-netns RTNL helpers in inet_rtm_newaddr().

inet_rtm_to_ifa() and find_matching_ifa() are called
under rtnl_net_lock().

__in_dev_get_rtnl() and in_dev_for_each_ifa_rtnl() there
can use per-netns RTNL helpers.

Let's define and use __in_dev_get_rtnl_net() and
in_dev_for_each_ifa_rtnl_net().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/inetdevice.h | 9 +++++++++
 net/ipv4/devinet.c         | 8 ++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index d9c690c8c80b..5730ba6b1cfa 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -226,6 +226,10 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr)
 	for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa;	\
 	     ifa = rtnl_dereference(ifa->ifa_next))
 
+#define in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev)			\
+	for (ifa = rtnl_net_dereference(net, (in_dev)->ifa_list); ifa;	\
+	     ifa = rtnl_net_dereference(net, ifa->ifa_next))
+
 #define in_dev_for_each_ifa_rcu(ifa, in_dev)			\
 	for (ifa = rcu_dereference((in_dev)->ifa_list); ifa;	\
 	     ifa = rcu_dereference(ifa->ifa_next))
@@ -252,6 +256,11 @@ static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev)
 	return rtnl_dereference(dev->ip_ptr);
 }
 
+static inline struct in_device *__in_dev_get_rtnl_net(const struct net_device *dev)
+{
+	return rtnl_net_dereference(dev_net(dev), dev->ip_ptr);
+}
+
 /* called with rcu_read_lock or rtnl held */
 static inline bool ip_ignore_linkdown(const struct net_device *dev)
 {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7f24bc38981b..e14e35c22054 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -901,7 +901,7 @@ static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh,
 		goto errout;
 	}
 
-	in_dev = __in_dev_get_rtnl(dev);
+	in_dev = __in_dev_get_rtnl_net(dev);
 	err = -ENOBUFS;
 	if (!in_dev)
 		goto errout;
@@ -948,12 +948,12 @@ errout:
 	return ERR_PTR(err);
 }
 
-static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
+static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa)
 {
 	struct in_device *in_dev = ifa->ifa_dev;
 	struct in_ifaddr *ifa1;
 
-	in_dev_for_each_ifa_rtnl(ifa1, in_dev) {
+	in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) {
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa) &&
 		    ifa1->ifa_local == ifa->ifa_local)
@@ -989,7 +989,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto unlock;
 	}
 
-	ifa_existing = find_matching_ifa(ifa);
+	ifa_existing = find_matching_ifa(net, ifa);
 	if (!ifa_existing) {
 		/* It would be best to check for !NLM_F_CREATE here but
 		 * userspace already relies on not having to provide this.
-- 
cgit v1.2.3


From d1c81818aa227b37d65b40f9883109c5256b9bfb Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 21 Oct 2024 11:32:36 -0700
Subject: rtnetlink: Define rtnl_net_trylock().

We will need the per-netns version of rtnl_trylock().

rtnl_net_trylock() calls __rtnl_net_lock() only when rtnl_trylock()
successfully holds RTNL.

When RTNL is removed, we will use mutex_trylock() for per-netns RTNL.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/rtnetlink.h |  6 ++++++
 net/core/rtnetlink.c      | 11 +++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 0e62918de63b..14b88f551920 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -101,6 +101,7 @@ void __rtnl_net_lock(struct net *net);
 void __rtnl_net_unlock(struct net *net);
 void rtnl_net_lock(struct net *net);
 void rtnl_net_unlock(struct net *net);
+int rtnl_net_trylock(struct net *net);
 int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b);
 
 bool rtnl_net_is_locked(struct net *net);
@@ -132,6 +133,11 @@ static inline void rtnl_net_unlock(struct net *net)
 	rtnl_unlock();
 }
 
+static inline int rtnl_net_trylock(struct net *net)
+{
+	return rtnl_trylock();
+}
+
 static inline void ASSERT_RTNL_NET(struct net *net)
 {
 	ASSERT_RTNL();
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 194a81e5f608..dda8230fdfd4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -210,6 +210,17 @@ void rtnl_net_unlock(struct net *net)
 }
 EXPORT_SYMBOL(rtnl_net_unlock);
 
+int rtnl_net_trylock(struct net *net)
+{
+	int ret = rtnl_trylock();
+
+	if (ret)
+		__rtnl_net_lock(net);
+
+	return ret;
+}
+EXPORT_SYMBOL(rtnl_net_trylock);
+
 static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
 {
 	if (net_eq(net_a, net_b))
-- 
cgit v1.2.3


From d14772c0d88c387f881a577aa136e1e9b1291d07 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 29 Oct 2024 11:54:25 +0100
Subject: iommu: Fix prototype of iommu_paging_domain_alloc_flags()

The iommu_paging_domain_alloc_flags() prototype for
non-iommu kernel configurations lacks the 'static inline'
prefixes.

Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Fixes: 20858d4ebb42 ("iommu: Introduce iommu_paging_domain_alloc_flags()")
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index aa78d911fdda..522efdc7d815 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1080,7 +1080,7 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
-struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+static inline struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
 						     unsigned int flags)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From a33bf8d8ce7e06bf0f033865b0cea5887cd2ac8c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 29 Oct 2024 12:11:49 +0100
Subject: iommu: Restore iommu_flush_iotlb_all()

This patch restores the iommu_flush_iotlb_all() function.
Commit

	69e5a17511f6 ("iommu: Remove useless flush from iommu_create_device_direct_mappings()")

claims it removed the last call-site, except it did not. There is still
at least one caller in

	drivers/gpu/drm/msm/msm_iommu.c

so keep the function around until all call-sites are updated.

Cc: Jason Gunthorpe <jgg@ziepe.ca>
Fixes: 69e5a17511f6 ("iommu: Remove useless flush from iommu_create_device_direct_mappings()")
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 522efdc7d815..8cce372a33f1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -853,6 +853,12 @@ void iommu_set_dma_strict(void);
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
+static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+	if (domain->ops->flush_iotlb_all)
+		domain->ops->flush_iotlb_all(domain);
+}
+
 static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
@@ -1137,6 +1143,10 @@ static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
+static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+}
+
 static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
-- 
cgit v1.2.3


From c382429b587ac49bd179d768f13e7fa5e7ed1787 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sat, 26 Oct 2024 21:38:02 +0200
Subject: platform/x86: wmi: Replace dev_to_wdev() with to_wmi_device()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace dev_to_wdev() with to_wmi_device() to stop duplicating
functionality.

Also switch to_wmi_device() to use container_of_const() so const
values are handled correctly.

Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20241026193803.8802-2-W_Armin@gmx.de
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/wmi.c | 9 ++++-----
 include/linux/wmi.h        | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 4704e79197f6..b2576d5189ed 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -91,7 +91,6 @@ static const struct acpi_device_id wmi_device_ids[] = {
 MODULE_DEVICE_TABLE(acpi, wmi_device_ids);
 
 #define dev_to_wblock(__dev)	container_of_const(__dev, struct wmi_block, dev.dev)
-#define dev_to_wdev(__dev)	container_of_const(__dev, struct wmi_device, dev)
 
 /*
  * GUID parsing functions
@@ -199,7 +198,7 @@ static struct wmi_device *wmi_find_device_by_guid(const char *guid_string)
 	if (!dev)
 		return ERR_PTR(-ENODEV);
 
-	return dev_to_wdev(dev);
+	return to_wmi_device(dev);
 }
 
 static void wmi_device_put(struct wmi_device *wdev)
@@ -761,7 +760,7 @@ static DEVICE_ATTR_RO(object_id);
 static ssize_t setable_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
-	struct wmi_device *wdev = dev_to_wdev(dev);
+	struct wmi_device *wdev = to_wmi_device(dev);
 
 	return sysfs_emit(buf, "%d\n", (int)wdev->setable);
 }
@@ -851,7 +850,7 @@ static int wmi_dev_probe(struct device *dev)
 		dev_warn(dev, "failed to enable device -- probing anyway\n");
 
 	if (wdriver->probe) {
-		ret = wdriver->probe(dev_to_wdev(dev),
+		ret = wdriver->probe(to_wmi_device(dev),
 				find_guid_context(wblock, wdriver));
 		if (ret) {
 			if (ACPI_FAILURE(wmi_method_enable(wblock, false)))
@@ -878,7 +877,7 @@ static void wmi_dev_remove(struct device *dev)
 	up_write(&wblock->notify_lock);
 
 	if (wdriver->remove)
-		wdriver->remove(dev_to_wdev(dev));
+		wdriver->remove(to_wmi_device(dev));
 
 	if (ACPI_FAILURE(wmi_method_enable(wblock, false)))
 		dev_warn(dev, "failed to disable device\n");
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 120019677fc6..fca5fd43c707 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -34,7 +34,7 @@ struct wmi_device {
  *
  * Cast a struct device to a struct wmi_device.
  */
-#define to_wmi_device(device)	container_of(device, struct wmi_device, dev)
+#define to_wmi_device(device)	container_of_const(device, struct wmi_device, dev)
 
 extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
 					  u8 instance, u32 method_id,
-- 
cgit v1.2.3


From e001341a984e709e377b275123aecb5a763eaef9 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sat, 26 Oct 2024 21:38:03 +0200
Subject: platform/x86: wmi: Introduce to_wmi_driver()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce to_wmi_driver() as a replacement for dev_to_wdrv()
so WMI drivers can use this support macro instead of having
to duplicate its functionality.

Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20241026193803.8802-3-W_Armin@gmx.de
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/wmi.c | 12 +++++-------
 include/linux/wmi.h        |  8 ++++++++
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index b2576d5189ed..646370bd6b03 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -653,8 +653,6 @@ char *wmi_get_acpi_device_uid(const char *guid_string)
 }
 EXPORT_SYMBOL_GPL(wmi_get_acpi_device_uid);
 
-#define drv_to_wdrv(__drv)	container_of_const(__drv, struct wmi_driver, driver)
-
 /*
  * sysfs interface
  */
@@ -802,7 +800,7 @@ static void wmi_dev_release(struct device *dev)
 
 static int wmi_dev_match(struct device *dev, const struct device_driver *driver)
 {
-	const struct wmi_driver *wmi_driver = drv_to_wdrv(driver);
+	const struct wmi_driver *wmi_driver = to_wmi_driver(driver);
 	struct wmi_block *wblock = dev_to_wblock(dev);
 	const struct wmi_device_id *id = wmi_driver->id_table;
 
@@ -826,7 +824,7 @@ static int wmi_dev_match(struct device *dev, const struct device_driver *driver)
 static int wmi_dev_probe(struct device *dev)
 {
 	struct wmi_block *wblock = dev_to_wblock(dev);
-	struct wmi_driver *wdriver = drv_to_wdrv(dev->driver);
+	struct wmi_driver *wdriver = to_wmi_driver(dev->driver);
 	int ret = 0;
 
 	/* Some older WMI drivers will break if instantiated multiple times,
@@ -870,7 +868,7 @@ static int wmi_dev_probe(struct device *dev)
 static void wmi_dev_remove(struct device *dev)
 {
 	struct wmi_block *wblock = dev_to_wblock(dev);
-	struct wmi_driver *wdriver = drv_to_wdrv(dev->driver);
+	struct wmi_driver *wdriver = to_wmi_driver(dev->driver);
 
 	down_write(&wblock->notify_lock);
 	wblock->driver_ready = false;
@@ -889,7 +887,7 @@ static void wmi_dev_shutdown(struct device *dev)
 	struct wmi_block *wblock;
 
 	if (dev->driver) {
-		wdriver = drv_to_wdrv(dev->driver);
+		wdriver = to_wmi_driver(dev->driver);
 		wblock = dev_to_wblock(dev);
 
 		/*
@@ -1173,7 +1171,7 @@ static int wmi_get_notify_data(struct wmi_block *wblock, union acpi_object **obj
 
 static void wmi_notify_driver(struct wmi_block *wblock, union acpi_object *obj)
 {
-	struct wmi_driver *driver = drv_to_wdrv(wblock->dev.dev.driver);
+	struct wmi_driver *driver = to_wmi_driver(wblock->dev.dev.driver);
 
 	if (!obj && !driver->no_notify_data) {
 		dev_warn(&wblock->dev.dev, "Event contains no event data\n");
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index fca5fd43c707..10751c8e5e6a 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -73,6 +73,14 @@ struct wmi_driver {
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
 };
 
+/**
+ * to_wmi_driver() - Helper macro to cast a driver to a wmi_driver
+ * @drv: driver struct
+ *
+ * Cast a struct device_driver to a struct wmi_driver.
+ */
+#define to_wmi_driver(drv)	container_of_const(drv, struct wmi_driver, driver)
+
 extern int __must_check __wmi_driver_register(struct wmi_driver *driver,
 					      struct module *owner);
 extern void wmi_driver_unregister(struct wmi_driver *driver);
-- 
cgit v1.2.3


From 2f5a65ef30a636d5030917eebd283ac447a212af Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Oct 2024 15:19:37 +0100
Subject: block: add a bdev_limits helper

Add a helper to get the queue_limits from the bdev without having to
poke into the request_queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20241029141937.249920-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c            |  3 +--
 block/blk-settings.c         |  2 +-
 drivers/md/dm-cache-target.c |  4 ++--
 drivers/md/dm-clone-target.c |  4 ++--
 drivers/md/dm-thin.c         |  2 +-
 fs/btrfs/zoned.c             |  7 ++-----
 include/linux/blkdev.h       | 15 ++++++++++-----
 7 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8b9a9646aed8..d813d799cee7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -411,10 +411,9 @@ struct bio *bio_split_zone_append(struct bio *bio,
  */
 struct bio *bio_split_to_limits(struct bio *bio)
 {
-	const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
 	unsigned int nr_segs;
 
-	return __bio_split_to_limits(bio, lim, &nr_segs);
+	return __bio_split_to_limits(bio, bdev_limits(bio->bi_bdev), &nr_segs);
 }
 EXPORT_SYMBOL(bio_split_to_limits);
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a446654ddee5..95fc39d09872 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -661,7 +661,7 @@ EXPORT_SYMBOL(blk_stack_limits);
 void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
 		sector_t offset, const char *pfx)
 {
-	if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits,
+	if (blk_stack_limits(t, bdev_limits(bdev),
 			get_start_sect(bdev) + offset))
 		pr_notice("%s: Warning: Device %pg is misaligned\n",
 			pfx, bdev);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index aaeeabfab09b..c4520ff7fe1a 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -3360,7 +3360,7 @@ static int cache_iterate_devices(struct dm_target *ti,
 static void disable_passdown_if_not_supported(struct cache *cache)
 {
 	struct block_device *origin_bdev = cache->origin_dev->bdev;
-	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
+	struct queue_limits *origin_limits = bdev_limits(origin_bdev);
 	const char *reason = NULL;
 
 	if (!cache->features.discard_passdown)
@@ -3382,7 +3382,7 @@ static void disable_passdown_if_not_supported(struct cache *cache)
 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
 {
 	struct block_device *origin_bdev = cache->origin_dev->bdev;
-	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
+	struct queue_limits *origin_limits = bdev_limits(origin_bdev);
 
 	if (!cache->features.discard_passdown) {
 		/* No passdown is done so setting own virtual limits */
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index 12bbe487a4c8..e956d980672c 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -2020,7 +2020,7 @@ static void clone_resume(struct dm_target *ti)
 static void disable_passdown_if_not_supported(struct clone *clone)
 {
 	struct block_device *dest_dev = clone->dest_dev->bdev;
-	struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
+	struct queue_limits *dest_limits = bdev_limits(dest_dev);
 	const char *reason = NULL;
 
 	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
@@ -2041,7 +2041,7 @@ static void disable_passdown_if_not_supported(struct clone *clone)
 static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
 {
 	struct block_device *dest_bdev = clone->dest_dev->bdev;
-	struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
+	struct queue_limits *dest_limits = bdev_limits(dest_bdev);
 
 	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
 		/* No passdown is done so we set our own virtual limits */
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 89632ce97760..9095f19a84f3 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2842,7 +2842,7 @@ static void disable_discard_passdown_if_not_supported(struct pool_c *pt)
 {
 	struct pool *pool = pt->pool;
 	struct block_device *data_bdev = pt->data_dev->bdev;
-	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
+	struct queue_limits *data_limits = bdev_limits(data_bdev);
 	const char *reason = NULL;
 
 	if (!pt->adjusted_pf.discard_passdown)
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 69d03feea4e0..46b9386957e6 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -707,11 +707,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 		 * zoned mode. In this case, we don't have a valid max zone
 		 * append size.
 		 */
-		if (bdev_is_zoned(device->bdev)) {
-			blk_stack_limits(lim,
-					 &bdev_get_queue(device->bdev)->limits,
-					 0);
-		}
+		if (bdev_is_zoned(device->bdev))
+			blk_stack_limits(lim, bdev_limits(device->bdev), 0);
 	}
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d0a52ed05e60..7bfc877e159e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1159,6 +1159,11 @@ enum blk_default_limits {
  */
 #define BLK_DEF_MAX_SECTORS_CAP	2560u
 
+static inline struct queue_limits *bdev_limits(struct block_device *bdev)
+{
+	return &bdev_get_queue(bdev)->limits;
+}
+
 static inline unsigned long queue_segment_boundary(const struct request_queue *q)
 {
 	return q->limits.seg_boundary_mask;
@@ -1293,23 +1298,23 @@ unsigned int bdev_discard_alignment(struct block_device *bdev);
 
 static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev)
 {
-	return bdev_get_queue(bdev)->limits.max_discard_sectors;
+	return bdev_limits(bdev)->max_discard_sectors;
 }
 
 static inline unsigned int bdev_discard_granularity(struct block_device *bdev)
 {
-	return bdev_get_queue(bdev)->limits.discard_granularity;
+	return bdev_limits(bdev)->discard_granularity;
 }
 
 static inline unsigned int
 bdev_max_secure_erase_sectors(struct block_device *bdev)
 {
-	return bdev_get_queue(bdev)->limits.max_secure_erase_sectors;
+	return bdev_limits(bdev)->max_secure_erase_sectors;
 }
 
 static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
 {
-	return bdev_get_queue(bdev)->limits.max_write_zeroes_sectors;
+	return bdev_limits(bdev)->max_write_zeroes_sectors;
 }
 
 static inline bool bdev_nonrot(struct block_device *bdev)
@@ -1345,7 +1350,7 @@ static inline bool bdev_write_cache(struct block_device *bdev)
 
 static inline bool bdev_fua(struct block_device *bdev)
 {
-	return bdev_get_queue(bdev)->limits.features & BLK_FEAT_FUA;
+	return bdev_limits(bdev)->features & BLK_FEAT_FUA;
 }
 
 static inline bool bdev_nowait(struct block_device *bdev)
-- 
cgit v1.2.3


From 085268829b07202cf7bf8ec1a8fb7fd9d8f6a41a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 30 Sep 2024 14:22:36 -0600
Subject: io_uring/poll: get rid of unlocked cancel hash

io_uring maintains two hash lists of inflight requests:

1) ctx->cancel_table_locked. This is used when the caller has the
   ctx->uring_lock held already. This is only an issue side parameter,
   as removal or task_work will always have it held.

2) ctx->cancel_table. This is used when the issuer does NOT have the
   ctx->uring_lock held, and relies on the table spinlocks for access.

However, it's pretty trivial to simply grab the lock in the one spot
where we care about it, for insertion. With that, we can kill the
unlocked table (and get rid of the _locked postfix for the other one).

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   6 +-
 io_uring/fdinfo.c              |  11 +---
 io_uring/io_uring.c            |   4 --
 io_uring/poll.c                | 142 ++++++++++-------------------------------
 4 files changed, 36 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 4b9ba523978d..d8ca27da1341 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -291,7 +291,7 @@ struct io_ring_ctx {
 
 		struct xarray		io_bl_xa;
 
-		struct io_hash_table	cancel_table_locked;
+		struct io_hash_table	cancel_table;
 		struct io_alloc_cache	apoll_cache;
 		struct io_alloc_cache	netmsg_cache;
 		struct io_alloc_cache	rw_cache;
@@ -342,7 +342,6 @@ struct io_ring_ctx {
 
 	struct list_head	io_buffers_comp;
 	struct list_head	cq_overflow_list;
-	struct io_hash_table	cancel_table;
 
 	struct hlist_head	waitid_list;
 
@@ -459,7 +458,6 @@ enum {
 	REQ_F_DOUBLE_POLL_BIT,
 	REQ_F_APOLL_MULTISHOT_BIT,
 	REQ_F_CLEAR_POLLIN_BIT,
-	REQ_F_HASH_LOCKED_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
@@ -534,8 +532,6 @@ enum {
 	REQ_F_APOLL_MULTISHOT	= IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
 	/* recvmsg special flag, clear EPOLLIN */
 	REQ_F_CLEAR_POLLIN	= IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT),
-	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
-	REQ_F_HASH_LOCKED	= IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
 	/* don't use lazy poll wake for this request */
 	REQ_F_POLL_NO_LAZY	= IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
 	/* file is pollable */
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 6b1247664b35..a6bac533edbe 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -190,22 +190,13 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	}
 
 	seq_puts(m, "PollList:\n");
-	for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
+	for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) {
 		struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
-		struct io_hash_bucket *hbl = &ctx->cancel_table_locked.hbs[i];
 		struct io_kiocb *req;
 
-		spin_lock(&hb->lock);
 		hlist_for_each_entry(req, &hb->list, hash_node)
 			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
 					task_work_pending(req->task));
-		spin_unlock(&hb->lock);
-
-		if (!has_lock)
-			continue;
-		hlist_for_each_entry(req, &hbl->list, hash_node)
-			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
-					task_work_pending(req->task));
 	}
 
 	if (has_lock)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b2736e3491b8..f4e069cd03a5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -294,8 +294,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	hash_bits = clamp(hash_bits, 1, 8);
 	if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
 		goto err;
-	if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
-		goto err;
 	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 			    0, GFP_KERNEL))
 		goto err;
@@ -361,7 +359,6 @@ err:
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	kfree(ctx->cancel_table.hbs);
-	kfree(ctx->cancel_table_locked.hbs);
 	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
 	return NULL;
@@ -2774,7 +2771,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		io_wq_put_hash(ctx->hash_map);
 	io_napi_free(ctx);
 	kfree(ctx->cancel_table.hbs);
-	kfree(ctx->cancel_table_locked.hbs);
 	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
 }
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 175c279e59ea..217d667e0622 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -122,28 +122,6 @@ static void io_poll_req_insert(struct io_kiocb *req)
 {
 	struct io_hash_table *table = &req->ctx->cancel_table;
 	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
-	struct io_hash_bucket *hb = &table->hbs[index];
-
-	spin_lock(&hb->lock);
-	hlist_add_head(&req->hash_node, &hb->list);
-	spin_unlock(&hb->lock);
-}
-
-static void io_poll_req_delete(struct io_kiocb *req)
-{
-	struct io_hash_table *table = &req->ctx->cancel_table;
-	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
-	spinlock_t *lock = &table->hbs[index].lock;
-
-	spin_lock(lock);
-	hash_del(&req->hash_node);
-	spin_unlock(lock);
-}
-
-static void io_poll_req_insert_locked(struct io_kiocb *req)
-{
-	struct io_hash_table *table = &req->ctx->cancel_table_locked;
-	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
 
 	lockdep_assert_held(&req->ctx->uring_lock);
 
@@ -154,19 +132,14 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (req->flags & REQ_F_HASH_LOCKED) {
-		/*
-		 * ->cancel_table_locked is protected by ->uring_lock in
-		 * contrast to per bucket spinlocks. Likely, tctx_task_work()
-		 * already grabbed the mutex for us, but there is a chance it
-		 * failed.
-		 */
-		io_tw_lock(ctx, ts);
-		hash_del(&req->hash_node);
-		req->flags &= ~REQ_F_HASH_LOCKED;
-	} else {
-		io_poll_req_delete(req);
-	}
+	/*
+	 * ->cancel_table_locked is protected by ->uring_lock in
+	 * contrast to per bucket spinlocks. Likely, tctx_task_work()
+	 * already grabbed the mutex for us, but there is a chance it
+	 * failed.
+	 */
+	io_tw_lock(ctx, ts);
+	hash_del(&req->hash_node);
 }
 
 static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
@@ -563,12 +536,13 @@ static bool io_poll_can_finish_inline(struct io_kiocb *req,
 	return pt->owning || io_poll_get_ownership(req);
 }
 
-static void io_poll_add_hash(struct io_kiocb *req)
+static void io_poll_add_hash(struct io_kiocb *req, unsigned int issue_flags)
 {
-	if (req->flags & REQ_F_HASH_LOCKED)
-		io_poll_req_insert_locked(req);
-	else
-		io_poll_req_insert(req);
+	struct io_ring_ctx *ctx = req->ctx;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	io_poll_req_insert(req);
+	io_ring_submit_unlock(ctx, issue_flags);
 }
 
 /*
@@ -605,11 +579,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	ipt->owning = issue_flags & IO_URING_F_UNLOCKED;
 	atomic_set(&req->poll_refs, (int)ipt->owning);
 
-	/* io-wq doesn't hold uring_lock */
-	if (issue_flags & IO_URING_F_UNLOCKED)
-		req->flags &= ~REQ_F_HASH_LOCKED;
-
-
 	/*
 	 * Exclusive waits may only wake a limited amount of entries
 	 * rather than all of them, this may interfere with lazy
@@ -638,7 +607,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	if (mask &&
 	   ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
 		if (!io_poll_can_finish_inline(req, ipt)) {
-			io_poll_add_hash(req);
+			io_poll_add_hash(req, issue_flags);
 			return 0;
 		}
 		io_poll_remove_entries(req);
@@ -647,7 +616,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 		return 1;
 	}
 
-	io_poll_add_hash(req);
+	io_poll_add_hash(req, issue_flags);
 
 	if (mask && (poll->events & EPOLLET) &&
 	    io_poll_can_finish_inline(req, ipt)) {
@@ -720,12 +689,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	__poll_t mask = POLLPRI | POLLERR | EPOLLET;
 	int ret;
 
-	/*
-	 * apoll requests already grab the mutex to complete in the tw handler,
-	 * so removal from the mutex-backed hash is free, use it by default.
-	 */
-	req->flags |= REQ_F_HASH_LOCKED;
-
 	if (!def->pollin && !def->pollout)
 		return IO_APOLL_ABORTED;
 	if (!io_file_can_poll(req))
@@ -761,18 +724,22 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	return IO_APOLL_OK;
 }
 
-static __cold bool io_poll_remove_all_table(struct task_struct *tsk,
-					    struct io_hash_table *table,
-					    bool cancel_all)
+/*
+ * Returns true if we found and killed one or more poll requests
+ */
+__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+			       bool cancel_all)
 {
-	unsigned nr_buckets = 1U << table->hash_bits;
+	unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits;
 	struct hlist_node *tmp;
 	struct io_kiocb *req;
 	bool found = false;
 	int i;
 
+	lockdep_assert_held(&ctx->uring_lock);
+
 	for (i = 0; i < nr_buckets; i++) {
-		struct io_hash_bucket *hb = &table->hbs[i];
+		struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
 
 		spin_lock(&hb->lock);
 		hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
@@ -787,28 +754,13 @@ static __cold bool io_poll_remove_all_table(struct task_struct *tsk,
 	return found;
 }
 
-/*
- * Returns true if we found and killed one or more poll requests
- */
-__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
-			       bool cancel_all)
-	__must_hold(&ctx->uring_lock)
-{
-	bool ret;
-
-	ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all);
-	ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all);
-	return ret;
-}
-
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 				     struct io_cancel_data *cd,
-				     struct io_hash_table *table,
 				     struct io_hash_bucket **out_bucket)
 {
 	struct io_kiocb *req;
-	u32 index = hash_long(cd->data, table->hash_bits);
-	struct io_hash_bucket *hb = &table->hbs[index];
+	u32 index = hash_long(cd->data, ctx->cancel_table.hash_bits);
+	struct io_hash_bucket *hb = &ctx->cancel_table.hbs[index];
 
 	*out_bucket = NULL;
 
@@ -831,17 +783,16 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 
 static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 					  struct io_cancel_data *cd,
-					  struct io_hash_table *table,
 					  struct io_hash_bucket **out_bucket)
 {
-	unsigned nr_buckets = 1U << table->hash_bits;
+	unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits;
 	struct io_kiocb *req;
 	int i;
 
 	*out_bucket = NULL;
 
 	for (i = 0; i < nr_buckets; i++) {
-		struct io_hash_bucket *hb = &table->hbs[i];
+		struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
 
 		spin_lock(&hb->lock);
 		hlist_for_each_entry(req, &hb->list, hash_node) {
@@ -866,17 +817,16 @@ static int io_poll_disarm(struct io_kiocb *req)
 	return 0;
 }
 
-static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
-			    struct io_hash_table *table)
+static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 {
 	struct io_hash_bucket *bucket;
 	struct io_kiocb *req;
 
 	if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
 			 IORING_ASYNC_CANCEL_ANY))
-		req = io_poll_file_find(ctx, cd, table, &bucket);
+		req = io_poll_file_find(ctx, cd, &bucket);
 	else
-		req = io_poll_find(ctx, false, cd, table, &bucket);
+		req = io_poll_find(ctx, false, cd, &bucket);
 
 	if (req)
 		io_poll_cancel_req(req);
@@ -890,12 +840,8 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 {
 	int ret;
 
-	ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table);
-	if (ret != -ENOENT)
-		return ret;
-
 	io_ring_submit_lock(ctx, issue_flags);
-	ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked);
+	ret = __io_poll_cancel(ctx, cd);
 	io_ring_submit_unlock(ctx, issue_flags);
 	return ret;
 }
@@ -972,13 +918,6 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 	ipt.pt._qproc = io_poll_queue_proc;
 
-	/*
-	 * If sqpoll or single issuer, there is no contention for ->uring_lock
-	 * and we'll end up holding it in tw handlers anyway.
-	 */
-	if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER))
-		req->flags |= REQ_F_HASH_LOCKED;
-
 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
 	if (ret > 0) {
 		io_req_set_res(req, ipt.result_mask, 0);
@@ -997,18 +936,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	int ret2, ret = 0;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
-	ret2 = io_poll_disarm(preq);
-	if (bucket)
-		spin_unlock(&bucket->lock);
-	if (!ret2)
-		goto found;
-	if (ret2 != -ENOENT) {
-		ret = ret2;
-		goto out;
-	}
-
-	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket);
+	preq = io_poll_find(ctx, true, &cd, &bucket);
 	ret2 = io_poll_disarm(preq);
 	if (bucket)
 		spin_unlock(&bucket->lock);
@@ -1016,8 +944,6 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 		ret = ret2;
 		goto out;
 	}
-
-found:
 	if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) {
 		ret = -EFAULT;
 		goto out;
-- 
cgit v1.2.3


From ba4366f57b117c2eab996642288e5c75646ccfc9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 30 Sep 2024 14:29:06 -0600
Subject: io_uring/poll: get rid of per-hashtable bucket locks

Any access to the table is protected by ctx->uring_lock now anyway, the
per-bucket locking doesn't buy us anything.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 -
 io_uring/cancel.c              |  4 +---
 io_uring/poll.c                | 39 ++++++++++-----------------------------
 3 files changed, 11 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d8ca27da1341..9c7e1d3f06e5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -67,7 +67,6 @@ struct io_file_table {
 };
 
 struct io_hash_bucket {
-	spinlock_t		lock;
 	struct hlist_head	list;
 } ____cacheline_aligned_in_smp;
 
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index a6e58a20efdd..755dd5506a5f 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -236,10 +236,8 @@ void init_hash_table(struct io_hash_table *table, unsigned size)
 {
 	unsigned int i;
 
-	for (i = 0; i < size; i++) {
-		spin_lock_init(&table->hbs[i].lock);
+	for (i = 0; i < size; i++)
 		INIT_HLIST_HEAD(&table->hbs[i].list);
-	}
 }
 
 static int __io_sync_cancel(struct io_uring_task *tctx,
diff --git a/io_uring/poll.c b/io_uring/poll.c
index a0d1a09c5a20..2d6698fb7400 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -728,7 +728,6 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	for (i = 0; i < nr_buckets; i++) {
 		struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
 
-		spin_lock(&hb->lock);
 		hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
 			if (io_match_task_safe(req, tsk, cancel_all)) {
 				hlist_del_init(&req->hash_node);
@@ -736,22 +735,17 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 				found = true;
 			}
 		}
-		spin_unlock(&hb->lock);
 	}
 	return found;
 }
 
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
-				     struct io_cancel_data *cd,
-				     struct io_hash_bucket **out_bucket)
+				     struct io_cancel_data *cd)
 {
 	struct io_kiocb *req;
 	u32 index = hash_long(cd->data, ctx->cancel_table.hash_bits);
 	struct io_hash_bucket *hb = &ctx->cancel_table.hbs[index];
 
-	*out_bucket = NULL;
-
-	spin_lock(&hb->lock);
 	hlist_for_each_entry(req, &hb->list, hash_node) {
 		if (cd->data != req->cqe.user_data)
 			continue;
@@ -761,34 +755,25 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 			if (io_cancel_match_sequence(req, cd->seq))
 				continue;
 		}
-		*out_bucket = hb;
 		return req;
 	}
-	spin_unlock(&hb->lock);
 	return NULL;
 }
 
 static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
-					  struct io_cancel_data *cd,
-					  struct io_hash_bucket **out_bucket)
+					  struct io_cancel_data *cd)
 {
 	unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits;
 	struct io_kiocb *req;
 	int i;
 
-	*out_bucket = NULL;
-
 	for (i = 0; i < nr_buckets; i++) {
 		struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
 
-		spin_lock(&hb->lock);
 		hlist_for_each_entry(req, &hb->list, hash_node) {
-			if (io_cancel_req_match(req, cd)) {
-				*out_bucket = hb;
+			if (io_cancel_req_match(req, cd))
 				return req;
-			}
 		}
-		spin_unlock(&hb->lock);
 	}
 	return NULL;
 }
@@ -806,20 +791,19 @@ static int io_poll_disarm(struct io_kiocb *req)
 
 static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 {
-	struct io_hash_bucket *bucket;
 	struct io_kiocb *req;
 
 	if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
 			 IORING_ASYNC_CANCEL_ANY))
-		req = io_poll_file_find(ctx, cd, &bucket);
+		req = io_poll_file_find(ctx, cd);
 	else
-		req = io_poll_find(ctx, false, cd, &bucket);
+		req = io_poll_find(ctx, false, cd);
 
-	if (req)
+	if (req) {
 		io_poll_cancel_req(req);
-	if (bucket)
-		spin_unlock(&bucket->lock);
-	return req ? 0 : -ENOENT;
+		return 0;
+	}
+	return -ENOENT;
 }
 
 int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
@@ -918,15 +902,12 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
-	struct io_hash_bucket *bucket;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	preq = io_poll_find(ctx, true, &cd, &bucket);
+	preq = io_poll_find(ctx, true, &cd);
 	ret2 = io_poll_disarm(preq);
-	if (bucket)
-		spin_unlock(&bucket->lock);
 	if (ret2) {
 		ret = ret2;
 		goto out;
-- 
cgit v1.2.3


From 2946f08ae9ed650b94e0ffebcdfdda8de76bd926 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 18 Oct 2024 17:14:00 +0100
Subject: io_uring: clean up cqe trace points

We have too many helpers posting CQEs, instead of tracing completion
events before filling in a CQE and thus having to pass all the data,
set the CQE first, pass it to the tracing helper and let it extract
everything it needs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b83c1ca9ee5aed2df0f3bb743bf5ed699cce4c86.1729267437.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h  |  5 +++++
 include/trace/events/io_uring.h | 24 +++++++++---------------
 io_uring/io_uring.c             |  4 ++--
 io_uring/io_uring.h             |  7 +++----
 4 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 9c7e1d3f06e5..391087144666 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -662,4 +662,9 @@ struct io_overflow_cqe {
 	struct io_uring_cqe cqe;
 };
 
+static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
+{
+	return ctx->flags & IORING_SETUP_CQE32;
+}
+
 #endif
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 412c9c210a32..fb81c533b310 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -315,20 +315,14 @@ TRACE_EVENT(io_uring_fail_link,
  * io_uring_complete - called when completing an SQE
  *
  * @ctx:		pointer to a ring context structure
- * @req:		pointer to a submitted request
- * @user_data:		user data associated with the request
- * @res:		result of the request
- * @cflags:		completion flags
- * @extra1:		extra 64-bit data for CQE32
- * @extra2:		extra 64-bit data for CQE32
- *
+ * @req:		(optional) pointer to a submitted request
+ * @cqe:		pointer to the filled in CQE being posted
  */
 TRACE_EVENT(io_uring_complete,
 
-	TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags,
-		 u64 extra1, u64 extra2),
+TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe),
 
-	TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2),
+	TP_ARGS(ctx, req, cqe),
 
 	TP_STRUCT__entry (
 		__field(  void *,	ctx		)
@@ -343,11 +337,11 @@ TRACE_EVENT(io_uring_complete,
 	TP_fast_assign(
 		__entry->ctx		= ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->res		= res;
-		__entry->cflags		= cflags;
-		__entry->extra1		= extra1;
-		__entry->extra2		= extra2;
+		__entry->user_data	= cqe->user_data;
+		__entry->res		= cqe->res;
+		__entry->cflags		= cqe->flags;
+		__entry->extra1		= io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0;
+		__entry->extra2		= io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0;
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fa9d31034c62..58b401900b41 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -828,8 +828,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	 * the ring.
 	 */
 	if (likely(io_get_cqe(ctx, &cqe))) {
-		trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
-
 		WRITE_ONCE(cqe->user_data, user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, cflags);
@@ -838,6 +836,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 			WRITE_ONCE(cqe->big_cqe[0], 0);
 			WRITE_ONCE(cqe->big_cqe[1], 0);
 		}
+
+		trace_io_uring_complete(ctx, NULL, cqe);
 		return true;
 	}
 	return false;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 70b6675941ff..9cd9a127e9ed 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -189,16 +189,15 @@ static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
 	if (unlikely(!io_get_cqe(ctx, &cqe)))
 		return false;
 
-	if (trace_io_uring_complete_enabled())
-		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
-					req->cqe.res, req->cqe.flags,
-					req->big_cqe.extra1, req->big_cqe.extra2);
 
 	memcpy(cqe, &req->cqe, sizeof(*cqe));
 	if (ctx->flags & IORING_SETUP_CQE32) {
 		memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
 		memset(&req->big_cqe, 0, sizeof(req->big_cqe));
 	}
+
+	if (trace_io_uring_complete_enabled())
+		trace_io_uring_complete(req->ctx, req, cqe);
 	return true;
 }
 
-- 
cgit v1.2.3


From e6d43739d0ee49a39505d696ba6a656f47c2bd39 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 16 Oct 2024 15:54:06 -0600
Subject: io_uring: kill 'imu' from struct io_kiocb

It's no longer being used, remove it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 391087144666..6d3ee71bd832 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -613,9 +613,6 @@ struct io_kiocb {
 	struct task_struct		*task;
 
 	union {
-		/* store used ubuf, so we can prevent reloading */
-		struct io_mapped_ubuf	*imu;
-
 		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
 		struct io_buffer	*kbuf;
 
-- 
cgit v1.2.3


From 79cfe9e59c2a12c3b3faeeefe38d23f3d8030972 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 21 Oct 2024 13:34:10 -0600
Subject: io_uring/register: add IORING_REGISTER_RESIZE_RINGS

Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.

For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.

Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.

Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.

Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.

To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   7 ++
 include/uapi/linux/io_uring.h  |   5 +
 io_uring/io_uring.c            |   1 +
 io_uring/memmap.c              |   8 ++
 io_uring/register.c            | 215 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 236 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 6d3ee71bd832..841579dcdae9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -415,6 +415,13 @@ struct io_ring_ctx {
 	/* protected by ->completion_lock */
 	unsigned			evfd_last_cq_tail;
 
+	/*
+	 * Protection for resize vs mmap races - both the mmap and resize
+	 * side will need to grab this lock, to prevent either side from
+	 * being run concurrently with the other.
+	 */
+	struct mutex			resize_lock;
+
 	/*
 	 * If IORING_SETUP_NO_MMAP is used, then the below holds
 	 * the gup'ed pages for the two rings, and the sqes.
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 86cb385fe0b5..60b9c98595fa 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -615,6 +615,11 @@ enum io_uring_register_op {
 	/* send MSG_RING without having a ring */
 	IORING_REGISTER_SEND_MSG_RING		= 31,
 
+	/* 32 reserved for zc rx */
+
+	/* resize CQ ring */
+	IORING_REGISTER_RESIZE_RINGS		= 33,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b5974bdad48b..140cd47fbdb3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -353,6 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
 	io_napi_init(ctx);
+	mutex_init(&ctx->resize_lock);
 
 	return ctx;
 
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index d614824e17bd..85c66fa54956 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -251,6 +251,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned int npages;
 	void *ptr;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
@@ -274,6 +276,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 					 unsigned long len, unsigned long pgoff,
 					 unsigned long flags)
 {
+	struct io_ring_ctx *ctx = filp->private_data;
 	void *ptr;
 
 	/*
@@ -284,6 +287,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr)
 		return -EINVAL;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
 	if (IS_ERR(ptr))
 		return -ENOMEM;
@@ -329,8 +334,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
 					 unsigned long len, unsigned long pgoff,
 					 unsigned long flags)
 {
+	struct io_ring_ctx *ctx = file->private_data;
 	void *ptr;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(file, pgoff, len);
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
diff --git a/io_uring/register.c b/io_uring/register.c
index 52b2f9b74af8..fc6c94d694b2 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -29,6 +29,7 @@
 #include "napi.h"
 #include "eventfd.h"
 #include "msg_ring.h"
+#include "memmap.h"
 
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -361,6 +362,214 @@ static int io_register_clock(struct io_ring_ctx *ctx,
 	return 0;
 }
 
+/*
+ * State to maintain until we can swap. Both new and old state, used for
+ * either mapping or freeing.
+ */
+struct io_ring_ctx_rings {
+	unsigned short n_ring_pages;
+	unsigned short n_sqe_pages;
+	struct page **ring_pages;
+	struct page **sqe_pages;
+	struct io_uring_sqe *sq_sqes;
+	struct io_rings *rings;
+};
+
+static void io_register_free_rings(struct io_uring_params *p,
+				   struct io_ring_ctx_rings *r)
+{
+	if (!(p->flags & IORING_SETUP_NO_MMAP)) {
+		io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
+				true);
+		io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
+				true);
+	} else {
+		io_pages_free(&r->ring_pages, r->n_ring_pages);
+		io_pages_free(&r->sqe_pages, r->n_sqe_pages);
+		vunmap(r->rings);
+		vunmap(r->sq_sqes);
+	}
+}
+
+#define swap_old(ctx, o, n, field)		\
+	do {					\
+		(o).field = (ctx)->field;	\
+		(ctx)->field = (n).field;	\
+	} while (0)
+
+#define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
+#define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
+			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
+
+static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
+	size_t size, sq_array_offset;
+	struct io_uring_params p;
+	unsigned i, tail;
+	void *ptr;
+	int ret;
+
+	/* for single issuer, must be owner resizing */
+	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
+	    current != ctx->submitter_task)
+		return -EEXIST;
+	if (copy_from_user(&p, arg, sizeof(p)))
+		return -EFAULT;
+	if (p.flags & ~RESIZE_FLAGS)
+		return -EINVAL;
+
+	/* properties that are always inherited */
+	p.flags |= (ctx->flags & COPY_FLAGS);
+
+	ret = io_uring_fill_params(p.sq_entries, &p);
+	if (unlikely(ret))
+		return ret;
+
+	/* nothing to do, but copy params back */
+	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
+		if (copy_to_user(arg, &p, sizeof(p)))
+			return -EFAULT;
+		return 0;
+	}
+
+	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
+				&sq_array_offset);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	if (!(p.flags & IORING_SETUP_NO_MMAP))
+		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
+	else
+		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
+						p.cq_off.user_addr, size);
+	if (IS_ERR(n.rings))
+		return PTR_ERR(n.rings);
+
+	n.rings->sq_ring_mask = p.sq_entries - 1;
+	n.rings->cq_ring_mask = p.cq_entries - 1;
+	n.rings->sq_ring_entries = p.sq_entries;
+	n.rings->cq_ring_entries = p.cq_entries;
+
+	if (copy_to_user(arg, &p, sizeof(p))) {
+		io_register_free_rings(&p, &n);
+		return -EFAULT;
+	}
+
+	if (p.flags & IORING_SETUP_SQE128)
+		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
+	else
+		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
+	if (size == SIZE_MAX) {
+		io_register_free_rings(&p, &n);
+		return -EOVERFLOW;
+	}
+
+	if (!(p.flags & IORING_SETUP_NO_MMAP))
+		ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
+	else
+		ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
+					p.sq_off.user_addr,
+					size);
+	if (IS_ERR(ptr)) {
+		io_register_free_rings(&p, &n);
+		return PTR_ERR(ptr);
+	}
+
+	/*
+	 * If using SQPOLL, park the thread
+	 */
+	if (ctx->sq_data) {
+		mutex_unlock(&ctx->uring_lock);
+		io_sq_thread_park(ctx->sq_data);
+		mutex_lock(&ctx->uring_lock);
+	}
+
+	/*
+	 * We'll do the swap. Grab the ctx->resize_lock, which will exclude
+	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
+	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
+	 * existing rings beyond this point will fail. Not that it could proceed
+	 * at this point anyway, as the io_uring mmap side needs go grab the
+	 * ctx->resize_lock as well. Likewise, hold the completion lock over the
+	 * duration of the actual swap.
+	 */
+	mutex_lock(&ctx->resize_lock);
+	spin_lock(&ctx->completion_lock);
+	o.rings = ctx->rings;
+	ctx->rings = NULL;
+	o.sq_sqes = ctx->sq_sqes;
+	ctx->sq_sqes = NULL;
+
+	/*
+	 * Now copy SQ and CQ entries, if any. If either of the destination
+	 * rings can't hold what is already there, then fail the operation.
+	 */
+	n.sq_sqes = ptr;
+	tail = o.rings->sq.tail;
+	if (tail - o.rings->sq.head > p.sq_entries)
+		goto overflow;
+	for (i = o.rings->sq.head; i < tail; i++) {
+		unsigned src_head = i & (ctx->sq_entries - 1);
+		unsigned dst_head = i & n.rings->sq_ring_mask;
+
+		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
+	}
+	n.rings->sq.head = o.rings->sq.head;
+	n.rings->sq.tail = o.rings->sq.tail;
+
+	tail = o.rings->cq.tail;
+	if (tail - o.rings->cq.head > p.cq_entries) {
+overflow:
+		/* restore old rings, and return -EOVERFLOW via cleanup path */
+		ctx->rings = o.rings;
+		ctx->sq_sqes = o.sq_sqes;
+		to_free = &n;
+		ret = -EOVERFLOW;
+		goto out;
+	}
+	for (i = o.rings->cq.head; i < tail; i++) {
+		unsigned src_head = i & (ctx->cq_entries - 1);
+		unsigned dst_head = i & n.rings->cq_ring_mask;
+
+		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+	}
+	n.rings->cq.head = o.rings->cq.head;
+	n.rings->cq.tail = o.rings->cq.tail;
+	/* invalidate cached cqe refill */
+	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
+
+	n.rings->sq_dropped = o.rings->sq_dropped;
+	n.rings->sq_flags = o.rings->sq_flags;
+	n.rings->cq_flags = o.rings->cq_flags;
+	n.rings->cq_overflow = o.rings->cq_overflow;
+
+	/* all done, store old pointers and assign new ones */
+	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
+
+	ctx->sq_entries = p.sq_entries;
+	ctx->cq_entries = p.cq_entries;
+
+	ctx->rings = n.rings;
+	ctx->sq_sqes = n.sq_sqes;
+	swap_old(ctx, o, n, n_ring_pages);
+	swap_old(ctx, o, n, n_sqe_pages);
+	swap_old(ctx, o, n, ring_pages);
+	swap_old(ctx, o, n, sqe_pages);
+	to_free = &o;
+	ret = 0;
+out:
+	spin_unlock(&ctx->completion_lock);
+	mutex_unlock(&ctx->resize_lock);
+	io_register_free_rings(&p, to_free);
+
+	if (ctx->sq_data)
+		io_sq_thread_unpark(ctx->sq_data);
+
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -549,6 +758,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_clone_buffers(ctx, arg);
 		break;
+	case IORING_REGISTER_RESIZE_RINGS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_resize_rings(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From aa00f67adc2c0d6439f81b5a81ff181377c47a7e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 22 Oct 2024 13:47:00 -0600
Subject: io_uring: add support for fixed wait regions

Generally applications have 1 or a few waits of waiting, yet they pass
in a struct io_uring_getevents_arg every time. This needs to get copied
and, in turn, the timeout value needs to get copied.

Rather than do this for every invocation, allow the application to
register a fixed set of wait regions that can simply be indexed when
asking the kernel to wait on events.

At ring setup time, the application can register a number of these wait
regions and initialize region/index 0 upfront:

	struct io_uring_reg_wait *reg;

	reg = io_uring_setup_reg_wait(ring, nr_regions, &ret);

	/* set timeout and mark as set, sigmask/sigmask_sz as needed */
	reg->ts.tv_sec = 0;
	reg->ts.tv_nsec = 100000;
	reg->flags = IORING_REG_WAIT_TS;

where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The
above initializes index 0, but 63 other regions can be initialized,
if needed. Now, instead of doing:

	struct __kernel_timespec timeout = { .tv_nsec = 100000, };

	io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL);

to wait for events for each submit_and_wait, or just wait, operation, it
can just reference the above region at offset 0 and do:

	io_uring_submit_and_wait_reg(ring, &cqe, nr, 0);

to achieve the same goal of waiting 100usec without needing to copy
both struct io_uring_getevents_arg (24b) and struct __kernel_timeout
(16b) for each invocation. Struct io_uring_reg_wait looks as follows:

struct io_uring_reg_wait {
	struct __kernel_timespec	ts;
	__u32				min_wait_usec;
	__u32				flags;
	__u64				sigmask;
	__u32				sigmask_sz;
	__u32				pad[3];
	__u64				pad2[2];
};

embedding the timeout itself in the region, rather than passing it as
a pointer as well. Note that the signal mask is still passed as a
pointer, both for compatability reasons, but also because there doesn't
seem to be a lot of high frequency waits scenarios that involve setting
and resetting the signal mask for each wait.

The application is free to modify any region before a wait call, or it
can use keep multiple regions with different settings to avoid needing to
modify the same one for wait calls. Up to a page size of regions is mapped
by default, allowing PAGE_SIZE / 64 available regions for use.

The registered region must fit within a page. On a 4kb page size system,
that allows for 64 wait regions if a full page is used, as the size of
struct io_uring_reg_wait is 64b. The region registered must be aligned
to io_uring_reg_wait in size. It's valid to register less than 64
entries.

In network performance testing with zero-copy, this reduced the time
spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4%
to 0.3%.

Wait regions are fixed for the lifetime of the ring - once registered,
they are persistent until the ring is torn down. The regions support
minimum wait timeout as well as the regular waits.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 10 ++++++
 include/uapi/linux/io_uring.h  | 41 +++++++++++++++++++++
 io_uring/io_uring.c            | 68 +++++++++++++++++++++++++++++------
 io_uring/register.c            | 82 ++++++++++++++++++++++++++++++++++++++++++
 io_uring/register.h            |  1 +
 5 files changed, 191 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 841579dcdae9..2f12828b22a4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -327,6 +327,14 @@ struct io_ring_ctx {
 		atomic_t		cq_wait_nr;
 		atomic_t		cq_timeouts;
 		struct wait_queue_head	cq_wait;
+
+		/*
+		 * If registered with IORING_REGISTER_CQWAIT_REG, a single
+		 * page holds N entries, mapped in cq_wait_arg. cq_wait_index
+		 * is the maximum allowable index.
+		 */
+		struct io_uring_reg_wait	*cq_wait_arg;
+		unsigned char			cq_wait_index;
 	} ____cacheline_aligned_in_smp;
 
 	/* timeouts */
@@ -430,6 +438,8 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+
+	struct page			**cq_wait_page;
 };
 
 struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 60b9c98595fa..65b7417c1b05 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -518,6 +518,7 @@ struct io_cqring_offsets {
 #define IORING_ENTER_EXT_ARG		(1U << 3)
 #define IORING_ENTER_REGISTERED_RING	(1U << 4)
 #define IORING_ENTER_ABS_TIMER		(1U << 5)
+#define IORING_ENTER_EXT_ARG_REG	(1U << 6)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -620,6 +621,9 @@ enum io_uring_register_op {
 	/* resize CQ ring */
 	IORING_REGISTER_RESIZE_RINGS		= 33,
 
+	/* register fixed io_uring_reg_wait arguments */
+	IORING_REGISTER_CQWAIT_REG		= 34,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -803,6 +807,43 @@ enum io_uring_register_restriction_op {
 	IORING_RESTRICTION_LAST
 };
 
+enum {
+	IORING_REG_WAIT_TS		= (1U << 0),
+};
+
+/*
+ * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
+ * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
+ * called rather than pass in a wait argument structure separately.
+ */
+struct io_uring_cqwait_reg_arg {
+	__u32		flags;
+	__u32		struct_size;
+	__u32		nr_entries;
+	__u32		pad;
+	__u64		user_addr;
+	__u64		pad2[3];
+};
+
+/*
+ * Argument for io_uring_enter(2) with
+ * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
+ * is an index into a previously registered fixed wait region described by
+ * the below structure.
+ */
+struct io_uring_reg_wait {
+	struct __kernel_timespec	ts;
+	__u32				min_wait_usec;
+	__u32				flags;
+	__u64				sigmask;
+	__u32				sigmask_sz;
+	__u32				pad[3];
+	__u64				pad2[2];
+};
+
+/*
+ * Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
+ */
 struct io_uring_getevents_arg {
 	__u64	sigmask;
 	__u32	sigmask_sz;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4cd0ee52710d..2863b957e373 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2736,6 +2736,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
+	io_unregister_cqwait_reg(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
@@ -3224,21 +3225,43 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
-static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
+static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
+			const struct io_uring_getevents_arg __user *uarg)
 {
-	if (flags & IORING_ENTER_EXT_ARG) {
-		struct io_uring_getevents_arg arg;
+	struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg);
 
-		if (argsz != sizeof(arg))
+	if (arg) {
+		unsigned int index = (unsigned int) (uintptr_t) uarg;
+
+		if (index <= ctx->cq_wait_index)
+			return arg + index;
+	}
+
+	return ERR_PTR(-EFAULT);
+}
+
+static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+			       const void __user *argp, size_t argsz)
+{
+	struct io_uring_getevents_arg arg;
+
+	if (!(flags & IORING_ENTER_EXT_ARG))
+		return 0;
+
+	if (flags & IORING_ENTER_EXT_ARG_REG) {
+		if (argsz != sizeof(struct io_uring_reg_wait))
 			return -EINVAL;
-		if (copy_from_user(&arg, argp, sizeof(arg)))
-			return -EFAULT;
+		return PTR_ERR(io_get_ext_arg_reg(ctx, argp));
 	}
+	if (argsz != sizeof(arg))
+		return -EINVAL;
+	if (copy_from_user(&arg, argp, sizeof(arg)))
+		return -EFAULT;
 	return 0;
 }
 
-static int io_get_ext_arg(unsigned flags, const void __user *argp,
-			  struct ext_arg *ext_arg)
+static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+			  const void __user *argp, struct ext_arg *ext_arg)
 {
 	const struct io_uring_getevents_arg __user *uarg = argp;
 	struct io_uring_getevents_arg arg;
@@ -3252,6 +3275,28 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp,
 		return 0;
 	}
 
+	if (flags & IORING_ENTER_EXT_ARG_REG) {
+		struct io_uring_reg_wait *w;
+
+		if (ext_arg->argsz != sizeof(struct io_uring_reg_wait))
+			return -EINVAL;
+		w = io_get_ext_arg_reg(ctx, argp);
+		if (IS_ERR(w))
+			return PTR_ERR(w);
+
+		if (w->flags & ~IORING_REG_WAIT_TS)
+			return -EINVAL;
+		ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC;
+		ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask));
+		ext_arg->argsz = READ_ONCE(w->sigmask_sz);
+		if (w->flags & IORING_REG_WAIT_TS) {
+			ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec);
+			ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec);
+			ext_arg->ts_set = true;
+		}
+		return 0;
+	}
+
 	/*
 	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
 	 * timespec and sigset_t pointers if good.
@@ -3297,7 +3342,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
 			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
 			       IORING_ENTER_REGISTERED_RING |
-			       IORING_ENTER_ABS_TIMER)))
+			       IORING_ENTER_ABS_TIMER |
+			       IORING_ENTER_EXT_ARG_REG)))
 		return -EINVAL;
 
 	/*
@@ -3380,7 +3426,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			 */
 			mutex_lock(&ctx->uring_lock);
 iopoll_locked:
-			ret2 = io_validate_ext_arg(flags, argp, argsz);
+			ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
 			if (likely(!ret2)) {
 				min_complete = min(min_complete,
 						   ctx->cq_entries);
@@ -3390,7 +3436,7 @@ iopoll_locked:
 		} else {
 			struct ext_arg ext_arg = { .argsz = argsz };
 
-			ret2 = io_get_ext_arg(flags, argp, &ext_arg);
+			ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
 			if (likely(!ret2)) {
 				min_complete = min(min_complete,
 						   ctx->cq_entries);
diff --git a/io_uring/register.c b/io_uring/register.c
index fc6c94d694b2..1eb686eaa310 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,6 +570,82 @@ out:
 	return ret;
 }
 
+void io_unregister_cqwait_reg(struct io_ring_ctx *ctx)
+{
+	unsigned short npages = 1;
+
+	if (!ctx->cq_wait_page)
+		return;
+
+	io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true);
+	ctx->cq_wait_arg = NULL;
+	if (ctx->user)
+		__io_unaccount_mem(ctx->user, 1);
+}
+
+/*
+ * Register a page holding N entries of struct io_uring_reg_wait, which can
+ * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set.
+ * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing
+ * in a pointer for a struct io_uring_getevents_arg, an index into this
+ * registered array is passed, avoiding two (arg + timeout) copies per
+ * invocation.
+ */
+static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg)
+{
+	struct io_uring_cqwait_reg_arg arg;
+	struct io_uring_reg_wait *reg;
+	struct page **pages;
+	unsigned long len;
+	int nr_pages, poff;
+	int ret;
+
+	if (ctx->cq_wait_page || ctx->cq_wait_arg)
+		return -EBUSY;
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+	if (!arg.nr_entries || arg.flags)
+		return -EINVAL;
+	if (arg.struct_size != sizeof(*reg))
+		return -EINVAL;
+	if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len))
+		return -EOVERFLOW;
+	if (len > PAGE_SIZE)
+		return -EINVAL;
+	/* offset + len must fit within a page, and must be reg_wait aligned */
+	poff = arg.user_addr & ~PAGE_MASK;
+	if (len + poff > PAGE_SIZE)
+		return -EINVAL;
+	if (poff % arg.struct_size)
+		return -EINVAL;
+
+	pages = io_pin_pages(arg.user_addr, len, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+	ret = -EINVAL;
+	if (nr_pages != 1)
+		goto out_free;
+	if (ctx->user) {
+		ret = __io_account_mem(ctx->user, 1);
+		if (ret)
+			goto out_free;
+	}
+
+	reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
+	if (reg) {
+		ctx->cq_wait_index = arg.nr_entries - 1;
+		WRITE_ONCE(ctx->cq_wait_page, pages);
+		WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff);
+		return 0;
+	}
+	ret = -ENOMEM;
+	if (ctx->user)
+		__io_unaccount_mem(ctx->user, 1);
+out_free:
+	io_pages_free(&pages, nr_pages);
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -764,6 +840,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_resize_rings(ctx, arg);
 		break;
+	case IORING_REGISTER_CQWAIT_REG:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_cqwait_reg(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/register.h b/io_uring/register.h
index a5f39d5ef9e0..3e935e8fa4b2 100644
--- a/io_uring/register.h
+++ b/io_uring/register.h
@@ -5,5 +5,6 @@
 int io_eventfd_unregister(struct io_ring_ctx *ctx);
 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
 struct file *io_uring_register_get_file(unsigned int fd, bool registered);
+void io_unregister_cqwait_reg(struct io_ring_ctx *ctx);
 
 #endif
-- 
cgit v1.2.3


From ff1256b8f3c45f222bce19fbfc1e1bc498b31d03 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 25 Oct 2024 08:54:28 -0600
Subject: io_uring/rsrc: move struct io_fixed_file to rsrc.h header

There's no need for this internal structure to be visible, move it to
the private rsrc.h header instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 5 -----
 io_uring/filetable.h           | 1 +
 io_uring/rsrc.h                | 5 +++++
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 2f12828b22a4..d4ba4ae480d6 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -55,11 +55,6 @@ struct io_wq_work {
 	int cancel_seq;
 };
 
-struct io_fixed_file {
-	/* file * with additional FFS_* flags */
-	unsigned long file_ptr;
-};
-
 struct io_file_table {
 	struct io_fixed_file *files;
 	unsigned long *bitmap;
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index b2435c4dca1f..c027ed4ad68d 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -4,6 +4,7 @@
 
 #include <linux/file.h>
 #include <linux/io_uring_types.h>
+#include "rsrc.h"
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
 void io_free_file_tables(struct io_file_table *table);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c50d4be4aa6d..e072fb3ee351 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -40,6 +40,11 @@ struct io_rsrc_node {
 	struct io_rsrc_put		item;
 };
 
+struct io_fixed_file {
+	/* file * with additional FFS_* flags */
+	unsigned long file_ptr;
+};
+
 struct io_mapped_ubuf {
 	u64		ubuf;
 	unsigned int	len;
-- 
cgit v1.2.3


From b2473a359763e27567993e7d8f37de82f57a0829 Mon Sep 17 00:00:00 2001
From: Usama Arif <usamaarif642@gmail.com>
Date: Wed, 23 Oct 2024 18:14:26 +0100
Subject: of/fdt: add dt_phys arg to early_init_dt_scan and
 early_init_dt_verify

 __pa() is only intended to be used for linear map addresses and using
it for initial_boot_params which is in fixmap for arm64 will give an
incorrect value. Hence save the physical address when it is known at
boot time when calling early_init_dt_scan for arm64 and use it at kexec
time instead of converting the virtual address using __pa().

Note that arm64 doesn't need the FDT region reserved in the DT as the
kernel explicitly reserves the passed in FDT. Therefore, only a debug
warning is fixed with this change.

Reported-by: Breno Leitao <leitao@debian.org>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Fixes: ac10be5cdbfa ("arm64: Use common of_kexec_alloc_and_setup_fdt()")
Link: https://lore.kernel.org/r/20241023171426.452688-1-usamaarif642@gmail.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 arch/arc/kernel/devtree.c              |  2 +-
 arch/arm/kernel/devtree.c              |  2 +-
 arch/arm64/kernel/setup.c              |  6 +++++-
 arch/csky/kernel/setup.c               |  4 ++--
 arch/loongarch/kernel/setup.c          |  2 +-
 arch/microblaze/kernel/prom.c          |  2 +-
 arch/mips/kernel/prom.c                |  2 +-
 arch/mips/kernel/relocate.c            |  2 +-
 arch/nios2/kernel/prom.c               |  4 ++--
 arch/openrisc/kernel/prom.c            |  2 +-
 arch/powerpc/kernel/dt_cpu_ftrs.c      |  2 +-
 arch/powerpc/kernel/prom.c             |  2 +-
 arch/powerpc/platforms/pseries/plpks.c |  2 +-
 arch/riscv/kernel/setup.c              |  2 +-
 arch/sh/kernel/setup.c                 |  2 +-
 arch/um/kernel/dtb.c                   |  2 +-
 arch/x86/kernel/devicetree.c           |  2 +-
 arch/xtensa/kernel/setup.c             |  2 +-
 drivers/of/fdt.c                       | 14 ++++++++------
 drivers/of/kexec.c                     |  2 +-
 include/linux/of_fdt.h                 |  5 +++--
 21 files changed, 36 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/devtree.c b/arch/arc/kernel/devtree.c
index 4c9e61457b2f..cc6ac7d128aa 100644
--- a/arch/arc/kernel/devtree.c
+++ b/arch/arc/kernel/devtree.c
@@ -62,7 +62,7 @@ const struct machine_desc * __init setup_machine_fdt(void *dt)
 	const struct machine_desc *mdesc;
 	unsigned long dt_root;
 
-	if (!early_init_dt_scan(dt))
+	if (!early_init_dt_scan(dt, __pa(dt)))
 		return NULL;
 
 	mdesc = of_flat_dt_match_machine(NULL, arch_get_next_mach);
diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c
index fdb74e64206a..3b78966e750a 100644
--- a/arch/arm/kernel/devtree.c
+++ b/arch/arm/kernel/devtree.c
@@ -200,7 +200,7 @@ const struct machine_desc * __init setup_machine_fdt(void *dt_virt)
 
 	mdesc_best = &__mach_desc_GENERIC_DT;
 
-	if (!dt_virt || !early_init_dt_verify(dt_virt))
+	if (!dt_virt || !early_init_dt_verify(dt_virt, __pa(dt_virt)))
 		return NULL;
 
 	mdesc = of_flat_dt_match_machine(mdesc_best, arch_get_next_mach);
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index b22d28ec8028..87f61fd6783c 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -175,7 +175,11 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys)
 	if (dt_virt)
 		memblock_reserve(dt_phys, size);
 
-	if (!dt_virt || !early_init_dt_scan(dt_virt)) {
+	/*
+	 * dt_virt is a fixmap address, hence __pa(dt_virt) can't be used.
+	 * Pass dt_phys directly.
+	 */
+	if (!early_init_dt_scan(dt_virt, dt_phys)) {
 		pr_crit("\n"
 			"Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n"
 			"The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index 51012e90780d..fe715b707fd0 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -112,9 +112,9 @@ asmlinkage __visible void __init csky_start(unsigned int unused,
 	pre_trap_init();
 
 	if (dtb_start == NULL)
-		early_init_dt_scan(__dtb_start);
+		early_init_dt_scan(__dtb_start, __pa(dtb_start));
 	else
-		early_init_dt_scan(dtb_start);
+		early_init_dt_scan(dtb_start, __pa(dtb_start));
 
 	start_kernel();
 
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 00e307203ddb..10d878204018 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -290,7 +290,7 @@ static void __init fdt_setup(void)
 	if (!fdt_pointer || fdt_check_header(fdt_pointer))
 		return;
 
-	early_init_dt_scan(fdt_pointer);
+	early_init_dt_scan(fdt_pointer, __pa(fdt_pointer));
 	early_init_fdt_reserve_self();
 
 	max_low_pfn = PFN_PHYS(memblock_end_of_DRAM());
diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c
index e424c796e297..76ac4cfdfb42 100644
--- a/arch/microblaze/kernel/prom.c
+++ b/arch/microblaze/kernel/prom.c
@@ -18,7 +18,7 @@ void __init early_init_devtree(void *params)
 {
 	pr_debug(" -> early_init_devtree(%p)\n", params);
 
-	early_init_dt_scan(params);
+	early_init_dt_scan(params, __pa(params));
 	if (!strlen(boot_command_line))
 		strscpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE);
 
diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c
index 6062e6fa589a..4fd6da0a06c3 100644
--- a/arch/mips/kernel/prom.c
+++ b/arch/mips/kernel/prom.c
@@ -41,7 +41,7 @@ char *mips_get_machine_name(void)
 
 void __init __dt_setup_arch(void *bph)
 {
-	if (!early_init_dt_scan(bph))
+	if (!early_init_dt_scan(bph, __pa(bph)))
 		return;
 
 	mips_set_machine_name(of_flat_dt_get_machine_name());
diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c
index 7eeeaf1ff95d..cda7983e7c18 100644
--- a/arch/mips/kernel/relocate.c
+++ b/arch/mips/kernel/relocate.c
@@ -337,7 +337,7 @@ void *__init relocate_kernel(void)
 #if defined(CONFIG_USE_OF)
 	/* Deal with the device tree */
 	fdt = plat_get_fdt();
-	early_init_dt_scan(fdt);
+	early_init_dt_scan(fdt, __pa(fdt));
 	if (boot_command_line[0]) {
 		/* Boot command line was passed in device tree */
 		strscpy(arcs_cmdline, boot_command_line, COMMAND_LINE_SIZE);
diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c
index 9a8393e6b4a8..db049249766f 100644
--- a/arch/nios2/kernel/prom.c
+++ b/arch/nios2/kernel/prom.c
@@ -27,7 +27,7 @@ void __init early_init_devtree(void *params)
 	if (be32_to_cpup((__be32 *)CONFIG_NIOS2_DTB_PHYS_ADDR) ==
 		 OF_DT_HEADER) {
 		params = (void *)CONFIG_NIOS2_DTB_PHYS_ADDR;
-		early_init_dt_scan(params);
+		early_init_dt_scan(params, __pa(params));
 		return;
 	}
 #endif
@@ -37,5 +37,5 @@ void __init early_init_devtree(void *params)
 		params = (void *)__dtb_start;
 #endif
 
-	early_init_dt_scan(params);
+	early_init_dt_scan(params, __pa(params));
 }
diff --git a/arch/openrisc/kernel/prom.c b/arch/openrisc/kernel/prom.c
index 19e6008bf114..e424e9bd12a7 100644
--- a/arch/openrisc/kernel/prom.c
+++ b/arch/openrisc/kernel/prom.c
@@ -22,6 +22,6 @@
 
 void __init early_init_devtree(void *params)
 {
-	early_init_dt_scan(params);
+	early_init_dt_scan(params, __pa(params));
 	memblock_allow_resize();
 }
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index af4263594eb2..1bee15c013e7 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -867,7 +867,7 @@ bool __init dt_cpu_ftrs_init(void *fdt)
 	using_dt_cpu_ftrs = false;
 
 	/* Setup and verify the FDT, if it fails we just bail */
-	if (!early_init_dt_verify(fdt))
+	if (!early_init_dt_verify(fdt, __pa(fdt)))
 		return false;
 
 	if (!of_scan_flat_dt(fdt_find_cpu_features, NULL))
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 0be07ed407c7..88cbe432cad5 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -791,7 +791,7 @@ void __init early_init_devtree(void *params)
 	DBG(" -> early_init_devtree(%px)\n", params);
 
 	/* Too early to BUG_ON(), do it by hand */
-	if (!early_init_dt_verify(params))
+	if (!early_init_dt_verify(params, __pa(params)))
 		panic("BUG: Failed verifying flat device tree, bad version?");
 
 	of_scan_flat_dt(early_init_dt_scan_model, NULL);
diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c
index 4a595493d28a..b1667ed05f98 100644
--- a/arch/powerpc/platforms/pseries/plpks.c
+++ b/arch/powerpc/platforms/pseries/plpks.c
@@ -683,7 +683,7 @@ void __init plpks_early_init_devtree(void)
 out:
 	fdt_nop_property(fdt, chosen_node, "ibm,plpks-pw");
 	// Since we've cleared the password, we must update the FDT checksum
-	early_init_dt_verify(fdt);
+	early_init_dt_verify(fdt, __pa(fdt));
 }
 
 static __init int pseries_plpks_init(void)
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index a2cde65b69e9..26c886db4fb3 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -227,7 +227,7 @@ static void __init init_resources(void)
 static void __init parse_dtb(void)
 {
 	/* Early scan of device tree from init memory */
-	if (early_init_dt_scan(dtb_early_va)) {
+	if (early_init_dt_scan(dtb_early_va, __pa(dtb_early_va))) {
 		const char *name = of_flat_dt_get_machine_name();
 
 		if (name) {
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 620e5cf8ae1e..f2b6f16a46b8 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -255,7 +255,7 @@ void __ref sh_fdt_init(phys_addr_t dt_phys)
 	dt_virt = phys_to_virt(dt_phys);
 #endif
 
-	if (!dt_virt || !early_init_dt_scan(dt_virt)) {
+	if (!dt_virt || !early_init_dt_scan(dt_virt, __pa(dt_virt))) {
 		pr_crit("Error: invalid device tree blob"
 			" at physical address %p\n", (void *)dt_phys);
 
diff --git a/arch/um/kernel/dtb.c b/arch/um/kernel/dtb.c
index 4954188a6a09..8d78ced9e08f 100644
--- a/arch/um/kernel/dtb.c
+++ b/arch/um/kernel/dtb.c
@@ -17,7 +17,7 @@ void uml_dtb_init(void)
 
 	area = uml_load_file(dtb, &size);
 	if (area) {
-		if (!early_init_dt_scan(area)) {
+		if (!early_init_dt_scan(area, __pa(area))) {
 			pr_err("invalid DTB %s\n", dtb);
 			memblock_free(area, size);
 			return;
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 64280879c68c..59d23cdf4ed0 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -305,7 +305,7 @@ void __init x86_flattree_get_config(void)
 			map_len = size;
 		}
 
-		early_init_dt_verify(dt);
+		early_init_dt_verify(dt, __pa(dt));
 	}
 
 	unflatten_and_copy_device_tree();
diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c
index bdec4a773af0..e51f2060e830 100644
--- a/arch/xtensa/kernel/setup.c
+++ b/arch/xtensa/kernel/setup.c
@@ -216,7 +216,7 @@ static int __init xtensa_dt_io_area(unsigned long node, const char *uname,
 
 void __init early_init_devtree(void *params)
 {
-	early_init_dt_scan(params);
+	early_init_dt_scan(params, __pa(params));
 	of_scan_flat_dt(xtensa_dt_io_area, NULL);
 
 	if (!command_line[0])
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index d0dbc8183ac4..d3ecf2bdd202 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -457,6 +457,7 @@ int __initdata dt_root_addr_cells;
 int __initdata dt_root_size_cells;
 
 void *initial_boot_params __ro_after_init;
+phys_addr_t initial_boot_params_pa __ro_after_init;
 
 #ifdef CONFIG_OF_EARLY_FLATTREE
 
@@ -1134,17 +1135,18 @@ static void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
 	return ptr;
 }
 
-bool __init early_init_dt_verify(void *params)
+bool __init early_init_dt_verify(void *dt_virt, phys_addr_t dt_phys)
 {
-	if (!params)
+	if (!dt_virt)
 		return false;
 
 	/* check device tree validity */
-	if (fdt_check_header(params))
+	if (fdt_check_header(dt_virt))
 		return false;
 
 	/* Setup flat device-tree pointer */
-	initial_boot_params = params;
+	initial_boot_params = dt_virt;
+	initial_boot_params_pa = dt_phys;
 	of_fdt_crc32 = crc32_be(~0, initial_boot_params,
 				fdt_totalsize(initial_boot_params));
 
@@ -1171,11 +1173,11 @@ void __init early_init_dt_scan_nodes(void)
 	early_init_dt_check_for_usable_mem_range();
 }
 
-bool __init early_init_dt_scan(void *params)
+bool __init early_init_dt_scan(void *dt_virt, phys_addr_t dt_phys)
 {
 	bool status;
 
-	status = early_init_dt_verify(params);
+	status = early_init_dt_verify(dt_virt, dt_phys);
 	if (!status)
 		return false;
 
diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
index 9ccde2fd77cb..5b924597a4de 100644
--- a/drivers/of/kexec.c
+++ b/drivers/of/kexec.c
@@ -301,7 +301,7 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
 	}
 
 	/* Remove memory reservation for the current device tree. */
-	ret = fdt_find_and_del_mem_rsv(fdt, __pa(initial_boot_params),
+	ret = fdt_find_and_del_mem_rsv(fdt, initial_boot_params_pa,
 				       fdt_totalsize(initial_boot_params));
 	if (ret == -EINVAL) {
 		pr_err("Error removing memory reservation.\n");
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index d69ad5bb1eb1..b8d6c0c20876 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -31,6 +31,7 @@ extern void *of_fdt_unflatten_tree(const unsigned long *blob,
 extern int __initdata dt_root_addr_cells;
 extern int __initdata dt_root_size_cells;
 extern void *initial_boot_params;
+extern phys_addr_t initial_boot_params_pa;
 
 extern char __dtb_start[];
 extern char __dtb_end[];
@@ -70,8 +71,8 @@ extern u64 dt_mem_next_cell(int s, const __be32 **cellp);
 /* Early flat tree scan hooks */
 extern int early_init_dt_scan_root(void);
 
-extern bool early_init_dt_scan(void *params);
-extern bool early_init_dt_verify(void *params);
+extern bool early_init_dt_scan(void *dt_virt, phys_addr_t dt_phys);
+extern bool early_init_dt_verify(void *dt_virt, phys_addr_t dt_phys);
 extern void early_init_dt_scan_nodes(void);
 
 extern const char *of_flat_dt_get_machine_name(void);
-- 
cgit v1.2.3


From 90ee6ed776c06435a3fe79c7f5344761f52e1760 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Oct 2024 16:23:59 +0200
Subject: fs: port files to file_ref

Port files to rely on file_ref reference to improve scaling and gain
overflow protection.

- We continue to WARN during get_file() in case a file that is already
  marked dead is revived as get_file() is only valid if the caller
  already holds a reference to the file. This hasn't changed just the
  check changes.

- The semantics for epoll and ttm's dmabuf usage have changed. Both
  epoll and ttm synchronize with __fput() to prevent the underlying file
  from beeing freed.

  (1) epoll

      Explaining epoll is straightforward using a simple diagram.
      Essentially, the mutex of the epoll instance needs to be taken in both
      __fput() and around epi_fget() preventing the file from being freed
      while it is polled or preventing the file from being resurrected.

          CPU1                                   CPU2
          fput(file)
          -> __fput(file)
             -> eventpoll_release(file)
                -> eventpoll_release_file(file)
                                                 mutex_lock(&ep->mtx)
                                                 epi_item_poll()
                                                 -> epi_fget()
                                                    -> file_ref_get(file)
                                                 mutex_unlock(&ep->mtx)
                   mutex_lock(&ep->mtx);
                   __ep_remove()
                   mutex_unlock(&ep->mtx);
             -> kmem_cache_free(file)

  (2) ttm dmabuf

      This explanation is a bit more involved. A regular dmabuf file stashed
      the dmabuf in file->private_data and the file in dmabuf->file:

          file->private_data = dmabuf;
          dmabuf->file = file;

      The generic release method of a dmabuf file handles file specific
      things:

          f_op->release::dma_buf_file_release()

      while the generic dentry release method of a dmabuf handles dmabuf
      freeing including driver specific things:

          dentry->d_release::dma_buf_release()

      During ttm dmabuf initialization in ttm_object_device_init() the ttm
      driver copies the provided struct dma_buf_ops into a private location:

          struct ttm_object_device {
                  spinlock_t object_lock;
                  struct dma_buf_ops ops;
                  void (*dmabuf_release)(struct dma_buf *dma_buf);
                  struct idr idr;
          };

          ttm_object_device_init(const struct dma_buf_ops *ops)
          {
                  // copy original dma_buf_ops in private location
                  tdev->ops = *ops;

                  // stash the release method of the original struct dma_buf_ops
                  tdev->dmabuf_release = tdev->ops.release;

                  // override the release method in the copy of the struct dma_buf_ops
                  // with ttm's own dmabuf release method
                  tdev->ops.release = ttm_prime_dmabuf_release;
          }

      When a new dmabuf is created the struct dma_buf_ops with the overriden
      release method set to ttm_prime_dmabuf_release is passed in exp_info.ops:

          DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
          exp_info.ops = &tdev->ops;
          exp_info.size = prime->size;
          exp_info.flags = flags;
          exp_info.priv = prime;

      The call to dma_buf_export() then sets

          mutex_lock_interruptible(&prime->mutex);
          dma_buf = dma_buf_export(&exp_info)
          {
                  dmabuf->ops = exp_info->ops;
          }
          mutex_unlock(&prime->mutex);

      which creates a new dmabuf file and then install a file descriptor to
      it in the callers file descriptor table:

          ret = dma_buf_fd(dma_buf, flags);

      When that dmabuf file is closed we now get:

          fput(file)
          -> __fput(file)
             -> f_op->release::dma_buf_file_release()
             -> dput()
                -> d_op->d_release::dma_buf_release()
                   -> dmabuf->ops->release::ttm_prime_dmabuf_release()
                      mutex_lock(&prime->mutex);
                      if (prime->dma_buf == dma_buf)
                            prime->dma_buf = NULL;
                      mutex_unlock(&prime->mutex);

      Where we can see that prime->dma_buf is set to NULL. So when we have
      the following diagram:

          CPU1                                                          CPU2
          fput(file)
          -> __fput(file)
             -> f_op->release::dma_buf_file_release()
             -> dput()
                -> d_op->d_release::dma_buf_release()
                   -> dmabuf->ops->release::ttm_prime_dmabuf_release()
                                                                        ttm_prime_handle_to_fd()
                                                                        mutex_lock_interruptible(&prime->mutex)
                                                                        dma_buf = prime->dma_buf
                                                                        dma_buf && get_dma_buf_unless_doomed(dma_buf)
                                                                        -> file_ref_get(dma_buf->file)
                                                                        mutex_unlock(&prime->mutex);

                      mutex_lock(&prime->mutex);
                      if (prime->dma_buf == dma_buf)
                            prime->dma_buf = NULL;
                      mutex_unlock(&prime->mutex);
             -> kmem_cache_free(file)

      The logic of the mechanism is the same as for epoll: sync with
      __fput() preventing the file from being freed. Here the
      synchronization happens through the ttm instance's prime->mutex.
      Basically, the lifetime of the dma_buf and the file are tighly
      coupled.

  Both (1) and (2) used to call atomic_inc_not_zero() to check whether
  the file has already been marked dead and then refuse to revive it.

  This is only safe because both (1) and (2) sync with __fput() and thus
  prevent kmem_cache_free() on the file being called and thus prevent
  the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU.

  Both (1) and (2) have been ported from atomic_inc_not_zero() to
  file_ref_get(). That means a file that is already in the process of
  being marked as FILE_REF_DEAD:

  file_ref_put()
  cnt = atomic_long_dec_return()
  -> __file_ref_put(cnt)
     if (cnt == FIlE_REF_NOREF)
             atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)

  can be revived again:

  CPU1                                                             CPU2
  file_ref_put()
  cnt = atomic_long_dec_return()
  -> __file_ref_put(cnt)
     if (cnt == FIlE_REF_NOREF)
                                                                   file_ref_get()
                                                                   // Brings reference back to FILE_REF_ONEREF
                                                                   atomic_long_add_negative()
             atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)

  This is fine and inherent to the file_ref_get()/file_ref_put()
  semantics. For both (1) and (2) this is safe because __fput() is
  prevented from making progress if file_ref_get() fails due to the
  aforementioned synchronization mechanisms.

  Two cases need to be considered that affect both (1) epoll and (2) ttm
  dmabuf:

   (i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but
       before that fput() can mark the file as FILE_REF_DEAD someone
       manages to sneak in a file_ref_get() and brings the refcount back
       from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original
       fput() doesn't call __fput(). For epoll the poll will finish and
       for ttm dmabuf the file can be used again. For ttm dambuf this is
       actually an advantage because it avoids immediately allocating
       a new dmabuf object.

       CPU1                                                             CPU2
       file_ref_put()
       cnt = atomic_long_dec_return()
       -> __file_ref_put(cnt)
          if (cnt == FIlE_REF_NOREF)
                                                                        file_ref_get()
                                                                        // Brings reference back to FILE_REF_ONEREF
                                                                        atomic_long_add_negative()
                  atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)

  (ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and
       also suceeds in actually marking it FILE_REF_DEAD and then calls
       into __fput() to free the file.

       When either (1) or (2) call file_ref_get() they fail as
       atomic_long_add_negative() will return true.

       At the same time, both (1) and (2) all file_ref_get() under
       mutexes that __fput() must also acquire preventing
       kmem_cache_free() from freeing the file.

  So while this might be treated as a change in semantics for (1) and
  (2) it really isn't. It if should end up causing issues this can be
  fixed by adding a helper that does something like:

  long cnt = atomic_long_read(&ref->refcnt);
  do {
          if (cnt < 0)
                  return false;
  } while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1));
  return true;

  which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions.

- Jann correctly pointed out that kmem_cache_zalloc() cannot be used
  anymore once files have been ported to file_ref_t.

  The kmem_cache_zalloc() call will memset() the whole struct file to
  zero when it is reallocated. This will also set file->f_ref to zero
  which mens that a concurrent file_ref_get() can return true:

  CPU1                            CPU2
                                  __get_file_rcu()
                                    rcu_dereference_raw()
  close()
    [frees file]
  alloc_empty_file()
    kmem_cache_zalloc()
      [reallocates same file]
      memset(..., 0, ...)
                                    file_ref_get()
                                      [increments 0->1, returns true]
    init_file()
      file_ref_init(..., 1)
        [sets to 0]
                                    rcu_dereference_raw()
                                    fput()
                                      file_ref_put()
                                        [decrements 0->FILE_REF_NOREF, frees file]
    [UAF]

   causing a concurrent __get_file_rcu() call to acquire a reference to
   the file that is about to be reallocated and immediately freeing it
   on realizing that it has been recycled. This causes a UAF for the
   task that reallocated/recycled the file.

   This is prevented by switching from kmem_cache_zalloc() to
   kmem_cache_alloc() and initializing the fields manually. With
   file->f_ref initialized last.

   Note that a memset() also isn't guaranteed to atomically update an
   unsigned long so it's theoretically possible to see torn and
   therefore bogus counter values.

Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/gpu/drm/i915/gt/shmem_utils.c |  2 +-
 drivers/gpu/drm/vmwgfx/ttm_object.c   |  2 +-
 fs/eventpoll.c                        |  2 +-
 fs/file.c                             | 14 +++++++-------
 fs/file_table.c                       | 34 +++++++++++++++++++++++++---------
 include/linux/fs.h                    | 10 +++++-----
 6 files changed, 40 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c
index 1fb6ff77fd89..bb696b29ee2c 100644
--- a/drivers/gpu/drm/i915/gt/shmem_utils.c
+++ b/drivers/gpu/drm/i915/gt/shmem_utils.c
@@ -40,7 +40,7 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj)
 
 	if (i915_gem_object_is_shmem(obj)) {
 		file = obj->base.filp;
-		atomic_long_inc(&file->f_count);
+		get_file(file);
 		return file;
 	}
 
diff --git a/drivers/gpu/drm/vmwgfx/ttm_object.c b/drivers/gpu/drm/vmwgfx/ttm_object.c
index 3353e97687d1..a17e62867f3b 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_object.c
+++ b/drivers/gpu/drm/vmwgfx/ttm_object.c
@@ -471,7 +471,7 @@ void ttm_object_device_release(struct ttm_object_device **p_tdev)
  */
 static bool __must_check get_dma_buf_unless_doomed(struct dma_buf *dmabuf)
 {
-	return atomic_long_inc_not_zero(&dmabuf->file->f_count) != 0L;
+	return file_ref_get(&dmabuf->file->f_ref);
 }
 
 /**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1ae4542f0bd8..212383cefe6c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1002,7 +1002,7 @@ static struct file *epi_fget(const struct epitem *epi)
 	struct file *file;
 
 	file = epi->ffd.file;
-	if (!atomic_long_inc_not_zero(&file->f_count))
+	if (!file_ref_get(&file->f_ref))
 		file = NULL;
 	return file;
 }
diff --git a/fs/file.c b/fs/file.c
index ab15e2a3d280..9598c577f713 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -902,7 +902,7 @@ static struct file *__get_file_rcu(struct file __rcu **f)
 	if (!file)
 		return NULL;
 
-	if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+	if (unlikely(!file_ref_get(&file->f_ref)))
 		return ERR_PTR(-EAGAIN);
 
 	file_reloaded = rcu_dereference_raw(*f);
@@ -916,8 +916,8 @@ static struct file *__get_file_rcu(struct file __rcu **f)
 	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
 
 	/*
-	 * atomic_long_inc_not_zero() above provided a full memory
-	 * barrier when we acquired a reference.
+	 * file_ref_get() above provided a full memory barrier when we
+	 * acquired a reference.
 	 *
 	 * This is paired with the write barrier from assigning to the
 	 * __rcu protected file pointer so that if that pointer still
@@ -1015,11 +1015,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 * We need to confirm it by incrementing the refcount
 		 * and then check the lookup again.
 		 *
-		 * atomic_long_inc_not_zero() gives us a full memory
-		 * barrier. We only really need an 'acquire' one to
-		 * protect the loads below, but we don't have that.
+		 * file_ref_get() gives us a full memory barrier. We
+		 * only really need an 'acquire' one to protect the
+		 * loads below, but we don't have that.
 		 */
-		if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+		if (unlikely(!file_ref_get(&file->f_ref)))
 			continue;
 
 		/*
diff --git a/fs/file_table.c b/fs/file_table.c
index 4b23eb7b79dd..db4fde6fe620 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -169,16 +169,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	 * the respective member when opening the file.
 	 */
 	mutex_init(&f->f_pos_lock);
-	f->f_flags = flags;
-	f->f_mode = OPEN_FMODE(flags);
-	/* f->f_version: 0 */
+	memset(&f->f_path, 0, sizeof(f->f_path));
+	memset(&f->f_ra, 0, sizeof(f->f_ra));
+
+	f->f_flags	= flags;
+	f->f_mode	= OPEN_FMODE(flags);
+
+	f->f_op		= NULL;
+	f->f_mapping	= NULL;
+	f->private_data = NULL;
+	f->f_inode	= NULL;
+	f->f_owner	= NULL;
+#ifdef CONFIG_EPOLL
+	f->f_ep		= NULL;
+#endif
+
+	f->f_iocb_flags = 0;
+	f->f_pos	= 0;
+	f->f_wb_err	= 0;
+	f->f_sb_err	= 0;
 
 	/*
 	 * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
 	 * fget-rcu pattern users need to be able to handle spurious
 	 * refcount bumps we should reinitialize the reused file first.
 	 */
-	atomic_long_set(&f->f_count, 1);
+	file_ref_init(&f->f_ref, 1);
 	return 0;
 }
 
@@ -210,7 +226,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
 			goto over;
 	}
 
-	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+	f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
 	if (unlikely(!f))
 		return ERR_PTR(-ENOMEM);
 
@@ -244,7 +260,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
 	struct file *f;
 	int error;
 
-	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+	f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
 	if (unlikely(!f))
 		return ERR_PTR(-ENOMEM);
 
@@ -271,7 +287,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
 	struct backing_file *ff;
 	int error;
 
-	ff = kmem_cache_zalloc(bfilp_cachep, GFP_KERNEL);
+	ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
 	if (unlikely(!ff))
 		return ERR_PTR(-ENOMEM);
 
@@ -483,7 +499,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
 
 void fput(struct file *file)
 {
-	if (atomic_long_dec_and_test(&file->f_count)) {
+	if (file_ref_put(&file->f_ref)) {
 		struct task_struct *task = current;
 
 		if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
@@ -516,7 +532,7 @@ void fput(struct file *file)
  */
 void __fput_sync(struct file *file)
 {
-	if (atomic_long_dec_and_test(&file->f_count))
+	if (file_ref_put(&file->f_ref))
 		__fput(file);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e3c603d01337..c13f648a1c13 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -45,6 +45,7 @@
 #include <linux/slab.h>
 #include <linux/maple_tree.h>
 #include <linux/rw_hint.h>
+#include <linux/file_ref.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1005,7 +1006,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
 
 /**
  * struct file - Represents a file
- * @f_count: reference count
+ * @f_ref: reference count
  * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
  * @f_mode: FMODE_* flags often used in hotpaths
  * @f_op: file operations
@@ -1030,7 +1031,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
  */
 struct file {
-	atomic_long_t			f_count;
+	file_ref_t			f_ref;
 	spinlock_t			f_lock;
 	fmode_t				f_mode;
 	const struct file_operations	*f_op;
@@ -1078,15 +1079,14 @@ struct file_handle {
 
 static inline struct file *get_file(struct file *f)
 {
-	long prior = atomic_long_fetch_inc_relaxed(&f->f_count);
-	WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
+	file_ref_inc(&f->f_ref);
 	return f;
 }
 
 struct file *get_file_rcu(struct file __rcu **f);
 struct file *get_file_active(struct file **f);
 
-#define file_count(x)	atomic_long_read(&(x)->f_count)
+#define file_count(f)	file_ref_read(&(f)->f_ref)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
 
-- 
cgit v1.2.3


From 95b6235e36953e9d6c822270961e79751eee2276 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 28 Oct 2024 17:58:38 +0000
Subject: iommu: Make bus_iommu_probe() static

With the last external caller of bus_iommu_probe() now gone, make it
internal as it really should be.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: H. Nikolaus Schaller <hns@goldelico.com>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Tested-by: Kevin Hilman <khilman@baylibre.com>
Tested-by: Beleswar Padhi <b-padhi@ti.com>
Link: https://lore.kernel.org/r/a7511a034a27259aff4e14d80a861d3c40fbff1e.1730136799.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 3 ++-
 include/linux/iommu.h | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 83c8e617a2c5..35291b02aab1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -90,6 +90,7 @@ static const char * const iommu_group_resv_type_string[] = {
 #define IOMMU_CMD_LINE_DMA_API		BIT(0)
 #define IOMMU_CMD_LINE_STRICT		BIT(1)
 
+static int bus_iommu_probe(const struct bus_type *bus);
 static int iommu_bus_notifier(struct notifier_block *nb,
 			      unsigned long action, void *data);
 static void iommu_release_device(struct device *dev);
@@ -1795,7 +1796,7 @@ static void iommu_group_do_probe_finalize(struct device *dev)
 		ops->probe_finalize(dev);
 }
 
-int bus_iommu_probe(const struct bus_type *bus)
+static int bus_iommu_probe(const struct bus_type *bus)
 {
 	struct iommu_group *group, *next;
 	LIST_HEAD(group_list);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..84a6ed5e803a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -784,7 +784,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 	};
 }
 
-extern int bus_iommu_probe(const struct bus_type *bus);
 extern bool iommu_present(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
-- 
cgit v1.2.3


From 133008e84b99e4f5f8cf3d8b768c995732df9406 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 16 Oct 2024 13:13:09 -0700
Subject: blk-integrity: remove seed for user mapped buffers

The seed is only used for kernel generation and verification. That
doesn't happen for user buffers, so passing the seed around doesn't
accomplish anything.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20241016201309.1090320-1-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c         | 13 +++++--------
 block/blk-integrity.c         |  4 ++--
 drivers/nvme/host/ioctl.c     | 17 ++++++++---------
 include/linux/bio-integrity.h |  4 ++--
 include/linux/blk-integrity.h |  5 ++---
 5 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 88e3ad73c385..2a4bd6611692 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -199,7 +199,7 @@ EXPORT_SYMBOL(bio_integrity_add_page);
 
 static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
 				   int nr_vecs, unsigned int len,
-				   unsigned int direction, u32 seed)
+				   unsigned int direction)
 {
 	bool write = direction == ITER_SOURCE;
 	struct bio_integrity_payload *bip;
@@ -247,7 +247,6 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
 	}
 
 	bip->bip_flags |= BIP_COPY_USER;
-	bip->bip_iter.bi_sector = seed;
 	bip->bip_vcnt = nr_vecs;
 	return 0;
 free_bip:
@@ -258,7 +257,7 @@ free_buf:
 }
 
 static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
-				   int nr_vecs, unsigned int len, u32 seed)
+				   int nr_vecs, unsigned int len)
 {
 	struct bio_integrity_payload *bip;
 
@@ -267,7 +266,6 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
 		return PTR_ERR(bip);
 
 	memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec));
-	bip->bip_iter.bi_sector = seed;
 	bip->bip_iter.bi_size = len;
 	bip->bip_vcnt = nr_vecs;
 	return 0;
@@ -303,8 +301,7 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
 	return nr_bvecs;
 }
 
-int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
-			   u32 seed)
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
@@ -350,9 +347,9 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
 
 	if (copy)
 		ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
-					      direction, seed);
+					      direction);
 	else
-		ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed);
+		ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes);
 	if (ret)
 		goto release_pages;
 	if (bvec != stack_vec)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 83b696ba0cac..b180cac61a9d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -113,9 +113,9 @@ new_segment:
 EXPORT_SYMBOL(blk_rq_map_integrity_sg);
 
 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
-			      ssize_t bytes, u32 seed)
+			      ssize_t bytes)
 {
-	int ret = bio_integrity_map_user(rq->bio, ubuf, bytes, seed);
+	int ret = bio_integrity_map_user(rq->bio, ubuf, bytes);
 
 	if (ret)
 		return ret;
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index b9b79ccfabf8..f697d2d1d7e4 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -114,7 +114,7 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
 
 static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags)
+		struct io_uring_cmd *ioucmd, unsigned int flags)
 {
 	struct request_queue *q = req->q;
 	struct nvme_ns *ns = q->queuedata;
@@ -152,8 +152,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		bio_set_dev(bio, bdev);
 
 	if (has_metadata) {
-		ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len,
-						meta_seed);
+		ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len);
 		if (ret)
 			goto out_unmap;
 	}
@@ -170,7 +169,7 @@ out:
 
 static int nvme_submit_user_cmd(struct request_queue *q,
 		struct nvme_command *cmd, u64 ubuffer, unsigned bufflen,
-		void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
+		void __user *meta_buffer, unsigned meta_len,
 		u64 *result, unsigned timeout, unsigned int flags)
 {
 	struct nvme_ns *ns = q->queuedata;
@@ -187,7 +186,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	req->timeout = timeout;
 	if (ubuffer && bufflen) {
 		ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
-				meta_len, meta_seed, NULL, flags);
+				meta_len, NULL, flags);
 		if (ret)
 			return ret;
 	}
@@ -268,7 +267,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.lbatm = cpu_to_le16(io.appmask);
 
 	return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata,
-			meta_len, lower_32_bits(io.slba), NULL, 0, 0);
+			meta_len, NULL, 0, 0);
 }
 
 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
@@ -323,7 +322,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
-			cmd.metadata_len, 0, &result, timeout, 0);
+			cmd.metadata_len, &result, timeout, 0);
 
 	if (status >= 0) {
 		if (put_user(result, &ucmd->result))
@@ -370,7 +369,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
-			cmd.metadata_len, 0, &cmd.result, timeout, flags);
+			cmd.metadata_len, &cmd.result, timeout, flags);
 
 	if (status >= 0) {
 		if (put_user(cmd.result, &ucmd->result))
@@ -504,7 +503,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (d.addr && d.data_len) {
 		ret = nvme_map_user_request(req, d.addr,
 			d.data_len, nvme_to_user_ptr(d.metadata),
-			d.metadata_len, 0, ioucmd, vec);
+			d.metadata_len, ioucmd, vec);
 		if (ret)
 			return ret;
 	}
diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h
index dd831c269e99..dbf0f74c1529 100644
--- a/include/linux/bio-integrity.h
+++ b/include/linux/bio-integrity.h
@@ -72,7 +72,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp,
 		unsigned int nr);
 int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len,
 		unsigned int offset);
-int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed);
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len);
 void bio_integrity_unmap_user(struct bio *bio);
 bool bio_integrity_prep(struct bio *bio);
 void bio_integrity_advance(struct bio *bio, unsigned int bytes_done);
@@ -99,7 +99,7 @@ static inline void bioset_integrity_free(struct bio_set *bs)
 }
 
 static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
-					 ssize_t len, u32 seed)
+					 ssize_t len)
 {
 	return -EINVAL;
 }
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index 676f8f860c47..c7eae0bfb013 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -28,7 +28,7 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t,
 int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
-			      ssize_t bytes, u32 seed);
+			      ssize_t bytes);
 
 static inline bool
 blk_integrity_queue_supports_integrity(struct request_queue *q)
@@ -104,8 +104,7 @@ static inline int blk_rq_map_integrity_sg(struct request *q,
 }
 static inline int blk_rq_integrity_map_user(struct request *rq,
 					    void __user *ubuf,
-					    ssize_t bytes,
-					    u32 seed)
+					    ssize_t bytes)
 {
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From 899f44531fe6cac4b024710fec647ecc127724b8 Mon Sep 17 00:00:00 2001
From: Sibi Sankar <quic_sibis@quicinc.com>
Date: Wed, 30 Oct 2024 18:25:10 +0530
Subject: pmdomain: core: Add GENPD_FLAG_DEV_NAME_FW flag

Introduce GENPD_FLAG_DEV_NAME_FW flag which instructs genpd to generate
an unique device name using ida. It is aimed to be used by genpd providers
which derive their names directly from FW making them susceptible to
debugfs node creation failures.

Reported-by: Johan Hovold <johan+linaro@kernel.org>
Closes: https://lore.kernel.org/lkml/ZoQjAWse2YxwyRJv@hovoldconsulting.com/
Fixes: 718072ceb211 ("PM: domains: create debugfs nodes when adding power domains")
Suggested-by: Ulf Hansson <ulf.hansson@linaro.org>
Suggested-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Sibi Sankar <quic_sibis@quicinc.com>
Cc: stable@vger.kernel.org
Message-ID: <20241030125512.2884761-5-quic_sibis@quicinc.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/core.c   | 49 ++++++++++++++++++++++++++++++++---------------
 include/linux/pm_domain.h |  6 ++++++
 2 files changed, 40 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 5ede0f7eda09..29ad510e881c 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -7,6 +7,7 @@
 #define pr_fmt(fmt) "PM: " fmt
 
 #include <linux/delay.h>
+#include <linux/idr.h>
 #include <linux/kernel.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
@@ -23,6 +24,9 @@
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
 
+/* Provides a unique ID for each genpd device */
+static DEFINE_IDA(genpd_ida);
+
 #define GENPD_RETRY_MAX_MS	250		/* Approximate */
 
 #define GENPD_DEV_CALLBACK(genpd, type, callback, dev)		\
@@ -171,6 +175,7 @@ static const struct genpd_lock_ops genpd_raw_spin_ops = {
 #define genpd_is_cpu_domain(genpd)	(genpd->flags & GENPD_FLAG_CPU_DOMAIN)
 #define genpd_is_rpm_always_on(genpd)	(genpd->flags & GENPD_FLAG_RPM_ALWAYS_ON)
 #define genpd_is_opp_table_fw(genpd)	(genpd->flags & GENPD_FLAG_OPP_TABLE_FW)
+#define genpd_is_dev_name_fw(genpd)	(genpd->flags & GENPD_FLAG_DEV_NAME_FW)
 
 static inline bool irq_safe_dev_in_sleep_domain(struct device *dev,
 		const struct generic_pm_domain *genpd)
@@ -189,7 +194,7 @@ static inline bool irq_safe_dev_in_sleep_domain(struct device *dev,
 
 	if (ret)
 		dev_warn_once(dev, "PM domain %s will not be powered off\n",
-				genpd->name);
+			      dev_name(&genpd->dev));
 
 	return ret;
 }
@@ -274,7 +279,7 @@ static void genpd_debug_remove(struct generic_pm_domain *genpd)
 	if (!genpd_debugfs_dir)
 		return;
 
-	debugfs_lookup_and_remove(genpd->name, genpd_debugfs_dir);
+	debugfs_lookup_and_remove(dev_name(&genpd->dev), genpd_debugfs_dir);
 }
 
 static void genpd_update_accounting(struct generic_pm_domain *genpd)
@@ -731,7 +736,7 @@ static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed)
 	genpd->states[state_idx].power_on_latency_ns = elapsed_ns;
 	genpd->gd->max_off_time_changed = true;
 	pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n",
-		 genpd->name, "on", elapsed_ns);
+		 dev_name(&genpd->dev), "on", elapsed_ns);
 
 out:
 	raw_notifier_call_chain(&genpd->power_notifiers, GENPD_NOTIFY_ON, NULL);
@@ -782,7 +787,7 @@ static int _genpd_power_off(struct generic_pm_domain *genpd, bool timed)
 	genpd->states[state_idx].power_off_latency_ns = elapsed_ns;
 	genpd->gd->max_off_time_changed = true;
 	pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n",
-		 genpd->name, "off", elapsed_ns);
+		 dev_name(&genpd->dev), "off", elapsed_ns);
 
 out:
 	raw_notifier_call_chain(&genpd->power_notifiers, GENPD_NOTIFY_OFF,
@@ -1940,7 +1945,7 @@ int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb)
 
 	if (ret) {
 		dev_warn(dev, "failed to add notifier for PM domain %s\n",
-			 genpd->name);
+			 dev_name(&genpd->dev));
 		return ret;
 	}
 
@@ -1987,7 +1992,7 @@ int dev_pm_genpd_remove_notifier(struct device *dev)
 
 	if (ret) {
 		dev_warn(dev, "failed to remove notifier for PM domain %s\n",
-			 genpd->name);
+			 dev_name(&genpd->dev));
 		return ret;
 	}
 
@@ -2013,7 +2018,7 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
 	 */
 	if (!genpd_is_irq_safe(genpd) && genpd_is_irq_safe(subdomain)) {
 		WARN(1, "Parent %s of subdomain %s must be IRQ safe\n",
-				genpd->name, subdomain->name);
+		     dev_name(&genpd->dev), subdomain->name);
 		return -EINVAL;
 	}
 
@@ -2088,7 +2093,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 
 	if (!list_empty(&subdomain->parent_links) || subdomain->device_count) {
 		pr_warn("%s: unable to remove subdomain %s\n",
-			genpd->name, subdomain->name);
+			dev_name(&genpd->dev), subdomain->name);
 		ret = -EBUSY;
 		goto out;
 	}
@@ -2225,6 +2230,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 	genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON;
 	genpd->device_count = 0;
 	genpd->provider = NULL;
+	genpd->device_id = -ENXIO;
 	genpd->has_provider = false;
 	genpd->accounting_time = ktime_get_mono_fast_ns();
 	genpd->domain.ops.runtime_suspend = genpd_runtime_suspend;
@@ -2265,7 +2271,18 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 		return ret;
 
 	device_initialize(&genpd->dev);
-	dev_set_name(&genpd->dev, "%s", genpd->name);
+
+	if (!genpd_is_dev_name_fw(genpd)) {
+		dev_set_name(&genpd->dev, "%s", genpd->name);
+	} else {
+		ret = ida_alloc(&genpd_ida, GFP_KERNEL);
+		if (ret < 0) {
+			put_device(&genpd->dev);
+			return ret;
+		}
+		genpd->device_id = ret;
+		dev_set_name(&genpd->dev, "%s_%u", genpd->name, genpd->device_id);
+	}
 
 	mutex_lock(&gpd_list_lock);
 	list_add(&genpd->gpd_list_node, &gpd_list);
@@ -2287,13 +2304,13 @@ static int genpd_remove(struct generic_pm_domain *genpd)
 
 	if (genpd->has_provider) {
 		genpd_unlock(genpd);
-		pr_err("Provider present, unable to remove %s\n", genpd->name);
+		pr_err("Provider present, unable to remove %s\n", dev_name(&genpd->dev));
 		return -EBUSY;
 	}
 
 	if (!list_empty(&genpd->parent_links) || genpd->device_count) {
 		genpd_unlock(genpd);
-		pr_err("%s: unable to remove %s\n", __func__, genpd->name);
+		pr_err("%s: unable to remove %s\n", __func__, dev_name(&genpd->dev));
 		return -EBUSY;
 	}
 
@@ -2307,9 +2324,11 @@ static int genpd_remove(struct generic_pm_domain *genpd)
 	genpd_unlock(genpd);
 	genpd_debug_remove(genpd);
 	cancel_work_sync(&genpd->power_off_work);
+	if (genpd->device_id != -ENXIO)
+		ida_free(&genpd_ida, genpd->device_id);
 	genpd_free_data(genpd);
 
-	pr_debug("%s: removed %s\n", __func__, genpd->name);
+	pr_debug("%s: removed %s\n", __func__, dev_name(&genpd->dev));
 
 	return 0;
 }
@@ -3272,12 +3291,12 @@ static int genpd_summary_one(struct seq_file *s,
 	else
 		snprintf(state, sizeof(state), "%s",
 			 status_lookup[genpd->status]);
-	seq_printf(s, "%-30s  %-30s  %u", genpd->name, state, genpd->performance_state);
+	seq_printf(s, "%-30s  %-30s  %u", dev_name(&genpd->dev), state, genpd->performance_state);
 
 	/*
 	 * Modifications on the list require holding locks on both
 	 * parent and child, so we are safe.
-	 * Also genpd->name is immutable.
+	 * Also the device name is immutable.
 	 */
 	list_for_each_entry(link, &genpd->parent_links, parent_node) {
 		if (list_is_first(&link->parent_node, &genpd->parent_links))
@@ -3502,7 +3521,7 @@ static void genpd_debug_add(struct generic_pm_domain *genpd)
 	if (!genpd_debugfs_dir)
 		return;
 
-	d = debugfs_create_dir(genpd->name, genpd_debugfs_dir);
+	d = debugfs_create_dir(dev_name(&genpd->dev), genpd_debugfs_dir);
 
 	debugfs_create_file("current_state", 0444,
 			    d, genpd, &status_fops);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index b637ec14025f..cf4b11be3709 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -92,6 +92,10 @@ struct dev_pm_domain_list {
  * GENPD_FLAG_OPP_TABLE_FW:	The genpd provider supports performance states,
  *				but its corresponding OPP tables are not
  *				described in DT, but are given directly by FW.
+ *
+ * GENPD_FLAG_DEV_NAME_FW:	Instructs genpd to generate an unique device name
+ *				using ida. It is used by genpd providers which
+ *				get their genpd-names directly from FW.
  */
 #define GENPD_FLAG_PM_CLK	 (1U << 0)
 #define GENPD_FLAG_IRQ_SAFE	 (1U << 1)
@@ -101,6 +105,7 @@ struct dev_pm_domain_list {
 #define GENPD_FLAG_RPM_ALWAYS_ON (1U << 5)
 #define GENPD_FLAG_MIN_RESIDENCY (1U << 6)
 #define GENPD_FLAG_OPP_TABLE_FW	 (1U << 7)
+#define GENPD_FLAG_DEV_NAME_FW	 (1U << 8)
 
 enum gpd_status {
 	GENPD_STATE_ON = 0,	/* PM domain is on */
@@ -163,6 +168,7 @@ struct generic_pm_domain {
 	atomic_t sd_count;	/* Number of subdomains with power "on" */
 	enum gpd_status status;	/* Current state of the domain */
 	unsigned int device_count;	/* Number of devices */
+	unsigned int device_id;		/* unique device id */
 	unsigned int suspended_count;	/* System suspend device counter */
 	unsigned int prepared_count;	/* Suspend counter of prepared devices */
 	unsigned int performance_state;	/* Aggregated max performance state */
-- 
cgit v1.2.3


From b35108a51cf7bab58d7eace1267d7965978bcdb8 Mon Sep 17 00:00:00 2001
From: Easwar Hariharan <eahariha@linux.microsoft.com>
Date: Wed, 30 Oct 2024 17:47:35 +0000
Subject: jiffies: Define secs_to_jiffies()

secs_to_jiffies() is defined in hci_event.c and cannot be reused by
other call sites. Hoist it into the core code to allow conversion of the
~1150 usages of msecs_to_jiffies() that either:

 - use a multiplier value of 1000 or equivalently MSEC_PER_SEC, or
 - have timeouts that are denominated in seconds (i.e. end in 000)

It's implemented as a macro to allow usage in static initializers.

This will also allow conversion of yet more sites that use (sec * HZ)
directly, and improve their readability.

Suggested-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Easwar Hariharan <eahariha@linux.microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Link: https://lore.kernel.org/all/20241030-open-coded-timeouts-v3-1-9ba123facf88@linux.microsoft.com
---
 include/linux/jiffies.h   | 13 +++++++++++++
 net/bluetooth/hci_event.c |  2 --
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 5d21dacd62bc..ed945f42e064 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -526,6 +526,19 @@ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m)
 	}
 }
 
+/**
+ * secs_to_jiffies: - convert seconds to jiffies
+ * @_secs: time in seconds
+ *
+ * Conversion is done by simple multiplication with HZ
+ *
+ * secs_to_jiffies() is defined as a macro rather than a static inline
+ * function so it can be used in static initializers.
+ *
+ * Return: jiffies value
+ */
+#define secs_to_jiffies(_secs) ((_secs) * HZ)
+
 extern unsigned long __usecs_to_jiffies(const unsigned int u);
 #if !(USEC_PER_SEC % HZ)
 static inline unsigned long _usecs_to_jiffies(const unsigned int u)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 1c82dcdf6e8f..4bd94d432bcf 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -42,8 +42,6 @@
 #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
 		 "\x00\x00\x00\x00\x00\x00\x00\x00"
 
-#define secs_to_jiffies(_secs) msecs_to_jiffies((_secs) * 1000)
-
 /* Handle HCI Event packets */
 
 static void *hci_ev_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
-- 
cgit v1.2.3


From d9d959c36bec59f11c0eb6b5308729e3c4901b5e Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 30 Oct 2024 12:27:34 +0100
Subject: PCI: Make pcim_request_all_regions() a public function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to remove the deprecated function
pcim_iomap_regions_request_all(), a few drivers need an interface to
request all BARs a PCI device offers.

Make pcim_request_all_regions() a public interface.

Link: https://lore.kernel.org/r/20241030112743.104395-2-pstanner@redhat.com
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/pci/devres.c | 3 ++-
 include/linux/pci.h  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/devres.c b/drivers/pci/devres.c
index b133967faef8..2a64da5c91fb 100644
--- a/drivers/pci/devres.c
+++ b/drivers/pci/devres.c
@@ -939,7 +939,7 @@ static void pcim_release_all_regions(struct pci_dev *pdev)
  * desired, release individual regions with pcim_release_region() or all of
  * them at once with pcim_release_all_regions().
  */
-static int pcim_request_all_regions(struct pci_dev *pdev, const char *name)
+int pcim_request_all_regions(struct pci_dev *pdev, const char *name)
 {
 	int ret;
 	int bar;
@@ -957,6 +957,7 @@ err:
 
 	return ret;
 }
+EXPORT_SYMBOL(pcim_request_all_regions);
 
 /**
  * pcim_iomap_regions_request_all - Request all BARs and iomap specified ones
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..3b151c8331e5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2293,6 +2293,7 @@ static inline void pci_fixup_device(enum pci_fixup_pass pass,
 				    struct pci_dev *dev) { }
 #endif
 
+int pcim_request_all_regions(struct pci_dev *pdev, const char *name);
 void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen);
 void __iomem *pcim_iomap_region(struct pci_dev *pdev, int bar,
 				const char *name);
-- 
cgit v1.2.3


From 6d9c59212523e719a63575222f1cd1b0aca3da4f Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 30 Oct 2024 12:27:43 +0100
Subject: PCI: Remove pcim_iomap_regions_request_all()

pcim_iomap_regions_request_all() have been deprecated in
commit e354bb84a4c1 ("PCI: Deprecate pcim_iomap_table(),
pcim_iomap_regions_request_all()").

All users of this function have been ported to other interfaces by now.

Remove pcim_iomap_regions_request_all().

Link: https://lore.kernel.org/r/20241030112743.104395-11-pstanner@redhat.com
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
---
 Documentation/driver-api/driver-model/devres.rst |  1 -
 drivers/pci/devres.c                             | 56 ------------------------
 include/linux/pci.h                              |  2 -
 3 files changed, 59 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 5f2ee8d717b1..3a30cf4f6c0d 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -394,7 +394,6 @@ PCI
   pcim_enable_device()		: after success, some PCI ops become managed
   pcim_iomap()			: do iomap() on a single BAR
   pcim_iomap_regions()		: do request_region() and iomap() on multiple BARs
-  pcim_iomap_regions_request_all() : do request_region() on all and iomap() on multiple BARs
   pcim_iomap_table()		: array of mapped addresses indexed by BAR
   pcim_iounmap()		: do iounmap() on a single BAR
   pcim_iounmap_regions()	: do iounmap() and release_region() on multiple BARs
diff --git a/drivers/pci/devres.c b/drivers/pci/devres.c
index 2a64da5c91fb..319a477a2135 100644
--- a/drivers/pci/devres.c
+++ b/drivers/pci/devres.c
@@ -959,62 +959,6 @@ err:
 }
 EXPORT_SYMBOL(pcim_request_all_regions);
 
-/**
- * pcim_iomap_regions_request_all - Request all BARs and iomap specified ones
- *			(DEPRECATED)
- * @pdev: PCI device to map IO resources for
- * @mask: Mask of BARs to iomap
- * @name: Name associated with the requests
- *
- * Returns: 0 on success, negative error code on failure.
- *
- * Request all PCI BARs and iomap regions specified by @mask.
- *
- * To release these resources manually, call pcim_release_region() for the
- * regions and pcim_iounmap() for the mappings.
- *
- * This function is DEPRECATED. Don't use it in new code. Instead, use one
- * of the pcim_* region request functions in combination with a pcim_*
- * mapping function.
- */
-int pcim_iomap_regions_request_all(struct pci_dev *pdev, int mask,
-				   const char *name)
-{
-	int bar;
-	int ret;
-	void __iomem **legacy_iomap_table;
-
-	ret = pcim_request_all_regions(pdev, name);
-	if (ret != 0)
-		return ret;
-
-	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
-		if (!mask_contains_bar(mask, bar))
-			continue;
-		if (!pcim_iomap(pdev, bar, 0))
-			goto err;
-	}
-
-	return 0;
-
-err:
-	/*
-	 * If bar is larger than 0, then pcim_iomap() above has most likely
-	 * failed because of -EINVAL. If it is equal 0, most likely the table
-	 * couldn't be created, indicating -ENOMEM.
-	 */
-	ret = bar > 0 ? -EINVAL : -ENOMEM;
-	legacy_iomap_table = (void __iomem **)pcim_iomap_table(pdev);
-
-	while (--bar >= 0)
-		pcim_iounmap(pdev, legacy_iomap_table[bar]);
-
-	pcim_release_all_regions(pdev);
-
-	return ret;
-}
-EXPORT_SYMBOL(pcim_iomap_regions_request_all);
-
 /**
  * pcim_iounmap_regions - Unmap and release PCI BARs
  * @pdev: PCI device to map IO resources for
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3b151c8331e5..b59197635c5c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2301,8 +2301,6 @@ void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr);
 void __iomem * const *pcim_iomap_table(struct pci_dev *pdev);
 int pcim_request_region(struct pci_dev *pdev, int bar, const char *name);
 int pcim_iomap_regions(struct pci_dev *pdev, int mask, const char *name);
-int pcim_iomap_regions_request_all(struct pci_dev *pdev, int mask,
-				   const char *name);
 void pcim_iounmap_regions(struct pci_dev *pdev, int mask);
 void __iomem *pcim_iomap_range(struct pci_dev *pdev, int bar,
 				unsigned long offset, unsigned long len);
-- 
cgit v1.2.3


From 4a6afd60733c75369680f4a40c82b7c8528f4a7a Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 16 Oct 2024 11:49:04 +0200
Subject: PCI: Make pcim_iounmap_region() a public function

The function pcim_iounmap_regions() is problematic because it uses a
bitmask mechanism to release / iounmap multiple BARs at once. It, thus,
prevents getting rid of the problematic iomap table mechanism which was
deprecated in commit e354bb84a4c1 ("PCI: Deprecate pcim_iomap_table(),
pcim_iomap_regions_request_all()").

pcim_iounmap_region() does not have that problem. Make it public as the
successor of pcim_iounmap_regions().

Link: https://lore.kernel.org/r/20241016094911.24818-3-pstanner@redhat.com
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/devres.c | 3 ++-
 include/linux/pci.h  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/devres.c b/drivers/pci/devres.c
index 319a477a2135..fc50d2456aed 100644
--- a/drivers/pci/devres.c
+++ b/drivers/pci/devres.c
@@ -773,7 +773,7 @@ EXPORT_SYMBOL(pcim_iomap_region);
  * Unmap a BAR and release its region manually. Only pass BARs that were
  * previously mapped by pcim_iomap_region().
  */
-static void pcim_iounmap_region(struct pci_dev *pdev, int bar)
+void pcim_iounmap_region(struct pci_dev *pdev, int bar)
 {
 	struct pcim_addr_devres res_searched;
 
@@ -784,6 +784,7 @@ static void pcim_iounmap_region(struct pci_dev *pdev, int bar)
 	devres_release(&pdev->dev, pcim_addr_resource_release,
 			pcim_addr_resources_match, &res_searched);
 }
+EXPORT_SYMBOL(pcim_iounmap_region);
 
 /**
  * pcim_iomap_regions - Request and iomap PCI BARs (DEPRECATED)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b59197635c5c..b8d248c136ba 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2297,6 +2297,7 @@ int pcim_request_all_regions(struct pci_dev *pdev, const char *name);
 void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen);
 void __iomem *pcim_iomap_region(struct pci_dev *pdev, int bar,
 				const char *name);
+void pcim_iounmap_region(struct pci_dev *pdev, int bar);
 void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr);
 void __iomem * const *pcim_iomap_table(struct pci_dev *pdev);
 int pcim_request_region(struct pci_dev *pdev, int bar, const char *name);
-- 
cgit v1.2.3


From 3e7f43188ee227bcf0f07f60a00f1fd1aca10e6a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 2 Aug 2024 13:01:36 -0700
Subject: KVM: Protect vCPU's "last run PID" with rwlock, not RCU

To avoid jitter on KVM_RUN due to synchronize_rcu(), use a rwlock instead
of RCU to protect vcpu->pid, a.k.a. the pid of the task last used to a
vCPU.  When userspace is doing M:N scheduling of tasks to vCPUs, e.g. to
run SEV migration helper vCPUs during post-copy, the synchronize_rcu()
needed to change the PID associated with the vCPU can stall for hundreds
of milliseconds, which is problematic for latency sensitive post-copy
operations.

In the directed yield path, do not acquire the lock if it's contended,
i.e. if the associated PID is changing, as that means the vCPU's task is
already running.

Reported-by: Steve Rutherford <srutherford@google.com>
Reviewed-by: Steve Rutherford <srutherford@google.com>
Acked-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20240802200136.329973-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/include/asm/kvm_host.h |  2 +-
 include/linux/kvm_host.h          |  3 ++-
 virt/kvm/kvm_main.c               | 39 +++++++++++++++++++++++++--------------
 3 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index bf64fed9820e..04febe60c88e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1140,7 +1140,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
 
-#define vcpu_has_run_once(vcpu)	!!rcu_access_pointer((vcpu)->pid)
+#define vcpu_has_run_once(vcpu)	(!!READ_ONCE((vcpu)->pid))
 
 #ifndef __KVM_NVHE_HYPERVISOR__
 #define kvm_call_hyp_nvhe(f, ...)						\
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 02f0206fd2dc..18a1672ffcbf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -334,7 +334,8 @@ struct kvm_vcpu {
 #ifndef __KVM_HAVE_ARCH_WQP
 	struct rcuwait wait;
 #endif
-	struct pid __rcu *pid;
+	struct pid *pid;
+	rwlock_t pid_lock;
 	int sigset_active;
 	sigset_t sigset;
 	unsigned int halt_poll_ns;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 17048d9575e3..c1876b9ed213 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -447,6 +447,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
+	rwlock_init(&vcpu->pid_lock);
 #ifndef __KVM_HAVE_ARCH_WQP
 	rcuwait_init(&vcpu->wait);
 #endif
@@ -474,7 +475,7 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 	 * the vcpu->pid pointer, and at destruction time all file descriptors
 	 * are already gone.
 	 */
-	put_pid(rcu_dereference_protected(vcpu->pid, 1));
+	put_pid(vcpu->pid);
 
 	free_page((unsigned long)vcpu->run);
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
@@ -3770,15 +3771,17 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
 
 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
 {
-	struct pid *pid;
 	struct task_struct *task = NULL;
 	int ret;
 
-	rcu_read_lock();
-	pid = rcu_dereference(target->pid);
-	if (pid)
-		task = get_pid_task(pid, PIDTYPE_PID);
-	rcu_read_unlock();
+	if (!read_trylock(&target->pid_lock))
+		return 0;
+
+	if (target->pid)
+		task = get_pid_task(target->pid, PIDTYPE_PID);
+
+	read_unlock(&target->pid_lock);
+
 	if (!task)
 		return 0;
 	ret = yield_to(task, 1);
@@ -4030,9 +4033,9 @@ static int vcpu_get_pid(void *data, u64 *val)
 {
 	struct kvm_vcpu *vcpu = data;
 
-	rcu_read_lock();
-	*val = pid_nr(rcu_dereference(vcpu->pid));
-	rcu_read_unlock();
+	read_lock(&vcpu->pid_lock);
+	*val = pid_nr(vcpu->pid);
+	read_unlock(&vcpu->pid_lock);
 	return 0;
 }
 
@@ -4318,7 +4321,14 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (arg)
 			goto out;
-		oldpid = rcu_access_pointer(vcpu->pid);
+
+		/*
+		 * Note, vcpu->pid is primarily protected by vcpu->mutex. The
+		 * dedicated r/w lock allows other tasks, e.g. other vCPUs, to
+		 * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield
+		 * directly to this vCPU
+		 */
+		oldpid = vcpu->pid;
 		if (unlikely(oldpid != task_pid(current))) {
 			/* The thread running this VCPU changed. */
 			struct pid *newpid;
@@ -4328,9 +4338,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
 				break;
 
 			newpid = get_task_pid(current, PIDTYPE_PID);
-			rcu_assign_pointer(vcpu->pid, newpid);
-			if (oldpid)
-				synchronize_rcu();
+			write_lock(&vcpu->pid_lock);
+			vcpu->pid = newpid;
+			write_unlock(&vcpu->pid_lock);
+
 			put_pid(oldpid);
 		}
 		vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
-- 
cgit v1.2.3


From 9e9af8bbb5f9b565b9faf691f96f661791e199b0 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 10 Oct 2024 07:26:04 -0700
Subject: perf/x86/rapl: Clean up cpumask and hotplug

The rapl pmu is die scope, which is supported by the generic perf_event
subsystem now.

Set the scope for the rapl PMU and remove all the cpumask and hotplug
codes.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Oliver Sang <oliver.sang@intel.com>
Tested-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
Link: https://lore.kernel.org/r/20241010142604.770192-2-kan.liang@linux.intel.com
---
 arch/x86/events/rapl.c     | 90 ++++------------------------------------------
 include/linux/cpuhotplug.h |  1 -
 2 files changed, 6 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 86cbee1a8bf0..a8defc813c36 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -148,7 +148,6 @@ struct rapl_model {
  /* 1/2^hw_unit Joule */
 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
 static struct rapl_pmus *rapl_pmus;
-static cpumask_t rapl_cpu_mask;
 static unsigned int rapl_cntr_mask;
 static u64 rapl_timer_ms;
 static struct perf_msr *rapl_msrs;
@@ -369,8 +368,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
 	if (event->cpu < 0)
 		return -EINVAL;
 
-	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
-
 	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
 		return -EINVAL;
 
@@ -389,7 +386,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
 	pmu = cpu_to_rapl_pmu(event->cpu);
 	if (!pmu)
 		return -EINVAL;
-	event->cpu = pmu->cpu;
 	event->pmu_private = pmu;
 	event->hw.event_base = rapl_msrs[bit].msr;
 	event->hw.config = cfg;
@@ -403,23 +399,6 @@ static void rapl_pmu_event_read(struct perf_event *event)
 	rapl_event_update(event);
 }
 
-static ssize_t rapl_get_attr_cpumask(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
-
-static struct attribute *rapl_pmu_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group rapl_pmu_attr_group = {
-	.attrs = rapl_pmu_attrs,
-};
-
 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
@@ -467,7 +446,6 @@ static struct attribute_group rapl_pmu_format_group = {
 };
 
 static const struct attribute_group *rapl_attr_groups[] = {
-	&rapl_pmu_attr_group,
 	&rapl_pmu_format_group,
 	&rapl_pmu_events_group,
 	NULL,
@@ -570,54 +548,6 @@ static struct perf_msr amd_rapl_msrs[] = {
 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
 };
 
-static int rapl_cpu_offline(unsigned int cpu)
-{
-	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
-	int target;
-
-	/* Check if exiting cpu is used for collecting rapl events */
-	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
-		return 0;
-
-	pmu->cpu = -1;
-	/* Find a new cpu to collect rapl events */
-	target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu);
-
-	/* Migrate rapl events to the new target */
-	if (target < nr_cpu_ids) {
-		cpumask_set_cpu(target, &rapl_cpu_mask);
-		pmu->cpu = target;
-		perf_pmu_migrate_context(pmu->pmu, cpu, target);
-	}
-	return 0;
-}
-
-static int rapl_cpu_online(unsigned int cpu)
-{
-	s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu);
-	if (rapl_pmu_idx < 0) {
-		pr_err("topology_logical_(package/die)_id() returned a negative value");
-		return -EINVAL;
-	}
-	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
-	int target;
-
-	if (!pmu)
-		return -ENOMEM;
-
-	/*
-	 * Check if there is an online cpu in the package which collects rapl
-	 * events already.
-	 */
-	target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu));
-	if (target < nr_cpu_ids)
-		return 0;
-
-	cpumask_set_cpu(cpu, &rapl_cpu_mask);
-	pmu->cpu = cpu;
-	return 0;
-}
-
 static int rapl_check_hw_unit(struct rapl_model *rm)
 {
 	u64 msr_rapl_power_unit_bits;
@@ -725,9 +655,12 @@ free:
 static int __init init_rapl_pmus(void)
 {
 	int nr_rapl_pmu = topology_max_packages();
+	int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
 
-	if (!rapl_pmu_is_pkg_scope())
+	if (!rapl_pmu_is_pkg_scope()) {
 		nr_rapl_pmu *= topology_max_dies_per_package();
+		rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+	}
 
 	rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
 	if (!rapl_pmus)
@@ -743,6 +676,7 @@ static int __init init_rapl_pmus(void)
 	rapl_pmus->pmu.start		= rapl_pmu_event_start;
 	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
+	rapl_pmus->pmu.scope		= rapl_pmu_scope;
 	rapl_pmus->pmu.module		= THIS_MODULE;
 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
 
@@ -892,24 +826,13 @@ static int __init rapl_pmu_init(void)
 	if (ret)
 		return ret;
 
-	/*
-	 * Install callbacks. Core will call them for each online cpu.
-	 */
-	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
-				"perf/x86/rapl:online",
-				rapl_cpu_online, rapl_cpu_offline);
-	if (ret)
-		goto out;
-
 	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
 	if (ret)
-		goto out1;
+		goto out;
 
 	rapl_advertise();
 	return 0;
 
-out1:
-	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 out:
 	pr_warn("Initialization failed (%d), disabled\n", ret);
 	cleanup_rapl_pmus();
@@ -919,7 +842,6 @@ module_init(rapl_pmu_init);
 
 static void __exit intel_rapl_exit(void)
 {
-	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 	perf_pmu_unregister(&rapl_pmus->pmu);
 	cleanup_rapl_pmus();
 }
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2361ed4d2b15..37a9afffb59e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -208,7 +208,6 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_UNCORE_ONLINE,
 	CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
 	CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
-	CPUHP_AP_PERF_X86_RAPL_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
-- 
cgit v1.2.3


From dd1a7567784e2b1f80258be04f57bcfa82c997eb Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 23 Oct 2024 21:41:59 -0700
Subject: uprobes: SRCU-protect uretprobe lifetime (with timeout)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid taking refcount on uprobe in prepare_uretprobe(), instead take
uretprobe-specific SRCU lock and keep it active as kernel transfers
control back to user space.

Given we can't rely on user space returning from traced function within
reasonable time period, we need to make sure not to keep SRCU lock
active for too long, though. To that effect, we employ a timer callback
which is meant to terminate SRCU lock region after predefined timeout
(currently set to 100ms), and instead transfer underlying struct
uprobe's lifetime protection to refcounting.

This fallback to less scalable refcounting after 100ms is a fine
tradeoff from uretprobe's scalability and performance perspective,
because uretprobing *long running* user functions inherently doesn't run
into scalability issues (there is just not enough frequency to cause
noticeable issues with either performance or scalability).

The overall trick is in ensuring synchronization between current thread
and timer's callback fired on some other thread. To cope with that with
minimal logic complications, we add hprobe wrapper which is used to
contain all the synchronization related issues behind a small number of
basic helpers: hprobe_expire() for "downgrading" uprobe from SRCU-protected
state to refcounted state, and a hprobe_consume() and hprobe_finalize()
pair of single-use consuming helpers. Other than that, whatever current
thread's logic is there stays the same, as timer thread cannot modify
return_instance state (or add new/remove old return_instances). It only
takes care of SRCU unlock and uprobe refcounting, which is hidden from
the higher-level uretprobe handling logic.

We use atomic xchg() in hprobe_consume(), which is called from
performance critical handle_uretprobe_chain() function run in the
current context. When uncontended, this xchg() doesn't seem to hurt
performance as there are no other competing CPUs fighting for the same
cache line. We also mark struct return_instance as ____cacheline_aligned
to ensure no false sharing can happen.

Another technical moment. We need to make sure that the list of return
instances can be safely traversed under RCU from timer callback, so we
delay return_instance freeing with kfree_rcu() and make sure that list
modifications use RCU-aware operations.

Also, given SRCU lock survives transition from kernel to user space and
back we need to use lower-level __srcu_read_lock() and
__srcu_read_unlock() to avoid lockdep complaining.

Just to give an impression of a kind of performance improvements this
change brings, below are benchmarking results with and without these
SRCU changes, assuming other uprobe optimizations (mainly RCU Tasks
Trace for entry uprobes, lockless RB-tree lookup, and lockless VMA to
uprobe lookup) are left intact:

WITHOUT SRCU for uretprobes
===========================
uretprobe-nop         ( 1 cpus):    2.197 ± 0.002M/s  (  2.197M/s/cpu)
uretprobe-nop         ( 2 cpus):    3.325 ± 0.001M/s  (  1.662M/s/cpu)
uretprobe-nop         ( 3 cpus):    4.129 ± 0.002M/s  (  1.376M/s/cpu)
uretprobe-nop         ( 4 cpus):    6.180 ± 0.003M/s  (  1.545M/s/cpu)
uretprobe-nop         ( 8 cpus):    7.323 ± 0.005M/s  (  0.915M/s/cpu)
uretprobe-nop         (16 cpus):    6.943 ± 0.005M/s  (  0.434M/s/cpu)
uretprobe-nop         (32 cpus):    5.931 ± 0.014M/s  (  0.185M/s/cpu)
uretprobe-nop         (64 cpus):    5.145 ± 0.003M/s  (  0.080M/s/cpu)
uretprobe-nop         (80 cpus):    4.925 ± 0.005M/s  (  0.062M/s/cpu)

WITH SRCU for uretprobes
========================
uretprobe-nop         ( 1 cpus):    1.968 ± 0.001M/s  (  1.968M/s/cpu)
uretprobe-nop         ( 2 cpus):    3.739 ± 0.003M/s  (  1.869M/s/cpu)
uretprobe-nop         ( 3 cpus):    5.616 ± 0.003M/s  (  1.872M/s/cpu)
uretprobe-nop         ( 4 cpus):    7.286 ± 0.002M/s  (  1.822M/s/cpu)
uretprobe-nop         ( 8 cpus):   13.657 ± 0.007M/s  (  1.707M/s/cpu)
uretprobe-nop         (32 cpus):   45.305 ± 0.066M/s  (  1.416M/s/cpu)
uretprobe-nop         (64 cpus):   42.390 ± 0.922M/s  (  0.662M/s/cpu)
uretprobe-nop         (80 cpus):   47.554 ± 2.411M/s  (  0.594M/s/cpu)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241024044159.3156646-3-andrii@kernel.org
---
 include/linux/uprobes.h |  54 ++++++++-
 kernel/events/uprobes.c | 287 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 304 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index dbaf04189548..7a051b5d2edd 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -15,6 +15,7 @@
 #include <linux/rbtree.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/timer.h>
 
 struct uprobe;
 struct vm_area_struct;
@@ -67,6 +68,53 @@ enum uprobe_task_state {
 	UTASK_SSTEP_TRAPPED,
 };
 
+/* The state of hybrid-lifetime uprobe inside struct return_instance */
+enum hprobe_state {
+	HPROBE_LEASED,		/* uretprobes_srcu-protected uprobe */
+	HPROBE_STABLE,		/* refcounted uprobe */
+	HPROBE_GONE,		/* NULL uprobe, SRCU expired, refcount failed */
+	HPROBE_CONSUMED,	/* uprobe "consumed" by uretprobe handler */
+};
+
+/*
+ * Hybrid lifetime uprobe. Represents a uprobe instance that could be either
+ * SRCU protected (with SRCU protection eventually potentially timing out),
+ * refcounted using uprobe->ref, or there could be no valid uprobe (NULL).
+ *
+ * hprobe's internal state is setup such that background timer thread can
+ * atomically "downgrade" temporarily RCU-protected uprobe into refcounted one
+ * (or no uprobe, if refcounting failed).
+ *
+ * *stable* pointer always point to the uprobe (or could be NULL if there is
+ * was no valid underlying uprobe to begin with).
+ *
+ * *leased* pointer is the key to achieving race-free atomic lifetime state
+ * transition and can have three possible states:
+ *   - either the same non-NULL value as *stable*, in which case uprobe is
+ *     SRCU-protected;
+ *   - NULL, in which case uprobe (if there is any) is refcounted;
+ *   - special __UPROBE_DEAD value, which represents an uprobe that was SRCU
+ *     protected initially, but SRCU period timed out and we attempted to
+ *     convert it to refcounted, but refcount_inc_not_zero() failed, because
+ *     uprobe effectively went away (the last consumer unsubscribed). In this
+ *     case it's important to know that *stable* pointer (which still has
+ *     non-NULL uprobe pointer) shouldn't be used, because lifetime of
+ *     underlying uprobe is not guaranteed anymore. __UPROBE_DEAD is just an
+ *     internal marker and is handled transparently by hprobe_fetch() helper.
+ *
+ * When uprobe is SRCU-protected, we also record srcu_idx value, necessary for
+ * SRCU unlocking.
+ *
+ * See hprobe_expire() and hprobe_fetch() for details of race-free uprobe
+ * state transitioning details. It all hinges on atomic xchg() over *leaded*
+ * pointer. *stable* pointer, once initially set, is not modified concurrently.
+ */
+struct hprobe {
+	enum hprobe_state state;
+	int srcu_idx;
+	struct uprobe *uprobe;
+};
+
 /*
  * uprobe_task: Metadata of a task while it singlesteps.
  */
@@ -86,6 +134,7 @@ struct uprobe_task {
 	};
 
 	struct uprobe			*active_uprobe;
+	struct timer_list		ri_timer;
 	unsigned long			xol_vaddr;
 
 	struct arch_uprobe              *auprobe;
@@ -100,7 +149,7 @@ struct return_consumer {
 };
 
 struct return_instance {
-	struct uprobe		*uprobe;
+	struct hprobe		hprobe;
 	unsigned long		func;
 	unsigned long		stack;		/* stack pointer */
 	unsigned long		orig_ret_vaddr; /* original return address */
@@ -108,9 +157,10 @@ struct return_instance {
 	int			consumers_cnt;
 
 	struct return_instance	*next;		/* keep as stack */
+	struct rcu_head		rcu;
 
 	struct return_consumer	consumers[] __counted_by(consumers_cnt);
-};
+} ____cacheline_aligned;
 
 enum rp_check {
 	RP_CHECK_CALL,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d7e489246608..a76ddc5fc982 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -28,6 +28,7 @@
 #include <linux/khugepaged.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/workqueue.h>
+#include <linux/srcu.h>
 
 #include <linux/uprobes.h>
 
@@ -51,6 +52,9 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 
 DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
 
+/* Covers return_instance's uprobe lifetime. */
+DEFINE_STATIC_SRCU(uretprobes_srcu);
+
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN	0
 
@@ -622,13 +626,20 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
 	return !RB_EMPTY_NODE(&uprobe->rb_node);
 }
 
-static void uprobe_free_rcu(struct rcu_head *rcu)
+static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
 {
 	struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
 
 	kfree(uprobe);
 }
 
+static void uprobe_free_srcu(struct rcu_head *rcu)
+{
+	struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+	call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace);
+}
+
 static void uprobe_free_deferred(struct work_struct *work)
 {
 	struct uprobe *uprobe = container_of(work, struct uprobe, work);
@@ -652,7 +663,8 @@ static void uprobe_free_deferred(struct work_struct *work)
 	delayed_uprobe_remove(uprobe, NULL);
 	mutex_unlock(&delayed_uprobe_lock);
 
-	call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
+	/* start srcu -> rcu_tasks_trace -> kfree chain */
+	call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu);
 }
 
 static void put_uprobe(struct uprobe *uprobe)
@@ -664,6 +676,151 @@ static void put_uprobe(struct uprobe *uprobe)
 	schedule_work(&uprobe->work);
 }
 
+/* Initialize hprobe as SRCU-protected "leased" uprobe */
+static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx)
+{
+	WARN_ON(!uprobe);
+	hprobe->state = HPROBE_LEASED;
+	hprobe->uprobe = uprobe;
+	hprobe->srcu_idx = srcu_idx;
+}
+
+/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */
+static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe)
+{
+	hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE;
+	hprobe->uprobe = uprobe;
+	hprobe->srcu_idx = -1;
+}
+
+/*
+ * hprobe_consume() fetches hprobe's underlying uprobe and detects whether
+ * uprobe is SRCU protected or is refcounted. hprobe_consume() can be
+ * used only once for a given hprobe.
+ *
+ * Caller has to call hprobe_finalize() and pass previous hprobe_state, so
+ * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever
+ * is appropriate.
+ */
+static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate)
+{
+	*hstate = xchg(&hprobe->state, HPROBE_CONSUMED);
+	switch (*hstate) {
+	case HPROBE_LEASED:
+	case HPROBE_STABLE:
+		return hprobe->uprobe;
+	case HPROBE_GONE:	/* uprobe is NULL, no SRCU */
+	case HPROBE_CONSUMED:	/* uprobe was finalized already, do nothing */
+		return NULL;
+	default:
+		WARN(1, "hprobe invalid state %d", *hstate);
+		return NULL;
+	}
+}
+
+/*
+ * Reset hprobe state and, if hprobe was LEASED, release SRCU lock.
+ * hprobe_finalize() can only be used from current context after
+ * hprobe_consume() call (which determines uprobe and hstate value).
+ */
+static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate)
+{
+	switch (hstate) {
+	case HPROBE_LEASED:
+		__srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+		break;
+	case HPROBE_STABLE:
+		put_uprobe(hprobe->uprobe);
+		break;
+	case HPROBE_GONE:
+	case HPROBE_CONSUMED:
+		break;
+	default:
+		WARN(1, "hprobe invalid state %d", hstate);
+		break;
+	}
+}
+
+/*
+ * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED)
+ * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of
+ * them can win the race to perform SRCU unlocking. Whoever wins must perform
+ * SRCU unlock.
+ *
+ * Returns underlying valid uprobe or NULL, if there was no underlying uprobe
+ * to begin with or we failed to bump its refcount and it's going away.
+ *
+ * Returned non-NULL uprobe can be still safely used within an ongoing SRCU
+ * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has
+ * an extra refcount for caller to assume and use. Otherwise, it's not
+ * guaranteed that returned uprobe has a positive refcount, so caller has to
+ * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current
+ * SRCU lock region. See dup_utask().
+ */
+static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
+{
+	enum hprobe_state hstate;
+
+	/*
+	 * return_instance's hprobe is protected by RCU.
+	 * Underlying uprobe is itself protected from reuse by SRCU.
+	 */
+	lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu));
+
+	hstate = READ_ONCE(hprobe->state);
+	switch (hstate) {
+	case HPROBE_STABLE:
+		/* uprobe has positive refcount, bump refcount, if necessary */
+		return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe;
+	case HPROBE_GONE:
+		/*
+		 * SRCU was unlocked earlier and we didn't manage to take
+		 * uprobe refcnt, so it's effectively NULL
+		 */
+		return NULL;
+	case HPROBE_CONSUMED:
+		/*
+		 * uprobe was consumed, so it's effectively NULL as far as
+		 * uretprobe processing logic is concerned
+		 */
+		return NULL;
+	case HPROBE_LEASED: {
+		struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe);
+		/*
+		 * Try to switch hprobe state, guarding against
+		 * hprobe_consume() or another hprobe_expire() racing with us.
+		 * Note, if we failed to get uprobe refcount, we use special
+		 * HPROBE_GONE state to signal that hprobe->uprobe shouldn't
+		 * be used as it will be freed after SRCU is unlocked.
+		 */
+		if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) {
+			/* We won the race, we are the ones to unlock SRCU */
+			__srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+			return get ? get_uprobe(uprobe) : uprobe;
+		}
+
+		/*
+		 * We lost the race, undo refcount bump (if it ever happened),
+		 * unless caller would like an extra refcount anyways.
+		 */
+		if (uprobe && !get)
+			put_uprobe(uprobe);
+		/*
+		 * Even if hprobe_consume() or another hprobe_expire() wins
+		 * the state update race and unlocks SRCU from under us, we
+		 * still have a guarantee that underyling uprobe won't be
+		 * freed due to ongoing caller's SRCU lock region, so we can
+		 * return it regardless. Also, if `get` was true, we also have
+		 * an extra ref for the caller to own. This is used in dup_utask().
+		 */
+		return uprobe;
+	}
+	default:
+		WARN(1, "unknown hprobe state %d", hstate);
+		return NULL;
+	}
+}
+
 static __always_inline
 int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
 	       const struct uprobe *r)
@@ -1169,6 +1326,7 @@ void uprobe_unregister_sync(void)
 	 * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
 	 */
 	synchronize_rcu_tasks_trace();
+	synchronize_srcu(&uretprobes_srcu);
 }
 EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
 
@@ -1731,11 +1889,18 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
 	return instruction_pointer(regs);
 }
 
-static struct return_instance *free_ret_instance(struct return_instance *ri)
+static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe)
 {
 	struct return_instance *next = ri->next;
-	put_uprobe(ri->uprobe);
-	kfree(ri);
+
+	if (cleanup_hprobe) {
+		enum hprobe_state hstate;
+
+		(void)hprobe_consume(&ri->hprobe, &hstate);
+		hprobe_finalize(&ri->hprobe, hstate);
+	}
+
+	kfree_rcu(ri, rcu);
 	return next;
 }
 
@@ -1753,14 +1918,48 @@ void uprobe_free_utask(struct task_struct *t)
 
 	WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
 
+	timer_delete_sync(&utask->ri_timer);
+
 	ri = utask->return_instances;
 	while (ri)
-		ri = free_ret_instance(ri);
+		ri = free_ret_instance(ri, true /* cleanup_hprobe */);
 
 	kfree(utask);
 	t->utask = NULL;
 }
 
+#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
+
+#define for_each_ret_instance_rcu(pos, head) \
+	for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))
+
+static void ri_timer(struct timer_list *timer)
+{
+	struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
+	struct return_instance *ri;
+
+	/* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
+	guard(srcu)(&uretprobes_srcu);
+	/* RCU protects return_instance from freeing. */
+	guard(rcu)();
+
+	for_each_ret_instance_rcu(ri, utask->return_instances)
+		hprobe_expire(&ri->hprobe, false);
+}
+
+static struct uprobe_task *alloc_utask(void)
+{
+	struct uprobe_task *utask;
+
+	utask = kzalloc(sizeof(*utask), GFP_KERNEL);
+	if (!utask)
+		return NULL;
+
+	timer_setup(&utask->ri_timer, ri_timer, 0);
+
+	return utask;
+}
+
 /*
  * Allocate a uprobe_task object for the task if necessary.
  * Called when the thread hits a breakpoint.
@@ -1772,7 +1971,7 @@ void uprobe_free_utask(struct task_struct *t)
 static struct uprobe_task *get_utask(void)
 {
 	if (!current->utask)
-		current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+		current->utask = alloc_utask();
 	return current->utask;
 }
 
@@ -1808,29 +2007,37 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
 {
 	struct uprobe_task *n_utask;
 	struct return_instance **p, *o, *n;
+	struct uprobe *uprobe;
 
-	n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+	n_utask = alloc_utask();
 	if (!n_utask)
 		return -ENOMEM;
 	t->utask = n_utask;
 
+	/* protect uprobes from freeing, we'll need try_get_uprobe() them */
+	guard(srcu)(&uretprobes_srcu);
+
 	p = &n_utask->return_instances;
 	for (o = o_utask->return_instances; o; o = o->next) {
 		n = dup_return_instance(o);
 		if (!n)
 			return -ENOMEM;
 
+		/* if uprobe is non-NULL, we'll have an extra refcount for uprobe */
+		uprobe = hprobe_expire(&o->hprobe, true);
+
 		/*
-		 * uprobe's refcnt has to be positive at this point, kept by
-		 * utask->return_instances items; return_instances can't be
-		 * removed right now, as task is blocked due to duping; so
-		 * get_uprobe() is safe to use here.
+		 * New utask will have stable properly refcounted uprobe or
+		 * NULL. Even if we failed to get refcounted uprobe, we still
+		 * need to preserve full set of return_instances for proper
+		 * uretprobe handling and nesting in forked task.
 		 */
-		get_uprobe(n->uprobe);
-		n->next = NULL;
+		hprobe_init_stable(&n->hprobe, uprobe);
 
-		*p = n;
+		n->next = NULL;
+		rcu_assign_pointer(*p, n);
 		p = &n->next;
+
 		n_utask->depth++;
 	}
 
@@ -1906,10 +2113,10 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
 	enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
 
 	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
-		ri = free_ret_instance(ri);
+		ri = free_ret_instance(ri, true /* cleanup_hprobe */);
 		utask->depth--;
 	}
-	utask->return_instances = ri;
+	rcu_assign_pointer(utask->return_instances, ri);
 }
 
 static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
@@ -1918,6 +2125,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
 	struct uprobe_task *utask = current->utask;
 	unsigned long orig_ret_vaddr, trampoline_vaddr;
 	bool chained;
+	int srcu_idx;
 
 	if (!get_xol_area())
 		goto free;
@@ -1929,14 +2137,10 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
 		goto free;
 	}
 
-	/* we need to bump refcount to store uprobe in utask */
-	if (!try_get_uprobe(uprobe))
-		goto free;
-
 	trampoline_vaddr = uprobe_get_trampoline_vaddr();
 	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
 	if (orig_ret_vaddr == -1)
-		goto put;
+		goto free;
 
 	/* drop the entries invalidated by longjmp() */
 	chained = (orig_ret_vaddr == trampoline_vaddr);
@@ -1954,23 +2158,28 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
 			 * attack from user-space.
 			 */
 			uprobe_warn(current, "handle tail call");
-			goto put;
+			goto free;
 		}
 		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
 	}
-	ri->uprobe = uprobe;
+
+	/* __srcu_read_lock() because SRCU lock survives switch to user space */
+	srcu_idx = __srcu_read_lock(&uretprobes_srcu);
+
 	ri->func = instruction_pointer(regs);
 	ri->stack = user_stack_pointer(regs);
 	ri->orig_ret_vaddr = orig_ret_vaddr;
 	ri->chained = chained;
 
 	utask->depth++;
+
+	hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx);
 	ri->next = utask->return_instances;
-	utask->return_instances = ri;
+	rcu_assign_pointer(utask->return_instances, ri);
+
+	mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD);
 
 	return;
-put:
-	put_uprobe(uprobe);
 free:
 	kfree(ri);
 }
@@ -2215,13 +2424,16 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 }
 
 static void
-handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs)
 {
-	struct uprobe *uprobe = ri->uprobe;
 	struct return_consumer *ric;
 	struct uprobe_consumer *uc;
 	int ric_idx = 0;
 
+	/* all consumers unsubscribed meanwhile */
+	if (unlikely(!uprobe))
+		return;
+
 	rcu_read_lock_trace();
 	list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
 		bool session = uc->handler && uc->ret_handler;
@@ -2251,6 +2463,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
 {
 	struct uprobe_task *utask;
 	struct return_instance *ri, *next;
+	struct uprobe *uprobe;
+	enum hprobe_state hstate;
 	bool valid;
 
 	utask = current->utask;
@@ -2281,21 +2495,24 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
 			 * trampoline addresses on the stack are replaced with correct
 			 * original return addresses
 			 */
-			utask->return_instances = ri->next;
+			rcu_assign_pointer(utask->return_instances, ri->next);
+
+			uprobe = hprobe_consume(&ri->hprobe, &hstate);
 			if (valid)
-				handle_uretprobe_chain(ri, regs);
-			ri = free_ret_instance(ri);
+				handle_uretprobe_chain(ri, uprobe, regs);
+			hprobe_finalize(&ri->hprobe, hstate);
+
+			/* We already took care of hprobe, no need to waste more time on that. */
+			ri = free_ret_instance(ri, false /* !cleanup_hprobe */);
 			utask->depth--;
 		} while (ri != next);
 	} while (!valid);
 
-	utask->return_instances = ri;
 	return;
 
- sigill:
+sigill:
 	uprobe_warn(current, "handle uretprobe, sending SIGILL.");
 	force_sig(SIGILL);
-
 }
 
 bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
-- 
cgit v1.2.3


From f247fd22e9f27d919d81861e4dcc438e0b6d179b Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Wed, 23 Oct 2024 08:56:00 +0200
Subject: s390/time: Add clocksource id to TOD clock

To allow specifying the clock source in the upcoming PtP driver,
add a clocksource ID to the s390 TOD clock.

Acked-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Link: https://patch.msgid.link/20241023065601.449586-2-svens@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 arch/s390/kernel/time.c         | 1 +
 include/linux/clocksource_ids.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index b713effe0579..4214901c3ab0 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -255,6 +255,7 @@ static struct clocksource clocksource_tod = {
 	.shift		= 24,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.vdso_clock_mode = VDSO_CLOCKMODE_TOD,
+	.id		= CSID_S390_TOD,
 };
 
 struct clocksource * __init clocksource_default_clock(void)
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h
index 2bb4d8c2f1b0..c4ef4ae2eded 100644
--- a/include/linux/clocksource_ids.h
+++ b/include/linux/clocksource_ids.h
@@ -6,6 +6,7 @@
 enum clocksource_ids {
 	CSID_GENERIC		= 0,
 	CSID_ARM_ARCH_COUNTER,
+	CSID_S390_TOD,
 	CSID_X86_TSC_EARLY,
 	CSID_X86_TSC,
 	CSID_X86_KVM_CLK,
-- 
cgit v1.2.3


From 3b1596a21fbf210f5b763fd3c0be280650475b52 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 29 Oct 2024 13:54:43 +0100
Subject: clockevents: Shutdown and unregister current clockevents at
 CPUHP_AP_TICK_DYING

The way the clockevent devices are finally stopped while a CPU is
offlining is currently chaotic. The layout being by order:

1) tick_sched_timer_dying() stops the tick and the underlying clockevent
  but only for oneshot case. The periodic tick and its related
  clockevent still runs.

2) tick_broadcast_offline() detaches and stops the per-cpu oneshot
  broadcast and append it to the released list.

3) Some individual clockevent drivers stop the clockevents (a second time if
  the tick is oneshot)

4) Once the CPU is dead, a control CPU remotely detaches and stops
  (a 3rd time if oneshot mode) the CPU clockevent and adds it to the
  released list.

5) The released list containing the broadcast device released on step 2)
   and the remotely detached clockevent from step 4) are unregistered.

These random events can be factorized if the current clockevent is
detached and stopped by the dying CPU at the generic layer, that is
from the dying CPU:

a) Stop the tick
b) Stop/detach the underlying per-cpu oneshot broadcast clockevent
c) Stop/detach the underlying clockevent
d) Release / unregister the clockevents from b) and c)
e) Release / unregister the remaining clockevents from the dying CPU.
   This part could be performed by the dying CPU

This way the drivers and the tick layer don't need to care about
clockevent operations during cpuhotplug down. This also unifies the tick
behaviour on offline CPUs between oneshot and periodic modes, avoiding
offline ticks altogether for sanity.

Adopt the simplification.

[ tglx: Remove the WARN_ON() in clockevents_register_device() as that
  	is called from an upcoming CPU before the CPU is marked online ]

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20241029125451.54574-3-frederic@kernel.org
---
 include/linux/tick.h        |  2 --
 kernel/cpu.c                |  2 --
 kernel/time/clockevents.c   | 30 +++++++++++-------------------
 kernel/time/tick-internal.h |  3 +--
 4 files changed, 12 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 72744638c5b0..b0c74bfe0600 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -20,12 +20,10 @@ extern void __init tick_init(void);
 extern void tick_suspend_local(void);
 /* Should be core only, but XEN resume magic and ARM BL switcher require it */
 extern void tick_resume_local(void);
-extern void tick_cleanup_dead_cpu(int cpu);
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
 static inline void tick_init(void) { }
 static inline void tick_suspend_local(void) { }
 static inline void tick_resume_local(void) { }
-static inline void tick_cleanup_dead_cpu(int cpu) { }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_HOTPLUG_CPU)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d293d52a3e00..895f3287e3f3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1338,8 +1338,6 @@ static int takedown_cpu(unsigned int cpu)
 
 	cpuhp_bp_sync_dead(cpu);
 
-	tick_cleanup_dead_cpu(cpu);
-
 	/*
 	 * Callbacks must be re-integrated right away to the RCU state machine.
 	 * Otherwise an RCU callback could block a further teardown function
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 4af27994db93..f3e831f62906 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -618,39 +618,30 @@ void clockevents_resume(void)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 /**
- * tick_offline_cpu - Take CPU out of the broadcast mechanism
+ * tick_offline_cpu - Shutdown all clock events related
+ *                    to this CPU and take it out of the
+ *                    broadcast mechanism.
  * @cpu:	The outgoing CPU
  *
- * Called on the outgoing CPU after it took itself offline.
+ * Called by the dying CPU during teardown.
  */
 void tick_offline_cpu(unsigned int cpu)
-{
-	raw_spin_lock(&clockevents_lock);
-	tick_broadcast_offline(cpu);
-	raw_spin_unlock(&clockevents_lock);
-}
-# endif
-
-/**
- * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
- * @cpu:	The dead CPU
- */
-void tick_cleanup_dead_cpu(int cpu)
 {
 	struct clock_event_device *dev, *tmp;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&clockevents_lock, flags);
+	raw_spin_lock(&clockevents_lock);
 
+	tick_broadcast_offline(cpu);
 	tick_shutdown(cpu);
+
 	/*
 	 * Unregister the clock event devices which were
-	 * released from the users in the notify chain.
+	 * released above.
 	 */
 	list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
 		list_del(&dev->list);
+
 	/*
 	 * Now check whether the CPU has left unused per cpu devices
 	 */
@@ -662,7 +653,8 @@ void tick_cleanup_dead_cpu(int cpu)
 			list_del(&dev->list);
 		}
 	}
-	raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+
+	raw_spin_unlock(&clockevents_lock);
 }
 #endif
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 5f2105e637bd..faac36de35b9 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -25,6 +25,7 @@ extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
 extern void tick_check_new_device(struct clock_event_device *dev);
+extern void tick_offline_cpu(unsigned int cpu);
 extern void tick_shutdown(unsigned int cpu);
 extern void tick_suspend(void);
 extern void tick_resume(void);
@@ -142,10 +143,8 @@ static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_
 #endif /* !(BROADCAST && ONESHOT) */
 
 #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU)
-extern void tick_offline_cpu(unsigned int cpu);
 extern void tick_broadcast_offline(unsigned int cpu);
 #else
-static inline void tick_offline_cpu(unsigned int cpu) { }
 static inline void tick_broadcast_offline(unsigned int cpu) { }
 #endif
 
-- 
cgit v1.2.3


From 697a4001d31a607a72c6297e4eb0f7918c6e6929 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Mon, 7 Oct 2024 01:14:05 +0100
Subject: mfd: axp20x: Ensure relationship between IDs and model names

At the moment there is an implicit relationship between the AXP model
IDs and the order of the strings in the axp20x_model_names[] array.
This is fragile, and makes adding IDs in the middle error prone.

Make this relationship official by changing the ID type to the actual
enum used, and using indexed initialisers for the string list.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Chen-Yu Tsai <wens@csie.org>
Link: https://lore.kernel.org/r/20241007001408.27249-3-andre.przywara@arm.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/axp20x.c                 | 30 +++++++++++++++---------------
 drivers/regulator/axp20x-regulator.c |  2 +-
 include/linux/mfd/axp20x.h           |  2 +-
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index 4051551757f2..5ceea359289f 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -34,20 +34,20 @@
 #define AXP806_REG_ADDR_EXT_ADDR_SLAVE_MODE	BIT(4)
 
 static const char * const axp20x_model_names[] = {
-	"AXP152",
-	"AXP192",
-	"AXP202",
-	"AXP209",
-	"AXP221",
-	"AXP223",
-	"AXP288",
-	"AXP313a",
-	"AXP717",
-	"AXP803",
-	"AXP806",
-	"AXP809",
-	"AXP813",
-	"AXP15060",
+	[AXP152_ID] = "AXP152",
+	[AXP192_ID] = "AXP192",
+	[AXP202_ID] = "AXP202",
+	[AXP209_ID] = "AXP209",
+	[AXP221_ID] = "AXP221",
+	[AXP223_ID] = "AXP223",
+	[AXP288_ID] = "AXP288",
+	[AXP313A_ID] = "AXP313a",
+	[AXP717_ID] = "AXP717",
+	[AXP803_ID] = "AXP803",
+	[AXP806_ID] = "AXP806",
+	[AXP809_ID] = "AXP809",
+	[AXP813_ID] = "AXP813",
+	[AXP15060_ID] = "AXP15060",
 };
 
 static const struct regmap_range axp152_writeable_ranges[] = {
@@ -1345,7 +1345,7 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		axp20x->regmap_irq_chip = &axp15060_regmap_irq_chip;
 		break;
 	default:
-		dev_err(dev, "unsupported AXP20X ID %lu\n", axp20x->variant);
+		dev_err(dev, "unsupported AXP20X ID %u\n", axp20x->variant);
 		return -EINVAL;
 	}
 
diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c
index a8e91d9d028b..3ba76dbd0fb9 100644
--- a/drivers/regulator/axp20x-regulator.c
+++ b/drivers/regulator/axp20x-regulator.c
@@ -1597,7 +1597,7 @@ static int axp20x_regulator_probe(struct platform_device *pdev)
 		nregulators = AXP15060_REG_ID_MAX;
 		break;
 	default:
-		dev_err(&pdev->dev, "Unsupported AXP variant: %ld\n",
+		dev_err(&pdev->dev, "Unsupported AXP variant: %d\n",
 			axp20x->variant);
 		return -EINVAL;
 	}
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index f4dfc1871a95..79ecaaaa2070 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -959,7 +959,7 @@ struct axp20x_dev {
 	unsigned long			irq_flags;
 	struct regmap			*regmap;
 	struct regmap_irq_chip_data	*regmap_irqc;
-	long				variant;
+	enum axp20x_variants		variant;
 	int                             nr_cells;
 	const struct mfd_cell           *cells;
 	const struct regmap_config	*regmap_cfg;
-- 
cgit v1.2.3


From 35fec94afe045856456faca4879b9c560e39d1e3 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Mon, 7 Oct 2024 01:14:07 +0100
Subject: mfd: axp20x: Add support for AXP323

The X-Powers AXP323 is a very close sibling of the AXP313A. The only
difference seems to be the ability to dual-phase the first two DC/DC
converter, which adds another register.

Add the required boilerplate to introduce a new PMIC to the AXP MFD
driver. Where possible, this just maps into the existing structs defined
for the AXP313A, only deviating where needed.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Chen-Yu Tsai <wens@csie.org>
Link: https://lore.kernel.org/r/20241007001408.27249-5-andre.przywara@arm.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/axp20x-i2c.c   |  1 +
 drivers/mfd/axp20x.c       | 26 ++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h |  2 ++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x-i2c.c b/drivers/mfd/axp20x-i2c.c
index 791a0b4cb64b..5c93136f977e 100644
--- a/drivers/mfd/axp20x-i2c.c
+++ b/drivers/mfd/axp20x-i2c.c
@@ -65,6 +65,7 @@ static const struct of_device_id axp20x_i2c_of_match[] = {
 	{ .compatible = "x-powers,axp221", .data = (void *)AXP221_ID },
 	{ .compatible = "x-powers,axp223", .data = (void *)AXP223_ID },
 	{ .compatible = "x-powers,axp313a", .data = (void *)AXP313A_ID },
+	{ .compatible = "x-powers,axp323", .data = (void *)AXP323_ID },
 	{ .compatible = "x-powers,axp717", .data = (void *)AXP717_ID },
 	{ .compatible = "x-powers,axp803", .data = (void *)AXP803_ID },
 	{ .compatible = "x-powers,axp806", .data = (void *)AXP806_ID },
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index bc08ae433260..251465a656d0 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -42,6 +42,7 @@ static const char * const axp20x_model_names[] = {
 	[AXP223_ID] = "AXP223",
 	[AXP288_ID] = "AXP288",
 	[AXP313A_ID] = "AXP313a",
+	[AXP323_ID] = "AXP323",
 	[AXP717_ID] = "AXP717",
 	[AXP803_ID] = "AXP803",
 	[AXP806_ID] = "AXP806",
@@ -193,6 +194,10 @@ static const struct regmap_range axp313a_writeable_ranges[] = {
 	regmap_reg_range(AXP313A_ON_INDICATE, AXP313A_IRQ_STATE),
 };
 
+static const struct regmap_range axp323_writeable_ranges[] = {
+	regmap_reg_range(AXP313A_ON_INDICATE, AXP323_DCDC_MODE_CTRL2),
+};
+
 static const struct regmap_range axp313a_volatile_ranges[] = {
 	regmap_reg_range(AXP313A_SHUTDOWN_CTRL, AXP313A_SHUTDOWN_CTRL),
 	regmap_reg_range(AXP313A_IRQ_STATE, AXP313A_IRQ_STATE),
@@ -203,6 +208,11 @@ static const struct regmap_access_table axp313a_writeable_table = {
 	.n_yes_ranges = ARRAY_SIZE(axp313a_writeable_ranges),
 };
 
+static const struct regmap_access_table axp323_writeable_table = {
+	.yes_ranges = axp323_writeable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(axp323_writeable_ranges),
+};
+
 static const struct regmap_access_table axp313a_volatile_table = {
 	.yes_ranges = axp313a_volatile_ranges,
 	.n_yes_ranges = ARRAY_SIZE(axp313a_volatile_ranges),
@@ -433,6 +443,15 @@ static const struct regmap_config axp313a_regmap_config = {
 	.cache_type = REGCACHE_MAPLE,
 };
 
+static const struct regmap_config axp323_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.wr_table = &axp323_writeable_table,
+	.volatile_table = &axp313a_volatile_table,
+	.max_register = AXP323_DCDC_MODE_CTRL2,
+	.cache_type = REGCACHE_MAPLE,
+};
+
 static const struct regmap_config axp717_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
@@ -1221,6 +1240,7 @@ static int axp20x_power_off(struct sys_off_data *data)
 	unsigned int shutdown_reg;
 
 	switch (axp20x->variant) {
+	case AXP323_ID:
 	case AXP313A_ID:
 		shutdown_reg = AXP313A_SHUTDOWN_CTRL;
 		break;
@@ -1289,6 +1309,12 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		axp20x->regmap_cfg = &axp313a_regmap_config;
 		axp20x->regmap_irq_chip = &axp313a_regmap_irq_chip;
 		break;
+	case AXP323_ID:
+		axp20x->nr_cells = ARRAY_SIZE(axp313a_cells);
+		axp20x->cells = axp313a_cells;
+		axp20x->regmap_cfg = &axp323_regmap_config;
+		axp20x->regmap_irq_chip = &axp313a_regmap_irq_chip;
+		break;
 	case AXP717_ID:
 		axp20x->nr_cells = ARRAY_SIZE(axp717_cells);
 		axp20x->cells = axp717_cells;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 79ecaaaa2070..c3df0e615fbf 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -19,6 +19,7 @@ enum axp20x_variants {
 	AXP223_ID,
 	AXP288_ID,
 	AXP313A_ID,
+	AXP323_ID,
 	AXP717_ID,
 	AXP803_ID,
 	AXP806_ID,
@@ -113,6 +114,7 @@ enum axp20x_variants {
 #define AXP313A_SHUTDOWN_CTRL		0x1a
 #define AXP313A_IRQ_EN			0x20
 #define AXP313A_IRQ_STATE		0x21
+#define AXP323_DCDC_MODE_CTRL2		0x22
 
 #define AXP717_ON_INDICATE		0x00
 #define AXP717_PMU_STATUS_2		0x01
-- 
cgit v1.2.3


From 6e31bb8d3a63bb2c3efab2fb6bcfccac677a4581 Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Fri, 18 Oct 2024 11:10:46 +0300
Subject: mfd: mt6397: Add initial support for MT6328

The MT6328 PMIC is commonly used with the MT6735 SoC. Add initial
support for this PMIC.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20241018081050.23592-5-y.oudjana@protonmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/mt6397-core.c            |  32 ++
 drivers/mfd/mt6397-irq.c             |  23 +
 include/linux/mfd/mt6328/core.h      |  53 +++
 include/linux/mfd/mt6328/registers.h | 822 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/mt6397/core.h      |  11 +-
 5 files changed, 936 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/mfd/mt6328/core.h
 create mode 100644 include/linux/mfd/mt6328/registers.h

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c
index c2939e785818..0e5d59ae064a 100644
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c
@@ -13,12 +13,14 @@
 #include <linux/regmap.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/mt6323/core.h>
+#include <linux/mfd/mt6328/core.h>
 #include <linux/mfd/mt6331/core.h>
 #include <linux/mfd/mt6357/core.h>
 #include <linux/mfd/mt6358/core.h>
 #include <linux/mfd/mt6359/core.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6328/registers.h>
 #include <linux/mfd/mt6331/registers.h>
 #include <linux/mfd/mt6357/registers.h>
 #include <linux/mfd/mt6358/registers.h>
@@ -87,6 +89,13 @@ static const struct resource mt6323_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6323_IRQ_STATUS_FCHRKEY, "homekey"),
 };
 
+static const struct resource mt6328_keys_resources[] = {
+	DEFINE_RES_IRQ_NAMED(MT6328_IRQ_STATUS_PWRKEY, "powerkey"),
+	DEFINE_RES_IRQ_NAMED(MT6328_IRQ_STATUS_HOMEKEY, "homekey"),
+	DEFINE_RES_IRQ_NAMED(MT6328_IRQ_STATUS_PWRKEY_R, "powerkey_r"),
+	DEFINE_RES_IRQ_NAMED(MT6328_IRQ_STATUS_HOMEKEY_R, "homekey_r"),
+};
+
 static const struct resource mt6357_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_PWRKEY, "powerkey"),
 	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_HOMEKEY, "homekey"),
@@ -133,6 +142,18 @@ static const struct mfd_cell mt6323_devs[] = {
 	},
 };
 
+static const struct mfd_cell mt6328_devs[] = {
+	{
+		.name = "mt6328-regulator",
+		.of_compatible = "mediatek,mt6328-regulator"
+	}, {
+		.name = "mtk-pmic-keys",
+		.num_resources = ARRAY_SIZE(mt6328_keys_resources),
+		.resources = mt6328_keys_resources,
+		.of_compatible = "mediatek,mt6328-keys"
+	},
+};
+
 static const struct mfd_cell mt6357_devs[] = {
 	{
 		.name = "mt6359-auxadc",
@@ -262,6 +283,14 @@ static const struct chip_data mt6323_core = {
 	.irq_init = mt6397_irq_init,
 };
 
+static const struct chip_data mt6328_core = {
+	.cid_addr = MT6328_HWCID,
+	.cid_shift = 0,
+	.cells = mt6328_devs,
+	.cell_size = ARRAY_SIZE(mt6328_devs),
+	.irq_init = mt6397_irq_init,
+};
+
 static const struct chip_data mt6357_core = {
 	.cid_addr = MT6357_SWCID,
 	.cid_shift = 8,
@@ -360,6 +389,9 @@ static const struct of_device_id mt6397_of_match[] = {
 	{
 		.compatible = "mediatek,mt6323",
 		.data = &mt6323_core,
+	}, {
+		.compatible = "mediatek,mt6328",
+		.data = &mt6328_core,
 	}, {
 		.compatible = "mediatek,mt6331",
 		.data = &mt6331_mt6332_core,
diff --git a/drivers/mfd/mt6397-irq.c b/drivers/mfd/mt6397-irq.c
index 886745b5b607..1310665200ed 100644
--- a/drivers/mfd/mt6397-irq.c
+++ b/drivers/mfd/mt6397-irq.c
@@ -11,6 +11,8 @@
 #include <linux/suspend.h>
 #include <linux/mfd/mt6323/core.h>
 #include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6328/core.h>
+#include <linux/mfd/mt6328/registers.h>
 #include <linux/mfd/mt6331/core.h>
 #include <linux/mfd/mt6331/registers.h>
 #include <linux/mfd/mt6397/core.h>
@@ -31,6 +33,9 @@ static void mt6397_irq_sync_unlock(struct irq_data *data)
 		     mt6397->irq_masks_cur[0]);
 	regmap_write(mt6397->regmap, mt6397->int_con[1],
 		     mt6397->irq_masks_cur[1]);
+	if (mt6397->int_con[2])
+		regmap_write(mt6397->regmap, mt6397->int_con[2],
+			     mt6397->irq_masks_cur[2]);
 
 	mutex_unlock(&mt6397->irqlock);
 }
@@ -105,6 +110,8 @@ static irqreturn_t mt6397_irq_thread(int irq, void *data)
 
 	mt6397_irq_handle_reg(mt6397, mt6397->int_status[0], 0);
 	mt6397_irq_handle_reg(mt6397, mt6397->int_status[1], 16);
+	if (mt6397->int_status[2])
+		mt6397_irq_handle_reg(mt6397, mt6397->int_status[2], 32);
 
 	return IRQ_HANDLED;
 }
@@ -138,6 +145,9 @@ static int mt6397_irq_pm_notifier(struct notifier_block *notifier,
 			     chip->int_con[0], chip->wake_mask[0]);
 		regmap_write(chip->regmap,
 			     chip->int_con[1], chip->wake_mask[1]);
+		if (chip->int_con[2])
+			regmap_write(chip->regmap,
+				     chip->int_con[2], chip->wake_mask[2]);
 		enable_irq_wake(chip->irq);
 		break;
 
@@ -146,6 +156,9 @@ static int mt6397_irq_pm_notifier(struct notifier_block *notifier,
 			     chip->int_con[0], chip->irq_masks_cur[0]);
 		regmap_write(chip->regmap,
 			     chip->int_con[1], chip->irq_masks_cur[1]);
+		if (chip->int_con[2])
+			regmap_write(chip->regmap,
+				     chip->int_con[2], chip->irq_masks_cur[2]);
 		disable_irq_wake(chip->irq);
 		break;
 
@@ -169,6 +182,14 @@ int mt6397_irq_init(struct mt6397_chip *chip)
 		chip->int_status[0] = MT6323_INT_STATUS0;
 		chip->int_status[1] = MT6323_INT_STATUS1;
 		break;
+	case MT6328_CHIP_ID:
+		chip->int_con[0] = MT6328_INT_CON0;
+		chip->int_con[1] = MT6328_INT_CON1;
+		chip->int_con[2] = MT6328_INT_CON2;
+		chip->int_status[0] = MT6328_INT_STATUS0;
+		chip->int_status[1] = MT6328_INT_STATUS1;
+		chip->int_status[2] = MT6328_INT_STATUS2;
+		break;
 	case MT6331_CHIP_ID:
 		chip->int_con[0] = MT6331_INT_CON0;
 		chip->int_con[1] = MT6331_INT_CON1;
@@ -191,6 +212,8 @@ int mt6397_irq_init(struct mt6397_chip *chip)
 	/* Mask all interrupt sources */
 	regmap_write(chip->regmap, chip->int_con[0], 0x0);
 	regmap_write(chip->regmap, chip->int_con[1], 0x0);
+	if (chip->int_con[2])
+		regmap_write(chip->regmap, chip->int_con[2], 0x0);
 
 	chip->pm_nb.notifier_call = mt6397_irq_pm_notifier;
 	chip->irq_domain = irq_domain_add_linear(chip->dev->of_node,
diff --git a/include/linux/mfd/mt6328/core.h b/include/linux/mfd/mt6328/core.h
new file mode 100644
index 000000000000..9a08aed72b9f
--- /dev/null
+++ b/include/linux/mfd/mt6328/core.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ * Copyright (c) 2022 Yassine Oudjana <y.oudjana@protonmail.com>
+ */
+
+#ifndef __MFD_MT6328_CORE_H__
+#define __MFD_MT6328_CORE_H__
+
+enum mt6328_irq_status_numbers {
+	MT6328_IRQ_STATUS_PWRKEY = 0,
+	MT6328_IRQ_STATUS_HOMEKEY,
+	MT6328_IRQ_STATUS_PWRKEY_R,
+	MT6328_IRQ_STATUS_HOMEKEY_R,
+	MT6328_IRQ_STATUS_THR_H,
+	MT6328_IRQ_STATUS_THR_L,
+	MT6328_IRQ_STATUS_BAT_H,
+	MT6328_IRQ_STATUS_BAT_L,
+	MT6328_IRQ_STATUS_RTC,
+	MT6328_IRQ_STATUS_AUDIO,
+	MT6328_IRQ_STATUS_ACCDET,
+	MT6328_IRQ_STATUS_ACCDET_EINT,
+	MT6328_IRQ_STATUS_ACCDET_NEGV,
+	MT6328_IRQ_STATUS_NI_LBAT_INT,
+	MT6328_IRQ_STATUS_VPROC_OC = 16,
+	MT6328_IRQ_STATUS_VSYS_OC,
+	MT6328_IRQ_STATUS_VLTE_OC,
+	MT6328_IRQ_STATUS_VCORE_OC,
+	MT6328_IRQ_STATUS_VPA_OC,
+	MT6328_IRQ_STATUS_LDO_OC,
+	MT6328_IRQ_STATUS_BAT2_H,
+	MT6328_IRQ_STATUS_BAT2_L,
+	MT6328_IRQ_STATUS_VISMPS0_H,
+	MT6328_IRQ_STATUS_VISMPS0_L,
+	MT6328_IRQ_STATUS_AUXADC_IMP,
+	MT6328_IRQ_STATUS_OV = 32,
+	MT6328_IRQ_STATUS_BVALID_DET,
+	MT6328_IRQ_STATUS_VBATON_HV,
+	MT6328_IRQ_STATUS_VBATON_UNDET,
+	MT6328_IRQ_STATUS_WATCHDOG,
+	MT6328_IRQ_STATUS_PCHR_CM_VDEC,
+	MT6328_IRQ_STATUS_CHRDET,
+	MT6328_IRQ_STATUS_PCHR_CM_VINC,
+	MT6328_IRQ_STATUS_FG_BAT_H,
+	MT6328_IRQ_STATUS_FG_BAT_L,
+	MT6328_IRQ_STATUS_FG_CUR_H,
+	MT6328_IRQ_STATUS_FG_CUR_L,
+	MT6328_IRQ_STATUS_FG_ZCV,
+	MT6328_IRQ_STATUS_SPKL_D,
+	MT6328_IRQ_STATUS_SPKL_AB,
+};
+
+#endif /* __MFD_MT6323_CORE_H__ */
diff --git a/include/linux/mfd/mt6328/registers.h b/include/linux/mfd/mt6328/registers.h
new file mode 100644
index 000000000000..8199aaea27b9
--- /dev/null
+++ b/include/linux/mfd/mt6328/registers.h
@@ -0,0 +1,822 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 Yassine Oudjana <y.oudjana@protonmail.com>
+ */
+
+#ifndef __MFD_MT6328_REGISTERS_H__
+#define __MFD_MT6328_REGISTERS_H__
+
+/* PMIC Registers */
+#define MT6328_STRUP_CON0            0x0000
+#define MT6328_STRUP_CON2            0x0002
+#define MT6328_STRUP_CON3            0x0004
+#define MT6328_STRUP_CON4            0x0006
+#define MT6328_STRUP_CON5            0x0008
+#define MT6328_STRUP_CON6            0x000a
+#define MT6328_STRUP_CON7            0x000c
+#define MT6328_STRUP_CON8            0x000e
+#define MT6328_STRUP_CON9            0x0010
+#define MT6328_STRUP_CON10           0x0012
+#define MT6328_STRUP_CON11           0x0014
+#define MT6328_STRUP_CON12           0x0016
+#define MT6328_STRUP_CON13           0x0018
+#define MT6328_STRUP_CON14           0x001a
+#define MT6328_STRUP_CON15           0x001c
+#define MT6328_STRUP_CON16           0x001e
+#define MT6328_STRUP_CON17           0x0020
+#define MT6328_STRUP_CON18           0x0022
+#define MT6328_STRUP_CON19           0x0024
+#define MT6328_STRUP_CON20           0x0026
+#define MT6328_STRUP_CON21           0x0028
+#define MT6328_STRUP_CON22           0x002a
+#define MT6328_STRUP_CON23           0x002c
+#define MT6328_STRUP_CON24           0x002e
+#define MT6328_STRUP_CON25           0x0030
+#define MT6328_STRUP_CON26           0x0032
+#define MT6328_STRUP_CON27           0x0034
+#define MT6328_STRUP_CON28           0x0036
+#define MT6328_STRUP_CON29           0x0038
+#define MT6328_STRUP_CON30           0x003a
+#define MT6328_STRUP_CON31           0x003c
+#define MT6328_STRUP_CON32           0x003e
+#define MT6328_STRUP_ANA_CON0        0x0040
+#define MT6328_HWCID                 0x0200
+#define MT6328_SWCID                 0x0202
+#define MT6328_TOP_CON               0x0204
+#define MT6328_TEST_OUT              0x0206
+#define MT6328_TEST_CON0             0x0208
+#define MT6328_TEST_CON1             0x020a
+#define MT6328_TESTMODE_SW           0x020c
+#define MT6328_EN_STATUS0            0x020e
+#define MT6328_EN_STATUS1            0x0210
+#define MT6328_EN_STATUS2            0x0212
+#define MT6328_OCSTATUS0             0x0214
+#define MT6328_OCSTATUS1             0x0216
+#define MT6328_OCSTATUS2             0x0218
+#define MT6328_PGDEBSTATUS           0x021a
+#define MT6328_PGSTATUS              0x021c
+#define MT6328_THERMALSTATUS         0x021e
+#define MT6328_TOPSTATUS             0x0220
+#define MT6328_TDSEL_CON             0x0222
+#define MT6328_RDSEL_CON             0x0224
+#define MT6328_SMT_CON0              0x0226
+#define MT6328_SMT_CON1              0x0228
+#define MT6328_SMT_CON2              0x022a
+#define MT6328_DRV_CON0              0x022c
+#define MT6328_DRV_CON1              0x022e
+#define MT6328_DRV_CON2              0x0230
+#define MT6328_DRV_CON3              0x0232
+#define MT6328_TOP_STATUS            0x0234
+#define MT6328_TOP_STATUS_SET        0x0236
+#define MT6328_TOP_STATUS_CLR        0x0238
+#define MT6328_RGS_ANA_MON           0x023a
+#define MT6328_TOP_CKPDN_CON0        0x023c
+#define MT6328_TOP_CKPDN_CON0_SET    0x023e
+#define MT6328_TOP_CKPDN_CON0_CLR    0x0240
+#define MT6328_TOP_CKPDN_CON1        0x0242
+#define MT6328_TOP_CKPDN_CON1_SET    0x0244
+#define MT6328_TOP_CKPDN_CON1_CLR    0x0246
+#define MT6328_TOP_CKPDN_CON2        0x0248
+#define MT6328_TOP_CKPDN_CON2_SET    0x024a
+#define MT6328_TOP_CKPDN_CON2_CLR    0x024c
+#define MT6328_TOP_CKPDN_CON3        0x024e
+#define MT6328_TOP_CKPDN_CON3_SET    0x0250
+#define MT6328_TOP_CKPDN_CON3_CLR    0x0252
+#define MT6328_TOP_CKPDN_CON4        0x0254
+#define MT6328_TOP_CKPDN_CON4_SET    0x0256
+#define MT6328_TOP_CKPDN_CON4_CLR    0x0258
+#define MT6328_TOP_CKSEL_CON0        0x025a
+#define MT6328_TOP_CKSEL_CON0_SET    0x025c
+#define MT6328_TOP_CKSEL_CON0_CLR    0x025e
+#define MT6328_TOP_CKSEL_CON1        0x0260
+#define MT6328_TOP_CKSEL_CON1_SET    0x0262
+#define MT6328_TOP_CKSEL_CON1_CLR    0x0264
+#define MT6328_TOP_CKSEL_CON2        0x0266
+#define MT6328_TOP_CKSEL_CON2_SET    0x0268
+#define MT6328_TOP_CKSEL_CON2_CLR    0x026a
+#define MT6328_TOP_CKDIVSEL_CON0     0x026c
+#define MT6328_TOP_CKDIVSEL_CON0_SET 0x026e
+#define MT6328_TOP_CKDIVSEL_CON0_CLR 0x0270
+#define MT6328_TOP_CKDIVSEL_CON1     0x0272
+#define MT6328_TOP_CKDIVSEL_CON1_SET 0x0274
+#define MT6328_TOP_CKDIVSEL_CON1_CLR 0x0276
+#define MT6328_TOP_CKHWEN_CON0       0x0278
+#define MT6328_TOP_CKHWEN_CON0_SET   0x027a
+#define MT6328_TOP_CKHWEN_CON0_CLR   0x027c
+#define MT6328_TOP_CKHWEN_CON1       0x027e
+#define MT6328_TOP_CKHWEN_CON1_SET   0x0280
+#define MT6328_TOP_CKHWEN_CON1_CLR   0x0282
+#define MT6328_TOP_CKTST_CON0        0x0284
+#define MT6328_TOP_CKTST_CON1        0x0286
+#define MT6328_TOP_CKTST_CON2        0x0288
+#define MT6328_TOP_CLKSQ             0x028a
+#define MT6328_TOP_CLKSQ_SET         0x028c
+#define MT6328_TOP_CLKSQ_CLR         0x028e
+#define MT6328_TOP_CLKSQ_RTC         0x0290
+#define MT6328_TOP_CLKSQ_RTC_SET     0x0292
+#define MT6328_TOP_CLKSQ_RTC_CLR     0x0294
+#define MT6328_TOP_CLK_TRIM          0x0296
+#define MT6328_TOP_RST_CON0          0x0298
+#define MT6328_TOP_RST_CON0_SET      0x029a
+#define MT6328_TOP_RST_CON0_CLR      0x029c
+#define MT6328_TOP_RST_CON1          0x029e
+#define MT6328_TOP_RST_MISC          0x02a0
+#define MT6328_TOP_RST_MISC_SET      0x02a2
+#define MT6328_TOP_RST_MISC_CLR      0x02a4
+#define MT6328_TOP_RST_STATUS        0x02a6
+#define MT6328_TOP_RST_STATUS_SET    0x02a8
+#define MT6328_TOP_RST_STATUS_CLR    0x02aa
+#define MT6328_INT_CON0              0x02ac
+#define MT6328_INT_CON0_SET          0x02ae
+#define MT6328_INT_CON0_CLR          0x02b0
+#define MT6328_INT_CON1              0x02b2
+#define MT6328_INT_CON1_SET          0x02b4
+#define MT6328_INT_CON1_CLR          0x02b6
+#define MT6328_INT_CON2              0x02b8
+#define MT6328_INT_CON2_SET          0x02ba
+#define MT6328_INT_CON2_CLR          0x02bc
+#define MT6328_INT_MISC_CON          0x02be
+#define MT6328_INT_MISC_CON_SET      0x02c0
+#define MT6328_INT_MISC_CON_CLR      0x02c2
+#define MT6328_INT_STATUS0           0x02c4
+#define MT6328_INT_STATUS1           0x02c6
+#define MT6328_INT_STATUS2           0x02c8
+#define MT6328_OC_GEAR_0             0x02ca
+#define MT6328_FQMTR_CON0            0x02cc
+#define MT6328_FQMTR_CON1            0x02ce
+#define MT6328_FQMTR_CON2            0x02d0
+#define MT6328_RG_SPI_CON            0x02d2
+#define MT6328_DEW_DIO_EN            0x02d4
+#define MT6328_DEW_READ_TEST         0x02d6
+#define MT6328_DEW_WRITE_TEST        0x02d8
+#define MT6328_DEW_CRC_SWRST         0x02da
+#define MT6328_DEW_CRC_EN            0x02dc
+#define MT6328_DEW_CRC_VAL           0x02de
+#define MT6328_DEW_DBG_MON_SEL       0x02e0
+#define MT6328_DEW_CIPHER_KEY_SEL    0x02e2
+#define MT6328_DEW_CIPHER_IV_SEL     0x02e4
+#define MT6328_DEW_CIPHER_EN         0x02e6
+#define MT6328_DEW_CIPHER_RDY        0x02e8
+#define MT6328_DEW_CIPHER_MODE       0x02ea
+#define MT6328_DEW_CIPHER_SWRST      0x02ec
+#define MT6328_DEW_RDDMY_NO          0x02ee
+#define MT6328_INT_TYPE_CON0         0x02f0
+#define MT6328_INT_TYPE_CON0_SET     0x02f2
+#define MT6328_INT_TYPE_CON0_CLR     0x02f4
+#define MT6328_INT_TYPE_CON1         0x02f6
+#define MT6328_INT_TYPE_CON1_SET     0x02f8
+#define MT6328_INT_TYPE_CON1_CLR     0x02fa
+#define MT6328_INT_TYPE_CON2         0x02fc
+#define MT6328_INT_TYPE_CON2_SET     0x02fe
+#define MT6328_INT_TYPE_CON2_CLR     0x0300
+#define MT6328_INT_STA               0x0302
+#define MT6328_BUCK_ALL_CON0         0x0400
+#define MT6328_BUCK_ALL_CON1         0x0402
+#define MT6328_BUCK_ALL_CON2         0x0404
+#define MT6328_BUCK_ALL_CON3         0x0406
+#define MT6328_BUCK_ALL_CON4         0x0408
+#define MT6328_BUCK_ALL_CON5         0x040a
+#define MT6328_BUCK_ALL_CON6         0x040c
+#define MT6328_BUCK_ALL_CON9         0x040e
+#define MT6328_BUCK_ALL_CON12        0x0410
+#define MT6328_BUCK_ALL_CON13        0x0412
+#define MT6328_BUCK_ALL_CON14        0x0414
+#define MT6328_BUCK_ALL_CON16        0x0416
+#define MT6328_BUCK_ALL_CON18        0x0418
+#define MT6328_BUCK_ALL_CON19        0x041a
+#define MT6328_BUCK_ALL_CON20        0x041c
+#define MT6328_BUCK_ALL_CON21        0x041e
+#define MT6328_BUCK_ALL_CON22        0x0420
+#define MT6328_BUCK_ALL_CON23        0x0422
+#define MT6328_BUCK_ALL_CON24        0x0424
+#define MT6328_BUCK_ALL_CON25        0x0426
+#define MT6328_BUCK_ALL_CON26        0x0428
+#define MT6328_BUCK_ALL_CON27        0x042a
+#define MT6328_BUCK_ALL_CON28        0x042c
+#define MT6328_SMPS_TOP_ANA_CON0     0x042e
+#define MT6328_SMPS_TOP_ANA_CON1     0x0430
+#define MT6328_SMPS_TOP_ANA_CON2     0x0432
+#define MT6328_SMPS_TOP_ANA_CON3     0x0434
+#define MT6328_SMPS_TOP_ANA_CON4     0x0436
+#define MT6328_SMPS_TOP_ANA_CON5     0x0438
+#define MT6328_SMPS_TOP_ANA_CON6     0x043a
+#define MT6328_SMPS_TOP_ANA_CON7     0x043c
+#define MT6328_SMPS_TOP_ANA_CON8     0x043e
+#define MT6328_VCORE_ANA_CON0        0x0440
+#define MT6328_VCORE_ANA_CON1        0x0442
+#define MT6328_VCORE_ANA_CON2        0x0444
+#define MT6328_VCORE_ANA_CON3        0x0446
+#define MT6328_VCORE_ANA_CON4        0x0448
+#define MT6328_VSYS22_ANA_CON0       0x044a
+#define MT6328_VSYS22_ANA_CON1       0x044c
+#define MT6328_VSYS22_ANA_CON2       0x044e
+#define MT6328_VSYS22_ANA_CON3       0x0450
+#define MT6328_VSYS22_ANA_CON4       0x0452
+#define MT6328_VPROC_ANA_CON0        0x0454
+#define MT6328_VPROC_ANA_CON1        0x0456
+#define MT6328_VPROC_ANA_CON2        0x0458
+#define MT6328_VPROC_ANA_CON3        0x045a
+#define MT6328_VPROC_ANA_CON4        0x045c
+#define MT6328_OSC32_ANA_CON0        0x045e
+#define MT6328_OSC32_ANA_CON1        0x0460
+#define MT6328_VPA_ANA_CON0          0x0462
+#define MT6328_VPA_ANA_CON1          0x0464
+#define MT6328_VPA_ANA_CON2          0x0466
+#define MT6328_VPA_ANA_CON3          0x0468
+#define MT6328_VLTE_ANA_CON0         0x046a
+#define MT6328_VLTE_ANA_CON1         0x046c
+#define MT6328_VLTE_ANA_CON2         0x046e
+#define MT6328_VLTE_ANA_CON3         0x0470
+#define MT6328_VLTE_ANA_CON4         0x0472
+#define MT6328_VPROC_CON0            0x0474
+#define MT6328_VPROC_CON1            0x0476
+#define MT6328_VPROC_CON2            0x0478
+#define MT6328_VPROC_CON3            0x047a
+#define MT6328_VPROC_CON4            0x047c
+#define MT6328_VPROC_CON5            0x047e
+#define MT6328_VPROC_CON6            0x0480
+#define MT6328_VPROC_CON7            0x0482
+#define MT6328_VPROC_CON8            0x0484
+#define MT6328_VPROC_CON9            0x0486
+#define MT6328_VPROC_CON10           0x0488
+#define MT6328_VPROC_CON11           0x048a
+#define MT6328_VPROC_CON12           0x048c
+#define MT6328_VPROC_CON13           0x048e
+#define MT6328_VPROC_CON14           0x0490
+#define MT6328_VPROC_CON15           0x0492
+#define MT6328_VPROC_CON16           0x0494
+#define MT6328_VPROC_CON17           0x0496
+#define MT6328_VPROC_CON18           0x0498
+#define MT6328_VPROC_CON19           0x049a
+#define MT6328_VSRAM_CON0            0x049c
+#define MT6328_VSRAM_CON1            0x049e
+#define MT6328_VSRAM_CON2            0x04a0
+#define MT6328_VSRAM_CON3            0x04a2
+#define MT6328_VSRAM_CON4            0x04a4
+#define MT6328_VSRAM_CON5            0x04a6
+#define MT6328_VSRAM_CON6            0x04a8
+#define MT6328_VSRAM_CON7            0x04aa
+#define MT6328_VSRAM_CON8            0x04ac
+#define MT6328_VSRAM_CON9            0x04ae
+#define MT6328_VSRAM_CON10           0x04b0
+#define MT6328_VSRAM_CON11           0x04b2
+#define MT6328_VSRAM_CON12           0x04b4
+#define MT6328_VSRAM_CON13           0x04b6
+#define MT6328_VSRAM_CON14           0x04b8
+#define MT6328_VSRAM_CON15           0x04ba
+#define MT6328_VSRAM_CON16           0x04bc
+#define MT6328_VSRAM_CON17           0x04be
+#define MT6328_VSRAM_CON18           0x04c0
+#define MT6328_VSRAM_CON19           0x04c2
+#define MT6328_VLTE_CON0             0x04c4
+#define MT6328_VLTE_CON1             0x04c6
+#define MT6328_VLTE_CON2             0x04c8
+#define MT6328_VLTE_CON3             0x04ca
+#define MT6328_VLTE_CON4             0x04cc
+#define MT6328_VLTE_CON5             0x04ce
+#define MT6328_VLTE_CON6             0x04d0
+#define MT6328_VLTE_CON7             0x04d2
+#define MT6328_VLTE_CON8             0x04d4
+#define MT6328_VLTE_CON9             0x04d6
+#define MT6328_VLTE_CON10            0x04d8
+#define MT6328_VLTE_CON11            0x04da
+#define MT6328_VLTE_CON12            0x04dc
+#define MT6328_VLTE_CON13            0x04de
+#define MT6328_VLTE_CON14            0x04e0
+#define MT6328_VLTE_CON15            0x04e2
+#define MT6328_VLTE_CON16            0x04e4
+#define MT6328_VLTE_CON17            0x04e6
+#define MT6328_VLTE_CON18            0x04e8
+#define MT6328_VLTE_CON19            0x04ea
+#define MT6328_VCORE1_CON0           0x0600
+#define MT6328_VCORE1_CON1           0x0602
+#define MT6328_VCORE1_CON2           0x0604
+#define MT6328_VCORE1_CON3           0x0606
+#define MT6328_VCORE1_CON4           0x0608
+#define MT6328_VCORE1_CON5           0x060a
+#define MT6328_VCORE1_CON6           0x060c
+#define MT6328_VCORE1_CON7           0x060e
+#define MT6328_VCORE1_CON8           0x0610
+#define MT6328_VCORE1_CON9           0x0612
+#define MT6328_VCORE1_CON10          0x0614
+#define MT6328_VCORE1_CON11          0x0616
+#define MT6328_VCORE1_CON12          0x0618
+#define MT6328_VCORE1_CON13          0x061a
+#define MT6328_VCORE1_CON14          0x061c
+#define MT6328_VCORE1_CON15          0x061e
+#define MT6328_VCORE1_CON16          0x0620
+#define MT6328_VCORE1_CON17          0x0622
+#define MT6328_VCORE1_CON18          0x0624
+#define MT6328_VCORE1_CON19          0x0626
+#define MT6328_VSYS22_CON0           0x0628
+#define MT6328_VSYS22_CON1           0x062a
+#define MT6328_VSYS22_CON2           0x062c
+#define MT6328_VSYS22_CON3           0x062e
+#define MT6328_VSYS22_CON4           0x0630
+#define MT6328_VSYS22_CON5           0x0632
+#define MT6328_VSYS22_CON6           0x0634
+#define MT6328_VSYS22_CON7           0x0636
+#define MT6328_VSYS22_CON8           0x0638
+#define MT6328_VSYS22_CON9           0x063a
+#define MT6328_VSYS22_CON10          0x063c
+#define MT6328_VSYS22_CON11          0x063e
+#define MT6328_VSYS22_CON12          0x0640
+#define MT6328_VSYS22_CON13          0x0642
+#define MT6328_VSYS22_CON14          0x0644
+#define MT6328_VSYS22_CON15          0x0646
+#define MT6328_VSYS22_CON16          0x0648
+#define MT6328_VSYS22_CON17          0x064a
+#define MT6328_VSYS22_CON18          0x064c
+#define MT6328_VSYS22_CON19          0x064e
+#define MT6328_VPA_CON0              0x0650
+#define MT6328_VPA_CON1              0x0652
+#define MT6328_VPA_CON2              0x0654
+#define MT6328_VPA_CON3              0x0656
+#define MT6328_VPA_CON4              0x0658
+#define MT6328_VPA_CON5              0x065a
+#define MT6328_VPA_CON6              0x065c
+#define MT6328_VPA_CON7              0x065e
+#define MT6328_VPA_CON8              0x0660
+#define MT6328_VPA_CON9              0x0662
+#define MT6328_VPA_CON10             0x0664
+#define MT6328_VPA_CON11             0x0666
+#define MT6328_VPA_CON12             0x0668
+#define MT6328_VPA_CON13             0x066a
+#define MT6328_VPA_CON14             0x066c
+#define MT6328_VPA_CON15             0x066e
+#define MT6328_VPA_CON16             0x0670
+#define MT6328_VPA_CON17             0x0672
+#define MT6328_VPA_CON18             0x0674
+#define MT6328_VPA_CON19             0x0676
+#define MT6328_VPA_CON20             0x0678
+#define MT6328_VPA_CON21             0x067a
+#define MT6328_VPA_CON22             0x067c
+#define MT6328_VPA_CON23             0x067e
+#define MT6328_VPA_CON24             0x0680
+#define MT6328_BUCK_K_CON0           0x0682
+#define MT6328_BUCK_K_CON1           0x0684
+#define MT6328_BUCK_K_CON2           0x0686
+#define MT6328_BUCK_K_CON3           0x0688
+#define MT6328_ZCD_CON0              0x0800
+#define MT6328_ZCD_CON1              0x0802
+#define MT6328_ZCD_CON2              0x0804
+#define MT6328_ZCD_CON3              0x0806
+#define MT6328_ZCD_CON4              0x0808
+#define MT6328_ZCD_CON5              0x080a
+#define MT6328_ISINK0_CON0           0x080c
+#define MT6328_ISINK0_CON1           0x080e
+#define MT6328_ISINK0_CON2           0x0810
+#define MT6328_ISINK0_CON3           0x0812
+#define MT6328_ISINK1_CON0           0x0814
+#define MT6328_ISINK1_CON1           0x0816
+#define MT6328_ISINK1_CON2           0x0818
+#define MT6328_ISINK1_CON3           0x081a
+#define MT6328_ISINK2_CON1           0x081c
+#define MT6328_ISINK3_CON1           0x081e
+#define MT6328_ISINK_ANA0            0x0820
+#define MT6328_ISINK_ANA1            0x0822
+#define MT6328_ISINK_PHASE_DLY       0x0824
+#define MT6328_ISINK_SFSTR           0x0826
+#define MT6328_ISINK_EN_CTRL         0x0828
+#define MT6328_ISINK_MODE_CTRL       0x082a
+#define MT6328_VTCXO_0_CON0          0x0a00
+#define MT6328_VTCXO_1_CON0          0x0a02
+#define MT6328_VAUD28_CON0           0x0a04
+#define MT6328_VAUX18_CON0           0x0a06
+#define MT6328_VRF18_0_CON0          0x0a08
+#define MT6328_VRF18_0_CON1          0x0a0a
+#define MT6328_VCAMA_CON0            0x0a0c
+#define MT6328_VCN28_CON0            0x0a0e
+#define MT6328_VCN33_CON0            0x0a10
+#define MT6328_VCN33_CON1            0x0a12
+#define MT6328_VCN33_CON2            0x0a14
+#define MT6328_VRF18_1_CON0          0x0a16
+#define MT6328_VRF18_1_CON1          0x0a18
+#define MT6328_VUSB33_CON0           0x0a1a
+#define MT6328_VMCH_CON0             0x0a1c
+#define MT6328_VMCH_CON1             0x0a1e
+#define MT6328_VMC_CON0              0x0a20
+#define MT6328_VMC_CON1              0x0a22
+#define MT6328_VEMC_3V3_CON0         0x0a24
+#define MT6328_VEMC_3V3_CON1         0x0a26
+#define MT6328_VIO28_CON0            0x0a28
+#define MT6328_VCAMAF_CON0           0x0a2a
+#define MT6328_VGP1_CON0             0x0a2c
+#define MT6328_VGP1_CON1             0x0a2e
+#define MT6328_VEFUSE_CON0           0x0a30
+#define MT6328_VSIM1_CON0            0x0a32
+#define MT6328_VSIM2_CON0            0x0a34
+#define MT6328_VIO18_CON0            0x0a36
+#define MT6328_VIBR_CON0             0x0a38
+#define MT6328_VCN18_CON0            0x0a3a
+#define MT6328_VCAM_CON0             0x0a3c
+#define MT6328_VCAMIO_CON0           0x0a3e
+#define MT6328_LDO_VSRAM_CON0        0x0a40
+#define MT6328_LDO_VSRAM_CON1        0x0a42
+#define MT6328_VTREF_CON0            0x0a44
+#define MT6328_VM_CON0               0x0a46
+#define MT6328_VM_CON1               0x0a48
+#define MT6328_VRTC_CON0             0x0a4a
+#define MT6328_LDO_OCFB0             0x0a4c
+#define MT6328_ALDO_ANA_CON0         0x0a4e
+#define MT6328_ADLDO_ANA_CON1        0x0a50
+#define MT6328_ADLDO_ANA_CON2        0x0a52
+#define MT6328_ADLDO_ANA_CON3        0x0a54
+#define MT6328_ADLDO_ANA_CON4        0x0a56
+#define MT6328_ADLDO_ANA_CON5        0x0a58
+#define MT6328_ADLDO_ANA_CON6        0x0a5a
+#define MT6328_ADLDO_ANA_CON7        0x0a5c
+#define MT6328_ADLDO_ANA_CON8        0x0a5e
+#define MT6328_ADLDO_ANA_CON9        0x0a60
+#define MT6328_ADLDO_ANA_CON10       0x0a62
+#define MT6328_ADLDO_ANA_CON11       0x0a64
+#define MT6328_ADLDO_ANA_CON12       0x0a66
+#define MT6328_ADLDO_ANA_CON13       0x0a68
+#define MT6328_DLDO_ANA_CON0         0x0a6a
+#define MT6328_DLDO_ANA_CON1         0x0a6c
+#define MT6328_DLDO_ANA_CON2         0x0a6e
+#define MT6328_DLDO_ANA_CON3         0x0a70
+#define MT6328_DLDO_ANA_CON4         0x0a72
+#define MT6328_DLDO_ANA_CON5         0x0a74
+#define MT6328_SLDO_ANA_CON0         0x0a76
+#define MT6328_SLDO_ANA_CON1         0x0a78
+#define MT6328_SLDO_ANA_CON2         0x0a7a
+#define MT6328_SLDO_ANA_CON3         0x0a7c
+#define MT6328_SLDO_ANA_CON4         0x0a7e
+#define MT6328_SLDO_ANA_CON5         0x0a80
+#define MT6328_SLDO_ANA_CON6         0x0a82
+#define MT6328_SLDO_ANA_CON7         0x0a84
+#define MT6328_SLDO_ANA_CON8         0x0a86
+#define MT6328_SLDO_ANA_CON9         0x0a88
+#define MT6328_SLDO_ANA_CON10        0x0a8a
+#define MT6328_LDO_RSV_CON0          0x0a8c
+#define MT6328_LDO_RSV_CON1          0x0a8e
+#define MT6328_SPK_CON0              0x0a90
+#define MT6328_SPK_CON1              0x0a92
+#define MT6328_SPK_CON2              0x0a94
+#define MT6328_SPK_CON3              0x0a96
+#define MT6328_SPK_CON4              0x0a98
+#define MT6328_SPK_CON5              0x0a9a
+#define MT6328_SPK_CON6              0x0a9c
+#define MT6328_SPK_CON7              0x0a9e
+#define MT6328_SPK_CON8              0x0aa0
+#define MT6328_SPK_CON9              0x0aa2
+#define MT6328_SPK_CON10             0x0aa4
+#define MT6328_SPK_CON11             0x0aa6
+#define MT6328_SPK_CON12             0x0aa8
+#define MT6328_SPK_CON13             0x0aaa
+#define MT6328_SPK_CON14             0x0aac
+#define MT6328_SPK_CON15             0x0aae
+#define MT6328_SPK_CON16             0x0ab0
+#define MT6328_SPK_ANA_CON0          0x0ab2
+#define MT6328_SPK_ANA_CON1          0x0ab4
+#define MT6328_SPK_ANA_CON3          0x0ab6
+#define MT6328_OTP_CON0              0x0c00
+#define MT6328_OTP_CON1              0x0c02
+#define MT6328_OTP_CON2              0x0c04
+#define MT6328_OTP_CON3              0x0c06
+#define MT6328_OTP_CON4              0x0c08
+#define MT6328_OTP_CON5              0x0c0a
+#define MT6328_OTP_CON6              0x0c0c
+#define MT6328_OTP_CON7              0x0c0e
+#define MT6328_OTP_CON8              0x0c10
+#define MT6328_OTP_CON9              0x0c12
+#define MT6328_OTP_CON10             0x0c14
+#define MT6328_OTP_CON11             0x0c16
+#define MT6328_OTP_CON12             0x0c18
+#define MT6328_OTP_CON13             0x0c1a
+#define MT6328_OTP_CON14             0x0c1c
+#define MT6328_OTP_DOUT_0_15         0x0c1e
+#define MT6328_OTP_DOUT_16_31        0x0c20
+#define MT6328_OTP_DOUT_32_47        0x0c22
+#define MT6328_OTP_DOUT_48_63        0x0c24
+#define MT6328_OTP_DOUT_64_79        0x0c26
+#define MT6328_OTP_DOUT_80_95        0x0c28
+#define MT6328_OTP_DOUT_96_111       0x0c2a
+#define MT6328_OTP_DOUT_112_127      0x0c2c
+#define MT6328_OTP_DOUT_128_143      0x0c2e
+#define MT6328_OTP_DOUT_144_159      0x0c30
+#define MT6328_OTP_DOUT_160_175      0x0c32
+#define MT6328_OTP_DOUT_176_191      0x0c34
+#define MT6328_OTP_DOUT_192_207      0x0c36
+#define MT6328_OTP_DOUT_208_223      0x0c38
+#define MT6328_OTP_DOUT_224_239      0x0c3a
+#define MT6328_OTP_DOUT_240_255      0x0c3c
+#define MT6328_OTP_DOUT_256_271      0x0c3e
+#define MT6328_OTP_DOUT_272_287      0x0c40
+#define MT6328_OTP_DOUT_288_303      0x0c42
+#define MT6328_OTP_DOUT_304_319      0x0c44
+#define MT6328_OTP_DOUT_320_335      0x0c46
+#define MT6328_OTP_DOUT_336_351      0x0c48
+#define MT6328_OTP_DOUT_352_367      0x0c4a
+#define MT6328_OTP_DOUT_368_383      0x0c4c
+#define MT6328_OTP_DOUT_384_399      0x0c4e
+#define MT6328_OTP_DOUT_400_415      0x0c50
+#define MT6328_OTP_DOUT_416_431      0x0c52
+#define MT6328_OTP_DOUT_432_447      0x0c54
+#define MT6328_OTP_DOUT_448_463      0x0c56
+#define MT6328_OTP_DOUT_464_479      0x0c58
+#define MT6328_OTP_DOUT_480_495      0x0c5a
+#define MT6328_OTP_DOUT_496_511      0x0c5c
+#define MT6328_OTP_VAL_0_15          0x0c5e
+#define MT6328_OTP_VAL_16_31         0x0c60
+#define MT6328_OTP_VAL_32_47         0x0c62
+#define MT6328_OTP_VAL_48_63         0x0c64
+#define MT6328_OTP_VAL_64_79         0x0c66
+#define MT6328_OTP_VAL_80_95         0x0c68
+#define MT6328_OTP_VAL_96_111        0x0c6a
+#define MT6328_OTP_VAL_112_127       0x0c6c
+#define MT6328_OTP_VAL_128_143       0x0c6e
+#define MT6328_OTP_VAL_144_159       0x0c70
+#define MT6328_OTP_VAL_160_175       0x0c72
+#define MT6328_OTP_VAL_176_191       0x0c74
+#define MT6328_OTP_VAL_192_207       0x0c76
+#define MT6328_OTP_VAL_208_223       0x0c78
+#define MT6328_OTP_VAL_224_239       0x0c7a
+#define MT6328_OTP_VAL_240_255       0x0c7c
+#define MT6328_OTP_VAL_256_271       0x0c7e
+#define MT6328_OTP_VAL_272_287       0x0c80
+#define MT6328_OTP_VAL_288_303       0x0c82
+#define MT6328_OTP_VAL_304_319       0x0c84
+#define MT6328_OTP_VAL_320_335       0x0c86
+#define MT6328_OTP_VAL_336_351       0x0c88
+#define MT6328_OTP_VAL_352_367       0x0c8a
+#define MT6328_OTP_VAL_368_383       0x0c8c
+#define MT6328_OTP_VAL_384_399       0x0c8e
+#define MT6328_OTP_VAL_400_415       0x0c90
+#define MT6328_OTP_VAL_416_431       0x0c92
+#define MT6328_OTP_VAL_432_447       0x0c94
+#define MT6328_OTP_VAL_448_463       0x0c96
+#define MT6328_OTP_VAL_464_479       0x0c98
+#define MT6328_OTP_VAL_480_495       0x0c9a
+#define MT6328_OTP_VAL_496_511       0x0c9c
+#define MT6328_RTC_MIX_CON0          0x0c9e
+#define MT6328_RTC_MIX_CON1          0x0ca0
+#define MT6328_RTC_MIX_CON2          0x0ca2
+#define MT6328_FGADC_CON0            0x0ca4
+#define MT6328_FGADC_CON1            0x0ca6
+#define MT6328_FGADC_CON2            0x0ca8
+#define MT6328_FGADC_CON3            0x0caa
+#define MT6328_FGADC_CON4            0x0cac
+#define MT6328_FGADC_CON5            0x0cae
+#define MT6328_FGADC_CON6            0x0cb0
+#define MT6328_FGADC_CON7            0x0cb2
+#define MT6328_FGADC_CON8            0x0cb4
+#define MT6328_FGADC_CON9            0x0cb6
+#define MT6328_FGADC_CON10           0x0cb8
+#define MT6328_FGADC_CON11           0x0cba
+#define MT6328_FGADC_CON12           0x0cbc
+#define MT6328_FGADC_CON13           0x0cbe
+#define MT6328_FGADC_CON14           0x0cc0
+#define MT6328_FGADC_CON15           0x0cc2
+#define MT6328_FGADC_CON16           0x0cc4
+#define MT6328_FGADC_CON17           0x0cc6
+#define MT6328_FGADC_CON18           0x0cc8
+#define MT6328_FGADC_CON19           0x0cca
+#define MT6328_FGADC_CON20           0x0ccc
+#define MT6328_FGADC_CON21           0x0cce
+#define MT6328_FGADC_CON22           0x0cd0
+#define MT6328_FGADC_CON23           0x0cd2
+#define MT6328_FGADC_CON24           0x0cd4
+#define MT6328_FGADC_CON25           0x0cd6
+#define MT6328_FGADC_CON26           0x0cd8
+#define MT6328_FGADC_CON27           0x0cda
+#define MT6328_AUDDEC_ANA_CON0       0x0cdc
+#define MT6328_AUDDEC_ANA_CON1       0x0cde
+#define MT6328_AUDDEC_ANA_CON2       0x0ce0
+#define MT6328_AUDDEC_ANA_CON3       0x0ce2
+#define MT6328_AUDDEC_ANA_CON4       0x0ce4
+#define MT6328_AUDDEC_ANA_CON5       0x0ce6
+#define MT6328_AUDDEC_ANA_CON6       0x0ce8
+#define MT6328_AUDDEC_ANA_CON7       0x0cea
+#define MT6328_AUDDEC_ANA_CON8       0x0cec
+#define MT6328_AUDENC_ANA_CON0       0x0cee
+#define MT6328_AUDENC_ANA_CON1       0x0cf0
+#define MT6328_AUDENC_ANA_CON2       0x0cf2
+#define MT6328_AUDENC_ANA_CON3       0x0cf4
+#define MT6328_AUDENC_ANA_CON4       0x0cf6
+#define MT6328_AUDENC_ANA_CON5       0x0cf8
+#define MT6328_AUDENC_ANA_CON6       0x0cfa
+#define MT6328_AUDENC_ANA_CON7       0x0cfc
+#define MT6328_AUDENC_ANA_CON8       0x0cfe
+#define MT6328_AUDENC_ANA_CON9       0x0d00
+#define MT6328_AUDENC_ANA_CON10      0x0d02
+#define MT6328_AUDNCP_CLKDIV_CON0    0x0d04
+#define MT6328_AUDNCP_CLKDIV_CON1    0x0d06
+#define MT6328_AUDNCP_CLKDIV_CON2    0x0d08
+#define MT6328_AUDNCP_CLKDIV_CON3    0x0d0a
+#define MT6328_AUDNCP_CLKDIV_CON4    0x0d0c
+#define MT6328_AUXADC_ADC0           0x0e00
+#define MT6328_AUXADC_ADC1           0x0e02
+#define MT6328_AUXADC_ADC2           0x0e04
+#define MT6328_AUXADC_ADC3           0x0e06
+#define MT6328_AUXADC_ADC4           0x0e08
+#define MT6328_AUXADC_ADC5           0x0e0a
+#define MT6328_AUXADC_ADC6           0x0e0c
+#define MT6328_AUXADC_ADC7           0x0e0e
+#define MT6328_AUXADC_ADC8           0x0e10
+#define MT6328_AUXADC_ADC9           0x0e12
+#define MT6328_AUXADC_ADC10          0x0e14
+#define MT6328_AUXADC_ADC11          0x0e16
+#define MT6328_AUXADC_ADC12          0x0e18
+#define MT6328_AUXADC_ADC13          0x0e1a
+#define MT6328_AUXADC_ADC14          0x0e1c
+#define MT6328_AUXADC_ADC15          0x0e1e
+#define MT6328_AUXADC_ADC16          0x0e20
+#define MT6328_AUXADC_ADC17          0x0e22
+#define MT6328_AUXADC_ADC18          0x0e24
+#define MT6328_AUXADC_ADC19          0x0e26
+#define MT6328_AUXADC_ADC20          0x0e28
+#define MT6328_AUXADC_ADC21          0x0e2a
+#define MT6328_AUXADC_ADC22          0x0e2c
+#define MT6328_AUXADC_ADC23          0x0e2e
+#define MT6328_AUXADC_ADC24          0x0e30
+#define MT6328_AUXADC_ADC25          0x0e32
+#define MT6328_AUXADC_ADC26          0x0e34
+#define MT6328_AUXADC_ADC27          0x0e36
+#define MT6328_AUXADC_ADC28          0x0e38
+#define MT6328_AUXADC_ADC29          0x0e3a
+#define MT6328_AUXADC_ADC30          0x0e3c
+#define MT6328_AUXADC_ADC31          0x0e3e
+#define MT6328_AUXADC_ADC32          0x0e40
+#define MT6328_AUXADC_ADC33          0x0e42
+#define MT6328_AUXADC_BUF0           0x0e44
+#define MT6328_AUXADC_BUF1           0x0e46
+#define MT6328_AUXADC_BUF2           0x0e48
+#define MT6328_AUXADC_BUF3           0x0e4a
+#define MT6328_AUXADC_BUF4           0x0e4c
+#define MT6328_AUXADC_BUF5           0x0e4e
+#define MT6328_AUXADC_BUF6           0x0e50
+#define MT6328_AUXADC_BUF7           0x0e52
+#define MT6328_AUXADC_BUF8           0x0e54
+#define MT6328_AUXADC_BUF9           0x0e56
+#define MT6328_AUXADC_BUF10          0x0e58
+#define MT6328_AUXADC_BUF11          0x0e5a
+#define MT6328_AUXADC_BUF12          0x0e5c
+#define MT6328_AUXADC_BUF13          0x0e5e
+#define MT6328_AUXADC_BUF14          0x0e60
+#define MT6328_AUXADC_BUF15          0x0e62
+#define MT6328_AUXADC_BUF16          0x0e64
+#define MT6328_AUXADC_BUF17          0x0e66
+#define MT6328_AUXADC_BUF18          0x0e68
+#define MT6328_AUXADC_BUF19          0x0e6a
+#define MT6328_AUXADC_BUF20          0x0e6c
+#define MT6328_AUXADC_BUF21          0x0e6e
+#define MT6328_AUXADC_BUF22          0x0e70
+#define MT6328_AUXADC_BUF23          0x0e72
+#define MT6328_AUXADC_BUF24          0x0e74
+#define MT6328_AUXADC_BUF25          0x0e76
+#define MT6328_AUXADC_BUF26          0x0e78
+#define MT6328_AUXADC_BUF27          0x0e7a
+#define MT6328_AUXADC_BUF28          0x0e7c
+#define MT6328_AUXADC_BUF29          0x0e7e
+#define MT6328_AUXADC_BUF30          0x0e80
+#define MT6328_AUXADC_BUF31          0x0e82
+#define MT6328_AUXADC_STA0           0x0e84
+#define MT6328_AUXADC_STA1           0x0e86
+#define MT6328_AUXADC_RQST0          0x0e88
+#define MT6328_AUXADC_RQST0_SET      0x0e8a
+#define MT6328_AUXADC_RQST0_CLR      0x0e8c
+#define MT6328_AUXADC_RQST1          0x0e8e
+#define MT6328_AUXADC_RQST1_SET      0x0e90
+#define MT6328_AUXADC_RQST1_CLR      0x0e92
+#define MT6328_AUXADC_CON0           0x0e94
+#define MT6328_AUXADC_CON0_SET       0x0e96
+#define MT6328_AUXADC_CON0_CLR       0x0e98
+#define MT6328_AUXADC_CON1           0x0e9a
+#define MT6328_AUXADC_CON2           0x0e9c
+#define MT6328_AUXADC_CON3           0x0e9e
+#define MT6328_AUXADC_CON4           0x0ea0
+#define MT6328_AUXADC_CON5           0x0ea2
+#define MT6328_AUXADC_CON6           0x0ea4
+#define MT6328_AUXADC_CON7           0x0ea6
+#define MT6328_AUXADC_CON8           0x0ea8
+#define MT6328_AUXADC_CON9           0x0eaa
+#define MT6328_AUXADC_CON10          0x0eac
+#define MT6328_AUXADC_CON11          0x0eae
+#define MT6328_AUXADC_CON12          0x0eb0
+#define MT6328_AUXADC_CON13          0x0eb2
+#define MT6328_AUXADC_CON14          0x0eb4
+#define MT6328_AUXADC_CON15          0x0eb6
+#define MT6328_AUXADC_CON16          0x0eb8
+#define MT6328_AUXADC_AUTORPT0       0x0eba
+#define MT6328_AUXADC_LBAT0          0x0ebc
+#define MT6328_AUXADC_LBAT1          0x0ebe
+#define MT6328_AUXADC_LBAT2          0x0ec0
+#define MT6328_AUXADC_LBAT3          0x0ec2
+#define MT6328_AUXADC_LBAT4          0x0ec4
+#define MT6328_AUXADC_LBAT5          0x0ec6
+#define MT6328_AUXADC_LBAT6          0x0ec8
+#define MT6328_AUXADC_ACCDET         0x0eca
+#define MT6328_AUXADC_THR0           0x0ecc
+#define MT6328_AUXADC_THR1           0x0ece
+#define MT6328_AUXADC_THR2           0x0ed0
+#define MT6328_AUXADC_THR3           0x0ed2
+#define MT6328_AUXADC_THR4           0x0ed4
+#define MT6328_AUXADC_THR5           0x0ed6
+#define MT6328_AUXADC_THR6           0x0ed8
+#define MT6328_AUXADC_EFUSE0         0x0eda
+#define MT6328_AUXADC_EFUSE1         0x0edc
+#define MT6328_AUXADC_EFUSE2         0x0ede
+#define MT6328_AUXADC_EFUSE3         0x0ee0
+#define MT6328_AUXADC_EFUSE4         0x0ee2
+#define MT6328_AUXADC_EFUSE5         0x0ee4
+#define MT6328_AUXADC_DBG0           0x0ee6
+#define MT6328_AUXADC_IMP0           0x0ee8
+#define MT6328_AUXADC_IMP1           0x0eea
+#define MT6328_AUXADC_VISMPS0_1      0x0eec
+#define MT6328_AUXADC_VISMPS0_2      0x0eee
+#define MT6328_AUXADC_VISMPS0_3      0x0ef0
+#define MT6328_AUXADC_VISMPS0_4      0x0ef2
+#define MT6328_AUXADC_VISMPS0_5      0x0ef4
+#define MT6328_AUXADC_VISMPS0_6      0x0ef6
+#define MT6328_AUXADC_VISMPS0_7      0x0ef8
+#define MT6328_AUXADC_LBAT2_1        0x0efa
+#define MT6328_AUXADC_LBAT2_2        0x0efc
+#define MT6328_AUXADC_LBAT2_3        0x0efe
+#define MT6328_AUXADC_LBAT2_4        0x0f00
+#define MT6328_AUXADC_LBAT2_5        0x0f02
+#define MT6328_AUXADC_LBAT2_6        0x0f04
+#define MT6328_AUXADC_LBAT2_7        0x0f06
+#define MT6328_AUXADC_MDBG_0         0x0f08
+#define MT6328_AUXADC_MDBG_1         0x0f0a
+#define MT6328_AUXADC_MDBG_2         0x0f0c
+#define MT6328_AUXADC_MDRT_0         0x0f0e
+#define MT6328_AUXADC_MDRT_1         0x0f10
+#define MT6328_AUXADC_MDRT_2         0x0f12
+#define MT6328_ACCDET_CON0           0x0f14
+#define MT6328_ACCDET_CON1           0x0f16
+#define MT6328_ACCDET_CON2           0x0f18
+#define MT6328_ACCDET_CON3           0x0f1a
+#define MT6328_ACCDET_CON4           0x0f1c
+#define MT6328_ACCDET_CON5           0x0f1e
+#define MT6328_ACCDET_CON6           0x0f20
+#define MT6328_ACCDET_CON7           0x0f22
+#define MT6328_ACCDET_CON8           0x0f24
+#define MT6328_ACCDET_CON9           0x0f26
+#define MT6328_ACCDET_CON10          0x0f28
+#define MT6328_ACCDET_CON11          0x0f2a
+#define MT6328_ACCDET_CON12          0x0f2c
+#define MT6328_ACCDET_CON13          0x0f2e
+#define MT6328_ACCDET_CON14          0x0f30
+#define MT6328_ACCDET_CON15          0x0f32
+#define MT6328_ACCDET_CON16          0x0f34
+#define MT6328_ACCDET_CON17          0x0f36
+#define MT6328_ACCDET_CON18          0x0f38
+#define MT6328_ACCDET_CON19          0x0f3a
+#define MT6328_ACCDET_CON20          0x0f3c
+#define MT6328_ACCDET_CON21          0x0f3e
+#define MT6328_ACCDET_CON22          0x0f40
+#define MT6328_ACCDET_CON23          0x0f42
+#define MT6328_ACCDET_CON24          0x0f44
+#define MT6328_ACCDET_CON25          0x0f46
+#define MT6328_CHR_CON0              0x0f48
+#define MT6328_CHR_CON1              0x0f4a
+#define MT6328_CHR_CON2              0x0f4c
+#define MT6328_CHR_CON3              0x0f4e
+#define MT6328_CHR_CON4              0x0f50
+#define MT6328_CHR_CON5              0x0f52
+#define MT6328_CHR_CON6              0x0f54
+#define MT6328_CHR_CON7              0x0f56
+#define MT6328_CHR_CON8              0x0f58
+#define MT6328_CHR_CON9              0x0f5a
+#define MT6328_CHR_CON10             0x0f5c
+#define MT6328_CHR_CON11             0x0f5e
+#define MT6328_CHR_CON12             0x0f60
+#define MT6328_CHR_CON13             0x0f62
+#define MT6328_CHR_CON14             0x0f64
+#define MT6328_CHR_CON15             0x0f66
+#define MT6328_CHR_CON16             0x0f68
+#define MT6328_CHR_CON17             0x0f6a
+#define MT6328_CHR_CON18             0x0f6c
+#define MT6328_CHR_CON19             0x0f6e
+#define MT6328_CHR_CON20             0x0f70
+#define MT6328_CHR_CON21             0x0f72
+#define MT6328_CHR_CON22             0x0f74
+#define MT6328_CHR_CON23             0x0f76
+#define MT6328_CHR_CON24             0x0f78
+#define MT6328_CHR_CON25             0x0f7a
+#define MT6328_CHR_CON26             0x0f7c
+#define MT6328_CHR_CON27             0x0f7e
+#define MT6328_CHR_CON28             0x0f80
+#define MT6328_CHR_CON29             0x0f82
+#define MT6328_CHR_CON30             0x0f84
+#define MT6328_CHR_CON31             0x0f86
+#define MT6328_CHR_CON32             0x0f88
+#define MT6328_CHR_CON33             0x0f8a
+#define MT6328_CHR_CON34             0x0f8c
+#define MT6328_CHR_CON35             0x0f8e
+#define MT6328_CHR_CON36             0x0f90
+#define MT6328_CHR_CON37             0x0f92
+#define MT6328_CHR_CON38             0x0f94
+#define MT6328_CHR_CON39             0x0f96
+#define MT6328_CHR_CON40             0x0f98
+#define MT6328_CHR_CON41             0x0f9a
+#define MT6328_CHR_CON42             0x0f9c
+#define MT6328_BATON_CON0            0x0f9e
+#define MT6328_CHR_CON43             0x0fa0
+#define MT6328_EOSC_CALI_CON0        0x0faa
+#define MT6328_EOSC_CALI_CON1        0x0fac
+#define MT6328_VRTC_PWM_CON0         0x0fae
+
+#endif /* __MFD_MT6328_REGISTERS_H__ */
diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h
index 627487e26287..b774c3a4bb62 100644
--- a/include/linux/mfd/mt6397/core.h
+++ b/include/linux/mfd/mt6397/core.h
@@ -12,6 +12,7 @@
 
 enum chip_id {
 	MT6323_CHIP_ID = 0x23,
+	MT6328_CHIP_ID = 0x30,
 	MT6331_CHIP_ID = 0x20,
 	MT6332_CHIP_ID = 0x20,
 	MT6357_CHIP_ID = 0x57,
@@ -65,11 +66,11 @@ struct mt6397_chip {
 	int irq;
 	struct irq_domain *irq_domain;
 	struct mutex irqlock;
-	u16 wake_mask[2];
-	u16 irq_masks_cur[2];
-	u16 irq_masks_cache[2];
-	u16 int_con[2];
-	u16 int_status[2];
+	u16 wake_mask[3];
+	u16 irq_masks_cur[3];
+	u16 irq_masks_cache[3];
+	u16 int_con[3];
+	u16 int_status[3];
 	u16 chip_id;
 	void *irq_data;
 };
-- 
cgit v1.2.3


From f187b9bf1a639090893c31030ddb60f9beae23f0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 30 Oct 2024 06:18:52 +0100
Subject: block: remove bio_add_zone_append_page

This is only used by the nvmet zns passthrough code, which can trivially
just use bio_add_pc_page and do the sanity check for the max zone append
limit itself.

All future zoned file systems should follow the btrfs lead and let the
upper layers fill up bios unlimited by hardware constraints and split
them to the limits in the I/O submission handler.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20241030051859.280923-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 33 ---------------------------------
 drivers/nvme/target/zns.c | 21 +++++++++++++--------
 include/linux/bio.h       |  2 --
 3 files changed, 13 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 6a60d62a529d..daceb0a5c1d7 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1064,39 +1064,6 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio,
 }
 EXPORT_SYMBOL(bio_add_pc_page);
 
-/**
- * bio_add_zone_append_page - attempt to add page to zone-append bio
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
- * for a zone-append request. This can fail for a number of reasons, such as the
- * bio being full or the target block device is not a zoned block device or
- * other limitations of the target block device. The target block device must
- * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
- * to an empty bio.
- *
- * Returns: number of bytes added to the bio, or 0 in case of a failure.
- */
-int bio_add_zone_append_page(struct bio *bio, struct page *page,
-			     unsigned int len, unsigned int offset)
-{
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	bool same_page = false;
-
-	if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
-		return 0;
-
-	if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev)))
-		return 0;
-
-	return bio_add_hw_page(q, bio, page, len, offset,
-			       queue_max_zone_append_sectors(q), &same_page);
-}
-EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
-
 /**
  * __bio_add_page - add page(s) to a bio in a new segment
  * @bio: destination bio
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index af9e13be7678..3aef35b05111 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -537,6 +537,7 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
 	u16 status = NVME_SC_SUCCESS;
 	unsigned int total_len = 0;
 	struct scatterlist *sg;
+	u32 data_len = nvmet_rw_data_len(req);
 	struct bio *bio;
 	int sg_cnt;
 
@@ -544,6 +545,13 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
 	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
 		return;
 
+	if (data_len >
+	    bdev_max_zone_append_sectors(req->ns->bdev) << SECTOR_SHIFT) {
+		req->error_loc = offsetof(struct nvme_rw_command, length);
+		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		goto out;
+	}
+
 	if (!req->sg_cnt) {
 		nvmet_req_complete(req, 0);
 		return;
@@ -576,20 +584,17 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
 		bio->bi_opf |= REQ_FUA;
 
 	for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) {
-		struct page *p = sg_page(sg);
-		unsigned int l = sg->length;
-		unsigned int o = sg->offset;
-		unsigned int ret;
+		unsigned int len = sg->length;
 
-		ret = bio_add_zone_append_page(bio, p, l, o);
-		if (ret != sg->length) {
+		if (bio_add_pc_page(bdev_get_queue(bio->bi_bdev), bio,
+				sg_page(sg), len, sg->offset) != len) {
 			status = NVME_SC_INTERNAL;
 			goto out_put_bio;
 		}
-		total_len += sg->length;
+		total_len += len;
 	}
 
-	if (total_len != nvmet_rw_data_len(req)) {
+	if (total_len != data_len) {
 		status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
 		goto out_put_bio;
 	}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index faceadb040f9..4a1bf43ca53d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -418,8 +418,6 @@ bool __must_check bio_add_folio(struct bio *bio, struct folio *folio,
 				size_t len, size_t off);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
-int bio_add_zone_append_page(struct bio *bio, struct page *page,
-			     unsigned int len, unsigned int offset);
 void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off);
 void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
-- 
cgit v1.2.3


From 16aed0a6520ba01b7d22c32e193fc1ec674f92d4 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Mon, 21 Oct 2024 11:45:06 -0400
Subject: i3c: master: Replace hard code 2 with macro I3C_ADDR_SLOT_STATUS_BITS

Replace the hardcoded value 2, which indicates 2 bits for I3C address
status, with the predefined macro I3C_ADDR_SLOT_STATUS_BITS.

Improve maintainability and extensibility of the code.

Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20241021-i3c_dts_assign-v8-1-4098b8bde01e@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master.c       | 4 ++--
 include/linux/i3c/master.h | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 6f3eb710a75d..dcf8d23c5941 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -348,7 +348,7 @@ static enum i3c_addr_slot_status
 i3c_bus_get_addr_slot_status(struct i3c_bus *bus, u16 addr)
 {
 	unsigned long status;
-	int bitpos = addr * 2;
+	int bitpos = addr * I3C_ADDR_SLOT_STATUS_BITS;
 
 	if (addr > I2C_MAX_ADDR)
 		return I3C_ADDR_SLOT_RSVD;
@@ -362,7 +362,7 @@ i3c_bus_get_addr_slot_status(struct i3c_bus *bus, u16 addr)
 static void i3c_bus_set_addr_slot_status(struct i3c_bus *bus, u16 addr,
 					 enum i3c_addr_slot_status status)
 {
-	int bitpos = addr * 2;
+	int bitpos = addr * I3C_ADDR_SLOT_STATUS_BITS;
 	unsigned long *ptr;
 
 	if (addr > I2C_MAX_ADDR)
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 2a1ed05d5782..2100547b2d8d 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -313,6 +313,8 @@ enum i3c_addr_slot_status {
 	I3C_ADDR_SLOT_STATUS_MASK = 3,
 };
 
+#define I3C_ADDR_SLOT_STATUS_BITS 2
+
 /**
  * struct i3c_bus - I3C bus object
  * @cur_master: I3C master currently driving the bus. Since I3C is multi-master
@@ -354,7 +356,7 @@ enum i3c_addr_slot_status {
 struct i3c_bus {
 	struct i3c_dev_desc *cur_master;
 	int id;
-	unsigned long addrslots[((I2C_MAX_ADDR + 1) * 2) / BITS_PER_LONG];
+	unsigned long addrslots[((I2C_MAX_ADDR + 1) * I3C_ADDR_SLOT_STATUS_BITS) / BITS_PER_LONG];
 	enum i3c_bus_mode mode;
 	struct {
 		unsigned long i3c;
-- 
cgit v1.2.3


From 2f552fa280590e61bd3dbe66a7b54b99caa642a4 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Mon, 21 Oct 2024 11:45:07 -0400
Subject: i3c: master: Extend address status bit to 4 and add
 I3C_ADDR_SLOT_EXT_DESIRED
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the address status bit to 4 and introduce the
I3C_ADDR_SLOT_EXT_DESIRED macro to indicate that a device prefers a
specific address. This is generally set by the 'assigned-address' in the
device tree source (dts) file.

 ┌────┬─────────────┬───┬─────────┬───┐
 │S/Sr│ 7'h7E RnW=0 │ACK│ ENTDAA  │ T ├────┐
 └────┴─────────────┴───┴─────────┴───┘    │
 ┌─────────────────────────────────────────┘
 │  ┌──┬─────────────┬───┬─────────────────┬────────────────┬───┬─────────┐
 └─►│Sr│7'h7E RnW=1  │ACK│48bit UID BCR DCR│Assign 7bit Addr│PAR│ ACK/NACK│
    └──┴─────────────┴───┴─────────────────┴────────────────┴───┴─────────┘

Some master controllers (such as HCI) need to prepare the entire above
transaction before sending it out to the I3C bus. This means that a 7-bit
dynamic address needs to be allocated before knowing the target device's
UID information.

However, some I3C targets may request specific addresses (called as
"init_dyn_addr"), which is typically specified by the DT-'s
assigned-address property. Lower addresses having higher IBI priority. If
it is available, i3c_bus_get_free_addr() preferably return a free address
that is not in the list of desired addresses (called as "init_dyn_addr").
This allows the device with the "init_dyn_addr" to switch to its
"init_dyn_addr" when it hot-joins the I3C bus. Otherwise, if the
"init_dyn_addr" is already in use by another I3C device, the target device
will not be able to switch to its desired address.

If the previous step fails, fallback returning one of the remaining
unassigned address, regardless of its state in the desired list.

Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20241021-i3c_dts_assign-v8-2-4098b8bde01e@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master.c       | 65 ++++++++++++++++++++++++++++++++++++++--------
 include/linux/i3c/master.h |  7 +++--
 2 files changed, 59 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index dcf8d23c5941..1bf9cb138f77 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -345,7 +345,7 @@ const struct bus_type i3c_bus_type = {
 EXPORT_SYMBOL_GPL(i3c_bus_type);
 
 static enum i3c_addr_slot_status
-i3c_bus_get_addr_slot_status(struct i3c_bus *bus, u16 addr)
+i3c_bus_get_addr_slot_status_mask(struct i3c_bus *bus, u16 addr, u32 mask)
 {
 	unsigned long status;
 	int bitpos = addr * I3C_ADDR_SLOT_STATUS_BITS;
@@ -356,11 +356,17 @@ i3c_bus_get_addr_slot_status(struct i3c_bus *bus, u16 addr)
 	status = bus->addrslots[bitpos / BITS_PER_LONG];
 	status >>= bitpos % BITS_PER_LONG;
 
-	return status & I3C_ADDR_SLOT_STATUS_MASK;
+	return status & mask;
 }
 
-static void i3c_bus_set_addr_slot_status(struct i3c_bus *bus, u16 addr,
-					 enum i3c_addr_slot_status status)
+static enum i3c_addr_slot_status
+i3c_bus_get_addr_slot_status(struct i3c_bus *bus, u16 addr)
+{
+	return i3c_bus_get_addr_slot_status_mask(bus, addr, I3C_ADDR_SLOT_STATUS_MASK);
+}
+
+static void i3c_bus_set_addr_slot_status_mask(struct i3c_bus *bus, u16 addr,
+					      enum i3c_addr_slot_status status, u32 mask)
 {
 	int bitpos = addr * I3C_ADDR_SLOT_STATUS_BITS;
 	unsigned long *ptr;
@@ -369,9 +375,14 @@ static void i3c_bus_set_addr_slot_status(struct i3c_bus *bus, u16 addr,
 		return;
 
 	ptr = bus->addrslots + (bitpos / BITS_PER_LONG);
-	*ptr &= ~((unsigned long)I3C_ADDR_SLOT_STATUS_MASK <<
-						(bitpos % BITS_PER_LONG));
-	*ptr |= (unsigned long)status << (bitpos % BITS_PER_LONG);
+	*ptr &= ~((unsigned long)mask << (bitpos % BITS_PER_LONG));
+	*ptr |= ((unsigned long)status & mask) << (bitpos % BITS_PER_LONG);
+}
+
+static void i3c_bus_set_addr_slot_status(struct i3c_bus *bus, u16 addr,
+					 enum i3c_addr_slot_status status)
+{
+	i3c_bus_set_addr_slot_status_mask(bus, addr, status, I3C_ADDR_SLOT_STATUS_MASK);
 }
 
 static bool i3c_bus_dev_addr_is_avail(struct i3c_bus *bus, u8 addr)
@@ -383,13 +394,44 @@ static bool i3c_bus_dev_addr_is_avail(struct i3c_bus *bus, u8 addr)
 	return status == I3C_ADDR_SLOT_FREE;
 }
 
+/*
+ * ┌────┬─────────────┬───┬─────────┬───┐
+ * │S/Sr│ 7'h7E RnW=0 │ACK│ ENTDAA  │ T ├────┐
+ * └────┴─────────────┴───┴─────────┴───┘    │
+ * ┌─────────────────────────────────────────┘
+ * │  ┌──┬─────────────┬───┬─────────────────┬────────────────┬───┬─────────┐
+ * └─►│Sr│7'h7E RnW=1  │ACK│48bit UID BCR DCR│Assign 7bit Addr│PAR│ ACK/NACK│
+ *    └──┴─────────────┴───┴─────────────────┴────────────────┴───┴─────────┘
+ * Some master controllers (such as HCI) need to prepare the entire above transaction before
+ * sending it out to the I3C bus. This means that a 7-bit dynamic address needs to be allocated
+ * before knowing the target device's UID information.
+ *
+ * However, some I3C targets may request specific addresses (called as "init_dyn_addr"), which is
+ * typically specified by the DT-'s assigned-address property. Lower addresses having higher IBI
+ * priority. If it is available, i3c_bus_get_free_addr() preferably return a free address that is
+ * not in the list of desired addresses (called as "init_dyn_addr"). This allows the device with
+ * the "init_dyn_addr" to switch to its "init_dyn_addr" when it hot-joins the I3C bus. Otherwise,
+ * if the "init_dyn_addr" is already in use by another I3C device, the target device will not be
+ * able to switch to its desired address.
+ *
+ * If the previous step fails, fallback returning one of the remaining unassigned address,
+ * regardless of its state in the desired list.
+ */
 static int i3c_bus_get_free_addr(struct i3c_bus *bus, u8 start_addr)
 {
 	enum i3c_addr_slot_status status;
 	u8 addr;
 
 	for (addr = start_addr; addr < I3C_MAX_ADDR; addr++) {
-		status = i3c_bus_get_addr_slot_status(bus, addr);
+		status = i3c_bus_get_addr_slot_status_mask(bus, addr,
+							   I3C_ADDR_SLOT_EXT_STATUS_MASK);
+		if (status == I3C_ADDR_SLOT_FREE)
+			return addr;
+	}
+
+	for (addr = start_addr; addr < I3C_MAX_ADDR; addr++) {
+		status = i3c_bus_get_addr_slot_status_mask(bus, addr,
+							   I3C_ADDR_SLOT_STATUS_MASK);
 		if (status == I3C_ADDR_SLOT_FREE)
 			return addr;
 	}
@@ -1918,9 +1960,10 @@ static int i3c_master_bus_init(struct i3c_master_controller *master)
 			goto err_rstdaa;
 		}
 
-		i3c_bus_set_addr_slot_status(&master->bus,
-					     i3cboardinfo->init_dyn_addr,
-					     I3C_ADDR_SLOT_I3C_DEV);
+		i3c_bus_set_addr_slot_status_mask(&master->bus,
+						  i3cboardinfo->init_dyn_addr,
+						  I3C_ADDR_SLOT_I3C_DEV | I3C_ADDR_SLOT_EXT_DESIRED,
+						  I3C_ADDR_SLOT_EXT_STATUS_MASK);
 
 		/*
 		 * Only try to create/attach devices that have a static
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 2100547b2d8d..6e5328c6c6af 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -298,7 +298,8 @@ enum i3c_open_drain_speed {
  * @I3C_ADDR_SLOT_I2C_DEV: address is assigned to an I2C device
  * @I3C_ADDR_SLOT_I3C_DEV: address is assigned to an I3C device
  * @I3C_ADDR_SLOT_STATUS_MASK: address slot mask
- *
+ * @I3C_ADDR_SLOT_EXT_DESIRED: the bitmask represents addresses that are preferred by some devices,
+ *			       such as the "assigned-address" property in a device tree source.
  * On an I3C bus, addresses are assigned dynamically, and we need to know which
  * addresses are free to use and which ones are already assigned.
  *
@@ -311,9 +312,11 @@ enum i3c_addr_slot_status {
 	I3C_ADDR_SLOT_I2C_DEV,
 	I3C_ADDR_SLOT_I3C_DEV,
 	I3C_ADDR_SLOT_STATUS_MASK = 3,
+	I3C_ADDR_SLOT_EXT_STATUS_MASK = 7,
+	I3C_ADDR_SLOT_EXT_DESIRED = BIT(2),
 };
 
-#define I3C_ADDR_SLOT_STATUS_BITS 2
+#define I3C_ADDR_SLOT_STATUS_BITS 4
 
 /**
  * struct i3c_bus - I3C bus object
-- 
cgit v1.2.3


From a911bad094b010e276f072fe9a599b66e59ed5fe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 29 Oct 2024 19:14:25 +0000
Subject: dql: annotate data-races around dql->last_obj_cnt

dql->last_obj_cnt is read/written from different contexts,
without any lock synchronization.

Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Link: https://patch.msgid.link/20241029191425.2519085-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dynamic_queue_limits.h | 2 +-
 lib/dynamic_queue_limits.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index 281298e77a15..808b1a5102e7 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -127,7 +127,7 @@ static inline void dql_queued(struct dql *dql, unsigned int count)
 	if (WARN_ON_ONCE(count > DQL_MAX_OBJECT))
 		return;
 
-	dql->last_obj_cnt = count;
+	WRITE_ONCE(dql->last_obj_cnt, count);
 
 	/* We want to force a write first, so that cpu do not attempt
 	 * to get cache line containing last_obj_cnt, num_queued, adj_limit
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index e49deddd3de9..c1b7638a594a 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -179,7 +179,7 @@ void dql_completed(struct dql *dql, unsigned int count)
 
 	dql->adj_limit = limit + completed;
 	dql->prev_ovlimit = ovlimit;
-	dql->prev_last_obj_cnt = dql->last_obj_cnt;
+	dql->prev_last_obj_cnt = READ_ONCE(dql->last_obj_cnt);
 	dql->num_completed = completed;
 	dql->prev_num_queued = num_queued;
 
-- 
cgit v1.2.3


From d3eeb1ac0b99cdcd99420fb270041da946d2d360 Mon Sep 17 00:00:00 2001
From: Angelo Dureghello <adureghello@baylibre.com>
Date: Mon, 28 Oct 2024 22:45:30 +0100
Subject: iio: backend: extend features

Extend backend features with new calls needed later on this
patchset from axi version of ad3552r.

The follwoing calls are added:

iio_backend_ddr_enable()
	enable ddr bus transfer
iio_backend_ddr_disable()
	disable ddr bus transfer
iio_backend_data_stream_enable()
	enable data stream over bus interface
iio_backend_data_stream_disable()
	disable data stream over bus interface
iio_backend_data_transfer_addr()
	define the target register address where the DAC sample
	will be written.

Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Signed-off-by: Angelo Dureghello <adureghello@baylibre.com>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20241028-wip-bl-ad3552r-axi-v0-iio-testing-v9-3-f6960b4f9719@kernel-space.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-backend.c | 78 ++++++++++++++++++++++++++++++++++++++
 include/linux/iio/backend.h        | 17 +++++++++
 2 files changed, 95 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c
index 20b3b5212da7..81f3d24f0c50 100644
--- a/drivers/iio/industrialio-backend.c
+++ b/drivers/iio/industrialio-backend.c
@@ -718,6 +718,84 @@ static int __devm_iio_backend_get(struct device *dev, struct iio_backend *back)
 	return 0;
 }
 
+/**
+ * iio_backend_ddr_enable - Enable interface DDR (Double Data Rate) mode
+ * @back: Backend device
+ *
+ * Enable DDR, data is generated by the IP at each front (raising and falling)
+ * of the bus clock signal.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_ddr_enable(struct iio_backend *back)
+{
+	return iio_backend_op_call(back, ddr_enable);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_ddr_enable, IIO_BACKEND);
+
+/**
+ * iio_backend_ddr_disable - Disable interface DDR (Double Data Rate) mode
+ * @back: Backend device
+ *
+ * Disable DDR, setting into SDR mode (Single Data Rate).
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_ddr_disable(struct iio_backend *back)
+{
+	return iio_backend_op_call(back, ddr_disable);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_ddr_disable, IIO_BACKEND);
+
+/**
+ * iio_backend_data_stream_enable - Enable data stream
+ * @back: Backend device
+ *
+ * Enable data stream over the bus interface.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_data_stream_enable(struct iio_backend *back)
+{
+	return iio_backend_op_call(back, data_stream_enable);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_data_stream_enable, IIO_BACKEND);
+
+/**
+ * iio_backend_data_stream_disable - Disable data stream
+ * @back: Backend device
+ *
+ * Disable data stream over the bus interface.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_data_stream_disable(struct iio_backend *back)
+{
+	return iio_backend_op_call(back, data_stream_disable);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_data_stream_disable, IIO_BACKEND);
+
+/**
+ * iio_backend_data_transfer_addr - Set data address.
+ * @back: Backend device
+ * @address: Data register address
+ *
+ * Some devices may need to inform the backend about an address
+ * where to read or write the data.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_data_transfer_addr(struct iio_backend *back, u32 address)
+{
+	return iio_backend_op_call(back, data_transfer_addr, address);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_data_transfer_addr, IIO_BACKEND);
+
 static struct iio_backend *__devm_iio_backend_fwnode_get(struct device *dev, const char *name,
 							 struct fwnode_handle *fwnode)
 {
diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 37d56914d485..10be00f3b120 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -14,12 +14,14 @@ struct iio_dev;
 enum iio_backend_data_type {
 	IIO_BACKEND_TWOS_COMPLEMENT,
 	IIO_BACKEND_OFFSET_BINARY,
+	IIO_BACKEND_DATA_UNSIGNED,
 	IIO_BACKEND_DATA_TYPE_MAX
 };
 
 enum iio_backend_data_source {
 	IIO_BACKEND_INTERNAL_CONTINUOUS_WAVE,
 	IIO_BACKEND_EXTERNAL,
+	IIO_BACKEND_INTERNAL_RAMP_16BIT,
 	IIO_BACKEND_DATA_SOURCE_MAX
 };
 
@@ -89,6 +91,11 @@ enum iio_backend_sample_trigger {
  * @read_raw: Read a channel attribute from a backend device
  * @debugfs_print_chan_status: Print channel status into a buffer.
  * @debugfs_reg_access: Read or write register value of backend.
+ * @ddr_enable: Enable interface DDR (Double Data Rate) mode.
+ * @ddr_disable: Disable interface DDR (Double Data Rate) mode.
+ * @data_stream_enable: Enable data stream.
+ * @data_stream_disable: Disable data stream.
+ * @data_transfer_addr: Set data address.
  **/
 struct iio_backend_ops {
 	int (*enable)(struct iio_backend *back);
@@ -129,6 +136,11 @@ struct iio_backend_ops {
 					 size_t len);
 	int (*debugfs_reg_access)(struct iio_backend *back, unsigned int reg,
 				  unsigned int writeval, unsigned int *readval);
+	int (*ddr_enable)(struct iio_backend *back);
+	int (*ddr_disable)(struct iio_backend *back);
+	int (*data_stream_enable)(struct iio_backend *back);
+	int (*data_stream_disable)(struct iio_backend *back);
+	int (*data_transfer_addr)(struct iio_backend *back, u32 address);
 };
 
 /**
@@ -164,6 +176,11 @@ int iio_backend_data_sample_trigger(struct iio_backend *back,
 int devm_iio_backend_request_buffer(struct device *dev,
 				    struct iio_backend *back,
 				    struct iio_dev *indio_dev);
+int iio_backend_ddr_enable(struct iio_backend *back);
+int iio_backend_ddr_disable(struct iio_backend *back);
+int iio_backend_data_stream_enable(struct iio_backend *back);
+int iio_backend_data_stream_disable(struct iio_backend *back);
+int iio_backend_data_transfer_addr(struct iio_backend *back, u32 address);
 ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private,
 				 const struct iio_chan_spec *chan,
 				 const char *buf, size_t len);
-- 
cgit v1.2.3


From e9f0a363473585a0f51b1b61a9bb15a63808d6ea Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Oct 2024 13:01:12 +0200
Subject: tracing: Remove TRACE_FLAG_IRQS_NOSUPPORT

It was possible to enable tracing with no IRQ tracing support. The
tracing infrastructure would then record TRACE_FLAG_IRQS_NOSUPPORT as
the only tracing flag and show an 'X' in the output.

The last user of this feature was PPC32 which managed to implement it
during PowerPC merge in 2009. Since then, it was unused and the PPC32
dependency was finally removed in commit 0ea5ee035133a ("tracing: Remove
PPC32 wart from config TRACING_SUPPORT").
Since the PowerPC merge the code behind !CONFIG_TRACE_IRQFLAGS_SUPPORT
with TRACING enabled can no longer be selected used and the 'X' is not
displayed or recorded.

Remove the CONFIG_TRACE_IRQFLAGS_SUPPORT from the tracing code. Remove
TRACE_FLAG_IRQS_NOSUPPORT.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20241022110112.XJI8I9T2@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst |  3 ---
 include/linux/trace_events.h   | 13 -------------
 kernel/trace/trace_output.c    |  1 -
 3 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 4073ca48af4a..74d5bd801b1a 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -1031,9 +1031,6 @@ explains which is which.
   CPU#: The CPU which the process was running on.
 
   irqs-off: 'd' interrupts are disabled. '.' otherwise.
-	.. caution:: If the architecture does not support a way to
-		read the irq flags variable, an 'X' will always
-		be printed here.
 
   need-resched:
 	- 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set,
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index f8f2e52653df..016b29a56c87 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -184,7 +184,6 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
 
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
-	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
 	TRACE_FLAG_NEED_RESCHED		= 0x04,
 	TRACE_FLAG_HARDIRQ		= 0x08,
 	TRACE_FLAG_SOFTIRQ		= 0x10,
@@ -193,7 +192,6 @@ enum trace_flag_type {
 	TRACE_FLAG_BH_OFF		= 0x80,
 };
 
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
 {
 	unsigned int irq_status = irqs_disabled_flags(irqflags) ?
@@ -207,17 +205,6 @@ static inline unsigned int tracing_gen_ctx(void)
 	local_save_flags(irqflags);
 	return tracing_gen_ctx_flags(irqflags);
 }
-#else
-
-static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
-{
-	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
-}
-static inline unsigned int tracing_gen_ctx(void)
-{
-	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
-}
-#endif
 
 static inline unsigned int tracing_gen_ctx_dec(void)
 {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 868f2f912f28..2ee6613dce6d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -460,7 +460,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 		(entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' :
 		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
 		bh_off ? 'b' :
-		(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
 		'.';
 
 	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
-- 
cgit v1.2.3


From a9cfb8778c43fc473ae16cddb6e9611705721b31 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 31 Oct 2024 11:20:53 -0400
Subject: tracing: Introduce tracepoint extended structure

Shrink the struct tracepoint size from 80 bytes to 72 bytes on x86-64 by
moving the (typically NULL) regfunc/unregfunc pointers to an extended
structure.

Tested-by: Jordan Rife <jrife@google.com>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
Cc: linux-trace-kernel@vger.kernel.org
Link: https://lore.kernel.org/20241031152056.744137-2-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint-defs.h |  8 ++++++--
 include/linux/tracepoint.h      | 19 +++++++++++++------
 kernel/tracepoint.c             |  9 ++++-----
 3 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index 60a6e8314d4c..967c08d9da84 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -29,6 +29,11 @@ struct tracepoint_func {
 	int prio;
 };
 
+struct tracepoint_ext {
+	int (*regfunc)(void);
+	void (*unregfunc)(void);
+};
+
 struct tracepoint {
 	const char *name;		/* Tracepoint name */
 	struct static_key_false key;
@@ -36,9 +41,8 @@ struct tracepoint {
 	void *static_call_tramp;
 	void *iterator;
 	void *probestub;
-	int (*regfunc)(void);
-	void (*unregfunc)(void);
 	struct tracepoint_func __rcu *funcs;
+	struct tracepoint_ext *ext;
 };
 
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 0dc67fad706c..862ab49177a4 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -302,7 +302,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * structures, so we create an array of pointers that will be used for iteration
  * on the tracepoints.
  */
-#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args)		\
+#define __DEFINE_TRACE_EXT(_name, _ext, proto, args)			\
 	static const char __tpstrtab_##_name[]				\
 	__section("__tracepoints_strings") = #_name;			\
 	extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name);	\
@@ -316,9 +316,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		.static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \
 		.iterator = &__traceiter_##_name,			\
 		.probestub = &__probestub_##_name,			\
-		.regfunc = _reg,					\
-		.unregfunc = _unreg,					\
-		.funcs = NULL };					\
+		.funcs = NULL,						\
+		.ext = _ext,						\
+	};								\
 	__TRACEPOINT_ENTRY(_name);					\
 	int __traceiter_##_name(void *__data, proto)			\
 	{								\
@@ -341,8 +341,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	}								\
 	DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);
 
-#define DEFINE_TRACE(name, proto, args)		\
-	DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args));
+#define DEFINE_TRACE_FN(_name, _reg, _unreg, _proto, _args)		\
+	static struct tracepoint_ext __tracepoint_ext_##_name = {	\
+		.regfunc = _reg,					\
+		.unregfunc = _unreg,					\
+	};								\
+	__DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args));
+
+#define DEFINE_TRACE(_name, _proto, _args)				\
+	__DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args));
 
 #define EXPORT_TRACEPOINT_SYMBOL_GPL(name)				\
 	EXPORT_SYMBOL_GPL(__tracepoint_##name);				\
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 6474e2cf22c9..5658dc92f5b5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -278,8 +278,8 @@ static int tracepoint_add_func(struct tracepoint *tp,
 	struct tracepoint_func *old, *tp_funcs;
 	int ret;
 
-	if (tp->regfunc && !static_key_enabled(&tp->key)) {
-		ret = tp->regfunc();
+	if (tp->ext && tp->ext->regfunc && !static_key_enabled(&tp->key)) {
+		ret = tp->ext->regfunc();
 		if (ret < 0)
 			return ret;
 	}
@@ -362,9 +362,8 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 	switch (nr_func_state(tp_funcs)) {
 	case TP_FUNC_0:		/* 1->0 */
 		/* Removed last function */
-		if (tp->unregfunc && static_key_enabled(&tp->key))
-			tp->unregfunc();
-
+		if (tp->ext && tp->ext->unregfunc && static_key_enabled(&tp->key))
+			tp->ext->unregfunc();
 		static_branch_disable(&tp->key);
 		/* Set iterator static call */
 		tracepoint_update_call(tp, tp_funcs);
-- 
cgit v1.2.3


From 654ced4a13774e5aec4d9466c97d22df7cb1e60b Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 31 Oct 2024 11:20:54 -0400
Subject: tracing: Introduce tracepoint_is_faultable()

Introduce a "faultable" flag within the extended structure to know
whether a tracepoint needs rcu tasks trace grace period before reclaim.
This can be queried using tracepoint_is_faultable().

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Jordan Rife <jrife@google.com>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
Cc: linux-trace-kernel@vger.kernel.org
Link: https://lore.kernel.org/20241031152056.744137-3-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint-defs.h |  2 ++
 include/linux/tracepoint.h      | 24 ++++++++++++++++++++++++
 include/trace/define_trace.h    |  2 +-
 3 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index 967c08d9da84..aebf0571c736 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -32,6 +32,8 @@ struct tracepoint_func {
 struct tracepoint_ext {
 	int (*regfunc)(void);
 	void (*unregfunc)(void);
+	/* Flags. */
+	unsigned int faultable:1;
 };
 
 struct tracepoint {
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 862ab49177a4..906f3091d23d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -104,6 +104,12 @@ void for_each_tracepoint_in_module(struct module *mod,
  * tracepoint_synchronize_unregister must be called between the last tracepoint
  * probe unregistration and the end of module exit to make sure there is no
  * caller executing a probe when it is freed.
+ *
+ * An alternative is to use the following for batch reclaim associated
+ * with a given tracepoint:
+ *
+ * - tracepoint_is_faultable() == false: call_rcu()
+ * - tracepoint_is_faultable() == true:  call_rcu_tasks_trace()
  */
 #ifdef CONFIG_TRACEPOINTS
 static inline void tracepoint_synchronize_unregister(void)
@@ -111,9 +117,17 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_rcu_tasks_trace();
 	synchronize_rcu();
 }
+static inline bool tracepoint_is_faultable(struct tracepoint *tp)
+{
+	return tp->ext && tp->ext->faultable;
+}
 #else
 static inline void tracepoint_synchronize_unregister(void)
 { }
+static inline bool tracepoint_is_faultable(struct tracepoint *tp)
+{
+	return false;
+}
 #endif
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
@@ -345,6 +359,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	static struct tracepoint_ext __tracepoint_ext_##_name = {	\
 		.regfunc = _reg,					\
 		.unregfunc = _unreg,					\
+		.faultable = false,					\
+	};								\
+	__DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args));
+
+#define DEFINE_TRACE_SYSCALL(_name, _reg, _unreg, _proto, _args)	\
+	static struct tracepoint_ext __tracepoint_ext_##_name = {	\
+		.regfunc = _reg,					\
+		.unregfunc = _unreg,					\
+		.faultable = true,					\
 	};								\
 	__DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args));
 
@@ -389,6 +412,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #define __DECLARE_TRACE_SYSCALL	__DECLARE_TRACE
 
 #define DEFINE_TRACE_FN(name, reg, unreg, proto, args)
+#define DEFINE_TRACE_SYSCALL(name, reg, unreg, proto, args)
 #define DEFINE_TRACE(name, proto, args)
 #define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
 #define EXPORT_TRACEPOINT_SYMBOL(name)
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index ff5fa17a6259..63fea2218afa 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -48,7 +48,7 @@
 
 #undef TRACE_EVENT_SYSCALL
 #define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, print, reg, unreg) \
-	DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args))
+	DEFINE_TRACE_SYSCALL(name, reg, unreg, PARAMS(proto), PARAMS(args))
 
 #undef TRACE_EVENT_NOP
 #define TRACE_EVENT_NOP(name, proto, args, struct, assign, print)
-- 
cgit v1.2.3


From ee3685a98ea920369696d31e54d1fe6a939ed65f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 31 Oct 2024 11:20:56 -0400
Subject: tracing: Add might_fault() check in __DECLARE_TRACE_SYSCALL

Catch incorrect use of syscall tracepoints even if no probes are
registered by adding a might_fault() check in trace_##name()
emitted by __DECLARE_TRACE_SYSCALL.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jordan Rife <jrife@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
Cc: linux-trace-kernel@vger.kernel.org
Link: https://lore.kernel.org/20241031152056.744137-5-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 906f3091d23d..425123e921ac 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -301,6 +301,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
 	static inline void trace_##name(proto)				\
 	{								\
+		might_fault();						\
 		if (static_branch_unlikely(&__tracepoint_##name.key))	\
 			__DO_TRACE(name,				\
 				TP_ARGS(args),				\
-- 
cgit v1.2.3


From 61c6fefa92bb4ed7a34163b94f6ffac628237a29 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 1 Nov 2024 11:17:53 -0700
Subject: bpf: decouple BPF link/attach hook and BPF program sleepable
 semantics

BPF link's lifecycle protection scheme depends on both BPF hook and BPF
program. If *either* of those require RCU Tasks Trace GP, then we need
to go through a chain of GPs before putting BPF program refcount and
deallocating BPF link memory.

This patch adds bpf_link-specific sleepable flag, which can be set to
true even if underlying BPF program is not sleepable itself. If either
link->sleepable or link->prog->sleepable is true, we'll go through
a chain of RCU Tasks Trace GP and RCU GP before putting BPF program and
freeing memory.

This will be used to protect BPF link for sleepable (faultable) raw
tracepoints in the next patch.

Link: https://lore.kernel.org/20241101181754.782341-2-andrii@kernel.org
Tested-by: Jordan Rife <jrife@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/bpf.h  | 20 ++++++++++++++++++--
 kernel/bpf/syscall.c | 39 ++++++++++++++++++++++++++++-----------
 2 files changed, 46 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 19d8ca8ac960..e7236facadd4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1598,6 +1598,11 @@ struct bpf_link {
 	enum bpf_link_type type;
 	const struct bpf_link_ops *ops;
 	struct bpf_prog *prog;
+	/* whether BPF link itself has "sleepable" semantics, which can differ
+	 * from underlying BPF program having a "sleepable" semantics, as BPF
+	 * link's semantics is determined by target attach hook
+	 */
+	bool sleepable;
 	/* rcu is used before freeing, work can be used to schedule that
 	 * RCU-based freeing before that, so they never overlap
 	 */
@@ -1614,8 +1619,10 @@ struct bpf_link_ops {
 	 */
 	void (*dealloc)(struct bpf_link *link);
 	/* deallocate link resources callback, called after RCU grace period;
-	 * if underlying BPF program is sleepable we go through tasks trace
-	 * RCU GP and then "classic" RCU GP
+	 * if either the underlying BPF program is sleepable or BPF link's
+	 * target hook is sleepable, we'll go through tasks trace RCU GP and
+	 * then "classic" RCU GP; this need for chaining tasks trace and
+	 * classic RCU GPs is designated by setting bpf_link->sleepable flag
 	 */
 	void (*dealloc_deferred)(struct bpf_link *link);
 	int (*detach)(struct bpf_link *link);
@@ -2362,6 +2369,9 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
 
 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
 		   const struct bpf_link_ops *ops, struct bpf_prog *prog);
+void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+			     const struct bpf_link_ops *ops, struct bpf_prog *prog,
+			     bool sleepable);
 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
 int bpf_link_settle(struct bpf_link_primer *primer);
 void bpf_link_cleanup(struct bpf_link_primer *primer);
@@ -2717,6 +2727,12 @@ static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
 {
 }
 
+static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+					   const struct bpf_link_ops *ops, struct bpf_prog *prog,
+					   bool sleepable)
+{
+}
+
 static inline int bpf_link_prime(struct bpf_link *link,
 				 struct bpf_link_primer *primer)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index aa7246a399f3..0f5540627911 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2933,17 +2933,33 @@ static int bpf_obj_get(const union bpf_attr *attr)
 				attr->file_flags);
 }
 
-void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
-		   const struct bpf_link_ops *ops, struct bpf_prog *prog)
+/* bpf_link_init_sleepable() allows to specify whether BPF link itself has
+ * "sleepable" semantics, which normally would mean that BPF link's attach
+ * hook can dereference link or link's underlying program for some time after
+ * detachment due to RCU Tasks Trace-based lifetime protection scheme.
+ * BPF program itself can be non-sleepable, yet, because it's transitively
+ * reachable through BPF link, its freeing has to be delayed until after RCU
+ * Tasks Trace GP.
+ */
+void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+			     const struct bpf_link_ops *ops, struct bpf_prog *prog,
+			     bool sleepable)
 {
 	WARN_ON(ops->dealloc && ops->dealloc_deferred);
 	atomic64_set(&link->refcnt, 1);
 	link->type = type;
+	link->sleepable = sleepable;
 	link->id = 0;
 	link->ops = ops;
 	link->prog = prog;
 }
 
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+		   const struct bpf_link_ops *ops, struct bpf_prog *prog)
+{
+	bpf_link_init_sleepable(link, type, ops, prog, false);
+}
+
 static void bpf_link_free_id(int id)
 {
 	if (!id)
@@ -3008,20 +3024,21 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
 static void bpf_link_free(struct bpf_link *link)
 {
 	const struct bpf_link_ops *ops = link->ops;
-	bool sleepable = false;
 
 	bpf_link_free_id(link->id);
-	if (link->prog) {
-		sleepable = link->prog->sleepable;
-		/* detach BPF program, clean up used resources */
+	/* detach BPF program, clean up used resources */
+	if (link->prog)
 		ops->release(link);
-	}
 	if (ops->dealloc_deferred) {
-		/* schedule BPF link deallocation; if underlying BPF program
-		 * is sleepable, we need to first wait for RCU tasks trace
-		 * sync, then go through "classic" RCU grace period
+		/* Schedule BPF link deallocation, which will only then
+		 * trigger putting BPF program refcount.
+		 * If underlying BPF program is sleepable or BPF link's target
+		 * attach hookpoint is sleepable or otherwise requires RCU GPs
+		 * to ensure link and its underlying BPF program is not
+		 * reachable anymore, we need to first wait for RCU tasks
+		 * trace sync, and then go through "classic" RCU grace period
 		 */
-		if (sleepable)
+		if (link->sleepable || (link->prog && link->prog->sleepable))
 			call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
 		else
 			call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
-- 
cgit v1.2.3


From d44d26987bb3df6d76556827097fc9ce17565cb8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 31 Oct 2024 13:04:07 +0100
Subject: timekeeping: Remove CONFIG_DEBUG_TIMEKEEPING

Since 135225a363ae timekeeping_cycles_to_ns() handles large offsets which
would lead to 64bit multiplication overflows correctly. It's also protected
against negative motion of the clocksource unconditionally, which was
exclusive to x86 before.

timekeeping_advance() handles large offsets already correctly.

That means the value of CONFIG_DEBUG_TIMEKEEPING which analyzed these cases
is very close to zero. Remove all of it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241031120328.536010148@linutronix.de
---
 arch/riscv/configs/defconfig                       |   1 -
 include/linux/timekeeper_internal.h                |  16 ---
 kernel/time/timekeeping.c                          | 108 +--------------------
 lib/Kconfig.debug                                  |  13 ---
 .../testing/selftests/wireguard/qemu/debug.config  |   1 -
 5 files changed, 3 insertions(+), 136 deletions(-)

(limited to 'include/linux')

diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index 2341393cfac1..26c01b9e3434 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig
@@ -301,7 +301,6 @@ CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_PER_CPU_MAPS=y
 CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_WQ_WATCHDOG=y
-CONFIG_DEBUG_TIMEKEEPING=y
 CONFIG_DEBUG_RT_MUTEXES=y
 CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_MUTEXES=y
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index a3b6380a7777..e39d4d563b19 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -76,9 +76,6 @@ struct tk_read_base {
  *				ntp shifted nano seconds.
  * @ntp_err_mult:		Multiplication factor for scaled math conversion
  * @skip_second_overflow:	Flag used to avoid updating NTP twice with same second
- * @last_warning:		Warning ratelimiter (DEBUG_TIMEKEEPING)
- * @underflow_seen:		Underflow warning flag (DEBUG_TIMEKEEPING)
- * @overflow_seen:		Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -147,19 +144,6 @@ struct timekeeper {
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
 	u32			skip_second_overflow;
-
-#ifdef CONFIG_DEBUG_TIMEKEEPING
-	long			last_warning;
-	/*
-	 * These simple flag variables are managed
-	 * without locks, which is racy, but they are
-	 * ok since we don't really care about being
-	 * super precise about how many events were
-	 * seen, just that a problem was observed.
-	 */
-	int			underflow_seen;
-	int			overflow_seen;
-#endif
 };
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 17cae886ca82..d115adebc418 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -226,97 +226,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr)
 	return clock->read(clock);
 }
 
-#ifdef CONFIG_DEBUG_TIMEKEEPING
-#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-
-static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
-{
-
-	u64 max_cycles = tk->tkr_mono.clock->max_cycles;
-	const char *name = tk->tkr_mono.clock->name;
-
-	if (offset > max_cycles) {
-		printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
-				offset, name, max_cycles);
-		printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
-	} else {
-		if (offset > (max_cycles >> 1)) {
-			printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
-					offset, name, max_cycles >> 1);
-			printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
-		}
-	}
-
-	if (tk->underflow_seen) {
-		if (jiffies - tk->last_warning > WARNING_FREQ) {
-			printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
-			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
-			printk_deferred("         Your kernel is probably still fine.\n");
-			tk->last_warning = jiffies;
-		}
-		tk->underflow_seen = 0;
-	}
-
-	if (tk->overflow_seen) {
-		if (jiffies - tk->last_warning > WARNING_FREQ) {
-			printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
-			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
-			printk_deferred("         Your kernel is probably still fine.\n");
-			tk->last_warning = jiffies;
-		}
-		tk->overflow_seen = 0;
-	}
-}
-
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles);
-
-static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	u64 now, last, mask, max, delta;
-	unsigned int seq;
-
-	/*
-	 * Since we're called holding a seqcount, the data may shift
-	 * under us while we're doing the calculation. This can cause
-	 * false positives, since we'd note a problem but throw the
-	 * results away. So nest another seqcount here to atomically
-	 * grab the points we are checking with.
-	 */
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-		now = tk_clock_read(tkr);
-		last = tkr->cycle_last;
-		mask = tkr->mask;
-		max = tkr->clock->max_cycles;
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	delta = clocksource_delta(now, last, mask);
-
-	/*
-	 * Try to catch underflows by checking if we are seeing small
-	 * mask-relative negative values.
-	 */
-	if (unlikely((~delta & mask) < (mask >> 3)))
-		tk->underflow_seen = 1;
-
-	/* Check for multiplication overflows */
-	if (unlikely(delta > max))
-		tk->overflow_seen = 1;
-
-	/* timekeeping_cycles_to_ns() handles both under and overflow */
-	return timekeeping_cycles_to_ns(tkr, now);
-}
-#else
-static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
-{
-}
-static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
-{
-	BUG();
-}
-#endif
-
 /**
  * tk_setup_internals - Set up internals to use clocksource clock.
  *
@@ -421,19 +330,11 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c
 	return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
 }
 
-static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
+static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
 {
 	return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
 }
 
-static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
-{
-	if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING))
-		return timekeeping_debug_get_ns(tkr);
-
-	return __timekeeping_get_ns(tkr);
-}
-
 /**
  * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
  * @tkr: Timekeeping readout base from which we take the update
@@ -477,7 +378,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 		seq = raw_read_seqcount_latch(&tkf->seq);
 		tkr = tkf->base + (seq & 0x01);
 		now = ktime_to_ns(tkr->base);
-		now += __timekeeping_get_ns(tkr);
+		now += timekeeping_get_ns(tkr);
 	} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
 
 	return now;
@@ -593,7 +494,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
 		tkr = tkf->base + (seq & 0x01);
 		basem = ktime_to_ns(tkr->base);
 		baser = ktime_to_ns(tkr->base_real);
-		delta = __timekeeping_get_ns(tkr);
+		delta = timekeeping_get_ns(tkr);
 	} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
 
 	if (mono)
@@ -2333,9 +2234,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
 	if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
 		return false;
 
-	/* Do some additional sanity checking */
-	timekeeping_check_update(tk, offset);
-
 	/*
 	 * With NO_HZ we may have to accumulate many cycle_intervals
 	 * (think "ticks") worth of time at once. To do this efficiently,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7315f643817a..14977b9fc254 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1328,19 +1328,6 @@ config SCHEDSTATS
 
 endmenu
 
-config DEBUG_TIMEKEEPING
-	bool "Enable extra timekeeping sanity checking"
-	help
-	  This option will enable additional timekeeping sanity checks
-	  which may be helpful when diagnosing issues where timekeeping
-	  problems are suspected.
-
-	  This may include checks in the timekeeping hotpaths, so this
-	  option may have a (very small) performance impact to some
-	  workloads.
-
-	  If unsure, say N.
-
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPTION && TRACE_IRQFLAGS_SUPPORT
diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config
index 9d172210e2c6..139fd9aa8b12 100644
--- a/tools/testing/selftests/wireguard/qemu/debug.config
+++ b/tools/testing/selftests/wireguard/qemu/debug.config
@@ -31,7 +31,6 @@ CONFIG_SCHED_DEBUG=y
 CONFIG_SCHED_INFO=y
 CONFIG_SCHEDSTATS=y
 CONFIG_SCHED_STACK_END_CHECK=y
-CONFIG_DEBUG_TIMEKEEPING=y
 CONFIG_DEBUG_PREEMPT=y
 CONFIG_DEBUG_RT_MUTEXES=y
 CONFIG_DEBUG_SPINLOCK=y
-- 
cgit v1.2.3


From 2a69297eed87c1f3ad33b8a169025c06adde5dcf Mon Sep 17 00:00:00 2001
From: Qi Tao <taoqi10@huawei.com>
Date: Sat, 26 Oct 2024 17:46:51 +0800
Subject: crypto: hisilicon - support querying the capability register

Query the capability register status of accelerator devices
(SEC, HPRE and ZIP) through the debugfs interface, for example:
cat cap_regs. The purpose is to improve the robustness and
locability of hardware devices and drivers.

Signed-off-by: Qi Tao <taoqi10@huawei.com>
Signed-off-by: Chenghai Huang <huangchenghai2@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/ABI/testing/debugfs-hisi-hpre |   7 ++
 Documentation/ABI/testing/debugfs-hisi-sec  |   7 ++
 Documentation/ABI/testing/debugfs-hisi-zip  |   7 ++
 drivers/crypto/hisilicon/hpre/hpre.h        |  23 ++++++
 drivers/crypto/hisilicon/hpre/hpre_main.c   | 121 +++++++++++++++++++++-------
 drivers/crypto/hisilicon/qm.c               |  88 +++++++++++++-------
 drivers/crypto/hisilicon/sec2/sec.h         |  26 ++++--
 drivers/crypto/hisilicon/sec2/sec_crypto.c  |   8 +-
 drivers/crypto/hisilicon/sec2/sec_main.c    |  71 +++++++++++++---
 drivers/crypto/hisilicon/zip/zip.h          |  18 +++++
 drivers/crypto/hisilicon/zip/zip_main.c     | 116 ++++++++++++++++++--------
 include/linux/hisi_acc_qm.h                 |  15 ++++
 12 files changed, 396 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/debugfs-hisi-hpre b/Documentation/ABI/testing/debugfs-hisi-hpre
index d4e16ef9ac9a..29fb7d5ffc69 100644
--- a/Documentation/ABI/testing/debugfs-hisi-hpre
+++ b/Documentation/ABI/testing/debugfs-hisi-hpre
@@ -184,3 +184,10 @@ Date:		Apr 2020
 Contact:	linux-crypto@vger.kernel.org
 Description:	Dump the total number of time out requests.
 		Available for both PF and VF, and take no other effect on HPRE.
+
+What:           /sys/kernel/debug/hisi_hpre/<bdf>/cap_regs
+Date:           Oct 2024
+Contact:        linux-crypto@vger.kernel.org
+Description:    Dump the values of the qm and hpre capability bit registers and
+                support the query of device specifications to facilitate fault locating.
+                Available for both PF and VF, and take no other effect on HPRE.
diff --git a/Documentation/ABI/testing/debugfs-hisi-sec b/Documentation/ABI/testing/debugfs-hisi-sec
index 6c6c9a6e150a..82bf4a0dc7f7 100644
--- a/Documentation/ABI/testing/debugfs-hisi-sec
+++ b/Documentation/ABI/testing/debugfs-hisi-sec
@@ -157,3 +157,10 @@ Contact:	linux-crypto@vger.kernel.org
 Description:	Dump the total number of completed but marked error requests
 		to be received.
 		Available for both PF and VF, and take no other effect on SEC.
+
+What:           /sys/kernel/debug/hisi_sec2/<bdf>/cap_regs
+Date:           Oct 2024
+Contact:        linux-crypto@vger.kernel.org
+Description:    Dump the values of the qm and sec capability bit registers and
+                support the query of device specifications to facilitate fault locating.
+                Available for both PF and VF, and take no other effect on SEC.
diff --git a/Documentation/ABI/testing/debugfs-hisi-zip b/Documentation/ABI/testing/debugfs-hisi-zip
index a22dd6942219..0abd65d27e9b 100644
--- a/Documentation/ABI/testing/debugfs-hisi-zip
+++ b/Documentation/ABI/testing/debugfs-hisi-zip
@@ -158,3 +158,10 @@ Contact:	linux-crypto@vger.kernel.org
 Description:	Dump the total number of BD type error requests
 		to be received.
 		Available for both PF and VF, and take no other effect on ZIP.
+
+What:           /sys/kernel/debug/hisi_zip/<bdf>/cap_regs
+Date:           Oct 2024
+Contact:        linux-crypto@vger.kernel.org
+Description:    Dump the values of the qm and zip capability bit registers and
+                support the query of device specifications to facilitate fault locating.
+                Available for both PF and VF, and take no other effect on ZIP.
diff --git a/drivers/crypto/hisilicon/hpre/hpre.h b/drivers/crypto/hisilicon/hpre/hpre.h
index 9f0b94c8e03d..0f3ddbadbcf9 100644
--- a/drivers/crypto/hisilicon/hpre/hpre.h
+++ b/drivers/crypto/hisilicon/hpre/hpre.h
@@ -100,6 +100,29 @@ struct hpre_sqe {
 	__le32 rsvd1[_HPRE_SQE_ALIGN_EXT];
 };
 
+enum hpre_cap_table_type {
+	QM_RAS_NFE_TYPE = 0x0,
+	QM_RAS_NFE_RESET,
+	QM_RAS_CE_TYPE,
+	HPRE_RAS_NFE_TYPE,
+	HPRE_RAS_NFE_RESET,
+	HPRE_RAS_CE_TYPE,
+	HPRE_CORE_INFO,
+	HPRE_CORE_EN,
+	HPRE_DRV_ALG_BITMAP,
+	HPRE_ALG_BITMAP,
+	HPRE_CORE1_BITMAP_CAP,
+	HPRE_CORE2_BITMAP_CAP,
+	HPRE_CORE3_BITMAP_CAP,
+	HPRE_CORE4_BITMAP_CAP,
+	HPRE_CORE5_BITMAP_CAP,
+	HPRE_CORE6_BITMAP_CAP,
+	HPRE_CORE7_BITMAP_CAP,
+	HPRE_CORE8_BITMAP_CAP,
+	HPRE_CORE9_BITMAP_CAP,
+	HPRE_CORE10_BITMAP_CAP,
+};
+
 struct hisi_qp *hpre_create_qp(u8 type);
 int hpre_algs_register(struct hisi_qm *qm);
 void hpre_algs_unregister(struct hisi_qm *qm);
diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index f129878559c8..c5de1f6b6d3b 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -13,6 +13,7 @@
 #include <linux/uacce.h>
 #include "hpre.h"
 
+#define CAP_FILE_PERMISSION		0444
 #define HPRE_CTRL_CNT_CLR_CE_BIT	BIT(0)
 #define HPRE_CTRL_CNT_CLR_CE		0x301000
 #define HPRE_FSM_MAX_CNT		0x301008
@@ -203,7 +204,7 @@ static const struct hisi_qm_cap_info hpre_basic_info[] = {
 	{HPRE_RESET_MASK_CAP, 0x3134, 0, GENMASK(31, 0), 0x0, 0x3FFFFE, 0xBFFC3E},
 	{HPRE_OOO_SHUTDOWN_MASK_CAP, 0x3134, 0, GENMASK(31, 0), 0x0, 0x22, 0xBFFC3E},
 	{HPRE_CE_MASK_CAP, 0x3138, 0, GENMASK(31, 0), 0x0, 0x1, 0x1},
-	{HPRE_CLUSTER_NUM_CAP, 0x313c, 20, GENMASK(3, 0), 0x0,  0x4, 0x1},
+	{HPRE_CLUSTER_NUM_CAP, 0x313c, 20, GENMASK(3, 0), 0x0, 0x4, 0x1},
 	{HPRE_CORE_TYPE_NUM_CAP, 0x313c, 16, GENMASK(3, 0), 0x0, 0x2, 0x2},
 	{HPRE_CORE_NUM_CAP, 0x313c, 8, GENMASK(7, 0), 0x0, 0x8, 0xA},
 	{HPRE_CLUSTER_CORE_NUM_CAP, 0x313c, 0, GENMASK(7, 0), 0x0, 0x2, 0xA},
@@ -222,18 +223,27 @@ static const struct hisi_qm_cap_info hpre_basic_info[] = {
 	{HPRE_CORE10_ALG_BITMAP_CAP, 0x3170, 0, GENMASK(31, 0), 0x0, 0x10, 0x10}
 };
 
-enum hpre_pre_store_cap_idx {
-	HPRE_CLUSTER_NUM_CAP_IDX = 0x0,
-	HPRE_CORE_ENABLE_BITMAP_CAP_IDX,
-	HPRE_DRV_ALG_BITMAP_CAP_IDX,
-	HPRE_DEV_ALG_BITMAP_CAP_IDX,
-};
-
-static const u32 hpre_pre_store_caps[] = {
-	HPRE_CLUSTER_NUM_CAP,
-	HPRE_CORE_ENABLE_BITMAP_CAP,
-	HPRE_DRV_ALG_BITMAP_CAP,
-	HPRE_DEV_ALG_BITMAP_CAP,
+static const struct hisi_qm_cap_query_info hpre_cap_query_info[] = {
+	{QM_RAS_NFE_TYPE, "QM_RAS_NFE_TYPE             ", 0x3124, 0x0, 0x1C37, 0x7C37},
+	{QM_RAS_NFE_RESET, "QM_RAS_NFE_RESET            ", 0x3128, 0x0, 0xC77, 0x6C77},
+	{QM_RAS_CE_TYPE, "QM_RAS_CE_TYPE              ", 0x312C, 0x0, 0x8, 0x8},
+	{HPRE_RAS_NFE_TYPE, "HPRE_RAS_NFE_TYPE           ", 0x3130, 0x0, 0x3FFFFE, 0x1FFFC3E},
+	{HPRE_RAS_NFE_RESET, "HPRE_RAS_NFE_RESET          ", 0x3134, 0x0, 0x3FFFFE, 0xBFFC3E},
+	{HPRE_RAS_CE_TYPE, "HPRE_RAS_CE_TYPE            ", 0x3138, 0x0, 0x1, 0x1},
+	{HPRE_CORE_INFO, "HPRE_CORE_INFO              ", 0x313c, 0x0, 0x420802, 0x120A0A},
+	{HPRE_CORE_EN, "HPRE_CORE_EN                ", 0x3140, 0x0, 0xF, 0x3FF},
+	{HPRE_DRV_ALG_BITMAP, "HPRE_DRV_ALG_BITMAP         ", 0x3144, 0x0, 0x03, 0x27},
+	{HPRE_ALG_BITMAP, "HPRE_ALG_BITMAP             ", 0x3148, 0x0, 0x03, 0x7F},
+	{HPRE_CORE1_BITMAP_CAP, "HPRE_CORE1_BITMAP_CAP       ", 0x314c, 0x0, 0x7F, 0x7F},
+	{HPRE_CORE2_BITMAP_CAP, "HPRE_CORE2_BITMAP_CAP       ", 0x3150, 0x0, 0x7F, 0x7F},
+	{HPRE_CORE3_BITMAP_CAP, "HPRE_CORE3_BITMAP_CAP       ", 0x3154, 0x0, 0x7F, 0x7F},
+	{HPRE_CORE4_BITMAP_CAP, "HPRE_CORE4_BITMAP_CAP       ", 0x3158, 0x0, 0x7F, 0x7F},
+	{HPRE_CORE5_BITMAP_CAP, "HPRE_CORE5_BITMAP_CAP       ", 0x315c, 0x0, 0x7F, 0x7F},
+	{HPRE_CORE6_BITMAP_CAP, "HPRE_CORE6_BITMAP_CAP       ", 0x3160, 0x0, 0x7F, 0x7F},
+	{HPRE_CORE7_BITMAP_CAP, "HPRE_CORE7_BITMAP_CAP       ", 0x3164, 0x0, 0x7F, 0x7F},
+	{HPRE_CORE8_BITMAP_CAP, "HPRE_CORE8_BITMAP_CAP       ", 0x3168, 0x0, 0x7F, 0x7F},
+	{HPRE_CORE9_BITMAP_CAP, "HPRE_CORE9_BITMAP_CAP       ", 0x316c, 0x0, 0x10, 0x10},
+	{HPRE_CORE10_BITMAP_CAP, "HPRE_CORE10_BITMAP_CAP      ", 0x3170, 0x0, 0x10, 0x10},
 };
 
 static const struct hpre_hw_error hpre_hw_errors[] = {
@@ -360,7 +370,7 @@ bool hpre_check_alg_support(struct hisi_qm *qm, u32 alg)
 {
 	u32 cap_val;
 
-	cap_val = qm->cap_tables.dev_cap_table[HPRE_DRV_ALG_BITMAP_CAP_IDX].cap_val;
+	cap_val = qm->cap_tables.dev_cap_table[HPRE_DRV_ALG_BITMAP].cap_val;
 	if (alg & cap_val)
 		return true;
 
@@ -503,14 +513,17 @@ static int hpre_cfg_by_dsm(struct hisi_qm *qm)
 static int hpre_set_cluster(struct hisi_qm *qm)
 {
 	struct device *dev = &qm->pdev->dev;
-	unsigned long offset;
 	u32 cluster_core_mask;
+	unsigned long offset;
+	u32 hpre_core_info;
 	u8 clusters_num;
 	u32 val = 0;
 	int ret, i;
 
-	cluster_core_mask = qm->cap_tables.dev_cap_table[HPRE_CORE_ENABLE_BITMAP_CAP_IDX].cap_val;
-	clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+	cluster_core_mask = qm->cap_tables.dev_cap_table[HPRE_CORE_EN].cap_val;
+	hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+	clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+			hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
 	for (i = 0; i < clusters_num; i++) {
 		offset = i * HPRE_CLSTR_ADDR_INTRVL;
 
@@ -595,6 +608,7 @@ static void hpre_enable_clock_gate(struct hisi_qm *qm)
 {
 	unsigned long offset;
 	u8 clusters_num, i;
+	u32 hpre_core_info;
 	u32 val;
 
 	if (qm->ver < QM_HW_V3)
@@ -608,7 +622,9 @@ static void hpre_enable_clock_gate(struct hisi_qm *qm)
 	val |= HPRE_PEH_CFG_AUTO_GATE_EN;
 	writel(val, qm->io_base + HPRE_PEH_CFG_AUTO_GATE);
 
-	clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+	hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+	clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+			hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
 	for (i = 0; i < clusters_num; i++) {
 		offset = (unsigned long)i * HPRE_CLSTR_ADDR_INTRVL;
 		val = readl(qm->io_base + offset + HPRE_CLUSTER_DYN_CTL);
@@ -625,6 +641,7 @@ static void hpre_disable_clock_gate(struct hisi_qm *qm)
 {
 	unsigned long offset;
 	u8 clusters_num, i;
+	u32 hpre_core_info;
 	u32 val;
 
 	if (qm->ver < QM_HW_V3)
@@ -638,7 +655,9 @@ static void hpre_disable_clock_gate(struct hisi_qm *qm)
 	val &= ~HPRE_PEH_CFG_AUTO_GATE_EN;
 	writel(val, qm->io_base + HPRE_PEH_CFG_AUTO_GATE);
 
-	clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+	hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+	clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+			hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
 	for (i = 0; i < clusters_num; i++) {
 		offset = (unsigned long)i * HPRE_CLSTR_ADDR_INTRVL;
 		val = readl(qm->io_base + offset + HPRE_CLUSTER_DYN_CTL);
@@ -711,11 +730,14 @@ static int hpre_set_user_domain_and_cache(struct hisi_qm *qm)
 static void hpre_cnt_regs_clear(struct hisi_qm *qm)
 {
 	unsigned long offset;
+	u32 hpre_core_info;
 	u8 clusters_num;
 	int i;
 
 	/* clear clusterX/cluster_ctrl */
-	clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+	hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+	clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+			hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
 	for (i = 0; i < clusters_num; i++) {
 		offset = HPRE_CLSTR_BASE + i * HPRE_CLSTR_ADDR_INTRVL;
 		writel(0x0, qm->io_base + offset + HPRE_CLUSTER_INQURY);
@@ -1007,10 +1029,13 @@ static int hpre_cluster_debugfs_init(struct hisi_qm *qm)
 	char buf[HPRE_DBGFS_VAL_MAX_LEN];
 	struct debugfs_regset32 *regset;
 	struct dentry *tmp_d;
+	u32 hpre_core_info;
 	u8 clusters_num;
 	int i, ret;
 
-	clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+	hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+	clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+			hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
 	for (i = 0; i < clusters_num; i++) {
 		ret = snprintf(buf, HPRE_DBGFS_VAL_MAX_LEN, "cluster%d", i);
 		if (ret >= HPRE_DBGFS_VAL_MAX_LEN)
@@ -1053,6 +1078,26 @@ static int hpre_ctrl_debug_init(struct hisi_qm *qm)
 	return hpre_cluster_debugfs_init(qm);
 }
 
+static int hpre_cap_regs_show(struct seq_file *s, void *unused)
+{
+	struct hisi_qm *qm = s->private;
+	u32 i, size;
+
+	size = qm->cap_tables.qm_cap_size;
+	for (i = 0; i < size; i++)
+		seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.qm_cap_table[i].name,
+			   qm->cap_tables.qm_cap_table[i].cap_val);
+
+	size = qm->cap_tables.dev_cap_size;
+	for (i = 0; i < size; i++)
+		seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.dev_cap_table[i].name,
+			   qm->cap_tables.dev_cap_table[i].cap_val);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(hpre_cap_regs);
+
 static void hpre_dfx_debug_init(struct hisi_qm *qm)
 {
 	struct dfx_diff_registers *hpre_regs = qm->debug.acc_diff_regs;
@@ -1071,6 +1116,9 @@ static void hpre_dfx_debug_init(struct hisi_qm *qm)
 	if (qm->fun_type == QM_HW_PF && hpre_regs)
 		debugfs_create_file("diff_regs", 0444, parent,
 				      qm, &hpre_diff_regs_fops);
+
+	debugfs_create_file("cap_regs", CAP_FILE_PERMISSION,
+			    qm->debug.debug_root, qm, &hpre_cap_regs_fops);
 }
 
 static int hpre_debugfs_init(struct hisi_qm *qm)
@@ -1118,26 +1166,33 @@ static int hpre_pre_store_cap_reg(struct hisi_qm *qm)
 {
 	struct hisi_qm_cap_record *hpre_cap;
 	struct device *dev = &qm->pdev->dev;
+	u32 hpre_core_info;
+	u8 clusters_num;
 	size_t i, size;
 
-	size = ARRAY_SIZE(hpre_pre_store_caps);
+	size = ARRAY_SIZE(hpre_cap_query_info);
 	hpre_cap = devm_kzalloc(dev, sizeof(*hpre_cap) * size, GFP_KERNEL);
 	if (!hpre_cap)
 		return -ENOMEM;
 
 	for (i = 0; i < size; i++) {
-		hpre_cap[i].type = hpre_pre_store_caps[i];
-		hpre_cap[i].cap_val = hisi_qm_get_hw_info(qm, hpre_basic_info,
-				      hpre_pre_store_caps[i], qm->cap_ver);
+		hpre_cap[i].type = hpre_cap_query_info[i].type;
+		hpre_cap[i].name = hpre_cap_query_info[i].name;
+		hpre_cap[i].cap_val = hisi_qm_get_cap_value(qm, hpre_cap_query_info,
+					i, qm->cap_ver);
 	}
 
-	if (hpre_cap[HPRE_CLUSTER_NUM_CAP_IDX].cap_val > HPRE_CLUSTERS_NUM_MAX) {
+	hpre_core_info = hpre_cap[HPRE_CORE_INFO].cap_val;
+	clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+			hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
+	if (clusters_num > HPRE_CLUSTERS_NUM_MAX) {
 		dev_err(dev, "Device cluster num %u is out of range for driver supports %d!\n",
-			hpre_cap[HPRE_CLUSTER_NUM_CAP_IDX].cap_val, HPRE_CLUSTERS_NUM_MAX);
+			clusters_num, HPRE_CLUSTERS_NUM_MAX);
 		return -EINVAL;
 	}
 
 	qm->cap_tables.dev_cap_table = hpre_cap;
+	qm->cap_tables.dev_cap_size = size;
 
 	return 0;
 }
@@ -1184,7 +1239,7 @@ static int hpre_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 		return ret;
 	}
 
-	alg_msk = qm->cap_tables.dev_cap_table[HPRE_DEV_ALG_BITMAP_CAP_IDX].cap_val;
+	alg_msk = qm->cap_tables.dev_cap_table[HPRE_ALG_BITMAP].cap_val;
 	ret = hisi_qm_set_algs(qm, alg_msk, hpre_dev_algs, ARRAY_SIZE(hpre_dev_algs));
 	if (ret) {
 		pci_err(pdev, "Failed to set hpre algs!\n");
@@ -1200,10 +1255,13 @@ static int hpre_show_last_regs_init(struct hisi_qm *qm)
 	int com_dfx_regs_num = ARRAY_SIZE(hpre_com_dfx_regs);
 	struct qm_debug *debug = &qm->debug;
 	void __iomem *io_base;
+	u32 hpre_core_info;
 	u8 clusters_num;
 	int i, j, idx;
 
-	clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+	hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+	clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+			hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
 	debug->last_words = kcalloc(cluster_dfx_regs_num * clusters_num +
 			com_dfx_regs_num, sizeof(unsigned int), GFP_KERNEL);
 	if (!debug->last_words)
@@ -1243,6 +1301,7 @@ static void hpre_show_last_dfx_regs(struct hisi_qm *qm)
 	struct qm_debug *debug = &qm->debug;
 	struct pci_dev *pdev = qm->pdev;
 	void __iomem *io_base;
+	u32 hpre_core_info;
 	u8 clusters_num;
 	int i, j, idx;
 	u32 val;
@@ -1258,7 +1317,9 @@ static void hpre_show_last_dfx_regs(struct hisi_qm *qm)
 			  hpre_com_dfx_regs[i].name, debug->last_words[i], val);
 	}
 
-	clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+	hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+	clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+			hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
 	for (i = 0; i < clusters_num; i++) {
 		io_base = qm->io_base + hpre_cluster_offsets[i];
 		for (j = 0; j <  cluster_dfx_regs_num; j++) {
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index f7e8237e3a93..1cdda09ee817 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -307,11 +307,29 @@ enum qm_basic_type {
 	QM_VF_IRQ_NUM_CAP,
 };
 
-enum qm_pre_store_cap_idx {
-	QM_EQ_IRQ_TYPE_CAP_IDX = 0x0,
-	QM_AEQ_IRQ_TYPE_CAP_IDX,
-	QM_ABN_IRQ_TYPE_CAP_IDX,
-	QM_PF2VF_IRQ_TYPE_CAP_IDX,
+enum qm_cap_table_type {
+	QM_CAP_VF  = 0x0,
+	QM_AEQE_NUM,
+	QM_SCQE_NUM,
+	QM_EQ_IRQ,
+	QM_AEQ_IRQ,
+	QM_ABNORMAL_IRQ,
+	QM_MB_IRQ,
+	MAX_IRQ_NUM,
+	EXT_BAR_INDEX,
+};
+
+static const struct hisi_qm_cap_query_info qm_cap_query_info[] = {
+	{QM_CAP_VF, "QM_CAP_VF                   ", 0x3100, 0x0, 0x0, 0x6F01},
+	{QM_AEQE_NUM, "QM_AEQE_NUM                 ", 0x3104, 0x800, 0x4000800, 0x4000800},
+	{QM_SCQE_NUM, "QM_SCQE_NUM                 ",
+						0x3108, 0x4000400, 0x4000400, 0x4000400},
+	{QM_EQ_IRQ, "QM_EQ_IRQ                   ", 0x310c, 0x10000, 0x10000, 0x10000},
+	{QM_AEQ_IRQ, "QM_AEQ_IRQ                  ", 0x3110, 0x0, 0x10001, 0x10001},
+	{QM_ABNORMAL_IRQ, "QM_ABNORMAL_IRQ             ", 0x3114, 0x0, 0x10003, 0x10003},
+	{QM_MB_IRQ, "QM_MB_IRQ                   ", 0x3118, 0x0, 0x0, 0x10002},
+	{MAX_IRQ_NUM, "MAX_IRQ_NUM                 ", 0x311c, 0x10001, 0x40002, 0x40003},
+	{EXT_BAR_INDEX, "EXT_BAR_INDEX               ", 0x3120, 0x0, 0x0, 0x14},
 };
 
 static const struct hisi_qm_cap_info qm_cap_info_comm[] = {
@@ -344,13 +362,6 @@ static const struct hisi_qm_cap_info qm_basic_info[] = {
 	{QM_VF_IRQ_NUM_CAP,     0x311c,   0,  GENMASK(15, 0), 0x1,       0x2,       0x3},
 };
 
-static const u32 qm_pre_store_caps[] = {
-	QM_EQ_IRQ_TYPE_CAP,
-	QM_AEQ_IRQ_TYPE_CAP,
-	QM_ABN_IRQ_TYPE_CAP,
-	QM_PF2VF_IRQ_TYPE_CAP,
-};
-
 struct qm_mailbox {
 	__le16 w0;
 	__le16 queue_num;
@@ -794,6 +805,27 @@ u32 hisi_qm_get_hw_info(struct hisi_qm *qm,
 }
 EXPORT_SYMBOL_GPL(hisi_qm_get_hw_info);
 
+u32 hisi_qm_get_cap_value(struct hisi_qm *qm,
+			const struct hisi_qm_cap_query_info *info_table,
+			u32 index, bool is_read)
+{
+	u32 val;
+
+	switch (qm->ver) {
+	case QM_HW_V1:
+		return info_table[index].v1_val;
+	case QM_HW_V2:
+		return info_table[index].v2_val;
+	default:
+		if (!is_read)
+			return info_table[index].v3_val;
+
+		val = readl(qm->io_base + info_table[index].offset);
+		return val;
+	}
+}
+EXPORT_SYMBOL_GPL(hisi_qm_get_cap_value);
+
 static void qm_get_xqc_depth(struct hisi_qm *qm, u16 *low_bits,
 			     u16 *high_bits, enum qm_basic_type type)
 {
@@ -4897,7 +4929,7 @@ static void qm_unregister_abnormal_irq(struct hisi_qm *qm)
 	if (qm->fun_type == QM_HW_VF)
 		return;
 
-	val = qm->cap_tables.qm_cap_table[QM_ABN_IRQ_TYPE_CAP_IDX].cap_val;
+	val = qm->cap_tables.qm_cap_table[QM_ABNORMAL_IRQ].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_ABN_IRQ_TYPE_MASK))
 		return;
 
@@ -4914,7 +4946,7 @@ static int qm_register_abnormal_irq(struct hisi_qm *qm)
 	if (qm->fun_type == QM_HW_VF)
 		return 0;
 
-	val = qm->cap_tables.qm_cap_table[QM_ABN_IRQ_TYPE_CAP_IDX].cap_val;
+	val = qm->cap_tables.qm_cap_table[QM_ABNORMAL_IRQ].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_ABN_IRQ_TYPE_MASK))
 		return 0;
 
@@ -4931,7 +4963,7 @@ static void qm_unregister_mb_cmd_irq(struct hisi_qm *qm)
 	struct pci_dev *pdev = qm->pdev;
 	u32 irq_vector, val;
 
-	val = qm->cap_tables.qm_cap_table[QM_PF2VF_IRQ_TYPE_CAP_IDX].cap_val;
+	val = qm->cap_tables.qm_cap_table[QM_MB_IRQ].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return;
 
@@ -4945,7 +4977,7 @@ static int qm_register_mb_cmd_irq(struct hisi_qm *qm)
 	u32 irq_vector, val;
 	int ret;
 
-	val = qm->cap_tables.qm_cap_table[QM_PF2VF_IRQ_TYPE_CAP_IDX].cap_val;
+	val = qm->cap_tables.qm_cap_table[QM_MB_IRQ].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return 0;
 
@@ -4962,7 +4994,7 @@ static void qm_unregister_aeq_irq(struct hisi_qm *qm)
 	struct pci_dev *pdev = qm->pdev;
 	u32 irq_vector, val;
 
-	val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ_TYPE_CAP_IDX].cap_val;
+	val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return;
 
@@ -4976,7 +5008,7 @@ static int qm_register_aeq_irq(struct hisi_qm *qm)
 	u32 irq_vector, val;
 	int ret;
 
-	val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ_TYPE_CAP_IDX].cap_val;
+	val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return 0;
 
@@ -4994,7 +5026,7 @@ static void qm_unregister_eq_irq(struct hisi_qm *qm)
 	struct pci_dev *pdev = qm->pdev;
 	u32 irq_vector, val;
 
-	val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ_TYPE_CAP_IDX].cap_val;
+	val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return;
 
@@ -5008,7 +5040,7 @@ static int qm_register_eq_irq(struct hisi_qm *qm)
 	u32 irq_vector, val;
 	int ret;
 
-	val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ_TYPE_CAP_IDX].cap_val;
+	val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return 0;
 
@@ -5096,24 +5128,26 @@ static int qm_get_qp_num(struct hisi_qm *qm)
 	return 0;
 }
 
-static int qm_pre_store_irq_type_caps(struct hisi_qm *qm)
+static int qm_pre_store_caps(struct hisi_qm *qm)
 {
 	struct hisi_qm_cap_record *qm_cap;
 	struct pci_dev *pdev = qm->pdev;
 	size_t i, size;
 
-	size = ARRAY_SIZE(qm_pre_store_caps);
+	size = ARRAY_SIZE(qm_cap_query_info);
 	qm_cap = devm_kzalloc(&pdev->dev, sizeof(*qm_cap) * size, GFP_KERNEL);
 	if (!qm_cap)
 		return -ENOMEM;
 
 	for (i = 0; i < size; i++) {
-		qm_cap[i].type = qm_pre_store_caps[i];
-		qm_cap[i].cap_val = hisi_qm_get_hw_info(qm, qm_basic_info,
-							qm_pre_store_caps[i], qm->cap_ver);
+		qm_cap[i].type = qm_cap_query_info[i].type;
+		qm_cap[i].name = qm_cap_query_info[i].name;
+		qm_cap[i].cap_val = hisi_qm_get_cap_value(qm, qm_cap_query_info,
+							i, qm->cap_ver);
 	}
 
 	qm->cap_tables.qm_cap_table = qm_cap;
+	qm->cap_tables.qm_cap_size = size;
 
 	return 0;
 }
@@ -5150,8 +5184,8 @@ static int qm_get_hw_caps(struct hisi_qm *qm)
 			set_bit(cap_info[i].type, &qm->caps);
 	}
 
-	/* Fetch and save the value of irq type related capability registers */
-	return qm_pre_store_irq_type_caps(qm);
+	/* Fetch and save the value of qm capability registers */
+	return qm_pre_store_caps(qm);
 }
 
 static int qm_get_pci_res(struct hisi_qm *qm)
diff --git a/drivers/crypto/hisilicon/sec2/sec.h b/drivers/crypto/hisilicon/sec2/sec.h
index 410c83712e28..356188bee6fb 100644
--- a/drivers/crypto/hisilicon/sec2/sec.h
+++ b/drivers/crypto/hisilicon/sec2/sec.h
@@ -220,11 +220,27 @@ enum sec_cap_type {
 	SEC_CORE4_ALG_BITMAP_HIGH,
 };
 
-enum sec_cap_reg_record_idx {
-	SEC_DRV_ALG_BITMAP_LOW_IDX = 0x0,
-	SEC_DRV_ALG_BITMAP_HIGH_IDX,
-	SEC_DEV_ALG_BITMAP_LOW_IDX,
-	SEC_DEV_ALG_BITMAP_HIGH_IDX,
+enum sec_cap_table_type {
+	QM_RAS_NFE_TYPE = 0x0,
+	QM_RAS_NFE_RESET,
+	QM_RAS_CE_TYPE,
+	SEC_RAS_NFE_TYPE,
+	SEC_RAS_NFE_RESET,
+	SEC_RAS_CE_TYPE,
+	SEC_CORE_INFO,
+	SEC_CORE_EN,
+	SEC_DRV_ALG_BITMAP_LOW_TB,
+	SEC_DRV_ALG_BITMAP_HIGH_TB,
+	SEC_ALG_BITMAP_LOW,
+	SEC_ALG_BITMAP_HIGH,
+	SEC_CORE1_BITMAP_LOW,
+	SEC_CORE1_BITMAP_HIGH,
+	SEC_CORE2_BITMAP_LOW,
+	SEC_CORE2_BITMAP_HIGH,
+	SEC_CORE3_BITMAP_LOW,
+	SEC_CORE3_BITMAP_HIGH,
+	SEC_CORE4_BITMAP_LOW,
+	SEC_CORE4_BITMAP_HIGH,
 };
 
 void sec_destroy_qps(struct hisi_qp **qps, int qp_num);
diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c
index 0558f98e221f..ae9ebbb4103d 100644
--- a/drivers/crypto/hisilicon/sec2/sec_crypto.c
+++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c
@@ -2520,8 +2520,8 @@ int sec_register_to_crypto(struct hisi_qm *qm)
 	u64 alg_mask;
 	int ret = 0;
 
-	alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH_IDX,
-				      SEC_DRV_ALG_BITMAP_LOW_IDX);
+	alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH_TB,
+				      SEC_DRV_ALG_BITMAP_LOW_TB);
 
 	mutex_lock(&sec_algs_lock);
 	if (sec_available_devs) {
@@ -2553,8 +2553,8 @@ void sec_unregister_from_crypto(struct hisi_qm *qm)
 {
 	u64 alg_mask;
 
-	alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH_IDX,
-				      SEC_DRV_ALG_BITMAP_LOW_IDX);
+	alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH_TB,
+				      SEC_DRV_ALG_BITMAP_LOW_TB);
 
 	mutex_lock(&sec_algs_lock);
 	if (--sec_available_devs)
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 3abd12017250..316ab385e335 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -14,9 +14,9 @@
 #include <linux/seq_file.h>
 #include <linux/topology.h>
 #include <linux/uacce.h>
-
 #include "sec.h"
 
+#define CAP_FILE_PERMISSION		0444
 #define SEC_VF_NUM			63
 #define SEC_QUEUE_NUM_V1		4096
 #define PCI_DEVICE_ID_HUAWEI_SEC_PF	0xa255
@@ -167,11 +167,34 @@ static const struct hisi_qm_cap_info sec_basic_info[] = {
 	{SEC_CORE4_ALG_BITMAP_HIGH, 0x3170, 0, GENMASK(31, 0), 0x3FFF, 0x3FFF, 0x3FFF},
 };
 
-static const u32 sec_pre_store_caps[] = {
-	SEC_DRV_ALG_BITMAP_LOW,
-	SEC_DRV_ALG_BITMAP_HIGH,
-	SEC_DEV_ALG_BITMAP_LOW,
-	SEC_DEV_ALG_BITMAP_HIGH,
+static const struct hisi_qm_cap_query_info sec_cap_query_info[] = {
+	{QM_RAS_NFE_TYPE, "QM_RAS_NFE_TYPE             ", 0x3124, 0x0, 0x1C77, 0x7C77},
+	{QM_RAS_NFE_RESET, "QM_RAS_NFE_RESET            ", 0x3128, 0x0, 0xC77, 0x6C77},
+	{QM_RAS_CE_TYPE, "QM_RAS_CE_TYPE              ", 0x312C, 0x0, 0x8, 0x8},
+	{SEC_RAS_NFE_TYPE, "SEC_RAS_NFE_TYPE            ", 0x3130, 0x0, 0x177, 0x60177},
+	{SEC_RAS_NFE_RESET, "SEC_RAS_NFE_RESET           ", 0x3134, 0x0, 0x177, 0x177},
+	{SEC_RAS_CE_TYPE, "SEC_RAS_CE_TYPE             ", 0x3138, 0x0, 0x88, 0xC088},
+	{SEC_CORE_INFO, "SEC_CORE_INFO               ", 0x313c, 0x110404, 0x110404, 0x110404},
+	{SEC_CORE_EN, "SEC_CORE_EN                 ", 0x3140, 0x17F, 0x17F, 0xF},
+	{SEC_DRV_ALG_BITMAP_LOW_TB, "SEC_DRV_ALG_BITMAP_LOW      ",
+					0x3144, 0x18050CB, 0x18050CB, 0x18670CF},
+	{SEC_DRV_ALG_BITMAP_HIGH_TB, "SEC_DRV_ALG_BITMAP_HIGH     ",
+					0x3148, 0x395C, 0x395C, 0x395C},
+	{SEC_ALG_BITMAP_LOW, "SEC_ALG_BITMAP_LOW          ",
+					0x314c, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+	{SEC_ALG_BITMAP_HIGH, "SEC_ALG_BITMAP_HIGH         ", 0x3150, 0x3FFF, 0x3FFF, 0x3FFF},
+	{SEC_CORE1_BITMAP_LOW, "SEC_CORE1_BITMAP_LOW        ",
+					0x3154, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+	{SEC_CORE1_BITMAP_HIGH, "SEC_CORE1_BITMAP_HIGH       ", 0x3158, 0x3FFF, 0x3FFF, 0x3FFF},
+	{SEC_CORE2_BITMAP_LOW, "SEC_CORE2_BITMAP_LOW        ",
+					0x315c, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+	{SEC_CORE2_BITMAP_HIGH, "SEC_CORE2_BITMAP_HIGH       ", 0x3160, 0x3FFF, 0x3FFF, 0x3FFF},
+	{SEC_CORE3_BITMAP_LOW, "SEC_CORE3_BITMAP_LOW        ",
+					0x3164, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+	{SEC_CORE3_BITMAP_HIGH, "SEC_CORE3_BITMAP_HIGH       ", 0x3168, 0x3FFF, 0x3FFF, 0x3FFF},
+	{SEC_CORE4_BITMAP_LOW, "SEC_CORE4_BITMAP_LOW        ",
+					0x316c, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+	{SEC_CORE4_BITMAP_HIGH, "SEC_CORE4_BITMAP_HIGH       ", 0x3170, 0x3FFF, 0x3FFF, 0x3FFF},
 };
 
 static const struct qm_dev_alg sec_dev_algs[] = { {
@@ -838,6 +861,26 @@ static int sec_regs_show(struct seq_file *s, void *unused)
 
 DEFINE_SHOW_ATTRIBUTE(sec_regs);
 
+static int sec_cap_regs_show(struct seq_file *s, void *unused)
+{
+	struct hisi_qm *qm = s->private;
+	u32 i, size;
+
+	size = qm->cap_tables.qm_cap_size;
+	for (i = 0; i < size; i++)
+		seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.qm_cap_table[i].name,
+			   qm->cap_tables.qm_cap_table[i].cap_val);
+
+	size = qm->cap_tables.dev_cap_size;
+	for (i = 0; i < size; i++)
+		seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.dev_cap_table[i].name,
+			   qm->cap_tables.dev_cap_table[i].cap_val);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(sec_cap_regs);
+
 static int sec_core_debug_init(struct hisi_qm *qm)
 {
 	struct dfx_diff_registers *sec_regs = qm->debug.acc_diff_regs;
@@ -872,6 +915,9 @@ static int sec_core_debug_init(struct hisi_qm *qm)
 				   tmp_d, data, &sec_atomic64_ops);
 	}
 
+	debugfs_create_file("cap_regs", CAP_FILE_PERMISSION,
+			    qm->debug.debug_root, qm, &sec_cap_regs_fops);
+
 	return 0;
 }
 
@@ -1085,18 +1131,20 @@ static int sec_pre_store_cap_reg(struct hisi_qm *qm)
 	struct pci_dev *pdev = qm->pdev;
 	size_t i, size;
 
-	size = ARRAY_SIZE(sec_pre_store_caps);
+	size = ARRAY_SIZE(sec_cap_query_info);
 	sec_cap = devm_kzalloc(&pdev->dev, sizeof(*sec_cap) * size, GFP_KERNEL);
 	if (!sec_cap)
 		return -ENOMEM;
 
 	for (i = 0; i < size; i++) {
-		sec_cap[i].type = sec_pre_store_caps[i];
-		sec_cap[i].cap_val = hisi_qm_get_hw_info(qm, sec_basic_info,
-				     sec_pre_store_caps[i], qm->cap_ver);
+		sec_cap[i].type = sec_cap_query_info[i].type;
+		sec_cap[i].name = sec_cap_query_info[i].name;
+		sec_cap[i].cap_val = hisi_qm_get_cap_value(qm, sec_cap_query_info,
+				     i, qm->cap_ver);
 	}
 
 	qm->cap_tables.dev_cap_table = sec_cap;
+	qm->cap_tables.dev_cap_size = size;
 
 	return 0;
 }
@@ -1146,8 +1194,7 @@ static int sec_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 		hisi_qm_uninit(qm);
 		return ret;
 	}
-
-	alg_msk = sec_get_alg_bitmap(qm, SEC_DEV_ALG_BITMAP_HIGH_IDX, SEC_DEV_ALG_BITMAP_LOW_IDX);
+	alg_msk = sec_get_alg_bitmap(qm, SEC_ALG_BITMAP_HIGH, SEC_ALG_BITMAP_LOW);
 	ret = hisi_qm_set_algs(qm, alg_msk, sec_dev_algs, ARRAY_SIZE(sec_dev_algs));
 	if (ret) {
 		pci_err(qm->pdev, "Failed to set sec algs!\n");
diff --git a/drivers/crypto/hisilicon/zip/zip.h b/drivers/crypto/hisilicon/zip/zip.h
index f2e6da3240ae..2fecf346c3c9 100644
--- a/drivers/crypto/hisilicon/zip/zip.h
+++ b/drivers/crypto/hisilicon/zip/zip.h
@@ -81,6 +81,24 @@ struct hisi_zip_sqe {
 	u32 rsvd1[4];
 };
 
+enum zip_cap_table_type {
+	QM_RAS_NFE_TYPE,
+	QM_RAS_NFE_RESET,
+	QM_RAS_CE_TYPE,
+	ZIP_RAS_NFE_TYPE,
+	ZIP_RAS_NFE_RESET,
+	ZIP_RAS_CE_TYPE,
+	ZIP_CORE_INFO,
+	ZIP_CORE_EN,
+	ZIP_DRV_ALG_BITMAP_TB,
+	ZIP_ALG_BITMAP,
+	ZIP_CORE1_BITMAP,
+	ZIP_CORE2_BITMAP,
+	ZIP_CORE3_BITMAP,
+	ZIP_CORE4_BITMAP,
+	ZIP_CORE5_BITMAP,
+};
+
 int zip_create_qps(struct hisi_qp **qps, int qp_num, int node);
 int hisi_zip_register_to_crypto(struct hisi_qm *qm);
 void hisi_zip_unregister_from_crypto(struct hisi_qm *qm);
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index f547e6732bf5..638adfc6f683 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -14,6 +14,7 @@
 #include <linux/uacce.h>
 #include "zip.h"
 
+#define CAP_FILE_PERMISSION		0444
 #define PCI_DEVICE_ID_HUAWEI_ZIP_PF	0xa250
 
 #define HZIP_QUEUE_NUM_V1		4096
@@ -250,24 +251,22 @@ static struct hisi_qm_cap_info zip_basic_cap_info[] = {
 	{ZIP_CAP_MAX, 0x317c, 0, GENMASK(0, 0), 0x0, 0x0, 0x0}
 };
 
-enum zip_pre_store_cap_idx {
-	ZIP_CORE_NUM_CAP_IDX = 0x0,
-	ZIP_CLUSTER_COMP_NUM_CAP_IDX,
-	ZIP_CLUSTER_DECOMP_NUM_CAP_IDX,
-	ZIP_DECOMP_ENABLE_BITMAP_IDX,
-	ZIP_COMP_ENABLE_BITMAP_IDX,
-	ZIP_DRV_ALG_BITMAP_IDX,
-	ZIP_DEV_ALG_BITMAP_IDX,
-};
-
-static const u32 zip_pre_store_caps[] = {
-	ZIP_CORE_NUM_CAP,
-	ZIP_CLUSTER_COMP_NUM_CAP,
-	ZIP_CLUSTER_DECOMP_NUM_CAP,
-	ZIP_DECOMP_ENABLE_BITMAP,
-	ZIP_COMP_ENABLE_BITMAP,
-	ZIP_DRV_ALG_BITMAP,
-	ZIP_DEV_ALG_BITMAP,
+static const struct hisi_qm_cap_query_info zip_cap_query_info[] = {
+	{QM_RAS_NFE_TYPE, "QM_RAS_NFE_TYPE             ", 0x3124, 0x0, 0x1C57, 0x7C77},
+	{QM_RAS_NFE_RESET, "QM_RAS_NFE_RESET            ", 0x3128, 0x0, 0xC57, 0x6C77},
+	{QM_RAS_CE_TYPE, "QM_RAS_CE_TYPE              ", 0x312C, 0x0, 0x8, 0x8},
+	{ZIP_RAS_NFE_TYPE, "ZIP_RAS_NFE_TYPE            ", 0x3130, 0x0, 0x7FE, 0x1FFE},
+	{ZIP_RAS_NFE_RESET, "ZIP_RAS_NFE_RESET           ", 0x3134, 0x0, 0x7FE, 0x7FE},
+	{ZIP_RAS_CE_TYPE, "ZIP_RAS_CE_TYPE             ", 0x3138, 0x0, 0x1, 0x1},
+	{ZIP_CORE_INFO, "ZIP_CORE_INFO               ", 0x313C, 0x12080206, 0x12080206, 0x12050203},
+	{ZIP_CORE_EN, "ZIP_CORE_EN                 ", 0x3140, 0xFC0003, 0xFC0003, 0x1C0003},
+	{ZIP_DRV_ALG_BITMAP_TB, "ZIP_DRV_ALG_BITMAP          ", 0x3144, 0x0, 0x0, 0x30},
+	{ZIP_ALG_BITMAP, "ZIP_ALG_BITMAP              ", 0x3148, 0xF, 0xF, 0x3F},
+	{ZIP_CORE1_BITMAP, "ZIP_CORE1_BITMAP            ", 0x314C, 0x5, 0x5, 0xD5},
+	{ZIP_CORE2_BITMAP, "ZIP_CORE2_BITMAP            ", 0x3150, 0x5, 0x5, 0xD5},
+	{ZIP_CORE3_BITMAP, "ZIP_CORE3_BITMAP            ", 0x3154, 0xA, 0xA, 0x2A},
+	{ZIP_CORE4_BITMAP, "ZIP_CORE4_BITMAP            ", 0x3158, 0xA, 0xA, 0x2A},
+	{ZIP_CORE5_BITMAP, "ZIP_CORE5_BITMAP            ", 0x315C, 0xA, 0xA, 0x2A},
 };
 
 static const struct debugfs_reg32 hzip_dfx_regs[] = {
@@ -442,7 +441,7 @@ bool hisi_zip_alg_support(struct hisi_qm *qm, u32 alg)
 {
 	u32 cap_val;
 
-	cap_val = qm->cap_tables.dev_cap_table[ZIP_DRV_ALG_BITMAP_IDX].cap_val;
+	cap_val = qm->cap_tables.dev_cap_table[ZIP_DRV_ALG_BITMAP_TB].cap_val;
 	if ((alg & cap_val) == alg)
 		return true;
 
@@ -530,6 +529,7 @@ static int hisi_zip_set_user_domain_and_cache(struct hisi_qm *qm)
 {
 	void __iomem *base = qm->io_base;
 	u32 dcomp_bm, comp_bm;
+	u32 zip_core_en;
 
 	/* qm user domain */
 	writel(AXUSER_BASE, base + QM_ARUSER_M_CFG_1);
@@ -567,8 +567,12 @@ static int hisi_zip_set_user_domain_and_cache(struct hisi_qm *qm)
 	}
 
 	/* let's open all compression/decompression cores */
-	dcomp_bm = qm->cap_tables.dev_cap_table[ZIP_DECOMP_ENABLE_BITMAP_IDX].cap_val;
-	comp_bm = qm->cap_tables.dev_cap_table[ZIP_COMP_ENABLE_BITMAP_IDX].cap_val;
+
+	zip_core_en = qm->cap_tables.dev_cap_table[ZIP_CORE_EN].cap_val;
+	dcomp_bm = (zip_core_en >> zip_basic_cap_info[ZIP_DECOMP_ENABLE_BITMAP].shift) &
+			zip_basic_cap_info[ZIP_DECOMP_ENABLE_BITMAP].mask;
+	comp_bm = (zip_core_en >> zip_basic_cap_info[ZIP_COMP_ENABLE_BITMAP].shift) &
+			zip_basic_cap_info[ZIP_COMP_ENABLE_BITMAP].mask;
 	writel(HZIP_DECOMP_CHECK_ENABLE | dcomp_bm | comp_bm, base + HZIP_CLOCK_GATE_CTRL);
 
 	/* enable sqc,cqc writeback */
@@ -788,7 +792,12 @@ DEFINE_SHOW_ATTRIBUTE(hisi_zip_regs);
 
 static void __iomem *get_zip_core_addr(struct hisi_qm *qm, int core_num)
 {
-	u32 zip_comp_core_num = qm->cap_tables.dev_cap_table[ZIP_CLUSTER_COMP_NUM_CAP_IDX].cap_val;
+	u8 zip_comp_core_num;
+	u32 zip_core_info;
+
+	zip_core_info =  qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+	zip_comp_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].shift) &
+			     zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].mask;
 
 	if (core_num < zip_comp_core_num)
 		return qm->io_base + HZIP_CORE_DFX_BASE +
@@ -803,12 +812,16 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm)
 	u32 zip_core_num, zip_comp_core_num;
 	struct device *dev = &qm->pdev->dev;
 	struct debugfs_regset32 *regset;
+	u32 zip_core_info;
 	struct dentry *tmp_d;
 	char buf[HZIP_BUF_SIZE];
 	int i;
 
-	zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val;
-	zip_comp_core_num = qm->cap_tables.dev_cap_table[ZIP_CLUSTER_COMP_NUM_CAP_IDX].cap_val;
+	zip_core_info =  qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+	zip_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CORE_NUM_CAP].shift) &
+			zip_basic_cap_info[ZIP_CORE_NUM_CAP].mask;
+	zip_comp_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].shift) &
+			zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].mask;
 
 	for (i = 0; i < zip_core_num; i++) {
 		if (i < zip_comp_core_num)
@@ -834,6 +847,26 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm)
 	return 0;
 }
 
+static int zip_cap_regs_show(struct seq_file *s, void *unused)
+{
+	struct hisi_qm *qm = s->private;
+	u32 i, size;
+
+	size = qm->cap_tables.qm_cap_size;
+	for (i = 0; i < size; i++)
+		seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.qm_cap_table[i].name,
+			   qm->cap_tables.qm_cap_table[i].cap_val);
+
+	size = qm->cap_tables.dev_cap_size;
+	for (i = 0; i < size; i++)
+		seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.dev_cap_table[i].name,
+			   qm->cap_tables.dev_cap_table[i].cap_val);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(zip_cap_regs);
+
 static void hisi_zip_dfx_debug_init(struct hisi_qm *qm)
 {
 	struct dfx_diff_registers *hzip_regs = qm->debug.acc_diff_regs;
@@ -854,6 +887,9 @@ static void hisi_zip_dfx_debug_init(struct hisi_qm *qm)
 	if (qm->fun_type == QM_HW_PF && hzip_regs)
 		debugfs_create_file("diff_regs", 0444, tmp_dir,
 				      qm, &hzip_diff_regs_fops);
+
+	debugfs_create_file("cap_regs", CAP_FILE_PERMISSION,
+			    qm->debug.debug_root, qm, &zip_cap_regs_fops);
 }
 
 static int hisi_zip_ctrl_debug_init(struct hisi_qm *qm)
@@ -912,9 +948,14 @@ debugfs_remove:
 /* hisi_zip_debug_regs_clear() - clear the zip debug regs */
 static void hisi_zip_debug_regs_clear(struct hisi_qm *qm)
 {
-	u32 zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val;
+	u32 zip_core_info;
+	u8 zip_core_num;
 	int i, j;
 
+	zip_core_info = qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+	zip_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CORE_NUM_CAP].shift) &
+			zip_basic_cap_info[ZIP_CORE_NUM_CAP].mask;
+
 	/* enable register read_clear bit */
 	writel(HZIP_RD_CNT_CLR_CE_EN, qm->io_base + HZIP_SOFT_CTRL_CNT_CLR_CE);
 	for (i = 0; i < zip_core_num; i++)
@@ -946,10 +987,13 @@ static int hisi_zip_show_last_regs_init(struct hisi_qm *qm)
 	int com_dfx_regs_num = ARRAY_SIZE(hzip_com_dfx_regs);
 	struct qm_debug *debug = &qm->debug;
 	void __iomem *io_base;
+	u32 zip_core_info;
 	u32 zip_core_num;
 	int i, j, idx;
 
-	zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val;
+	zip_core_info = qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+	zip_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CORE_NUM_CAP].shift) &
+			zip_basic_cap_info[ZIP_CORE_NUM_CAP].mask;
 
 	debug->last_words = kcalloc(core_dfx_regs_num * zip_core_num + com_dfx_regs_num,
 				    sizeof(unsigned int), GFP_KERNEL);
@@ -991,6 +1035,7 @@ static void hisi_zip_show_last_dfx_regs(struct hisi_qm *qm)
 	u32 zip_core_num, zip_comp_core_num;
 	struct qm_debug *debug = &qm->debug;
 	char buf[HZIP_BUF_SIZE];
+	u32 zip_core_info;
 	void __iomem *base;
 	int i, j, idx;
 	u32 val;
@@ -1005,8 +1050,11 @@ static void hisi_zip_show_last_dfx_regs(struct hisi_qm *qm)
 				 hzip_com_dfx_regs[i].name, debug->last_words[i], val);
 	}
 
-	zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val;
-	zip_comp_core_num = qm->cap_tables.dev_cap_table[ZIP_CLUSTER_COMP_NUM_CAP_IDX].cap_val;
+	zip_core_info = qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+	zip_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CORE_NUM_CAP].shift) &
+			zip_basic_cap_info[ZIP_CORE_NUM_CAP].mask;
+	zip_comp_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].shift) &
+			zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].mask;
 
 	for (i = 0; i < zip_core_num; i++) {
 		if (i < zip_comp_core_num)
@@ -1167,18 +1215,20 @@ static int zip_pre_store_cap_reg(struct hisi_qm *qm)
 	struct pci_dev *pdev = qm->pdev;
 	size_t i, size;
 
-	size = ARRAY_SIZE(zip_pre_store_caps);
+	size = ARRAY_SIZE(zip_cap_query_info);
 	zip_cap = devm_kzalloc(&pdev->dev, sizeof(*zip_cap) * size, GFP_KERNEL);
 	if (!zip_cap)
 		return -ENOMEM;
 
 	for (i = 0; i < size; i++) {
-		zip_cap[i].type = zip_pre_store_caps[i];
-		zip_cap[i].cap_val = hisi_qm_get_hw_info(qm, zip_basic_cap_info,
-				     zip_pre_store_caps[i], qm->cap_ver);
+		zip_cap[i].type = zip_cap_query_info[i].type;
+		zip_cap[i].name = zip_cap_query_info[i].name;
+		zip_cap[i].cap_val = hisi_qm_get_cap_value(qm, zip_cap_query_info,
+				     i, qm->cap_ver);
 	}
 
 	qm->cap_tables.dev_cap_table = zip_cap;
+	qm->cap_tables.dev_cap_size = size;
 
 	return 0;
 }
@@ -1230,7 +1280,7 @@ static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 		return ret;
 	}
 
-	alg_msk = qm->cap_tables.dev_cap_table[ZIP_DEV_ALG_BITMAP_IDX].cap_val;
+	alg_msk = qm->cap_tables.dev_cap_table[ZIP_ALG_BITMAP].cap_val;
 	ret = hisi_qm_set_algs(qm, alg_msk, zip_dev_algs, ARRAY_SIZE(zip_dev_algs));
 	if (ret) {
 		pci_err(qm->pdev, "Failed to set zip algs!\n");
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 389e95754776..7272eac850e8 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -274,13 +274,25 @@ struct hisi_qm_cap_info {
 	u32 v3_val;
 };
 
+struct hisi_qm_cap_query_info {
+	u32 type;
+	const char *name;
+	u32 offset;
+	u32 v1_val;
+	u32 v2_val;
+	u32 v3_val;
+};
+
 struct hisi_qm_cap_record {
 	u32 type;
+	const char *name;
 	u32 cap_val;
 };
 
 struct hisi_qm_cap_tables {
+	u32 qm_cap_size;
 	struct hisi_qm_cap_record *qm_cap_table;
+	u32 dev_cap_size;
 	struct hisi_qm_cap_record *dev_cap_table;
 };
 
@@ -554,6 +566,9 @@ void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset);
 u32 hisi_qm_get_hw_info(struct hisi_qm *qm,
 			const struct hisi_qm_cap_info *info_table,
 			u32 index, bool is_read);
+u32 hisi_qm_get_cap_value(struct hisi_qm *qm,
+			const struct hisi_qm_cap_query_info *info_table,
+			u32 index, bool is_read);
 int hisi_qm_set_algs(struct hisi_qm *qm, u64 alg_msk, const struct qm_dev_alg *dev_algs,
 		     u32 dev_algs_size);
 
-- 
cgit v1.2.3


From c418ba6baca3ae10ffaf47b0803d2a9e6bf1af96 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Sat, 26 Oct 2024 19:44:29 +0800
Subject: crypto: hisilicon/qm - disable same error report before resetting

If an error indicating that the device needs to be reset is reported,
disable the error reporting before device reset is complete,
enable the error reporting after the reset is complete to prevent
the same error from being reported repeatedly.

Fixes: eaebf4c3b103 ("crypto: hisilicon - Unify hardware error init/uninit into QM")
Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/hpre/hpre_main.c | 35 +++++++++++++++++++----
 drivers/crypto/hisilicon/qm.c             | 47 +++++++++----------------------
 drivers/crypto/hisilicon/sec2/sec_main.c  | 35 +++++++++++++++++++----
 drivers/crypto/hisilicon/zip/zip_main.c   | 35 +++++++++++++++++++----
 include/linux/hisi_acc_qm.h               |  8 +++++-
 5 files changed, 110 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index c5de1f6b6d3b..96fde9437b4b 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -1353,11 +1353,15 @@ static u32 hpre_get_hw_err_status(struct hisi_qm *qm)
 
 static void hpre_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts)
 {
-	u32 nfe;
-
 	writel(err_sts, qm->io_base + HPRE_HAC_SOURCE_INT);
-	nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver);
-	writel(nfe, qm->io_base + HPRE_RAS_NFE_ENB);
+}
+
+static void hpre_disable_error_report(struct hisi_qm *qm, u32 err_type)
+{
+	u32 nfe_mask;
+
+	nfe_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver);
+	writel(nfe_mask & (~err_type), qm->io_base + HPRE_RAS_NFE_ENB);
 }
 
 static void hpre_open_axi_master_ooo(struct hisi_qm *qm)
@@ -1371,6 +1375,27 @@ static void hpre_open_axi_master_ooo(struct hisi_qm *qm)
 	       qm->io_base + HPRE_AM_OOO_SHUTDOWN_ENB);
 }
 
+static enum acc_err_result hpre_get_err_result(struct hisi_qm *qm)
+{
+	u32 err_status;
+
+	err_status = hpre_get_hw_err_status(qm);
+	if (err_status) {
+		if (err_status & qm->err_info.ecc_2bits_mask)
+			qm->err_status.is_dev_ecc_mbit = true;
+		hpre_log_hw_error(qm, err_status);
+
+		if (err_status & qm->err_info.dev_reset_mask) {
+			/* Disable the same error reporting until device is recovered. */
+			hpre_disable_error_report(qm, err_status);
+			return ACC_ERR_NEED_RESET;
+		}
+		hpre_clear_hw_err_status(qm, err_status);
+	}
+
+	return ACC_ERR_RECOVERED;
+}
+
 static void hpre_err_info_init(struct hisi_qm *qm)
 {
 	struct hisi_qm_err_info *err_info = &qm->err_info;
@@ -1397,12 +1422,12 @@ static const struct hisi_qm_err_ini hpre_err_ini = {
 	.hw_err_disable		= hpre_hw_error_disable,
 	.get_dev_hw_err_status	= hpre_get_hw_err_status,
 	.clear_dev_hw_err_status = hpre_clear_hw_err_status,
-	.log_dev_hw_err		= hpre_log_hw_error,
 	.open_axi_master_ooo	= hpre_open_axi_master_ooo,
 	.open_sva_prefetch	= hpre_open_sva_prefetch,
 	.close_sva_prefetch	= hpre_close_sva_prefetch,
 	.show_last_dfx_regs	= hpre_show_last_dfx_regs,
 	.err_info_init		= hpre_err_info_init,
+	.get_err_result		= hpre_get_err_result,
 };
 
 static int hpre_pf_probe_init(struct hpre *hpre)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 1cdda09ee817..19c1b5d3c954 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -271,12 +271,6 @@ enum vft_type {
 	SHAPER_VFT,
 };
 
-enum acc_err_result {
-	ACC_ERR_NONE,
-	ACC_ERR_NEED_RESET,
-	ACC_ERR_RECOVERED,
-};
-
 enum qm_alg_type {
 	ALG_TYPE_0,
 	ALG_TYPE_1,
@@ -1488,22 +1482,25 @@ static void qm_log_hw_error(struct hisi_qm *qm, u32 error_status)
 
 static enum acc_err_result qm_hw_error_handle_v2(struct hisi_qm *qm)
 {
-	u32 error_status, tmp;
-
-	/* read err sts */
-	tmp = readl(qm->io_base + QM_ABNORMAL_INT_STATUS);
-	error_status = qm->error_mask & tmp;
+	u32 error_status;
 
-	if (error_status) {
+	error_status = qm_get_hw_error_status(qm);
+	if (error_status & qm->error_mask) {
 		if (error_status & QM_ECC_MBIT)
 			qm->err_status.is_qm_ecc_mbit = true;
 
 		qm_log_hw_error(qm, error_status);
-		if (error_status & qm->err_info.qm_reset_mask)
+		if (error_status & qm->err_info.qm_reset_mask) {
+			/* Disable the same error reporting until device is recovered. */
+			writel(qm->err_info.nfe & (~error_status),
+			       qm->io_base + QM_RAS_NFE_ENABLE);
 			return ACC_ERR_NEED_RESET;
+		}
 
+		/* Clear error source if not need reset. */
 		writel(error_status, qm->io_base + QM_ABNORMAL_INT_SOURCE);
 		writel(qm->err_info.nfe, qm->io_base + QM_RAS_NFE_ENABLE);
+		writel(qm->err_info.ce, qm->io_base + QM_RAS_CE_ENABLE);
 	}
 
 	return ACC_ERR_RECOVERED;
@@ -3924,30 +3921,12 @@ EXPORT_SYMBOL_GPL(hisi_qm_sriov_configure);
 
 static enum acc_err_result qm_dev_err_handle(struct hisi_qm *qm)
 {
-	u32 err_sts;
-
-	if (!qm->err_ini->get_dev_hw_err_status) {
-		dev_err(&qm->pdev->dev, "Device doesn't support get hw error status!\n");
+	if (!qm->err_ini->get_err_result) {
+		dev_err(&qm->pdev->dev, "Device doesn't support reset!\n");
 		return ACC_ERR_NONE;
 	}
 
-	/* get device hardware error status */
-	err_sts = qm->err_ini->get_dev_hw_err_status(qm);
-	if (err_sts) {
-		if (err_sts & qm->err_info.ecc_2bits_mask)
-			qm->err_status.is_dev_ecc_mbit = true;
-
-		if (qm->err_ini->log_dev_hw_err)
-			qm->err_ini->log_dev_hw_err(qm, err_sts);
-
-		if (err_sts & qm->err_info.dev_reset_mask)
-			return ACC_ERR_NEED_RESET;
-
-		if (qm->err_ini->clear_dev_hw_err_status)
-			qm->err_ini->clear_dev_hw_err_status(qm, err_sts);
-	}
-
-	return ACC_ERR_RECOVERED;
+	return qm->err_ini->get_err_result(qm);
 }
 
 static enum acc_err_result qm_process_dev_error(struct hisi_qm *qm)
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 316ab385e335..8ec5333bb5aa 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -1056,11 +1056,15 @@ static u32 sec_get_hw_err_status(struct hisi_qm *qm)
 
 static void sec_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts)
 {
-	u32 nfe;
-
 	writel(err_sts, qm->io_base + SEC_CORE_INT_SOURCE);
-	nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver);
-	writel(nfe, qm->io_base + SEC_RAS_NFE_REG);
+}
+
+static void sec_disable_error_report(struct hisi_qm *qm, u32 err_type)
+{
+	u32 nfe_mask;
+
+	nfe_mask = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver);
+	writel(nfe_mask & (~err_type), qm->io_base + SEC_RAS_NFE_REG);
 }
 
 static void sec_open_axi_master_ooo(struct hisi_qm *qm)
@@ -1072,6 +1076,27 @@ static void sec_open_axi_master_ooo(struct hisi_qm *qm)
 	writel(val | SEC_AXI_SHUTDOWN_ENABLE, qm->io_base + SEC_CONTROL_REG);
 }
 
+static enum acc_err_result sec_get_err_result(struct hisi_qm *qm)
+{
+	u32 err_status;
+
+	err_status = sec_get_hw_err_status(qm);
+	if (err_status) {
+		if (err_status & qm->err_info.ecc_2bits_mask)
+			qm->err_status.is_dev_ecc_mbit = true;
+		sec_log_hw_error(qm, err_status);
+
+		if (err_status & qm->err_info.dev_reset_mask) {
+			/* Disable the same error reporting until device is recovered. */
+			sec_disable_error_report(qm, err_status);
+			return ACC_ERR_NEED_RESET;
+		}
+		sec_clear_hw_err_status(qm, err_status);
+	}
+
+	return ACC_ERR_RECOVERED;
+}
+
 static void sec_err_info_init(struct hisi_qm *qm)
 {
 	struct hisi_qm_err_info *err_info = &qm->err_info;
@@ -1098,12 +1123,12 @@ static const struct hisi_qm_err_ini sec_err_ini = {
 	.hw_err_disable		= sec_hw_error_disable,
 	.get_dev_hw_err_status	= sec_get_hw_err_status,
 	.clear_dev_hw_err_status = sec_clear_hw_err_status,
-	.log_dev_hw_err		= sec_log_hw_error,
 	.open_axi_master_ooo	= sec_open_axi_master_ooo,
 	.open_sva_prefetch	= sec_open_sva_prefetch,
 	.close_sva_prefetch	= sec_close_sva_prefetch,
 	.show_last_dfx_regs	= sec_show_last_dfx_regs,
 	.err_info_init		= sec_err_info_init,
+	.get_err_result		= sec_get_err_result,
 };
 
 static int sec_pf_probe_init(struct sec_dev *sec)
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index 638adfc6f683..9239b251c2d7 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -1107,11 +1107,15 @@ static u32 hisi_zip_get_hw_err_status(struct hisi_qm *qm)
 
 static void hisi_zip_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts)
 {
-	u32 nfe;
-
 	writel(err_sts, qm->io_base + HZIP_CORE_INT_SOURCE);
-	nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver);
-	writel(nfe, qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB);
+}
+
+static void hisi_zip_disable_error_report(struct hisi_qm *qm, u32 err_type)
+{
+	u32 nfe_mask;
+
+	nfe_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver);
+	writel(nfe_mask & (~err_type), qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB);
 }
 
 static void hisi_zip_open_axi_master_ooo(struct hisi_qm *qm)
@@ -1141,6 +1145,27 @@ static void hisi_zip_close_axi_master_ooo(struct hisi_qm *qm)
 	       qm->io_base + HZIP_CORE_INT_SET);
 }
 
+static enum acc_err_result hisi_zip_get_err_result(struct hisi_qm *qm)
+{
+	u32 err_status;
+
+	err_status = hisi_zip_get_hw_err_status(qm);
+	if (err_status) {
+		if (err_status & qm->err_info.ecc_2bits_mask)
+			qm->err_status.is_dev_ecc_mbit = true;
+		hisi_zip_log_hw_error(qm, err_status);
+
+		if (err_status & qm->err_info.dev_reset_mask) {
+			/* Disable the same error reporting until device is recovered. */
+			hisi_zip_disable_error_report(qm, err_status);
+			return ACC_ERR_NEED_RESET;
+		}
+		hisi_zip_clear_hw_err_status(qm, err_status);
+	}
+
+	return ACC_ERR_RECOVERED;
+}
+
 static void hisi_zip_err_info_init(struct hisi_qm *qm)
 {
 	struct hisi_qm_err_info *err_info = &qm->err_info;
@@ -1168,13 +1193,13 @@ static const struct hisi_qm_err_ini hisi_zip_err_ini = {
 	.hw_err_disable		= hisi_zip_hw_error_disable,
 	.get_dev_hw_err_status	= hisi_zip_get_hw_err_status,
 	.clear_dev_hw_err_status = hisi_zip_clear_hw_err_status,
-	.log_dev_hw_err		= hisi_zip_log_hw_error,
 	.open_axi_master_ooo	= hisi_zip_open_axi_master_ooo,
 	.close_axi_master_ooo	= hisi_zip_close_axi_master_ooo,
 	.open_sva_prefetch	= hisi_zip_open_sva_prefetch,
 	.close_sva_prefetch	= hisi_zip_close_sva_prefetch,
 	.show_last_dfx_regs	= hisi_zip_show_last_dfx_regs,
 	.err_info_init		= hisi_zip_err_info_init,
+	.get_err_result		= hisi_zip_get_err_result,
 };
 
 static int hisi_zip_pf_probe_init(struct hisi_zip *hisi_zip)
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 7272eac850e8..6dbd0d49628f 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -229,6 +229,12 @@ struct hisi_qm_status {
 
 struct hisi_qm;
 
+enum acc_err_result {
+	ACC_ERR_NONE,
+	ACC_ERR_NEED_RESET,
+	ACC_ERR_RECOVERED,
+};
+
 struct hisi_qm_err_info {
 	char *acpi_rst;
 	u32 msi_wr_port;
@@ -257,9 +263,9 @@ struct hisi_qm_err_ini {
 	void (*close_axi_master_ooo)(struct hisi_qm *qm);
 	void (*open_sva_prefetch)(struct hisi_qm *qm);
 	void (*close_sva_prefetch)(struct hisi_qm *qm);
-	void (*log_dev_hw_err)(struct hisi_qm *qm, u32 err_sts);
 	void (*show_last_dfx_regs)(struct hisi_qm *qm);
 	void (*err_info_init)(struct hisi_qm *qm);
+	enum acc_err_result (*get_err_result)(struct hisi_qm *qm);
 };
 
 struct hisi_qm_cap_info {
-- 
cgit v1.2.3


From f6ca73063754950bf3fbad753e3a9557e3aa85e3 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Sat, 2 Nov 2024 14:28:41 +0100
Subject: i3c: Document I3C_ADDR_SLOT_EXT_STATUS_MASK

As the mask is part of the enum, document it.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/r/20241102132841.2446176-1-alexandre.belloni@bootlin.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 6e5328c6c6af..12d532b012c5 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -298,6 +298,7 @@ enum i3c_open_drain_speed {
  * @I3C_ADDR_SLOT_I2C_DEV: address is assigned to an I2C device
  * @I3C_ADDR_SLOT_I3C_DEV: address is assigned to an I3C device
  * @I3C_ADDR_SLOT_STATUS_MASK: address slot mask
+ * @I3C_ADDR_SLOT_EXT_STATUS_MASK: address slot mask with extended information
  * @I3C_ADDR_SLOT_EXT_DESIRED: the bitmask represents addresses that are preferred by some devices,
  *			       such as the "assigned-address" property in a device tree source.
  * On an I3C bus, addresses are assigned dynamically, and we need to know which
-- 
cgit v1.2.3


From 7029acd8a950393ee3a3d8e1a7ee1a9b77808a3b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 25 Oct 2024 19:27:39 -0600
Subject: io_uring/rsrc: get rid of per-ring io_rsrc_node list

Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.

As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.

At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.

Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.

Outside of removing the stall in resource reclaim described above, it
has the following advantages:

1) It's a lot simpler than the previous scheme, and easier to follow.
   No need to specific quiesce handling anymore.

2) There are no resource node allocations in the fast path, all of that
   happens at resource registration time.

3) The structs related to resource handling can all get simplified
   quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
   go away completely.

4) Handling of resource tags is much simpler, and doesn't require
   persistent storage as it can simply get assigned up front at
   registration time. Just copy them in one-by-one at registration time
   and assign to the resource node.

The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.

With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  10 +-
 io_uring/fdinfo.c              |   2 +-
 io_uring/filetable.c           |  52 ++---
 io_uring/filetable.h           |  25 +--
 io_uring/io_uring.c            |  38 +---
 io_uring/net.c                 |  11 +-
 io_uring/nop.c                 |   6 +-
 io_uring/notif.c               |   3 +-
 io_uring/rsrc.c                | 481 +++++++++++++++--------------------------
 io_uring/rsrc.h                |  71 ++----
 io_uring/rw.c                  |   8 +-
 io_uring/splice.c              |  16 +-
 io_uring/uring_cmd.c           |  12 +-
 13 files changed, 270 insertions(+), 465 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d4ba4ae480d6..42c5f2c992c4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -56,7 +56,7 @@ struct io_wq_work {
 };
 
 struct io_file_table {
-	struct io_fixed_file *files;
+	struct io_rsrc_node **nodes;
 	unsigned long *bitmap;
 	unsigned int alloc_hint;
 };
@@ -264,7 +264,6 @@ struct io_ring_ctx {
 		 * Fixed resources fast path, should be accessed only under
 		 * uring_lock, and updated through io_uring_register(2)
 		 */
-		struct io_rsrc_node	*rsrc_node;
 		atomic_t		cancel_seq;
 
 		/*
@@ -277,7 +276,7 @@ struct io_ring_ctx {
 		struct io_wq_work_list	iopoll_list;
 
 		struct io_file_table	file_table;
-		struct io_mapped_ubuf	**user_bufs;
+		struct io_rsrc_node	**user_bufs;
 		unsigned		nr_user_files;
 		unsigned		nr_user_bufs;
 
@@ -372,10 +371,7 @@ struct io_ring_ctx {
 	struct io_rsrc_data		*buf_data;
 
 	/* protected by ->uring_lock */
-	struct list_head		rsrc_ref_list;
 	struct io_alloc_cache		rsrc_node_cache;
-	struct wait_queue_head		rsrc_quiesce_wq;
-	unsigned			rsrc_quiesce;
 
 	u32			pers_next;
 	struct xarray		personalities;
@@ -642,7 +638,7 @@ struct io_kiocb {
 		__poll_t apoll_events;
 	};
 
-	struct io_rsrc_node		*rsrc_node;
+	struct io_rsrc_node		*rsrc_nodes[2];
 
 	atomic_t			refs;
 	bool				cancel_seq_set;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index a6bac533edbe..064a79475c5f 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -176,7 +176,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	}
 	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
 	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
+		struct io_mapped_ubuf *buf = ctx->user_bufs[i]->buf;
 
 		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
 	}
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 997c56d32ee6..1b12a9a1cc16 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -38,14 +38,14 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
-	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
-				GFP_KERNEL_ACCOUNT);
-	if (unlikely(!table->files))
+	table->nodes = kvmalloc_array(nr_files, sizeof(struct io_src_node *),
+					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (unlikely(!table->nodes))
 		return false;
 
 	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
 	if (unlikely(!table->bitmap)) {
-		kvfree(table->files);
+		kvfree(table->nodes);
 		return false;
 	}
 
@@ -54,9 +54,9 @@ bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 
 void io_free_file_tables(struct io_file_table *table)
 {
-	kvfree(table->files);
+	kvfree(table->nodes);
 	bitmap_free(table->bitmap);
-	table->files = NULL;
+	table->nodes = NULL;
 	table->bitmap = NULL;
 }
 
@@ -64,8 +64,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 				 u32 slot_index)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_fixed_file *file_slot;
-	int ret;
+	struct io_rsrc_node *node;
 
 	if (io_is_uring_fops(file))
 		return -EBADF;
@@ -74,22 +73,18 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	if (slot_index >= ctx->nr_user_files)
 		return -EINVAL;
 
-	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
-
-	if (file_slot->file_ptr) {
-		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-					    io_slot_file(file_slot));
-		if (ret)
-			return ret;
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+	if (!node)
+		return -ENOMEM;
 
-		file_slot->file_ptr = 0;
-	} else {
+	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+	if (ctx->file_table.nodes[slot_index])
+		io_put_rsrc_node(ctx->file_table.nodes[slot_index]);
+	else
 		io_file_bitmap_set(&ctx->file_table, slot_index);
-	}
 
-	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
-	io_fixed_file_set(file_slot, file);
+	ctx->file_table.nodes[slot_index] = node;
+	io_fixed_file_set(node, file);
 	return 0;
 }
 
@@ -134,25 +129,16 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 {
-	struct io_fixed_file *file_slot;
-	int ret;
-
 	if (unlikely(!ctx->file_data))
 		return -ENXIO;
 	if (offset >= ctx->nr_user_files)
 		return -EINVAL;
 
 	offset = array_index_nospec(offset, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
-	if (!file_slot->file_ptr)
+	if (!ctx->file_table.nodes[offset])
 		return -EBADF;
-
-	ret = io_queue_rsrc_removal(ctx->file_data, offset,
-				    io_slot_file(file_slot));
-	if (ret)
-		return ret;
-
-	file_slot->file_ptr = 0;
+	io_put_rsrc_node(ctx->file_table.nodes[offset]);
+	ctx->file_table.nodes[offset] = NULL;
 	io_file_bitmap_clear(&ctx->file_table, offset);
 	return 0;
 }
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index c027ed4ad68d..47616079abaa 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -34,36 +34,35 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
 	table->alloc_hint = bit + 1;
 }
 
-static inline struct io_fixed_file *
-io_fixed_file_slot(struct io_file_table *table, unsigned i)
-{
-	return &table->files[i];
-}
-
 #define FFS_NOWAIT		0x1UL
 #define FFS_ISREG		0x2UL
 #define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG)
 
-static inline unsigned int io_slot_flags(struct io_fixed_file *slot)
+static inline unsigned int io_slot_flags(struct io_rsrc_node *node)
 {
-	return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+	return (node->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
 }
 
-static inline struct file *io_slot_file(struct io_fixed_file *slot)
+static inline struct file *io_slot_file(struct io_rsrc_node *node)
 {
-	return (struct file *)(slot->file_ptr & FFS_MASK);
+	return (struct file *)(node->file_ptr & FFS_MASK);
 }
 
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
-	return io_slot_file(io_fixed_file_slot(table, index));
+	struct io_rsrc_node *node = table->nodes[index];
+
+	if (node)
+		return io_slot_file(node);
+	return NULL;
 }
 
-static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
+static inline void io_fixed_file_set(struct io_rsrc_node *node,
 				     struct file *file)
 {
-	file_slot->file_ptr = (unsigned long)file |
+	node->file_ptr = (unsigned long)file |
 		(io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
 }
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a09c67b38c1b..0876aa74c739 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -333,7 +333,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->cq_wait);
 	init_waitqueue_head(&ctx->poll_wq);
-	init_waitqueue_head(&ctx->rsrc_quiesce_wq);
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
 	INIT_WQ_LIST(&ctx->iopoll_list);
@@ -341,7 +340,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	INIT_LIST_HEAD(&ctx->ltimeout_list);
-	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
@@ -1415,7 +1413,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 				io_clean_op(req);
 		}
 		io_put_file(req);
-		io_put_rsrc_node(ctx, req->rsrc_node);
+		io_req_put_rsrc_nodes(req);
 		io_put_task(req->task);
 
 		node = req->comp_list.next;
@@ -1878,7 +1876,7 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 				      unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_fixed_file *slot;
+	struct io_rsrc_node *node;
 	struct file *file = NULL;
 
 	io_ring_submit_lock(ctx, issue_flags);
@@ -1886,11 +1884,12 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 		goto out;
 	fd = array_index_nospec(fd, ctx->nr_user_files);
-	slot = io_fixed_file_slot(&ctx->file_table, fd);
-	if (!req->rsrc_node)
-		__io_req_set_rsrc_node(req, ctx);
-	req->flags |= io_slot_flags(slot);
-	file = io_slot_file(slot);
+	node = ctx->file_table.nodes[fd];
+	if (node) {
+		io_req_assign_rsrc_node(req, node);
+		req->flags |= io_slot_flags(node);
+		file = io_slot_file(node);
+	}
 out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
@@ -2036,7 +2035,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->flags = (__force io_req_flags_t) sqe_flags;
 	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
-	req->rsrc_node = NULL;
+	req->rsrc_nodes[IORING_RSRC_FILE] = NULL;
+	req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
 	req->task = current;
 	req->cancel_seq_set = false;
 
@@ -2718,15 +2718,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
-	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
-	if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
-		return;
 
 	mutex_lock(&ctx->uring_lock);
-	if (ctx->buf_data)
-		__io_sqe_buffers_unregister(ctx);
-	if (ctx->file_data)
-		__io_sqe_files_unregister(ctx);
+	io_sqe_buffers_unregister(ctx);
+	io_sqe_files_unregister(ctx);
 	io_cqring_overflow_kill(ctx);
 	io_eventfd_unregister(ctx);
 	io_alloc_cache_free(&ctx->apoll_cache, kfree);
@@ -2743,11 +2738,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	if (ctx->submitter_task)
 		put_task_struct(ctx->submitter_task);
 
-	/* there are no registered resources left, nobody uses it */
-	if (ctx->rsrc_node)
-		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
-
-	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
 	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
@@ -3729,10 +3719,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (ret)
 		goto err;
 
-	ret = io_rsrc_init(ctx);
-	if (ret)
-		goto err;
-
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
diff --git a/io_uring/net.c b/io_uring/net.c
index 2040195e33ab..ce1156551d10 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1342,15 +1342,15 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
 		struct io_ring_ctx *ctx = req->ctx;
-		struct io_mapped_ubuf *imu;
+		struct io_rsrc_node *node;
 		int idx;
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
 		if (sr->buf_index < ctx->nr_user_bufs) {
 			idx = array_index_nospec(sr->buf_index, ctx->nr_user_bufs);
-			imu = READ_ONCE(ctx->user_bufs[idx]);
-			io_req_set_rsrc_node(sr->notif, ctx);
+			node = ctx->user_bufs[idx];
+			io_req_assign_rsrc_node(sr->notif, node);
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
@@ -1358,8 +1358,9 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 		if (unlikely(ret))
 			return ret;
 
-		ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, imu,
-					(u64)(uintptr_t)sr->buf, sr->len);
+		ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter,
+					node->buf, (u64)(uintptr_t)sr->buf,
+					sr->len);
 		if (unlikely(ret))
 			return ret;
 		kmsg->msg.sg_from_iter = io_sg_from_iter;
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 2c7a22ba4053..de91600a3bc6 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -61,15 +61,15 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 	}
 	if (nop->flags & IORING_NOP_FIXED_BUFFER) {
 		struct io_ring_ctx *ctx = req->ctx;
-		struct io_mapped_ubuf *imu;
+		struct io_rsrc_node *node;
 		int idx;
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
 		if (nop->buffer < ctx->nr_user_bufs) {
 			idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs);
-			imu = READ_ONCE(ctx->user_bufs[idx]);
-			io_req_set_rsrc_node(req, ctx);
+			node = READ_ONCE(ctx->user_bufs[idx]);
+			io_req_assign_rsrc_node(req, node);
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 28859ae3ee6e..4f02e969cf08 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -117,7 +117,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->file = NULL;
 	notif->task = current;
 	io_get_task_refs(1);
-	notif->rsrc_node = NULL;
+	notif->rsrc_nodes[IORING_RSRC_FILE] = NULL;
+	notif->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
 
 	nd = io_notif_to_data(notif);
 	nd->zc_report = false;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index ca2ec8a018be..88d698efd75b 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -26,10 +26,8 @@ struct io_rsrc_update {
 	u32				offset;
 };
 
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-				  struct io_mapped_ubuf **pimu,
-				  struct page **last_hpage);
+static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
+			struct iovec *iov, struct page **last_hpage);
 
 /* only define max */
 #define IORING_MAX_FIXED_FILES	(1U << 20)
@@ -110,13 +108,13 @@ static int io_buffer_validate(struct iovec *iov)
 	return 0;
 }
 
-static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
-	struct io_mapped_ubuf *imu = *slot;
 	unsigned int i;
 
-	*slot = NULL;
-	if (imu != &dummy_ubuf) {
+	if (node->buf != &dummy_ubuf) {
+		struct io_mapped_ubuf *imu = node->buf;
+
 		if (!refcount_dec_and_test(&imu->refs))
 			return;
 		for (i = 0; i < imu->nr_bvecs; i++)
@@ -127,205 +125,55 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	}
 }
 
-static void io_rsrc_put_work(struct io_rsrc_node *node)
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
 {
-	struct io_rsrc_put *prsrc = &node->item;
-
-	if (prsrc->tag)
-		io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
-
-	switch (node->type) {
-	case IORING_RSRC_FILE:
-		fput(prsrc->file);
-		break;
-	case IORING_RSRC_BUFFER:
-		io_rsrc_buf_put(node->ctx, prsrc);
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		break;
-	}
-}
-
-void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
-{
-	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
-		kfree(node);
-}
-
-void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
-	__must_hold(&node->ctx->uring_lock)
-{
-	struct io_ring_ctx *ctx = node->ctx;
+	struct io_rsrc_node *node;
 
-	while (!list_empty(&ctx->rsrc_ref_list)) {
-		node = list_first_entry(&ctx->rsrc_ref_list,
-					    struct io_rsrc_node, node);
-		/* recycle ref nodes in order */
-		if (node->refs)
-			break;
-		list_del(&node->node);
-
-		if (likely(!node->empty))
-			io_rsrc_put_work(node);
-		io_rsrc_node_destroy(ctx, node);
-	}
-	if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
-		wake_up_all(&ctx->rsrc_quiesce_wq);
-}
-
-struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
-{
-	struct io_rsrc_node *ref_node;
-
-	ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
-	if (!ref_node) {
-		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-		if (!ref_node)
+	node = io_alloc_cache_get(&ctx->rsrc_node_cache);
+	if (!node) {
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		if (!node)
 			return NULL;
 	}
 
-	ref_node->ctx = ctx;
-	ref_node->empty = 0;
-	ref_node->refs = 1;
-	return ref_node;
-}
-
-__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
-				      struct io_ring_ctx *ctx)
-{
-	struct io_rsrc_node *backup;
-	DEFINE_WAIT(we);
-	int ret;
-
-	/* As We may drop ->uring_lock, other task may have started quiesce */
-	if (data->quiesce)
-		return -ENXIO;
-
-	backup = io_rsrc_node_alloc(ctx);
-	if (!backup)
-		return -ENOMEM;
-	ctx->rsrc_node->empty = true;
-	ctx->rsrc_node->type = -1;
-	list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
-	io_put_rsrc_node(ctx, ctx->rsrc_node);
-	ctx->rsrc_node = backup;
-
-	if (list_empty(&ctx->rsrc_ref_list))
-		return 0;
-
-	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-		atomic_set(&ctx->cq_wait_nr, 1);
-		smp_mb();
-	}
-
-	ctx->rsrc_quiesce++;
-	data->quiesce = true;
-	do {
-		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
-		mutex_unlock(&ctx->uring_lock);
-
-		ret = io_run_task_work_sig(ctx);
-		if (ret < 0) {
-			finish_wait(&ctx->rsrc_quiesce_wq, &we);
-			mutex_lock(&ctx->uring_lock);
-			if (list_empty(&ctx->rsrc_ref_list))
-				ret = 0;
-			break;
-		}
-
-		schedule();
-		mutex_lock(&ctx->uring_lock);
-		ret = 0;
-	} while (!list_empty(&ctx->rsrc_ref_list));
-
-	finish_wait(&ctx->rsrc_quiesce_wq, &we);
-	data->quiesce = false;
-	ctx->rsrc_quiesce--;
-
-	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-		atomic_set(&ctx->cq_wait_nr, 0);
-		smp_mb();
-	}
-	return ret;
-}
-
-static void io_free_page_table(void **table, size_t size)
-{
-	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-
-	for (i = 0; i < nr_tables; i++)
-		kfree(table[i]);
-	kfree(table);
+	node->ctx = ctx;
+	node->refs = 1;
+	node->type = type;
+	return node;
 }
 
 static void io_rsrc_data_free(struct io_rsrc_data *data)
 {
-	size_t size = data->nr * sizeof(data->tags[0][0]);
-
-	if (data->tags)
-		io_free_page_table((void **)data->tags, size);
-	kfree(data);
-}
-
-static __cold void **io_alloc_page_table(size_t size)
-{
-	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-	size_t init_size = size;
-	void **table;
-
-	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
-	if (!table)
-		return NULL;
+	int i;
 
-	for (i = 0; i < nr_tables; i++) {
-		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
+	for (i = 0; i < data->nr; i++) {
+		struct io_rsrc_node *node = data->nodes[i];
 
-		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
-		if (!table[i]) {
-			io_free_page_table(table, init_size);
-			return NULL;
-		}
-		size -= this_size;
+		io_put_rsrc_node(node);
 	}
-	return table;
+	kvfree(data->nodes);
+	kfree(data);
 }
 
-__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
-				     u64 __user *utags,
-				     unsigned nr, struct io_rsrc_data **pdata)
+__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, unsigned nr,
+				     struct io_rsrc_data **pdata)
 {
 	struct io_rsrc_data *data;
-	int ret = 0;
-	unsigned i;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
-	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
-	if (!data->tags) {
+
+	data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
+					GFP_KERNEL | __GFP_ZERO);
+	if (!data->nodes) {
 		kfree(data);
 		return -ENOMEM;
 	}
 
 	data->nr = nr;
-	data->ctx = ctx;
-	data->rsrc_type = type;
-	if (utags) {
-		ret = -EFAULT;
-		for (i = 0; i < nr; i++) {
-			u64 *tag_slot = io_get_tag_slot(data, i);
-
-			if (copy_from_user(tag_slot, &utags[i],
-					   sizeof(*tag_slot)))
-				goto fail;
-		}
-	}
 	*pdata = data;
 	return 0;
-fail:
-	io_rsrc_data_free(data);
-	return ret;
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -334,8 +182,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 {
 	u64 __user *tags = u64_to_user_ptr(up->tags);
 	__s32 __user *fds = u64_to_user_ptr(up->data);
-	struct io_rsrc_data *data = ctx->file_data;
-	struct io_fixed_file *file_slot;
 	int fd, i, err = 0;
 	unsigned int done;
 
@@ -360,18 +206,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;
 
 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		file_slot = io_fixed_file_slot(&ctx->file_table, i);
-
-		if (file_slot->file_ptr) {
-			err = io_queue_rsrc_removal(data, i,
-						    io_slot_file(file_slot));
-			if (err)
-				break;
-			file_slot->file_ptr = 0;
+		if (ctx->file_table.nodes[i]) {
+			io_put_rsrc_node(ctx->file_table.nodes[i]);
+			ctx->file_table.nodes[i] = NULL;
 			io_file_bitmap_clear(&ctx->file_table, i);
 		}
 		if (fd != -1) {
 			struct file *file = fget(fd);
+			struct io_rsrc_node *node;
 
 			if (!file) {
 				err = -EBADF;
@@ -385,8 +227,15 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			*io_get_tag_slot(data, i) = tag;
-			io_fixed_file_set(file_slot, file);
+			node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+			if (!node) {
+				err = -ENOMEM;
+				fput(file);
+				break;
+			}
+			ctx->file_table.nodes[i] = node;
+			node->tag = tag;
+			io_fixed_file_set(node, file);
 			io_file_bitmap_set(&ctx->file_table, i);
 		}
 	}
@@ -411,7 +260,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
-		struct io_mapped_ubuf *imu;
+		struct io_rsrc_node *node;
 		u64 tag = 0;
 
 		uvec = u64_to_user_ptr(user_data);
@@ -431,23 +280,16 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = -EINVAL;
 			break;
 		}
-		err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
-		if (err)
-			break;
-
 		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
-		if (ctx->user_bufs[i] != &dummy_ubuf) {
-			err = io_queue_rsrc_removal(ctx->buf_data, i,
-						    ctx->user_bufs[i]);
-			if (unlikely(err)) {
-				io_buffer_unmap(ctx, &imu);
-				break;
-			}
-			ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
+		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
+		if (IS_ERR(node)) {
+			err = PTR_ERR(node);
+			break;
 		}
+		io_put_rsrc_node(ctx->user_bufs[i]);
 
-		ctx->user_bufs[i] = imu;
-		*io_get_tag_slot(ctx->buf_data, i) = tag;
+		ctx->user_bufs[i] = node;
+		node->tag = tag;
 		if (ctx->compat)
 			user_data += sizeof(struct compat_iovec);
 		else
@@ -622,38 +464,47 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
+void io_free_rsrc_node(struct io_rsrc_node *node)
 {
-	struct io_ring_ctx *ctx = data->ctx;
-	struct io_rsrc_node *node = ctx->rsrc_node;
-	u64 *tag_slot = io_get_tag_slot(data, idx);
+	struct io_ring_ctx *ctx = node->ctx;
 
-	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
-	if (unlikely(!ctx->rsrc_node)) {
-		ctx->rsrc_node = node;
-		return -ENOMEM;
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (node->tag)
+		io_post_aux_cqe(node->ctx, node->tag, 0, 0);
+
+	switch (node->type) {
+	case IORING_RSRC_FILE:
+		if (io_slot_file(node))
+			fput(io_slot_file(node));
+		break;
+	case IORING_RSRC_BUFFER:
+		if (node->buf)
+			io_buffer_unmap(node->ctx, node);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
 	}
 
-	node->item.rsrc = rsrc;
-	node->type = data->rsrc_type;
-	node->item.tag = *tag_slot;
-	*tag_slot = 0;
-	list_add_tail(&node->node, &ctx->rsrc_ref_list);
-	io_put_rsrc_node(ctx, node);
-	return 0;
+	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
+		kfree(node);
 }
 
-void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 	int i;
 
+	lockdep_assert_held(&ctx->uring_lock);
+
 	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct file *file = io_file_from_index(&ctx->file_table, i);
+		struct io_rsrc_node *node = ctx->file_table.nodes[i];
 
-		if (!file)
-			continue;
-		io_file_bitmap_clear(&ctx->file_table, i);
-		fput(file);
+		if (node) {
+			io_put_rsrc_node(node);
+			io_file_bitmap_clear(&ctx->file_table, i);
+			ctx->file_table.nodes[i] = NULL;
+		}
 	}
 
 	io_free_file_tables(&ctx->file_table);
@@ -665,22 +516,11 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 
 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	unsigned nr = ctx->nr_user_files;
-	int ret;
-
 	if (!ctx->file_data)
 		return -ENXIO;
 
-	/*
-	 * Quiesce may unlock ->uring_lock, and while it's not held
-	 * prevent new requests using the table.
-	 */
-	ctx->nr_user_files = 0;
-	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
-	ctx->nr_user_files = nr;
-	if (!ret)
-		__io_sqe_files_unregister(ctx);
-	return ret;
+	__io_sqe_files_unregister(ctx);
+	return 0;
 }
 
 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -699,8 +539,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EMFILE;
 	if (nr_args > rlimit(RLIMIT_NOFILE))
 		return -EMFILE;
-	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
-				 &ctx->file_data);
+	ret = io_rsrc_data_alloc(ctx, nr_args, &ctx->file_data);
 	if (ret)
 		return ret;
 
@@ -711,16 +550,18 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		struct io_fixed_file *file_slot;
+		struct io_rsrc_node *node;
+		u64 tag = 0;
 
-		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
+			goto fail;
+		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
 			goto fail;
-		}
 		/* allow sparse sets */
 		if (!fds || fd == -1) {
 			ret = -EINVAL;
-			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
+			if (tag)
 				goto fail;
 			continue;
 		}
@@ -737,8 +578,16 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto fail;
 		}
-		file_slot = io_fixed_file_slot(&ctx->file_table, i);
-		io_fixed_file_set(file_slot, file);
+		ret = -ENOMEM;
+		node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+		if (!node) {
+			fput(file);
+			goto fail;
+		}
+		if (tag)
+			node->tag = tag;
+		ctx->file_table.nodes[i] = node;
+		io_fixed_file_set(node, file);
 		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
@@ -750,43 +599,30 @@ fail:
 	return ret;
 }
 
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
-	io_buffer_unmap(ctx, &prsrc->buf);
-	prsrc->buf = NULL;
-}
-
-void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
 	unsigned int i;
 
-	for (i = 0; i < ctx->nr_user_bufs; i++)
-		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
-	kfree(ctx->user_bufs);
-	io_rsrc_data_free(ctx->buf_data);
+	lockdep_assert_held(&ctx->uring_lock);
+
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		io_put_rsrc_node(ctx->user_bufs[i]);
+		ctx->user_bufs[i] = NULL;
+	}
+	kvfree(ctx->user_bufs);
 	ctx->user_bufs = NULL;
+	io_rsrc_data_free(ctx->buf_data);
 	ctx->buf_data = NULL;
 	ctx->nr_user_bufs = 0;
 }
 
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
-	unsigned nr = ctx->nr_user_bufs;
-	int ret;
-
 	if (!ctx->buf_data)
 		return -ENXIO;
 
-	/*
-	 * Quiesce may unlock ->uring_lock, and while it's not held
-	 * prevent new requests using the table.
-	 */
-	ctx->nr_user_bufs = 0;
-	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
-	ctx->nr_user_bufs = nr;
-	if (!ret)
-		__io_sqe_buffers_unregister(ctx);
-	return ret;
+	__io_sqe_buffers_unregister(ctx);
+	return 0;
 }
 
 /*
@@ -813,7 +649,8 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 
 	/* check previously registered pages */
 	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+		struct io_rsrc_node *node = ctx->user_bufs[i];
+		struct io_mapped_ubuf *imu = node->buf;
 
 		for (j = 0; j < imu->nr_bvecs; j++) {
 			if (!PageCompound(imu->bvec[j].bv_page))
@@ -950,21 +787,28 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
 	return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
 }
 
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-				  struct io_mapped_ubuf **pimu,
-				  struct page **last_hpage)
+static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
+						   struct iovec *iov,
+						   struct page **last_hpage)
 {
 	struct io_mapped_ubuf *imu = NULL;
 	struct page **pages = NULL;
+	struct io_rsrc_node *node;
 	unsigned long off;
 	size_t size;
 	int ret, nr_pages, i;
 	struct io_imu_folio_data data;
 	bool coalesced;
 
-	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
-	if (!iov->iov_base)
-		return 0;
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	node->buf = NULL;
+
+	if (!iov->iov_base) {
+		node->buf = (struct io_mapped_ubuf *) &dummy_ubuf;
+		return node;
+	}
 
 	ret = -ENOMEM;
 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
@@ -998,7 +842,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		imu->folio_shift = data.folio_shift;
 	refcount_set(&imu->refs, 1);
 	off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
-	*pimu = imu;
+	node->buf = imu;
 	ret = 0;
 
 	for (i = 0; i < nr_pages; i++) {
@@ -1010,10 +854,14 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		size -= vec_len;
 	}
 done:
-	if (ret)
+	if (ret) {
 		kvfree(imu);
+		if (node)
+			io_put_rsrc_node(node);
+		node = ERR_PTR(ret);
+	}
 	kvfree(pages);
-	return ret;
+	return node;
 }
 
 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
@@ -1037,7 +885,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
+	ret = io_rsrc_data_alloc(ctx, nr_args, &data);
 	if (ret)
 		return ret;
 	ret = io_buffers_map_alloc(ctx, nr_args);
@@ -1050,6 +898,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		memset(iov, 0, sizeof(*iov));
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+		struct io_rsrc_node *node;
+		u64 tag = 0;
+
 		if (arg) {
 			uvec = (struct iovec __user *) arg;
 			iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
@@ -1066,15 +917,24 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 				arg += sizeof(struct iovec);
 		}
 
-		if (!iov->iov_base && *io_get_tag_slot(data, i)) {
-			ret = -EINVAL;
-			break;
+		if (tags) {
+			if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
+				ret = -EFAULT;
+				break;
+			}
+			if (tag && !iov->iov_base) {
+				ret = -EINVAL;
+				break;
+			}
 		}
 
-		ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
-					     &last_hpage);
-		if (ret)
+		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
+		if (IS_ERR(node)) {
+			ret = PTR_ERR(node);
 			break;
+		}
+		node->tag = tag;
+		ctx->user_bufs[i] = node;
 	}
 
 	WARN_ON_ONCE(ctx->buf_data);
@@ -1148,7 +1008,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
 {
-	struct io_mapped_ubuf **user_bufs;
+	struct io_rsrc_node **user_bufs;
 	struct io_rsrc_data *data;
 	int i, ret, nbufs;
 
@@ -1163,21 +1023,31 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	nbufs = src_ctx->nr_user_bufs;
 	if (!nbufs)
 		goto out_unlock;
-	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
+	ret = io_rsrc_data_alloc(ctx, nbufs, &data);
 	if (ret)
 		goto out_unlock;
 
 	ret = -ENOMEM;
-	user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
+	user_bufs = kvmalloc_array(nbufs, sizeof(struct io_rsrc_node *),
+					GFP_KERNEL | __GFP_ZERO);
 	if (!user_bufs)
 		goto out_free_data;
 
 	for (i = 0; i < nbufs; i++) {
-		struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
+		struct io_mapped_ubuf *imu = src_ctx->user_bufs[i]->buf;
+		struct io_rsrc_node *dst_node;
 
-		if (src != &dummy_ubuf)
-			refcount_inc(&src->refs);
-		user_bufs[i] = src;
+		dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
+		if (!dst_node)
+			goto out_put_free;
+
+		if (imu == &dummy_ubuf) {
+			dst_node->buf = (struct io_mapped_ubuf *) &dummy_ubuf;
+		} else {
+			refcount_inc(&imu->refs);
+			dst_node->buf = imu;
+		}
+		user_bufs[i] = dst_node;
 	}
 
 	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
@@ -1190,12 +1060,17 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		return 0;
 	}
 
+	mutex_unlock(&ctx->uring_lock);
+	mutex_lock(&src_ctx->uring_lock);
 	/* someone raced setting up buffers, dump ours */
-	for (i = 0; i < nbufs; i++)
-		io_buffer_unmap(ctx, &user_bufs[i]);
-	io_rsrc_data_free(data);
-	kfree(user_bufs);
-	return -EBUSY;
+	ret = -EBUSY;
+	i = nbufs;
+out_put_free:
+	while (i--) {
+		io_buffer_unmap(src_ctx, user_bufs[i]);
+		kfree(user_bufs[i]);
+	}
+	kvfree(user_bufs);
 out_free_data:
 	io_rsrc_data_free(data);
 out_unlock:
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 1589c9740083..20a316854238 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -13,36 +13,21 @@ enum {
 	IORING_RSRC_BUFFER		= 1,
 };
 
-struct io_rsrc_put {
-	u64 tag;
-	union {
-		void *rsrc;
-		struct file *file;
-		struct io_mapped_ubuf *buf;
-	};
-};
-
 struct io_rsrc_data {
-	struct io_ring_ctx		*ctx;
-
-	u64				**tags;
 	unsigned int			nr;
-	u16				rsrc_type;
-	bool				quiesce;
+	struct io_rsrc_node		**nodes;
 };
 
 struct io_rsrc_node {
 	struct io_ring_ctx		*ctx;
 	int				refs;
-	bool				empty;
 	u16				type;
-	struct list_head		node;
-	struct io_rsrc_put		item;
-};
 
-struct io_fixed_file {
-	/* file * with additional FFS_* flags */
-	unsigned long file_ptr;
+	u64 tag;
+	union {
+		unsigned long file_ptr;
+		struct io_mapped_ubuf *buf;
+	};
 };
 
 struct io_mapped_ubuf {
@@ -63,21 +48,17 @@ struct io_imu_folio_data {
 	unsigned int	folio_shift;
 };
 
-void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
-void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
-struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
+void io_free_rsrc_node(struct io_rsrc_node *node);
 
 int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
 			   u64 buf_addr, size_t len);
 
 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
-void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned int nr_args, u64 __user *tags);
-void __io_sqe_files_unregister(struct io_ring_ctx *ctx);
 int io_sqe_files_unregister(struct io_ring_ctx *ctx);
 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			  unsigned nr_args, u64 __user *tags);
@@ -89,41 +70,23 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
 int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int size, unsigned int type);
 
-static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
+static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 {
-	lockdep_assert_held(&ctx->uring_lock);
-
 	if (node && !--node->refs)
-		io_rsrc_node_ref_zero(node);
-}
-
-static inline void __io_req_set_rsrc_node(struct io_kiocb *req,
-					  struct io_ring_ctx *ctx)
-{
-	lockdep_assert_held(&ctx->uring_lock);
-	req->rsrc_node = ctx->rsrc_node;
-	ctx->rsrc_node->refs++;
+		io_free_rsrc_node(node);
 }
 
-static inline void io_req_set_rsrc_node(struct io_kiocb *req,
-					struct io_ring_ctx *ctx)
+static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
 {
-	if (!req->rsrc_node)
-		__io_req_set_rsrc_node(req, ctx);
-}
-
-static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
-{
-	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
-	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
-
-	return &data->tags[table_idx][off];
+	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_FILE]);
+	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_BUFFER]);
 }
 
-static inline int io_rsrc_init(struct io_ring_ctx *ctx)
+static inline void io_req_assign_rsrc_node(struct io_kiocb *req,
+					   struct io_rsrc_node *node)
 {
-	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
-	return ctx->rsrc_node ? 0 : -ENOMEM;
+	node->refs++;
+	req->rsrc_nodes[node->type] = node;
 }
 
 int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 8080ffd6d571..65491f4f2c7e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -330,7 +330,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_mapped_ubuf *imu;
+	struct io_rsrc_node *node;
 	struct io_async_rw *io;
 	u16 index;
 	int ret;
@@ -342,11 +342,11 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (unlikely(req->buf_index >= ctx->nr_user_bufs))
 		return -EFAULT;
 	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
-	imu = ctx->user_bufs[index];
-	io_req_set_rsrc_node(req, ctx);
+	node = ctx->user_bufs[index];
+	io_req_assign_rsrc_node(req, node);
 
 	io = req->async_data;
-	ret = io_import_fixed(ddir, &io->iter, imu, rw->addr, rw->len);
+	ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
 	iov_iter_save_state(&io->iter, &io->iter_state);
 	return ret;
 }
diff --git a/io_uring/splice.c b/io_uring/splice.c
index e62bc6497a94..a0b4e0435b8b 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -51,7 +51,7 @@ void io_splice_cleanup(struct io_kiocb *req)
 {
 	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 
-	io_put_rsrc_node(req->ctx, sp->rsrc_node);
+	io_put_rsrc_node(sp->rsrc_node);
 }
 
 static struct file *io_splice_get_file(struct io_kiocb *req,
@@ -59,7 +59,7 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 {
 	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_fixed_file *slot;
+	struct io_rsrc_node *node;
 	struct file *file = NULL;
 
 	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
@@ -69,11 +69,13 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 	if (unlikely(sp->splice_fd_in >= ctx->nr_user_files))
 		goto out;
 	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->nr_user_files);
-	slot = &ctx->file_table.files[sp->splice_fd_in];
-	if (!req->rsrc_node)
-		__io_req_set_rsrc_node(req, ctx);
-	file = io_slot_file(slot);
-	req->flags |= REQ_F_NEED_CLEANUP;
+	node = ctx->file_table.nodes[sp->splice_fd_in];
+	if (node) {
+		node->refs++;
+		sp->rsrc_node = node;
+		file = io_slot_file(node);
+		req->flags |= REQ_F_NEED_CLEANUP;
+	}
 out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 6994f60d7ec7..0899c71008ae 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -220,7 +220,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		 * being called. This prevents destruction of the mapped buffer
 		 * we'll need at actual import time.
 		 */
-		io_req_set_rsrc_node(req, ctx);
+		io_req_assign_rsrc_node(req, ctx->user_bufs[req->buf_index]);
 	}
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 
@@ -276,15 +276,11 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
 {
 	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
-	struct io_ring_ctx *ctx = req->ctx;
+	struct io_rsrc_node *node = req->rsrc_nodes[IORING_RSRC_BUFFER];
 
 	/* Must have had rsrc_node assigned at prep time */
-	if (req->rsrc_node) {
-		struct io_mapped_ubuf *imu;
-
-		imu = READ_ONCE(ctx->user_bufs[req->buf_index]);
-		return io_import_fixed(rw, iter, imu, ubuf, len);
-	}
+	if (node)
+		return io_import_fixed(rw, iter, node->buf, ubuf, len);
 
 	return -EFAULT;
 }
-- 
cgit v1.2.3


From fbbb8e991d86bb7539de6161746b6c747f93f533 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 26 Oct 2024 06:43:44 -0600
Subject: io_uring/rsrc: get rid of io_rsrc_node allocation cache

It's not going to be needed in the fast path going forward, so kill it
off.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 ---
 io_uring/io_uring.c            |  6 +-----
 io_uring/rsrc.c                | 18 ++++++------------
 3 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 42c5f2c992c4..696f2a05a98b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -370,9 +370,6 @@ struct io_ring_ctx {
 	struct io_rsrc_data		*file_data;
 	struct io_rsrc_data		*buf_data;
 
-	/* protected by ->uring_lock */
-	struct io_alloc_cache		rsrc_node_cache;
-
 	u32			pers_next;
 	struct xarray		personalities;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0876aa74c739..094788cca47f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -312,9 +312,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
-			    sizeof(struct io_rsrc_node));
-	ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
+	ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
 			    sizeof(struct async_poll));
 	ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_msghdr));
@@ -358,7 +356,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 free_ref:
 	percpu_ref_exit(&ctx->refs);
 err:
-	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
 	io_alloc_cache_free(&ctx->apoll_cache, kfree);
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
@@ -2740,7 +2737,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
-	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
 	if (ctx->mm_account) {
 		mmdrop(ctx->mm_account);
 		ctx->mm_account = NULL;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 88d698efd75b..8f8147dd714c 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -13,7 +13,6 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
-#include "alloc_cache.h"
 #include "openclose.h"
 #include "rsrc.h"
 #include "memmap.h"
@@ -129,16 +128,12 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
 {
 	struct io_rsrc_node *node;
 
-	node = io_alloc_cache_get(&ctx->rsrc_node_cache);
-	if (!node) {
-		node = kzalloc(sizeof(*node), GFP_KERNEL);
-		if (!node)
-			return NULL;
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (node) {
+		node->ctx = ctx;
+		node->refs = 1;
+		node->type = type;
 	}
-
-	node->ctx = ctx;
-	node->refs = 1;
-	node->type = type;
 	return node;
 }
 
@@ -487,8 +482,7 @@ void io_free_rsrc_node(struct io_rsrc_node *node)
 		break;
 	}
 
-	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
-		kfree(node);
+	kfree(node);
 }
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-- 
cgit v1.2.3


From 3597f2786b687a7f26361ce00a805ea0af41b65f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 26 Oct 2024 14:50:13 -0600
Subject: io_uring/rsrc: unify file and buffer resource tables

For files, there's nr_user_files/file_table/file_data, and buffers have
nr_user_bufs/user_bufs/buf_data. There's no reason why file_table and
file_data can't be the same thing, and ditto for the buffer side. That
gets rid of more io_ring_ctx state that's in two spots rather than just
being in one spot, as it should be. Put all the registered file data in
one locations, and ditto on the buffer front.

This also avoids having both io_rsrc_data->nodes being an allocated
array, and ->user_bufs[] or ->file_table.nodes. There's no reason to
have this information duplicated. Keep it in one spot, io_rsrc_data,
along with how many resources are available.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  15 ++-
 io_uring/cancel.c              |   4 +-
 io_uring/fdinfo.c              |  10 +-
 io_uring/filetable.c           |  44 ++++-----
 io_uring/filetable.h           |   2 +-
 io_uring/io_uring.c            |   7 +-
 io_uring/msg_ring.c            |   4 +-
 io_uring/net.c                 |   6 +-
 io_uring/nop.c                 |   6 +-
 io_uring/register.c            |   3 +-
 io_uring/rsrc.c                | 209 +++++++++++++----------------------------
 io_uring/rsrc.h                |   7 +-
 io_uring/rw.c                  |   6 +-
 io_uring/splice.c              |   6 +-
 io_uring/uring_cmd.c           |   6 +-
 15 files changed, 123 insertions(+), 212 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 696f2a05a98b..77fd508d043a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -55,8 +55,13 @@ struct io_wq_work {
 	int cancel_seq;
 };
 
+struct io_rsrc_data {
+	unsigned int			nr;
+	struct io_rsrc_node		**nodes;
+};
+
 struct io_file_table {
-	struct io_rsrc_node **nodes;
+	struct io_rsrc_data data;
 	unsigned long *bitmap;
 	unsigned int alloc_hint;
 };
@@ -276,9 +281,7 @@ struct io_ring_ctx {
 		struct io_wq_work_list	iopoll_list;
 
 		struct io_file_table	file_table;
-		struct io_rsrc_node	**user_bufs;
-		unsigned		nr_user_files;
-		unsigned		nr_user_bufs;
+		struct io_rsrc_data	buf_table;
 
 		struct io_submit_state	submit_state;
 
@@ -366,10 +369,6 @@ struct io_ring_ctx {
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
 
-	/* slow path rsrc auxilary data, used by update/register */
-	struct io_rsrc_data		*file_data;
-	struct io_rsrc_data		*buf_data;
-
 	u32			pers_next;
 	struct xarray		personalities;
 
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index cc3475b22ae5..3a2996307025 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -240,9 +240,9 @@ static int __io_sync_cancel(struct io_uring_task *tctx,
 	/* fixed must be grabbed every time since we drop the uring_lock */
 	if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
 	    (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
-		if (unlikely(fd >= ctx->nr_user_files))
+		if (unlikely(fd >= ctx->file_table.data.nr))
 			return -EBADF;
-		fd = array_index_nospec(fd, ctx->nr_user_files);
+		fd = array_index_nospec(fd, ctx->file_table.data.nr);
 		cd->file = io_file_from_index(&ctx->file_table, fd);
 		if (!cd->file)
 			return -EBADF;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 064a79475c5f..e3f5e9fe5562 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -165,8 +165,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
 	seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
 	seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
-	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
-	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
+	seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
+	for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) {
 		struct file *f = io_file_from_index(&ctx->file_table, i);
 
 		if (f)
@@ -174,9 +174,9 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 		else
 			seq_printf(m, "%5u: <none>\n", i);
 	}
-	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
-	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *buf = ctx->user_bufs[i]->buf;
+	seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
+	for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
+		struct io_mapped_ubuf *buf = ctx->buf_table.nodes[i]->buf;
 
 		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
 	}
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 1b12a9a1cc16..c1f9f9550446 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -38,25 +38,19 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
-	table->nodes = kvmalloc_array(nr_files, sizeof(struct io_src_node *),
-					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-	if (unlikely(!table->nodes))
+	if (io_rsrc_data_alloc(&table->data, nr_files))
 		return false;
-
 	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
-	if (unlikely(!table->bitmap)) {
-		kvfree(table->nodes);
-		return false;
-	}
-
-	return true;
+	if (table->bitmap)
+		return true;
+	io_rsrc_data_free(&table->data);
+	return false;
 }
 
 void io_free_file_tables(struct io_file_table *table)
 {
-	kvfree(table->nodes);
+	io_rsrc_data_free(&table->data);
 	bitmap_free(table->bitmap);
-	table->nodes = NULL;
 	table->bitmap = NULL;
 }
 
@@ -68,22 +62,22 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 
 	if (io_is_uring_fops(file))
 		return -EBADF;
-	if (!ctx->file_data)
+	if (!ctx->file_table.data.nr)
 		return -ENXIO;
-	if (slot_index >= ctx->nr_user_files)
+	if (slot_index >= ctx->file_table.data.nr)
 		return -EINVAL;
 
 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
 	if (!node)
 		return -ENOMEM;
 
-	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
-	if (ctx->file_table.nodes[slot_index])
-		io_put_rsrc_node(ctx->file_table.nodes[slot_index]);
+	slot_index = array_index_nospec(slot_index, ctx->file_table.data.nr);
+	if (ctx->file_table.data.nodes[slot_index])
+		io_put_rsrc_node(ctx->file_table.data.nodes[slot_index]);
 	else
 		io_file_bitmap_set(&ctx->file_table, slot_index);
 
-	ctx->file_table.nodes[slot_index] = node;
+	ctx->file_table.data.nodes[slot_index] = node;
 	io_fixed_file_set(node, file);
 	return 0;
 }
@@ -129,16 +123,16 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 {
-	if (unlikely(!ctx->file_data))
+	if (unlikely(!ctx->file_table.data.nr))
 		return -ENXIO;
-	if (offset >= ctx->nr_user_files)
+	if (offset >= ctx->file_table.data.nr)
 		return -EINVAL;
 
-	offset = array_index_nospec(offset, ctx->nr_user_files);
-	if (!ctx->file_table.nodes[offset])
+	offset = array_index_nospec(offset, ctx->file_table.data.nr);
+	if (!ctx->file_table.data.nodes[offset])
 		return -EBADF;
-	io_put_rsrc_node(ctx->file_table.nodes[offset]);
-	ctx->file_table.nodes[offset] = NULL;
+	io_put_rsrc_node(ctx->file_table.data.nodes[offset]);
+	ctx->file_table.data.nodes[offset] = NULL;
 	io_file_bitmap_clear(&ctx->file_table, offset);
 	return 0;
 }
@@ -153,7 +147,7 @@ int io_register_file_alloc_range(struct io_ring_ctx *ctx,
 		return -EFAULT;
 	if (check_add_overflow(range.off, range.len, &end))
 		return -EOVERFLOW;
-	if (range.resv || end > ctx->nr_user_files)
+	if (range.resv || end > ctx->file_table.data.nr)
 		return -EINVAL;
 
 	io_file_table_set_alloc_range(ctx, range.off, range.len);
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 47616079abaa..664c31502dbb 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -52,7 +52,7 @@ static inline struct file *io_slot_file(struct io_rsrc_node *node)
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
-	struct io_rsrc_node *node = table->nodes[index];
+	struct io_rsrc_node *node = table->data.nodes[index];
 
 	if (node)
 		return io_slot_file(node);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 60c947114fa3..78df515fb3a7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1879,11 +1879,10 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	struct file *file = NULL;
 
 	io_ring_submit_lock(ctx, issue_flags);
-
-	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
+	if (unlikely((unsigned int)fd >= ctx->file_table.data.nr))
 		goto out;
-	fd = array_index_nospec(fd, ctx->nr_user_files);
-	node = ctx->file_table.nodes[fd];
+	fd = array_index_nospec(fd, ctx->file_table.data.nr);
+	node = ctx->file_table.data.nodes[fd];
 	if (node) {
 		io_req_assign_rsrc_node(req, node);
 		req->flags |= io_slot_flags(node);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index edea1ffd501c..b90ab3b8f5e0 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -180,8 +180,8 @@ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_fl
 	int idx = msg->src_fd;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	if (likely(idx < ctx->nr_user_files)) {
-		idx = array_index_nospec(idx, ctx->nr_user_files);
+	if (likely(idx < ctx->file_table.data.nr)) {
+		idx = array_index_nospec(idx, ctx->file_table.data.nr);
 		file = io_file_from_index(&ctx->file_table, idx);
 		if (file)
 			get_file(file);
diff --git a/io_uring/net.c b/io_uring/net.c
index ce1156551d10..3e1f31574abb 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1347,9 +1347,9 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
-		if (sr->buf_index < ctx->nr_user_bufs) {
-			idx = array_index_nospec(sr->buf_index, ctx->nr_user_bufs);
-			node = ctx->user_bufs[idx];
+		if (sr->buf_index < ctx->buf_table.nr) {
+			idx = array_index_nospec(sr->buf_index, ctx->buf_table.nr);
+			node = ctx->buf_table.nodes[idx];
 			io_req_assign_rsrc_node(sr->notif, node);
 			ret = 0;
 		}
diff --git a/io_uring/nop.c b/io_uring/nop.c
index de91600a3bc6..0dac01127de5 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -66,9 +66,9 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
-		if (nop->buffer < ctx->nr_user_bufs) {
-			idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs);
-			node = READ_ONCE(ctx->user_bufs[idx]);
+		if (nop->buffer < ctx->buf_table.nr) {
+			idx = array_index_nospec(nop->buffer, ctx->buf_table.nr);
+			node = READ_ONCE(ctx->buf_table.nodes[idx]);
 			io_req_assign_rsrc_node(req, node);
 			ret = 0;
 		}
diff --git a/io_uring/register.c b/io_uring/register.c
index 1eb686eaa310..45edfc57963a 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -937,7 +937,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
-	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
+	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
+				ctx->buf_table.nr, ret);
 	if (!use_registered_ring)
 		fput(file);
 	return ret;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 69a9cd82460d..49a6ab5f3ae9 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -142,39 +142,28 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
 	return node;
 }
 
-static void io_rsrc_data_free(struct io_rsrc_data *data)
+__cold void io_rsrc_data_free(struct io_rsrc_data *data)
 {
-	int i;
-
-	for (i = 0; i < data->nr; i++) {
-		struct io_rsrc_node *node = data->nodes[i];
-
-		if (node)
-			io_put_rsrc_node(node);
+	if (!data->nr)
+		return;
+	while (data->nr--) {
+		if (data->nodes[data->nr])
+			io_put_rsrc_node(data->nodes[data->nr]);
 	}
 	kvfree(data->nodes);
-	kfree(data);
+	data->nodes = NULL;
+	data->nr = 0;
 }
 
-__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, unsigned nr,
-				     struct io_rsrc_data **pdata)
+__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
 {
-	struct io_rsrc_data *data;
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
 	data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
-					GFP_KERNEL | __GFP_ZERO);
-	if (!data->nodes) {
-		kfree(data);
-		return -ENOMEM;
+					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (data->nodes) {
+		data->nr = nr;
+		return 0;
 	}
-
-	data->nr = nr;
-	*pdata = data;
-	return 0;
+	return -ENOMEM;
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -186,9 +175,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	int fd, i, err = 0;
 	unsigned int done;
 
-	if (!ctx->file_data)
+	if (!ctx->file_table.data.nr)
 		return -ENXIO;
-	if (up->offset + nr_args > ctx->nr_user_files)
+	if (up->offset + nr_args > ctx->file_table.data.nr)
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
@@ -206,10 +195,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		if (fd == IORING_REGISTER_FILES_SKIP)
 			continue;
 
-		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		if (ctx->file_table.nodes[i]) {
-			io_put_rsrc_node(ctx->file_table.nodes[i]);
-			ctx->file_table.nodes[i] = NULL;
+		i = array_index_nospec(up->offset + done, ctx->file_table.data.nr);
+		if (ctx->file_table.data.nodes[i]) {
+			io_put_rsrc_node(ctx->file_table.data.nodes[i]);
+			ctx->file_table.data.nodes[i] = NULL;
 			io_file_bitmap_clear(&ctx->file_table, i);
 		}
 		if (fd != -1) {
@@ -234,7 +223,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				fput(file);
 				break;
 			}
-			ctx->file_table.nodes[i] = node;
+			ctx->file_table.data.nodes[i] = node;
 			if (tag)
 				node->tag = tag;
 			io_fixed_file_set(node, file);
@@ -256,9 +245,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 	__u32 done;
 	int i, err;
 
-	if (!ctx->buf_data)
+	if (!ctx->buf_table.nr)
 		return -ENXIO;
-	if (up->offset + nr_args > ctx->nr_user_bufs)
+	if (up->offset + nr_args > ctx->buf_table.nr)
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
@@ -282,16 +271,16 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = -EINVAL;
 			break;
 		}
-		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
 		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
 		if (IS_ERR(node)) {
 			err = PTR_ERR(node);
 			break;
 		}
-		if (ctx->user_bufs[i])
-			io_put_rsrc_node(ctx->user_bufs[i]);
+		i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
+		if (ctx->buf_table.nodes[i])
+			io_put_rsrc_node(ctx->buf_table.nodes[i]);
 
-		ctx->user_bufs[i] = node;
+		ctx->buf_table.nodes[i] = node;
 		if (tag)
 			node->tag = tag;
 		if (ctx->compat)
@@ -409,7 +398,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 	struct file *file;
 	int ret, fd;
 
-	if (!req->ctx->file_data)
+	if (!req->ctx->file_table.data.nr)
 		return -ENXIO;
 
 	for (done = 0; done < up->nr_args; done++) {
@@ -494,35 +483,13 @@ void io_free_rsrc_node(struct io_rsrc_node *node)
 	kfree(node);
 }
 
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-	int i;
-
-	lockdep_assert_held(&ctx->uring_lock);
-
-	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct io_rsrc_node *node = ctx->file_table.nodes[i];
-
-		if (node) {
-			io_put_rsrc_node(node);
-			io_file_bitmap_clear(&ctx->file_table, i);
-			ctx->file_table.nodes[i] = NULL;
-		}
-	}
-
-	io_free_file_tables(&ctx->file_table);
-	io_file_table_set_alloc_range(ctx, 0, 0);
-	io_rsrc_data_free(ctx->file_data);
-	ctx->file_data = NULL;
-	ctx->nr_user_files = 0;
-}
-
 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	if (!ctx->file_data)
+	if (!ctx->file_table.data.nr)
 		return -ENXIO;
 
-	__io_sqe_files_unregister(ctx);
+	io_free_file_tables(&ctx->file_table);
+	io_file_table_set_alloc_range(ctx, 0, 0);
 	return 0;
 }
 
@@ -534,7 +501,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	int fd, ret;
 	unsigned i;
 
-	if (ctx->file_data)
+	if (ctx->file_table.data.nr)
 		return -EBUSY;
 	if (!nr_args)
 		return -EINVAL;
@@ -542,17 +509,10 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EMFILE;
 	if (nr_args > rlimit(RLIMIT_NOFILE))
 		return -EMFILE;
-	ret = io_rsrc_data_alloc(ctx, nr_args, &ctx->file_data);
-	if (ret)
-		return ret;
-
-	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
-		io_rsrc_data_free(ctx->file_data);
-		ctx->file_data = NULL;
+	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
 		return -ENOMEM;
-	}
 
-	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+	for (i = 0; i < nr_args; i++) {
 		struct io_rsrc_node *node;
 		u64 tag = 0;
 
@@ -589,44 +549,24 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		}
 		if (tag)
 			node->tag = tag;
-		ctx->file_table.nodes[i] = node;
+		ctx->file_table.data.nodes[i] = node;
 		io_fixed_file_set(node, file);
 		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
 	/* default it to the whole table */
-	io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
+	io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
 	return 0;
 fail:
-	__io_sqe_files_unregister(ctx);
+	io_sqe_files_unregister(ctx);
 	return ret;
 }
 
-static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
-	unsigned int i;
-
-	lockdep_assert_held(&ctx->uring_lock);
-
-	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		if (ctx->user_bufs[i]) {
-			io_put_rsrc_node(ctx->user_bufs[i]);
-			ctx->user_bufs[i] = NULL;
-		}
-	}
-	kvfree(ctx->user_bufs);
-	ctx->user_bufs = NULL;
-	io_rsrc_data_free(ctx->buf_data);
-	ctx->buf_data = NULL;
-	ctx->nr_user_bufs = 0;
-}
-
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
-	if (!ctx->buf_data)
+	if (!ctx->buf_table.nr)
 		return -ENXIO;
-
-	__io_sqe_buffers_unregister(ctx);
+	io_rsrc_data_free(&ctx->buf_table);
 	return 0;
 }
 
@@ -653,8 +593,8 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 	}
 
 	/* check previously registered pages */
-	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		struct io_rsrc_node *node = ctx->user_bufs[i];
+	for (i = 0; i < ctx->buf_table.nr; i++) {
+		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
 		struct io_mapped_ubuf *imu = node->buf;
 
 		for (j = 0; j < imu->nr_bvecs; j++) {
@@ -805,6 +745,9 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	struct io_imu_folio_data data;
 	bool coalesced;
 
+	if (!iov->iov_base)
+		return rsrc_empty_node;
+
 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 	if (!node)
 		return ERR_PTR(-ENOMEM);
@@ -864,40 +807,29 @@ done:
 	return node;
 }
 
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
-{
-	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
-	return ctx->user_bufs ? 0 : -ENOMEM;
-}
-
 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned int nr_args, u64 __user *tags)
 {
 	struct page *last_hpage = NULL;
-	struct io_rsrc_data *data;
+	struct io_rsrc_data data;
 	struct iovec fast_iov, *iov = &fast_iov;
 	const struct iovec __user *uvec;
 	int i, ret;
 
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
 
-	if (ctx->user_bufs)
+	if (ctx->buf_table.nr)
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_data_alloc(ctx, nr_args, &data);
+	ret = io_rsrc_data_alloc(&data, nr_args);
 	if (ret)
 		return ret;
-	ret = io_buffers_map_alloc(ctx, nr_args);
-	if (ret) {
-		io_rsrc_data_free(data);
-		return ret;
-	}
 
 	if (!arg)
 		memset(iov, 0, sizeof(*iov));
 
-	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+	for (i = 0; i < nr_args; i++) {
 		struct io_rsrc_node *node;
 		u64 tag = 0;
 
@@ -935,14 +867,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		}
 		if (tag)
 			node->tag = tag;
-		ctx->user_bufs[i] = node;
+		data.nodes[i] = node;
 	}
 
-	WARN_ON_ONCE(ctx->buf_data);
-
-	ctx->buf_data = data;
+	ctx->buf_table = data;
 	if (ret)
-		__io_sqe_buffers_unregister(ctx);
+		io_sqe_buffers_unregister(ctx);
 	return ret;
 }
 
@@ -1009,8 +939,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
 {
-	struct io_rsrc_node **user_bufs;
-	struct io_rsrc_data *data;
+	struct io_rsrc_data data;
 	int i, ret, nbufs;
 
 	/*
@@ -1021,43 +950,37 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 
 	mutex_lock(&src_ctx->uring_lock);
 	ret = -ENXIO;
-	nbufs = src_ctx->nr_user_bufs;
+	nbufs = src_ctx->buf_table.nr;
 	if (!nbufs)
 		goto out_unlock;
-	ret = io_rsrc_data_alloc(ctx, nbufs, &data);
+	ret = io_rsrc_data_alloc(&data, nbufs);
 	if (ret)
 		goto out_unlock;
 
-	ret = -ENOMEM;
-	user_bufs = kvmalloc_array(nbufs, sizeof(struct io_rsrc_node *),
-					GFP_KERNEL | __GFP_ZERO);
-	if (!user_bufs)
-		goto out_free_data;
-
 	for (i = 0; i < nbufs; i++) {
-		struct io_rsrc_node *src_node = src_ctx->user_bufs[i];
+		struct io_rsrc_node *src_node = src_ctx->buf_table.nodes[i];
 		struct io_rsrc_node *dst_node;
 
 		if (src_node == rsrc_empty_node) {
 			dst_node = rsrc_empty_node;
 		} else {
 			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
-			if (!dst_node)
+			if (!dst_node) {
+				ret = -ENOMEM;
 				goto out_put_free;
+			}
 
 			refcount_inc(&src_node->buf->refs);
 			dst_node->buf = src_node->buf;
 		}
-		user_bufs[i] = dst_node;
+		data.nodes[i] = dst_node;
 	}
 
 	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
 	mutex_unlock(&src_ctx->uring_lock);
 	mutex_lock(&ctx->uring_lock);
-	if (!ctx->user_bufs) {
-		ctx->user_bufs = user_bufs;
-		ctx->buf_data = data;
-		ctx->nr_user_bufs = nbufs;
+	if (!ctx->buf_table.nr) {
+		ctx->buf_table = data;
 		return 0;
 	}
 
@@ -1068,12 +991,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	i = nbufs;
 out_put_free:
 	while (i--) {
-		io_buffer_unmap(src_ctx, user_bufs[i]);
-		kfree(user_bufs[i]);
+		io_buffer_unmap(src_ctx, data.nodes[i]);
+		kfree(data.nodes[i]);
 	}
-	kvfree(user_bufs);
-out_free_data:
-	io_rsrc_data_free(data);
+	io_rsrc_data_free(&data);
 out_unlock:
 	mutex_unlock(&src_ctx->uring_lock);
 	mutex_lock(&ctx->uring_lock);
@@ -1094,7 +1015,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 	struct file *file;
 	int ret;
 
-	if (ctx->user_bufs || ctx->nr_user_bufs)
+	if (ctx->buf_table.nr)
 		return -EBUSY;
 	if (copy_from_user(&buf, arg, sizeof(buf)))
 		return -EFAULT;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 48f712488f6b..569ea9ce1405 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -13,11 +13,6 @@ enum {
 	IORING_RSRC_BUFFER		= 1,
 };
 
-struct io_rsrc_data {
-	unsigned int			nr;
-	struct io_rsrc_node		**nodes;
-};
-
 struct io_rsrc_node {
 	struct io_ring_ctx		*ctx;
 	int				refs;
@@ -50,6 +45,8 @@ struct io_imu_folio_data {
 
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
 void io_free_rsrc_node(struct io_rsrc_node *node);
+void io_rsrc_data_free(struct io_rsrc_data *data);
+int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
 
 int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 65491f4f2c7e..28fff18ebb19 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -339,10 +339,10 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (unlikely(ret))
 		return ret;
 
-	if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+	if (unlikely(req->buf_index >= ctx->buf_table.nr))
 		return -EFAULT;
-	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
-	node = ctx->user_bufs[index];
+	index = array_index_nospec(req->buf_index, ctx->buf_table.nr);
+	node = ctx->buf_table.nodes[index];
 	io_req_assign_rsrc_node(req, node);
 
 	io = req->async_data;
diff --git a/io_uring/splice.c b/io_uring/splice.c
index f78afb575ae6..aaaddb66e90a 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -66,10 +66,10 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 		return io_file_get_normal(req, sp->splice_fd_in);
 
 	io_ring_submit_lock(ctx, issue_flags);
-	if (unlikely(sp->splice_fd_in >= ctx->nr_user_files))
+	if (unlikely(sp->splice_fd_in >= ctx->file_table.data.nr))
 		goto out;
-	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->nr_user_files);
-	node = ctx->file_table.nodes[sp->splice_fd_in];
+	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->file_table.data.nr);
+	node = ctx->file_table.data.nodes[sp->splice_fd_in];
 	if (node) {
 		node->refs++;
 		sp->rsrc_node = node;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 0899c71008ae..17d5f5004702 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -212,15 +212,15 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		u16 index;
 
 		index = READ_ONCE(sqe->buf_index);
-		if (unlikely(index >= ctx->nr_user_bufs))
+		if (unlikely(index >= ctx->buf_table.nr))
 			return -EFAULT;
-		req->buf_index = array_index_nospec(index, ctx->nr_user_bufs);
+		req->buf_index = array_index_nospec(index, ctx->buf_table.nr);
 		/*
 		 * Pi node upfront, prior to io_uring_cmd_import_fixed()
 		 * being called. This prevents destruction of the mapped buffer
 		 * we'll need at actual import time.
 		 */
-		io_req_assign_rsrc_node(req, ctx->user_bufs[req->buf_index]);
+		io_req_assign_rsrc_node(req, ctx->buf_table.nodes[req->buf_index]);
 	}
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 
-- 
cgit v1.2.3


From 01ee194d1aba1202f0926d5047a2a4cf84d0e45d Mon Sep 17 00:00:00 2001
From: hexue <xue01.he@samsung.com>
Date: Fri, 1 Nov 2024 17:19:57 +0800
Subject: io_uring: add support for hybrid IOPOLL

A new hybrid poll is implemented on the io_uring layer. Once an IO is
issued, it will not poll immediately, but rather block first and re-run
before IO complete, then poll to reap IO. While this poll method could
be a suboptimal solution when running on a single thread, it offers
performance lower than regular polling but higher than IRQ, and CPU
utilization is also lower than polling.

To use hybrid polling, the ring must be setup with both the
IORING_SETUP_IOPOLL and IORING_SETUP_HYBRID)IOPOLL flags set. Hybrid
polling has the same restrictions as IOPOLL, in that commands must
explicitly support it.

Signed-off-by: hexue <xue01.he@samsung.com>
Link: https://lore.kernel.org/r/20241101091957.564220-2-xue01.he@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 19 ++++++++-
 include/uapi/linux/io_uring.h  |  3 ++
 io_uring/io_uring.c            |  8 +++-
 io_uring/rw.c                  | 92 +++++++++++++++++++++++++++++++++++++-----
 4 files changed, 108 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 77fd508d043a..d52fec533c51 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -298,6 +298,11 @@ struct io_ring_ctx {
 		 * ->uring_cmd() by io_uring_cmd_insert_cancelable()
 		 */
 		struct hlist_head	cancelable_uring_cmd;
+		/*
+		 * For Hybrid IOPOLL, runtime in hybrid polling, without
+		 * scheduling time
+		 */
+		u64					hybrid_poll_time;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
@@ -449,6 +454,7 @@ enum {
 	REQ_F_LINK_TIMEOUT_BIT,
 	REQ_F_NEED_CLEANUP_BIT,
 	REQ_F_POLLED_BIT,
+	REQ_F_HYBRID_IOPOLL_STATE_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_BUFFER_RING_BIT,
 	REQ_F_REISSUE_BIT,
@@ -507,6 +513,8 @@ enum {
 	REQ_F_NEED_CLEANUP	= IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
 	/* already went through poll handler */
 	REQ_F_POLLED		= IO_REQ_FLAG(REQ_F_POLLED_BIT),
+	/* every req only blocks once in hybrid poll */
+	REQ_F_IOPOLL_STATE        = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT),
 	/* buffer already selected */
 	REQ_F_BUFFER_SELECTED	= IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
 	/* buffer selected from ring, needs commit */
@@ -639,8 +647,15 @@ struct io_kiocb {
 	atomic_t			refs;
 	bool				cancel_seq_set;
 	struct io_task_work		io_task_work;
-	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
-	struct hlist_node		hash_node;
+	union {
+		/*
+		 * for polled requests, i.e. IORING_OP_POLL_ADD and async armed
+		 * poll
+		 */
+		struct hlist_node	hash_node;
+		/* For IOPOLL setup queues, with hybrid polling */
+		u64                     iopoll_start;
+	};
 	/* internal polling, see IORING_FEAT_FAST_POLL */
 	struct async_poll		*apoll;
 	/* opcode allocated if it needs to store data for async defer */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ce58c4590de6..47977a5c65f5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -200,6 +200,9 @@ enum io_uring_sqe_flags_bit {
  */
 #define IORING_SETUP_NO_SQARRAY		(1U << 16)
 
+/* Use hybrid poll in iopoll process */
+#define IORING_SETUP_HYBRID_IOPOLL	(1U << 17)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 44a772013c09..f08ea7fd5998 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -307,6 +307,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		goto err;
 
 	ctx->flags = p->flags;
+	ctx->hybrid_poll_time = LLONG_MAX;
 	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 	init_waitqueue_head(&ctx->sqo_sq_wait);
 	INIT_LIST_HEAD(&ctx->sqd_list);
@@ -3630,6 +3631,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
 		static_branch_inc(&io_key_has_sqarray);
 
+	/* HYBRID_IOPOLL only valid with IOPOLL */
+	if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) ==
+			IORING_SETUP_HYBRID_IOPOLL)
+		return -EINVAL;
+
 	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
 	    !(ctx->flags & IORING_SETUP_IOPOLL) &&
 	    !(ctx->flags & IORING_SETUP_SQPOLL))
@@ -3785,7 +3791,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
 			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-			IORING_SETUP_NO_SQARRAY))
+			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 30448f343c7f..1ea6be2ccc90 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -817,6 +817,11 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 		req->iopoll_completed = 0;
+		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
+			/* make sure every req only blocks once*/
+			req->flags &= ~REQ_F_IOPOLL_STATE;
+			req->iopoll_start = ktime_get_ns();
+		}
 	} else {
 		if (kiocb->ki_flags & IOCB_HIPRI)
 			return -EINVAL;
@@ -1115,6 +1120,78 @@ void io_rw_fail(struct io_kiocb *req)
 	io_req_set_res(req, res, req->cqe.flags);
 }
 
+static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob,
+				unsigned int poll_flags)
+{
+	struct file *file = req->file;
+
+	if (req->opcode == IORING_OP_URING_CMD) {
+		struct io_uring_cmd *ioucmd;
+
+		ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+		return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags);
+	} else {
+		struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+
+		return file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+	}
+}
+
+static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	struct hrtimer_sleeper timer;
+	enum hrtimer_mode mode;
+	ktime_t kt;
+	u64 sleep_time;
+
+	if (req->flags & REQ_F_IOPOLL_STATE)
+		return 0;
+
+	if (ctx->hybrid_poll_time == LLONG_MAX)
+		return 0;
+
+	/* Using half the running time to do schedule */
+	sleep_time = ctx->hybrid_poll_time / 2;
+
+	kt = ktime_set(0, sleep_time);
+	req->flags |= REQ_F_IOPOLL_STATE;
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&timer.timer, kt);
+	set_current_state(TASK_INTERRUPTIBLE);
+	hrtimer_sleeper_start_expires(&timer, mode);
+
+	if (timer.task)
+		io_schedule();
+
+	hrtimer_cancel(&timer.timer);
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&timer.timer);
+	return sleep_time;
+}
+
+static int io_uring_hybrid_poll(struct io_kiocb *req,
+				struct io_comp_batch *iob, unsigned int poll_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	u64 runtime, sleep_time;
+	int ret;
+
+	sleep_time = io_hybrid_iopoll_delay(ctx, req);
+	ret = io_uring_classic_poll(req, iob, poll_flags);
+	runtime = ktime_get_ns() - req->iopoll_start - sleep_time;
+
+	/*
+	 * Use minimum sleep time if we're polling devices with different
+	 * latencies. We could get more completions from the faster ones.
+	 */
+	if (ctx->hybrid_poll_time > runtime)
+		ctx->hybrid_poll_time = runtime;
+
+	return ret;
+}
+
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
@@ -1131,7 +1208,6 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 
 	wq_list_for_each(pos, start, &ctx->iopoll_list) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-		struct file *file = req->file;
 		int ret;
 
 		/*
@@ -1142,17 +1218,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		if (req->opcode == IORING_OP_URING_CMD) {
-			struct io_uring_cmd *ioucmd;
-
-			ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-			ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob,
-								poll_flags);
-		} else {
-			struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL)
+			ret = io_uring_hybrid_poll(req, &iob, poll_flags);
+		else
+			ret = io_uring_classic_poll(req, &iob, poll_flags);
 
-			ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
-		}
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
-- 
cgit v1.2.3


From 53c0a58beb60b76e105a61aae518fd780eec03d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 25 May 2024 23:32:20 -0400
Subject: net/socket.c: switch to CLASS(fd)

	The important part in sockfd_lookup_light() is avoiding needless
file refcount operations, not the marginal reduction of the register
pressure from not keeping a struct file pointer in the caller.

	Switch to use fdget()/fdpu(); with sane use of CLASS(fd) we can
get a better code generation...

	Would be nice if somebody tested it on networking test suites
(including benchmarks)...

	sockfd_lookup_light() does fdget(), uses sock_from_file() to
get the associated socket and returns the struct socket reference to
the caller, along with "do we need to fput()" flag.  No matching fdput(),
the caller does its equivalent manually, using the fact that sock->file
points to the struct file the socket has come from.

	Get rid of that - have the callers do fdget()/fdput() and
use sock_from_file() directly.  That kills sockfd_lookup_light()
and fput_light() (no users left).

	What's more, we can get rid of explicit fdget()/fdput() by
switching to CLASS(fd, ...) - code generation does not suffer, since
now fdput() inserted on "descriptor is not opened" failure exit
is recognized to be a no-op by compiler.

[folded a fix for braino in do_recvmmsg() caught by Simon Horman]

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/file.h |   6 -
 net/socket.c         | 303 +++++++++++++++++++++++----------------------------
 2 files changed, 137 insertions(+), 172 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index f98de143245a..b49a92295b3f 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -30,12 +30,6 @@ extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount
 extern struct file *alloc_file_clone(struct file *, int flags,
 	const struct file_operations *);
 
-static inline void fput_light(struct file *file, int fput_needed)
-{
-	if (fput_needed)
-		fput(file);
-}
-
 /* either a reference to struct file + flags
  * (cloned vs. borrowed, pos locked), with
  * flags stored in lower bits of value,
diff --git a/net/socket.c b/net/socket.c
index 601ad74930ef..c3ac02d060c0 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -509,7 +509,7 @@ static int sock_map_fd(struct socket *sock, int flags)
 
 struct socket *sock_from_file(struct file *file)
 {
-	if (file->f_op == &socket_file_ops)
+	if (likely(file->f_op == &socket_file_ops))
 		return file->private_data;	/* set in sock_alloc_file */
 
 	return NULL;
@@ -549,24 +549,6 @@ struct socket *sockfd_lookup(int fd, int *err)
 }
 EXPORT_SYMBOL(sockfd_lookup);
 
-static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
-{
-	struct fd f = fdget(fd);
-	struct socket *sock;
-
-	*err = -EBADF;
-	if (fd_file(f)) {
-		sock = sock_from_file(fd_file(f));
-		if (likely(sock)) {
-			*fput_needed = f.word & FDPUT_FPUT;
-			return sock;
-		}
-		*err = -ENOTSOCK;
-		fdput(f);
-	}
-	return NULL;
-}
-
 static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
 				size_t size)
 {
@@ -1853,16 +1835,20 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
 {
 	struct socket *sock;
 	struct sockaddr_storage address;
-	int err, fput_needed;
-
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (sock) {
-		err = move_addr_to_kernel(umyaddr, addrlen, &address);
-		if (!err)
-			err = __sys_bind_socket(sock, &address, addrlen);
-		fput_light(sock->file, fput_needed);
-	}
-	return err;
+	CLASS(fd, f)(fd);
+	int err;
+
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	err = move_addr_to_kernel(umyaddr, addrlen, &address);
+	if (unlikely(err))
+		return err;
+
+	return __sys_bind_socket(sock, &address, addrlen);
 }
 
 SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
@@ -1891,15 +1877,16 @@ int __sys_listen_socket(struct socket *sock, int backlog)
 
 int __sys_listen(int fd, int backlog)
 {
+	CLASS(fd, f)(fd);
 	struct socket *sock;
-	int err, fput_needed;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (sock) {
-		err = __sys_listen_socket(sock, backlog);
-		fput_light(sock->file, fput_needed);
-	}
-	return err;
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	return __sys_listen_socket(sock, backlog);
 }
 
 SYSCALL_DEFINE2(listen, int, fd, int, backlog)
@@ -2009,17 +1996,12 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s
 int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
 		  int __user *upeer_addrlen, int flags)
 {
-	int ret = -EBADF;
-	struct fd f;
+	CLASS(fd, f)(fd);
 
-	f = fdget(fd);
-	if (fd_file(f)) {
-		ret = __sys_accept4_file(fd_file(f), upeer_sockaddr,
+	if (fd_empty(f))
+		return -EBADF;
+	return __sys_accept4_file(fd_file(f), upeer_sockaddr,
 					 upeer_addrlen, flags);
-		fdput(f);
-	}
-
-	return ret;
 }
 
 SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
@@ -2071,20 +2053,18 @@ out:
 
 int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
 {
-	int ret = -EBADF;
-	struct fd f;
+	struct sockaddr_storage address;
+	CLASS(fd, f)(fd);
+	int ret;
 
-	f = fdget(fd);
-	if (fd_file(f)) {
-		struct sockaddr_storage address;
+	if (fd_empty(f))
+		return -EBADF;
 
-		ret = move_addr_to_kernel(uservaddr, addrlen, &address);
-		if (!ret)
-			ret = __sys_connect_file(fd_file(f), &address, addrlen, 0);
-		fdput(f);
-	}
+	ret = move_addr_to_kernel(uservaddr, addrlen, &address);
+	if (ret)
+		return ret;
 
-	return ret;
+	return __sys_connect_file(fd_file(f), &address, addrlen, 0);
 }
 
 SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
@@ -2103,26 +2083,25 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
 {
 	struct socket *sock;
 	struct sockaddr_storage address;
-	int err, fput_needed;
+	CLASS(fd, f)(fd);
+	int err;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		goto out;
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
 	err = security_socket_getsockname(sock);
 	if (err)
-		goto out_put;
+		return err;
 
 	err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0);
 	if (err < 0)
-		goto out_put;
-	/* "err" is actually length in this case */
-	err = move_addr_to_user(&address, err, usockaddr, usockaddr_len);
+		return err;
 
-out_put:
-	fput_light(sock->file, fput_needed);
-out:
-	return err;
+	/* "err" is actually length in this case */
+	return move_addr_to_user(&address, err, usockaddr, usockaddr_len);
 }
 
 SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
@@ -2141,26 +2120,25 @@ int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
 {
 	struct socket *sock;
 	struct sockaddr_storage address;
-	int err, fput_needed;
+	CLASS(fd, f)(fd);
+	int err;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (sock != NULL) {
-		const struct proto_ops *ops = READ_ONCE(sock->ops);
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
-		err = security_socket_getpeername(sock);
-		if (err) {
-			fput_light(sock->file, fput_needed);
-			return err;
-		}
+	err = security_socket_getpeername(sock);
+	if (err)
+		return err;
 
-		err = ops->getname(sock, (struct sockaddr *)&address, 1);
-		if (err >= 0)
-			/* "err" is actually length in this case */
-			err = move_addr_to_user(&address, err, usockaddr,
-						usockaddr_len);
-		fput_light(sock->file, fput_needed);
-	}
-	return err;
+	err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1);
+	if (err < 0)
+		return err;
+
+	/* "err" is actually length in this case */
+	return move_addr_to_user(&address, err, usockaddr, usockaddr_len);
 }
 
 SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
@@ -2181,14 +2159,17 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
 	struct sockaddr_storage address;
 	int err;
 	struct msghdr msg;
-	int fput_needed;
 
 	err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		goto out;
+
+	CLASS(fd, f)(fd);
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
 	msg.msg_name = NULL;
 	msg.msg_control = NULL;
@@ -2198,7 +2179,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
 	if (addr) {
 		err = move_addr_to_kernel(addr, addr_len, &address);
 		if (err < 0)
-			goto out_put;
+			return err;
 		msg.msg_name = (struct sockaddr *)&address;
 		msg.msg_namelen = addr_len;
 	}
@@ -2206,12 +2187,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
 	msg.msg_flags = flags;
-	err = __sock_sendmsg(sock, &msg);
-
-out_put:
-	fput_light(sock->file, fput_needed);
-out:
-	return err;
+	return __sock_sendmsg(sock, &msg);
 }
 
 SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
@@ -2246,14 +2222,18 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
 	};
 	struct socket *sock;
 	int err, err2;
-	int fput_needed;
 
 	err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		goto out;
+
+	CLASS(fd, f)(fd);
+
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
@@ -2265,9 +2245,6 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
 		if (err2 < 0)
 			err = err2;
 	}
-
-	fput_light(sock->file, fput_needed);
-out:
 	return err;
 }
 
@@ -2342,17 +2319,16 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval,
 {
 	sockptr_t optval = USER_SOCKPTR(user_optval);
 	bool compat = in_compat_syscall();
-	int err, fput_needed;
 	struct socket *sock;
+	CLASS(fd, f)(fd);
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		return err;
-
-	err = do_sock_setsockopt(sock, compat, level, optname, optval, optlen);
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
-	fput_light(sock->file, fput_needed);
-	return err;
+	return do_sock_setsockopt(sock, compat, level, optname, optval, optlen);
 }
 
 SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
@@ -2408,20 +2384,17 @@ EXPORT_SYMBOL(do_sock_getsockopt);
 int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
 		int __user *optlen)
 {
-	int err, fput_needed;
 	struct socket *sock;
-	bool compat;
+	CLASS(fd, f)(fd);
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		return err;
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
-	compat = in_compat_syscall();
-	err = do_sock_getsockopt(sock, compat, level, optname,
+	return do_sock_getsockopt(sock, in_compat_syscall(), level, optname,
 				 USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
-
-	fput_light(sock->file, fput_needed);
-	return err;
 }
 
 SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
@@ -2447,15 +2420,16 @@ int __sys_shutdown_sock(struct socket *sock, int how)
 
 int __sys_shutdown(int fd, int how)
 {
-	int err, fput_needed;
 	struct socket *sock;
+	CLASS(fd, f)(fd);
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (sock != NULL) {
-		err = __sys_shutdown_sock(sock, how);
-		fput_light(sock->file, fput_needed);
-	}
-	return err;
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	return __sys_shutdown_sock(sock, how);
 }
 
 SYSCALL_DEFINE2(shutdown, int, fd, int, how)
@@ -2671,22 +2645,21 @@ long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
 long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
 		   bool forbid_cmsg_compat)
 {
-	int fput_needed, err;
 	struct msghdr msg_sys;
 	struct socket *sock;
 
 	if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
 		return -EINVAL;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		goto out;
+	CLASS(fd, f)(fd);
 
-	err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
-	fput_light(sock->file, fput_needed);
-out:
-	return err;
+	return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
 }
 
 SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
@@ -2701,7 +2674,7 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int
 int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 		   unsigned int flags, bool forbid_cmsg_compat)
 {
-	int fput_needed, err, datagrams;
+	int err, datagrams;
 	struct socket *sock;
 	struct mmsghdr __user *entry;
 	struct compat_mmsghdr __user *compat_entry;
@@ -2717,9 +2690,13 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 
 	datagrams = 0;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		return err;
+	CLASS(fd, f)(fd);
+
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
 	used_address.name_len = UINT_MAX;
 	entry = mmsg;
@@ -2756,8 +2733,6 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 		cond_resched();
 	}
 
-	fput_light(sock->file, fput_needed);
-
 	/* We only return an error if no datagrams were able to be sent */
 	if (datagrams != 0)
 		return datagrams;
@@ -2879,22 +2854,21 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
 long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
 		   bool forbid_cmsg_compat)
 {
-	int fput_needed, err;
 	struct msghdr msg_sys;
 	struct socket *sock;
 
 	if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
 		return -EINVAL;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		goto out;
+	CLASS(fd, f)(fd);
 
-	err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
-	fput_light(sock->file, fput_needed);
-out:
-	return err;
+	return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
 }
 
 SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
@@ -2911,7 +2885,7 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
 			  unsigned int vlen, unsigned int flags,
 			  struct timespec64 *timeout)
 {
-	int fput_needed, err, datagrams;
+	int err = 0, datagrams;
 	struct socket *sock;
 	struct mmsghdr __user *entry;
 	struct compat_mmsghdr __user *compat_entry;
@@ -2926,16 +2900,18 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
 
 	datagrams = 0;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		return err;
+	CLASS(fd, f)(fd);
+
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
 
 	if (likely(!(flags & MSG_ERRQUEUE))) {
 		err = sock_error(sock->sk);
-		if (err) {
-			datagrams = err;
-			goto out_put;
-		}
+		if (err)
+			return err;
 	}
 
 	entry = mmsg;
@@ -2992,12 +2968,10 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
 	}
 
 	if (err == 0)
-		goto out_put;
+		return datagrams;
 
-	if (datagrams == 0) {
-		datagrams = err;
-		goto out_put;
-	}
+	if (datagrams == 0)
+		return err;
 
 	/*
 	 * We may return less entries than requested (vlen) if the
@@ -3012,9 +2986,6 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
 		 */
 		WRITE_ONCE(sock->sk->sk_err, -err);
 	}
-out_put:
-	fput_light(sock->file, fput_needed);
-
 	return datagrams;
 }
 
-- 
cgit v1.2.3


From f302edb9d822804e72df3fa6ba270234050c678b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Jul 2024 21:49:04 -0400
Subject: switch netlink_getsockbyfilp() to taking descriptor

the only call site (in do_mq_notify()) obtains the argument
from an immediately preceding fdget() and it is immediately
followed by fdput(); might as well just replace it with
a variant that would take a descriptor instead of struct file *
and have file lookups handled inside that function.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/netlink.h  | 2 +-
 ipc/mqueue.c             | 8 +-------
 net/netlink/af_netlink.c | 9 +++++++--
 3 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index b332c2048c75..a48a30842d84 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -239,7 +239,7 @@ int netlink_register_notifier(struct notifier_block *nb);
 int netlink_unregister_notifier(struct notifier_block *nb);
 
 /* finegrained unicast helpers: */
-struct sock *netlink_getsockbyfilp(struct file *filp);
+struct sock *netlink_getsockbyfd(int fd);
 int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 		      long *timeo, struct sock *ssk);
 void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 34fa0bd8bb11..fd05e3d4f7b6 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1355,13 +1355,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 			skb_put(nc, NOTIFY_COOKIE_LEN);
 			/* and attach it to the socket */
 retry:
-			f = fdget(notification->sigev_signo);
-			if (!fd_file(f)) {
-				ret = -EBADF;
-				goto out;
-			}
-			sock = netlink_getsockbyfilp(fd_file(f));
-			fdput(f);
+			sock = netlink_getsockbyfd(notification->sigev_signo);
 			if (IS_ERR(sock)) {
 				ret = PTR_ERR(sock);
 				goto free_skb;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 0b7a89db3ab7..42451ac355d0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1180,11 +1180,16 @@ static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
 	return sock;
 }
 
-struct sock *netlink_getsockbyfilp(struct file *filp)
+struct sock *netlink_getsockbyfd(int fd)
 {
-	struct inode *inode = file_inode(filp);
+	CLASS(fd, f)(fd);
+	struct inode *inode;
 	struct sock *sock;
 
+	if (fd_empty(f))
+		return ERR_PTR(-EBADF);
+
+	inode = file_inode(fd_file(f));
 	if (!S_ISSOCK(inode->i_mode))
 		return ERR_PTR(-ENOTSOCK);
 
-- 
cgit v1.2.3


From d7a9616ce0348b9d945d5dff82e4b44c0fe75b39 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2024 22:10:12 -0400
Subject: introduce "fd_pos" class, convert fdget_pos() users to it.

fdget_pos() for constructor, fdput_pos() for cleanup, all users of
fd..._pos() converted trivially.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/osf_sys.c |  5 ++---
 fs/read_write.c             | 34 +++++++++++++---------------------
 fs/readdir.c                | 28 ++++++++++------------------
 include/linux/file.h        |  1 +
 4 files changed, 26 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index c0424de9e7cd..86185021f75a 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -152,7 +152,7 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
 		long __user *, basep)
 {
 	int error;
-	struct fd arg = fdget_pos(fd);
+	CLASS(fd_pos, arg)(fd);
 	struct osf_dirent_callback buf = {
 		.ctx.actor = osf_filldir,
 		.dirent = dirent,
@@ -160,7 +160,7 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
 		.count = count
 	};
 
-	if (!fd_file(arg))
+	if (fd_empty(arg))
 		return -EBADF;
 
 	error = iterate_dir(fd_file(arg), &buf.ctx);
@@ -169,7 +169,6 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
 	if (count != buf.count)
 		error = count - buf.count;
 
-	fdput_pos(arg);
 	return error;
 }
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 64dc24afdb3a..ef3ee3725714 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -386,8 +386,8 @@ EXPORT_SYMBOL(vfs_llseek);
 static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 {
 	off_t retval;
-	struct fd f = fdget_pos(fd);
-	if (!fd_file(f))
+	CLASS(fd_pos, f)(fd);
+	if (fd_empty(f))
 		return -EBADF;
 
 	retval = -EINVAL;
@@ -397,7 +397,6 @@ static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 		if (res != (loff_t)retval)
 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
 	}
-	fdput_pos(f);
 	return retval;
 }
 
@@ -420,15 +419,14 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 		unsigned int, whence)
 {
 	int retval;
-	struct fd f = fdget_pos(fd);
+	CLASS(fd_pos, f)(fd);
 	loff_t offset;
 
-	if (!fd_file(f))
+	if (fd_empty(f))
 		return -EBADF;
 
-	retval = -EINVAL;
 	if (whence > SEEK_MAX)
-		goto out_putf;
+		return -EINVAL;
 
 	offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
 			whence);
@@ -439,8 +437,6 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 		if (!copy_to_user(result, &offset, sizeof(offset)))
 			retval = 0;
 	}
-out_putf:
-	fdput_pos(f);
 	return retval;
 }
 #endif
@@ -700,10 +696,10 @@ static inline loff_t *file_ppos(struct file *file)
 
 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 {
-	struct fd f = fdget_pos(fd);
+	CLASS(fd_pos, f)(fd);
 	ssize_t ret = -EBADF;
 
-	if (fd_file(f)) {
+	if (!fd_empty(f)) {
 		loff_t pos, *ppos = file_ppos(fd_file(f));
 		if (ppos) {
 			pos = *ppos;
@@ -712,7 +708,6 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 		ret = vfs_read(fd_file(f), buf, count, ppos);
 		if (ret >= 0 && ppos)
 			fd_file(f)->f_pos = pos;
-		fdput_pos(f);
 	}
 	return ret;
 }
@@ -724,10 +719,10 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 
 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 {
-	struct fd f = fdget_pos(fd);
+	CLASS(fd_pos, f)(fd);
 	ssize_t ret = -EBADF;
 
-	if (fd_file(f)) {
+	if (!fd_empty(f)) {
 		loff_t pos, *ppos = file_ppos(fd_file(f));
 		if (ppos) {
 			pos = *ppos;
@@ -736,7 +731,6 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 		ret = vfs_write(fd_file(f), buf, count, ppos);
 		if (ret >= 0 && ppos)
 			fd_file(f)->f_pos = pos;
-		fdput_pos(f);
 	}
 
 	return ret;
@@ -1075,10 +1069,10 @@ out:
 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 			unsigned long vlen, rwf_t flags)
 {
-	struct fd f = fdget_pos(fd);
+	CLASS(fd_pos, f)(fd);
 	ssize_t ret = -EBADF;
 
-	if (fd_file(f)) {
+	if (!fd_empty(f)) {
 		loff_t pos, *ppos = file_ppos(fd_file(f));
 		if (ppos) {
 			pos = *ppos;
@@ -1087,7 +1081,6 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 		ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
 		if (ret >= 0 && ppos)
 			fd_file(f)->f_pos = pos;
-		fdput_pos(f);
 	}
 
 	if (ret > 0)
@@ -1099,10 +1092,10 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
 			 unsigned long vlen, rwf_t flags)
 {
-	struct fd f = fdget_pos(fd);
+	CLASS(fd_pos, f)(fd);
 	ssize_t ret = -EBADF;
 
-	if (fd_file(f)) {
+	if (!fd_empty(f)) {
 		loff_t pos, *ppos = file_ppos(fd_file(f));
 		if (ppos) {
 			pos = *ppos;
@@ -1111,7 +1104,6 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
 		ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
 		if (ret >= 0 && ppos)
 			fd_file(f)->f_pos = pos;
-		fdput_pos(f);
 	}
 
 	if (ret > 0)
diff --git a/fs/readdir.c b/fs/readdir.c
index 6d29cab8576e..0038efda417b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -219,20 +219,19 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 		struct old_linux_dirent __user *, dirent, unsigned int, count)
 {
 	int error;
-	struct fd f = fdget_pos(fd);
+	CLASS(fd_pos, f)(fd);
 	struct readdir_callback buf = {
 		.ctx.actor = fillonedir,
 		.dirent = dirent
 	};
 
-	if (!fd_file(f))
+	if (fd_empty(f))
 		return -EBADF;
 
 	error = iterate_dir(fd_file(f), &buf.ctx);
 	if (buf.result)
 		error = buf.result;
 
-	fdput_pos(f);
 	return error;
 }
 
@@ -309,7 +308,7 @@ efault:
 SYSCALL_DEFINE3(getdents, unsigned int, fd,
 		struct linux_dirent __user *, dirent, unsigned int, count)
 {
-	struct fd f;
+	CLASS(fd_pos, f)(fd);
 	struct getdents_callback buf = {
 		.ctx.actor = filldir,
 		.count = count,
@@ -317,8 +316,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	};
 	int error;
 
-	f = fdget_pos(fd);
-	if (!fd_file(f))
+	if (fd_empty(f))
 		return -EBADF;
 
 	error = iterate_dir(fd_file(f), &buf.ctx);
@@ -333,7 +331,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 		else
 			error = count - buf.count;
 	}
-	fdput_pos(f);
 	return error;
 }
 
@@ -392,7 +389,7 @@ efault:
 SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 		struct linux_dirent64 __user *, dirent, unsigned int, count)
 {
-	struct fd f;
+	CLASS(fd_pos, f)(fd);
 	struct getdents_callback64 buf = {
 		.ctx.actor = filldir64,
 		.count = count,
@@ -400,8 +397,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	};
 	int error;
 
-	f = fdget_pos(fd);
-	if (!fd_file(f))
+	if (fd_empty(f))
 		return -EBADF;
 
 	error = iterate_dir(fd_file(f), &buf.ctx);
@@ -417,7 +413,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 		else
 			error = count - buf.count;
 	}
-	fdput_pos(f);
 	return error;
 }
 
@@ -477,20 +472,19 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 		struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
 {
 	int error;
-	struct fd f = fdget_pos(fd);
+	CLASS(fd_pos, f)(fd);
 	struct compat_readdir_callback buf = {
 		.ctx.actor = compat_fillonedir,
 		.dirent = dirent
 	};
 
-	if (!fd_file(f))
+	if (fd_empty(f))
 		return -EBADF;
 
 	error = iterate_dir(fd_file(f), &buf.ctx);
 	if (buf.result)
 		error = buf.result;
 
-	fdput_pos(f);
 	return error;
 }
 
@@ -560,7 +554,7 @@ efault:
 COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
 		struct compat_linux_dirent __user *, dirent, unsigned int, count)
 {
-	struct fd f;
+	CLASS(fd_pos, f)(fd);
 	struct compat_getdents_callback buf = {
 		.ctx.actor = compat_filldir,
 		.current_dir = dirent,
@@ -568,8 +562,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	};
 	int error;
 
-	f = fdget_pos(fd);
-	if (!fd_file(f))
+	if (fd_empty(f))
 		return -EBADF;
 
 	error = iterate_dir(fd_file(f), &buf.ctx);
@@ -584,7 +577,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
 		else
 			error = count - buf.count;
 	}
-	fdput_pos(f);
 	return error;
 }
 #endif
diff --git a/include/linux/file.h b/include/linux/file.h
index b49a92295b3f..4b09a8de2fd5 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -81,6 +81,7 @@ static inline void fdput_pos(struct fd f)
 
 DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)
 DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd)
+DEFINE_CLASS(fd_pos, struct fd, fdput_pos(_T), fdget_pos(fd), int fd)
 
 extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
 extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
-- 
cgit v1.2.3


From 38052c2dd71f5490f34bba21dc358e97fb205ee5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 6 Jun 2024 19:29:04 -0400
Subject: deal with the last remaing boolean uses of fd_file()

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/infiniband/core/uverbs_cmd.c | 8 +++-----
 include/linux/cleanup.h              | 2 +-
 sound/core/pcm_native.c              | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a4cce360df21..66b02fbf077a 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -584,7 +584,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
 	if (cmd.fd != -1) {
 		/* search for file descriptor */
 		f = fdget(cmd.fd);
-		if (!fd_file(f)) {
+		if (fd_empty(f)) {
 			ret = -EBADF;
 			goto err_tree_mutex_unlock;
 		}
@@ -632,8 +632,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
 		atomic_inc(&xrcd->usecnt);
 	}
 
-	if (fd_file(f))
-		fdput(f);
+	fdput(f);
 
 	mutex_unlock(&ibudev->xrcd_tree_mutex);
 	uobj_finalize_uobj_create(&obj->uobject, attrs);
@@ -648,8 +647,7 @@ err:
 	uobj_alloc_abort(&obj->uobject, attrs);
 
 err_tree_mutex_unlock:
-	if (fd_file(f))
-		fdput(f);
+	fdput(f);
 
 	mutex_unlock(&ibudev->xrcd_tree_mutex);
 
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 038b2d523bf8..875c998275c0 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -234,7 +234,7 @@ const volatile void * __must_check_fn(const volatile void *val)
  * DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd)
  *
  *	CLASS(fdget, f)(fd);
- *	if (!fd_file(f))
+ *	if (fd_empty(f))
  *		return -EBADF;
  *
  *	// use 'f' without concern
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index b465fb6e1f5f..3320cce35a03 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2250,7 +2250,7 @@ static int snd_pcm_link(struct snd_pcm_substream *substream, int fd)
 	bool nonatomic = substream->pcm->nonatomic;
 	CLASS(fd, f)(fd);
 
-	if (!fd_file(f))
+	if (fd_empty(f))
 		return -EBADFD;
 	if (!is_pcm_file(fd_file(f)))
 		return -EBADFD;
-- 
cgit v1.2.3


From a1afb959add1fad43cb337448c244ed70bac3109 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 30 Oct 2024 09:11:56 +0100
Subject: dpll: add clock quality level attribute and op

In order to allow driver expose quality level of the clock it is
running, introduce a new netlink attr with enum to carry it to the
userspace. Also, introduce an op the dpll netlink code calls into the
driver to obtain the value.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20241030081157.966604-2-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/dpll.yaml | 41 +++++++++++++++++++++++++++++++++++
 drivers/dpll/dpll_netlink.c           | 24 ++++++++++++++++++++
 include/linux/dpll.h                  |  4 ++++
 include/uapi/linux/dpll.h             | 24 ++++++++++++++++++++
 4 files changed, 93 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index f2894ca35de8..8feefeae5376 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -85,6 +85,36 @@ definitions:
           This may happen for example if dpll device was previously
           locked on an input pin of type PIN_TYPE_SYNCE_ETH_PORT.
     render-max: true
+  -
+    type: enum
+    name: clock-quality-level
+    doc: |
+      level of quality of a clock device. This mainly applies when
+      the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER.
+      The current list is defined according to the table 11-7 contained
+      in ITU-T G.8264/Y.1364 document. One may extend this list freely
+      by other ITU-T defined clock qualities, or different ones defined
+      by another standardization body (for those, please use
+      different prefix).
+    entries:
+      -
+        name: itu-opt1-prc
+        value: 1
+      -
+        name: itu-opt1-ssu-a
+      -
+        name: itu-opt1-ssu-b
+      -
+        name: itu-opt1-eec1
+      -
+        name: itu-opt1-prtc
+      -
+        name: itu-opt1-eprtc
+      -
+        name: itu-opt1-eeec
+      -
+        name: itu-opt1-eprc
+    render-max: true
   -
     type: const
     name: temp-divider
@@ -252,6 +282,17 @@ attribute-sets:
         name: lock-status-error
         type: u32
         enum: lock-status-error
+      -
+        name: clock-quality-level
+        type: u32
+        enum: clock-quality-level
+        multi-attr: true
+        doc: |
+          Level of quality of a clock device. This mainly applies when
+          the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER. This could
+          be put to message multiple times to indicate possible parallel
+          quality levels (e.g. one specified by ITU option 1 and another
+          one specified by option 2).
   -
     name: pin
     enum-name: dpll_a_pin
diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index fc0280dcddd1..c130f87147fa 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -169,6 +169,27 @@ dpll_msg_add_temp(struct sk_buff *msg, struct dpll_device *dpll,
 	return 0;
 }
 
+static int
+dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll,
+				 struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) = { 0 };
+	enum dpll_clock_quality_level ql;
+	int ret;
+
+	if (!ops->clock_quality_level_get)
+		return 0;
+	ret = ops->clock_quality_level_get(dpll, dpll_priv(dpll), qls, extack);
+	if (ret)
+		return ret;
+	for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX)
+		if (nla_put_u32(msg, DPLL_A_CLOCK_QUALITY_LEVEL, ql))
+			return -EMSGSIZE;
+
+	return 0;
+}
+
 static int
 dpll_msg_add_pin_prio(struct sk_buff *msg, struct dpll_pin *pin,
 		      struct dpll_pin_ref *ref,
@@ -557,6 +578,9 @@ dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg,
 	if (ret)
 		return ret;
 	ret = dpll_msg_add_lock_status(msg, dpll, extack);
+	if (ret)
+		return ret;
+	ret = dpll_msg_add_clock_quality_level(msg, dpll, extack);
 	if (ret)
 		return ret;
 	ret = dpll_msg_add_mode(msg, dpll, extack);
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 81f7b623d0ba..5e4f9ab1cf75 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -26,6 +26,10 @@ struct dpll_device_ops {
 			       struct netlink_ext_ack *extack);
 	int (*temp_get)(const struct dpll_device *dpll, void *dpll_priv,
 			s32 *temp, struct netlink_ext_ack *extack);
+	int (*clock_quality_level_get)(const struct dpll_device *dpll,
+				       void *dpll_priv,
+				       unsigned long *qls,
+				       struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_ops {
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index b0654ade7b7e..2b7ec2da4bcc 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -79,6 +79,29 @@ enum dpll_lock_status_error {
 	DPLL_LOCK_STATUS_ERROR_MAX = (__DPLL_LOCK_STATUS_ERROR_MAX - 1)
 };
 
+/**
+ * enum dpll_clock_quality_level - level of quality of a clock device. This
+ *   mainly applies when the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER. The
+ *   current list is defined according to the table 11-7 contained in ITU-T
+ *   G.8264/Y.1364 document. One may extend this list freely by other ITU-T
+ *   defined clock qualities, or different ones defined by another
+ *   standardization body (for those, please use different prefix).
+ */
+enum dpll_clock_quality_level {
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_PRC = 1,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_SSU_A,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_SSU_B,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EEC1,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_PRTC,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EPRTC,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EEEC,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EPRC,
+
+	/* private: */
+	__DPLL_CLOCK_QUALITY_LEVEL_MAX,
+	DPLL_CLOCK_QUALITY_LEVEL_MAX = (__DPLL_CLOCK_QUALITY_LEVEL_MAX - 1)
+};
+
 #define DPLL_TEMP_DIVIDER	1000
 
 /**
@@ -180,6 +203,7 @@ enum dpll_a {
 	DPLL_A_TEMP,
 	DPLL_A_TYPE,
 	DPLL_A_LOCK_STATUS_ERROR,
+	DPLL_A_CLOCK_QUALITY_LEVEL,
 
 	__DPLL_A_MAX,
 	DPLL_A_MAX = (__DPLL_A_MAX - 1)
-- 
cgit v1.2.3


From a8f80673ca0daaad990882529a5b4dc5114071e7 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Thu, 24 Oct 2024 17:37:41 +0800
Subject: compiler_types: Add noinline_for_tracing annotation

Kernel functions that are not inlined can be easily hooked with BPF for
tracing. However, functions intended for tracing may still be inlined
unexpectedly. For example, in our case, after upgrading the compiler from
GCC 9 to GCC 11, the tcp_drop_reason() function was inlined, which broke
our monitoring tools. To prevent this, we need to ensure that the function
remains non-inlined.

The noinline_for_tracing annotation is introduced as a general solution for
preventing inlining of kernel functions that need to be traced. This
approach avoids the need for adding individual noinline comments to each
function and provides a more consistent way to maintain traceability.

Link: https://lore.kernel.org/netdev/CANn89iKvr44ipuRYFaPTpzwz=B_+pgA94jsggQ946mjwreV6Aw@mail.gmail.com/
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://patch.msgid.link/20241024093742.87681-2-laoar.shao@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/compiler_types.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 1a957ea2f4fe..0c8b9601e603 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -265,6 +265,12 @@ struct ftrace_likely_data {
  */
 #define noinline_for_stack noinline
 
+/*
+ * Use noinline_for_tracing for functions that should not be inlined.
+ * For tracing reasons.
+ */
+#define noinline_for_tracing noinline
+
 /*
  * Sanitizer helper attributes: Because using __always_inline and
  * __no_sanitize_* conflict, provide helper attributes that will either expand
-- 
cgit v1.2.3


From 3bd9b9abdf1563a22041b7255baea6d449902f1a Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 29 Oct 2024 15:58:47 -0600
Subject: net: ethtool: Avoid thousands of -Wflex-array-member-not-at-end
 warnings

-Wflex-array-member-not-at-end was introduced in GCC-14, and we are
getting ready to enable it, globally.

Change the type of the middle struct member currently causing trouble from
`struct ethtool_link_settings` to `struct ethtool_link_settings_hdr`.

Additionally, update the type of some variables in various functions that
don't access the flexible-array member, changing them to the newly created
`struct ethtool_link_settings_hdr`. These changes are needed because the
type of the conflicting middle members changed. So, those instances that
expect the type to be `struct ethtool_link_settings` should be adjusted to
the newly created type `struct ethtool_link_settings_hdr`.

Also, adjust variable declarations to follow the reverse xmas tree
convention.

Fix 3338 of the following -Wflex-array-member-not-at-end warnings:

include/linux/ethtool.h:214:38: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://patch.msgid.link/0bc2809fe2a6c11dd4c8a9a10d9bd65cccdb559b.1730238285.git.gustavoars@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c   |  6 +++---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c  |  4 ++--
 drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c |  2 +-
 drivers/net/ethernet/cisco/enic/enic_ethtool.c      |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c     |  4 ++--
 include/linux/ethtool.h                             |  2 +-
 net/ethtool/ioctl.c                                 |  2 +-
 net/ethtool/linkinfo.c                              |  8 ++++----
 net/ethtool/linkmodes.c                             | 18 +++++++++++-------
 9 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index f71cc8188b4e..e0ebe69110bf 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2781,7 +2781,7 @@ u32 bnxt_fw_to_ethtool_speed(u16 fw_link_speed)
 static void bnxt_get_default_speeds(struct ethtool_link_ksettings *lk_ksettings,
 				    struct bnxt_link_info *link_info)
 {
-	struct ethtool_link_settings *base = &lk_ksettings->base;
+	struct ethtool_link_settings_hdr *base = &lk_ksettings->base;
 
 	if (link_info->link_state == BNXT_LINK_STATE_UP) {
 		base->speed = bnxt_fw_to_ethtool_speed(link_info->link_speed);
@@ -2800,7 +2800,7 @@ static void bnxt_get_default_speeds(struct ethtool_link_ksettings *lk_ksettings,
 static int bnxt_get_link_ksettings(struct net_device *dev,
 				   struct ethtool_link_ksettings *lk_ksettings)
 {
-	struct ethtool_link_settings *base = &lk_ksettings->base;
+	struct ethtool_link_settings_hdr *base = &lk_ksettings->base;
 	enum ethtool_link_mode_bit_indices link_mode;
 	struct bnxt *bp = netdev_priv(dev);
 	struct bnxt_link_info *link_info;
@@ -3023,9 +3023,9 @@ u16 bnxt_get_fw_auto_link_speeds(const unsigned long *mode)
 static int bnxt_set_link_ksettings(struct net_device *dev,
 			   const struct ethtool_link_ksettings *lk_ksettings)
 {
+	const struct ethtool_link_settings_hdr *base = &lk_ksettings->base;
 	struct bnxt *bp = netdev_priv(dev);
 	struct bnxt_link_info *link_info = &bp->link_info;
-	const struct ethtool_link_settings *base = &lk_ksettings->base;
 	bool set_pause = false;
 	u32 speed, lanes = 0;
 	int rc = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index 7f3f5afa864f..45d28a65347e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -662,8 +662,8 @@ static unsigned int lmm_to_fw_caps(const unsigned long *link_mode_mask)
 static int get_link_ksettings(struct net_device *dev,
 			      struct ethtool_link_ksettings *link_ksettings)
 {
+	struct ethtool_link_settings_hdr *base = &link_ksettings->base;
 	struct port_info *pi = netdev_priv(dev);
-	struct ethtool_link_settings *base = &link_ksettings->base;
 
 	/* For the nonce, the Firmware doesn't send up Port State changes
 	 * when the Virtual Interface attached to the Port is down.  So
@@ -717,9 +717,9 @@ static int get_link_ksettings(struct net_device *dev,
 static int set_link_ksettings(struct net_device *dev,
 			    const struct ethtool_link_ksettings *link_ksettings)
 {
+	const struct ethtool_link_settings_hdr *base = &link_ksettings->base;
 	struct port_info *pi = netdev_priv(dev);
 	struct link_config *lc = &pi->link_cfg;
-	const struct ethtool_link_settings *base = &link_ksettings->base;
 	struct link_config old_lc;
 	unsigned int fw_caps;
 	int ret = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 2fbe0f059a0b..61d08547e3f9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -1436,8 +1436,8 @@ static void fw_caps_to_lmm(enum fw_port_type port_type,
 static int cxgb4vf_get_link_ksettings(struct net_device *dev,
 				  struct ethtool_link_ksettings *link_ksettings)
 {
+	struct ethtool_link_settings_hdr *base = &link_ksettings->base;
 	struct port_info *pi = netdev_priv(dev);
-	struct ethtool_link_settings *base = &link_ksettings->base;
 
 	/* For the nonce, the Firmware doesn't send up Port State changes
 	 * when the Virtual Interface attached to the Port is down.  So
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index f7986f2b6a17..4fe85780a950 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -129,8 +129,8 @@ static void enic_intr_coal_set_rx(struct enic *enic, u32 timer)
 static int enic_get_ksettings(struct net_device *netdev,
 			      struct ethtool_link_ksettings *ecmd)
 {
+	struct ethtool_link_settings_hdr *base = &ecmd->base;
 	struct enic *enic = netdev_priv(netdev);
-	struct ethtool_link_settings *base = &ecmd->base;
 
 	ethtool_link_ksettings_add_link_mode(ecmd, supported,
 					     10000baseT_Full);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index e50e1df0a433..c553da16d4b1 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -504,7 +504,7 @@ static int qede_get_link_ksettings(struct net_device *dev,
 				   struct ethtool_link_ksettings *cmd)
 {
 	typeof(cmd->link_modes) *link_modes = &cmd->link_modes;
-	struct ethtool_link_settings *base = &cmd->base;
+	struct ethtool_link_settings_hdr *base = &cmd->base;
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qed_link_output current_link;
 
@@ -537,7 +537,7 @@ static int qede_get_link_ksettings(struct net_device *dev,
 static int qede_set_link_ksettings(struct net_device *dev,
 				   const struct ethtool_link_ksettings *cmd)
 {
-	const struct ethtool_link_settings *base = &cmd->base;
+	const struct ethtool_link_settings_hdr *base = &cmd->base;
 	const struct ethtool_forced_speed_map *map;
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qed_link_output current_link;
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 12f6dc567598..1199e308c8dd 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -211,7 +211,7 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id);
  * fields, but they are allowed to overwrite them (will be ignored).
  */
 struct ethtool_link_ksettings {
-	struct ethtool_link_settings base;
+	struct ethtool_link_settings_hdr base;
 	struct {
 		__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
 		__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 5cc131cdb1bc..7da94e26ced6 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -425,7 +425,7 @@ convert_link_ksettings_to_legacy_settings(
 
 /* layout of the struct passed from/to userland */
 struct ethtool_link_usettings {
-	struct ethtool_link_settings base;
+	struct ethtool_link_settings_hdr base;
 	struct {
 		__u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
 		__u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c
index 30b8ce275159..2d5bc57160be 100644
--- a/net/ethtool/linkinfo.c
+++ b/net/ethtool/linkinfo.c
@@ -8,9 +8,9 @@ struct linkinfo_req_info {
 };
 
 struct linkinfo_reply_data {
-	struct ethnl_reply_data		base;
-	struct ethtool_link_ksettings	ksettings;
-	struct ethtool_link_settings	*lsettings;
+	struct ethnl_reply_data			base;
+	struct ethtool_link_ksettings		ksettings;
+	struct ethtool_link_settings_hdr	*lsettings;
 };
 
 #define LINKINFO_REPDATA(__reply_base) \
@@ -98,7 +98,7 @@ static int
 ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info)
 {
 	struct ethtool_link_ksettings ksettings = {};
-	struct ethtool_link_settings *lsettings;
+	struct ethtool_link_settings_hdr *lsettings;
 	struct net_device *dev = req_info->dev;
 	struct nlattr **tb = info->attrs;
 	bool mod = false;
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index 259cd9ef1f2a..17e49cf89f03 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -11,10 +11,10 @@ struct linkmodes_req_info {
 };
 
 struct linkmodes_reply_data {
-	struct ethnl_reply_data		base;
-	struct ethtool_link_ksettings	ksettings;
-	struct ethtool_link_settings	*lsettings;
-	bool				peer_empty;
+	struct ethnl_reply_data			base;
+	struct ethtool_link_ksettings		ksettings;
+	struct ethtool_link_settings_hdr	*lsettings;
+	bool					peer_empty;
 };
 
 #define LINKMODES_REPDATA(__reply_base) \
@@ -62,10 +62,12 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base,
 {
 	const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
 	const struct ethtool_link_ksettings *ksettings = &data->ksettings;
-	const struct ethtool_link_settings *lsettings = &ksettings->base;
 	bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+	const struct ethtool_link_settings_hdr *lsettings;
 	int len, ret;
 
+	lsettings = &ksettings->base;
+
 	len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */
 		+ nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */
 		+ nla_total_size(sizeof(u32)) /* LINKMODES_LANES */
@@ -103,10 +105,12 @@ static int linkmodes_fill_reply(struct sk_buff *skb,
 {
 	const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
 	const struct ethtool_link_ksettings *ksettings = &data->ksettings;
-	const struct ethtool_link_settings *lsettings = &ksettings->base;
 	bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+	const struct ethtool_link_settings_hdr *lsettings;
 	int ret;
 
+	lsettings = &ksettings->base;
+
 	if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg))
 		return -EMSGSIZE;
 
@@ -237,7 +241,7 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
 				  struct ethtool_link_ksettings *ksettings,
 				  bool *mod, const struct net_device *dev)
 {
-	struct ethtool_link_settings *lsettings = &ksettings->base;
+	struct ethtool_link_settings_hdr *lsettings = &ksettings->base;
 	bool req_speed, req_lanes, req_duplex;
 	const struct nlattr *master_slave_cfg, *lanes_cfg;
 	int ret;
-- 
cgit v1.2.3


From 2ea25aab938a250bdf3148acd15359b56b91b40e Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Tue, 29 Oct 2024 16:18:49 -0500
Subject: pwm: core: export pwm_get_state_hw()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export the pwm_get_state_hw() function. This is useful in cases where
we want to know what the hardware is actually doing, rather than what
what we requested it should do.

Locking had to be rearranged to ensure that the chip is still
operational before trying to access ops now that this can be called
from outside the pwm core.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://lore.kernel.org/r/20241029-pwm-export-pwm_get_state_hw-v2-1-03ba063a3230@baylibre.com
[ukleinek: Add dummy for !CONFIG_PWM]
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 drivers/pwm/core.c  | 40 +++++++++++++++++++++++++++-------------
 include/linux/pwm.h |  6 ++++++
 2 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 4399e793efaf..ccbdd6dd1410 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -718,40 +718,54 @@ int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state)
 }
 EXPORT_SYMBOL_GPL(pwm_apply_atomic);
 
-static int pwm_get_state_hw(struct pwm_device *pwm, struct pwm_state *state)
+/**
+ * pwm_get_state_hw() - get the current PWM state from hardware
+ * @pwm: PWM device
+ * @state: state to fill with the current PWM state
+ *
+ * Similar to pwm_get_state() but reads the current PWM state from hardware
+ * instead of the requested state.
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ * Context: May sleep.
+ */
+int pwm_get_state_hw(struct pwm_device *pwm, struct pwm_state *state)
 {
 	struct pwm_chip *chip = pwm->chip;
 	const struct pwm_ops *ops = chip->ops;
 	int ret = -EOPNOTSUPP;
 
+	might_sleep();
+
+	guard(pwmchip)(chip);
+
+	if (!chip->operational)
+		return -ENODEV;
+
 	if (ops->read_waveform) {
 		char wfhw[WFHWSIZE];
 		struct pwm_waveform wf;
 
 		BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
 
-		scoped_guard(pwmchip, chip) {
+		ret = __pwm_read_waveform(chip, pwm, &wfhw);
+		if (ret)
+			return ret;
 
-			ret = __pwm_read_waveform(chip, pwm, &wfhw);
-			if (ret)
-				return ret;
-
-			ret = __pwm_round_waveform_fromhw(chip, pwm, &wfhw, &wf);
-			if (ret)
-				return ret;
-		}
+		ret = __pwm_round_waveform_fromhw(chip, pwm, &wfhw, &wf);
+		if (ret)
+			return ret;
 
 		pwm_wf2state(&wf, state);
 
 	} else if (ops->get_state) {
-		scoped_guard(pwmchip, chip)
-			ret = ops->get_state(chip, pwm, state);
-
+		ret = ops->get_state(chip, pwm, state);
 		trace_pwm_get(pwm, state, ret);
 	}
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(pwm_get_state_hw);
 
 /**
  * pwm_adjust_config() - adjust the current PWM config to the PWM arguments
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index f1cb1e5b0a36..78827f312407 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -370,6 +370,7 @@ int pwm_get_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf
 int pwm_set_waveform_might_sleep(struct pwm_device *pwm, const struct pwm_waveform *wf, bool exact);
 int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state);
 int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state);
+int pwm_get_state_hw(struct pwm_device *pwm, struct pwm_state *state);
 int pwm_adjust_config(struct pwm_device *pwm);
 
 /**
@@ -494,6 +495,11 @@ static inline int pwm_apply_atomic(struct pwm_device *pwm,
 	return -EOPNOTSUPP;
 }
 
+static inline int pwm_get_state_hw(struct pwm_device *pwm, struct pwm_state *state)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int pwm_adjust_config(struct pwm_device *pwm)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 9a5a2483bc60c12d73ac6ca5ac5ab95361a895f4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 1 Nov 2024 12:53:42 +0200
Subject: iio: Mark iio_dev::priv member with __private

The member is not supposed to be accessed directly, mark it with
__private to catch the misuses up.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20241101105342.3645018-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-core.c | 2 +-
 include/linux/iio/iio.h         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 6a6568d4a2cb..4c543490e56c 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1665,7 +1665,7 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
 	indio_dev = &iio_dev_opaque->indio_dev;
 
 	if (sizeof_priv)
-		indio_dev->priv = (char *)iio_dev_opaque +
+		ACCESS_PRIVATE(indio_dev, priv) = (char *)iio_dev_opaque +
 			ALIGN(sizeof(*iio_dev_opaque), IIO_DMA_MINALIGN);
 
 	indio_dev->dev.parent = parent;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 445d6666a291..5c6682bd4cb9 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -624,7 +624,7 @@ struct iio_dev {
 	const struct iio_info		*info;
 	const struct iio_buffer_setup_ops	*setup_ops;
 
-	void				*priv;
+	void				*priv __private;
 };
 
 int iio_device_id(struct iio_dev *indio_dev);
@@ -785,7 +785,7 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv);
 /* The information at the returned address is guaranteed to be cacheline aligned */
 static inline void *iio_priv(const struct iio_dev *indio_dev)
 {
-	return indio_dev->priv;
+	return ACCESS_PRIVATE(indio_dev, priv);
 }
 
 void iio_device_free(struct iio_dev *indio_dev);
-- 
cgit v1.2.3


From 6e6738398def256126185cd25e2e3cb68f1bc0a3 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Fri, 1 Nov 2024 07:46:27 +0000
Subject: iio: hid-sensors: Add proximity and attention IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HID Usage Table at https://usb.org/sites/default/files/hut1_5.pdf
reserves:

- 0x4b2 for Human Proximity Range
Distance between a human and the computer. Default unit of
measure is meters;
https://www.usb.org/sites/default/files/hutrr39b_0.pdf

- 0x4bd for Human Attention Detected
Human-Presence sensors detect the presence of humans in the sensor’s
field-of-view using diverse and evolving technologies. Some presence
sensors are implemented with low resolution video cameras, which can
additionally track a subject’s attention (i.e. if the
user is ‘looking’ at the system with the integrated sensor).
A Human-Presence sensor, providing a Host with the user’s attention
state, allows the Host to optimize its behavior. For example, to
brighten/dim the system display, based on the user’s attention to the
system (potentially prolonging battery life). Default unit is
true/false;
https://www.usb.org/sites/default/files/hutrr107-humanpresenceattention_1.pdf

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Link: https://patch.msgid.link/20241101-hpd-v3-1-e9c80b7c7164@chromium.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/hid-sensor-ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 6730ee900ee1..8a03d9696b1c 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -30,6 +30,8 @@
 #define HID_USAGE_SENSOR_PROX                                   0x200011
 #define HID_USAGE_SENSOR_DATA_PRESENCE                          0x2004b0
 #define HID_USAGE_SENSOR_HUMAN_PRESENCE                         0x2004b1
+#define HID_USAGE_SENSOR_HUMAN_PROXIMITY                        0x2004b2
+#define HID_USAGE_SENSOR_HUMAN_ATTENTION                        0x2004bd
 
 /* Pressure (200031) */
 #define HID_USAGE_SENSOR_PRESSURE                               0x200031
-- 
cgit v1.2.3


From b4b42f28a0df6b9d31f0003f7dea3bddf272eaa4 Mon Sep 17 00:00:00 2001
From: Julien Stephan <jstephan@baylibre.com>
Date: Thu, 31 Oct 2024 16:27:02 +0100
Subject: iio: fix write_event_config signature

write_event_config callback use an int for state, but it is actually a
boolean. iio_ev_state_store is actually using kstrtobool to check user
input, then gives the converted boolean value to write_event_config.

Fix signature and update all iio drivers to use the new signature.

This patch has been partially written using coccinelle with the
following script:

$ cat iio-bool.cocci
// Options: --all-includes

virtual patch

@c1@
identifier iioinfo;
identifier wecfunc;
@@
 static const struct iio_info iioinfo = {
        ...,
        .write_event_config =
(
 wecfunc
|
 &wecfunc
),
        ...,
 };

@@
identifier c1.wecfunc;
identifier indio_dev, chan, type, dir, state;
@@
 int wecfunc(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, enum iio_event_type type, enum iio_event_direction dir,
-int
+bool
 state) {
  ...
 }

make coccicheck MODE=patch COCCI=iio-bool.cocci M=drivers/iio

Unfortunately, this script didn't match all files:
* all write_event_config callbacks using iio_device_claim_direct_scoped
  were not detected and not patched.
* all files that do not assign and declare the write_event_config
  callback in the same file.

iio.h was also manually updated.

The patch was build tested using allmodconfig config.

cc: Julia Lawall <julia.lawall@inria.fr>
Signed-off-by: Julien Stephan <jstephan@baylibre.com>
Link: https://patch.msgid.link/20241031-iio-fix-write-event-config-signature-v2-7-2bcacbb517a2@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/adxl367.c                    | 2 +-
 drivers/iio/accel/adxl372.c                    | 2 +-
 drivers/iio/accel/adxl380.c                    | 2 +-
 drivers/iio/accel/bma400_core.c                | 2 +-
 drivers/iio/accel/bmc150-accel-core.c          | 2 +-
 drivers/iio/accel/fxls8962af-core.c            | 2 +-
 drivers/iio/accel/kxcjk-1013.c                 | 2 +-
 drivers/iio/accel/mma8452.c                    | 2 +-
 drivers/iio/accel/mma9551.c                    | 2 +-
 drivers/iio/accel/mma9553.c                    | 3 ++-
 drivers/iio/accel/sca3000.c                    | 2 +-
 drivers/iio/adc/ad7091r-base.c                 | 3 ++-
 drivers/iio/adc/ad7291.c                       | 2 +-
 drivers/iio/adc/ad799x.c                       | 2 +-
 drivers/iio/adc/hi8435.c                       | 2 +-
 drivers/iio/adc/max1363.c                      | 2 +-
 drivers/iio/adc/pac1921.c                      | 3 ++-
 drivers/iio/adc/palmas_gpadc.c                 | 2 +-
 drivers/iio/adc/ti-ads1015.c                   | 2 +-
 drivers/iio/adc/xilinx-ams.c                   | 2 +-
 drivers/iio/adc/xilinx-xadc-events.c           | 2 +-
 drivers/iio/adc/xilinx-xadc.h                  | 2 +-
 drivers/iio/cdc/ad7150.c                       | 2 +-
 drivers/iio/dac/ad5421.c                       | 2 +-
 drivers/iio/dac/ad8460.c                       | 2 +-
 drivers/iio/dummy/iio_simple_dummy.h           | 2 +-
 drivers/iio/dummy/iio_simple_dummy_events.c    | 2 +-
 drivers/iio/gyro/bmg160_core.c                 | 2 +-
 drivers/iio/imu/bmi323/bmi323_core.c           | 2 +-
 drivers/iio/imu/inv_mpu6050/inv_mpu_core.c     | 2 +-
 drivers/iio/imu/kmx61.c                        | 2 +-
 drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c   | 2 +-
 drivers/iio/light/adux1020.c                   | 3 ++-
 drivers/iio/light/apds9300.c                   | 2 +-
 drivers/iio/light/apds9306.c                   | 2 +-
 drivers/iio/light/apds9960.c                   | 2 +-
 drivers/iio/light/bh1745.c                     | 2 +-
 drivers/iio/light/cm36651.c                    | 2 +-
 drivers/iio/light/gp2ap002.c                   | 2 +-
 drivers/iio/light/gp2ap020a00f.c               | 2 +-
 drivers/iio/light/iqs621-als.c                 | 2 +-
 drivers/iio/light/ltr390.c                     | 2 +-
 drivers/iio/light/ltr501.c                     | 2 +-
 drivers/iio/light/max44009.c                   | 2 +-
 drivers/iio/light/opt3001.c                    | 2 +-
 drivers/iio/light/stk3310.c                    | 2 +-
 drivers/iio/light/tcs3472.c                    | 2 +-
 drivers/iio/light/tsl2563.c                    | 2 +-
 drivers/iio/light/tsl2591.c                    | 2 +-
 drivers/iio/light/tsl2772.c                    | 2 +-
 drivers/iio/light/us5182d.c                    | 2 +-
 drivers/iio/light/vcnl4000.c                   | 5 +++--
 drivers/iio/light/veml6030.c                   | 2 +-
 drivers/iio/position/iqs624-pos.c              | 2 +-
 drivers/iio/proximity/aw96103.c                | 2 +-
 drivers/iio/proximity/cros_ec_mkbp_proximity.c | 2 +-
 drivers/iio/proximity/hx9023s.c                | 2 +-
 drivers/iio/proximity/irsd200.c                | 3 ++-
 drivers/iio/proximity/sx9500.c                 | 2 +-
 drivers/iio/proximity/sx_common.c              | 2 +-
 drivers/iio/proximity/sx_common.h              | 2 +-
 drivers/iio/proximity/vcnl3020.c               | 2 +-
 drivers/iio/temperature/mcp9600.c              | 2 +-
 drivers/iio/temperature/tmp007.c               | 2 +-
 include/linux/iio/iio.h                        | 2 +-
 65 files changed, 72 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/adxl367.c b/drivers/iio/accel/adxl367.c
index e790a66d86c7..705375f3b56e 100644
--- a/drivers/iio/accel/adxl367.c
+++ b/drivers/iio/accel/adxl367.c
@@ -1073,7 +1073,7 @@ static int adxl367_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
 				      enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	enum adxl367_activity_type act;
 
diff --git a/drivers/iio/accel/adxl372.c b/drivers/iio/accel/adxl372.c
index ef8dd557877b..5b9eb364760a 100644
--- a/drivers/iio/accel/adxl372.c
+++ b/drivers/iio/accel/adxl372.c
@@ -940,7 +940,7 @@ static int adxl372_read_event_config(struct iio_dev *indio_dev, const struct iio
 
 static int adxl372_write_event_config(struct iio_dev *indio_dev, const struct iio_chan_spec *chan,
 				      enum iio_event_type type, enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	struct adxl372_state *st = iio_priv(indio_dev);
 
diff --git a/drivers/iio/accel/adxl380.c b/drivers/iio/accel/adxl380.c
index 9f6f0a45efce..5d2bda1a6a78 100644
--- a/drivers/iio/accel/adxl380.c
+++ b/drivers/iio/accel/adxl380.c
@@ -1386,7 +1386,7 @@ static int adxl380_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
 				      enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	struct adxl380_state *st = iio_priv(indio_dev);
 	enum adxl380_axis axis;
diff --git a/drivers/iio/accel/bma400_core.c b/drivers/iio/accel/bma400_core.c
index 0bf5f321cfe7..906d2577be2d 100644
--- a/drivers/iio/accel/bma400_core.c
+++ b/drivers/iio/accel/bma400_core.c
@@ -1293,7 +1293,7 @@ static int bma400_disable_adv_interrupt(struct bma400_data *data)
 static int bma400_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
-				     enum iio_event_direction dir, int state)
+				     enum iio_event_direction dir, bool state)
 {
 	struct bma400_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c
index 0f32c1e92b4d..158579350d59 100644
--- a/drivers/iio/accel/bmc150-accel-core.c
+++ b/drivers/iio/accel/bmc150-accel-core.c
@@ -804,7 +804,7 @@ static int bmc150_accel_write_event_config(struct iio_dev *indio_dev,
 					   const struct iio_chan_spec *chan,
 					   enum iio_event_type type,
 					   enum iio_event_direction dir,
-					   int state)
+					   bool state)
 {
 	struct bmc150_accel_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/accel/fxls8962af-core.c b/drivers/iio/accel/fxls8962af-core.c
index ab427f3461db..f07fba17048e 100644
--- a/drivers/iio/accel/fxls8962af-core.c
+++ b/drivers/iio/accel/fxls8962af-core.c
@@ -617,7 +617,7 @@ static int
 fxls8962af_write_event_config(struct iio_dev *indio_dev,
 			      const struct iio_chan_spec *chan,
 			      enum iio_event_type type,
-			      enum iio_event_direction dir, int state)
+			      enum iio_event_direction dir, bool state)
 {
 	struct fxls8962af_data *data = iio_priv(indio_dev);
 	u8 enable_event, enable_bits;
diff --git a/drivers/iio/accel/kxcjk-1013.c b/drivers/iio/accel/kxcjk-1013.c
index f65fde06f2c1..f2496cad8ec2 100644
--- a/drivers/iio/accel/kxcjk-1013.c
+++ b/drivers/iio/accel/kxcjk-1013.c
@@ -1081,7 +1081,7 @@ static int kxcjk1013_write_event_config(struct iio_dev *indio_dev,
 					   const struct iio_chan_spec *chan,
 					   enum iio_event_type type,
 					   enum iio_event_direction dir,
-					   int state)
+					   bool state)
 {
 	struct kxcjk1013_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/accel/mma8452.c b/drivers/iio/accel/mma8452.c
index de4525b30edc..962d289065ab 100644
--- a/drivers/iio/accel/mma8452.c
+++ b/drivers/iio/accel/mma8452.c
@@ -974,7 +974,7 @@ static int mma8452_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
 				      enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	struct mma8452_data *data = iio_priv(indio_dev);
 	int val, ret;
diff --git a/drivers/iio/accel/mma9551.c b/drivers/iio/accel/mma9551.c
index a5d20d8d08b8..605022f5239a 100644
--- a/drivers/iio/accel/mma9551.c
+++ b/drivers/iio/accel/mma9551.c
@@ -225,7 +225,7 @@ static int mma9551_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
 				      enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	struct mma9551_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/accel/mma9553.c b/drivers/iio/accel/mma9553.c
index 1ea6aa007412..43ba04c606a4 100644
--- a/drivers/iio/accel/mma9553.c
+++ b/drivers/iio/accel/mma9553.c
@@ -725,7 +725,8 @@ static int mma9553_read_event_config(struct iio_dev *indio_dev,
 static int mma9553_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
-				      enum iio_event_direction dir, int state)
+				      enum iio_event_direction dir,
+				      bool state)
 {
 	struct mma9553_data *data = iio_priv(indio_dev);
 	struct mma9553_event *event;
diff --git a/drivers/iio/accel/sca3000.c b/drivers/iio/accel/sca3000.c
index 87c54e41f6cc..36cbfcbba04d 100644
--- a/drivers/iio/accel/sca3000.c
+++ b/drivers/iio/accel/sca3000.c
@@ -1253,7 +1253,7 @@ static int sca3000_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
 				      enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	struct sca3000_state *st = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/adc/ad7091r-base.c b/drivers/iio/adc/ad7091r-base.c
index d6876259ad14..eb0a059b4b0e 100644
--- a/drivers/iio/adc/ad7091r-base.c
+++ b/drivers/iio/adc/ad7091r-base.c
@@ -150,7 +150,8 @@ static int ad7091r_read_event_config(struct iio_dev *indio_dev,
 static int ad7091r_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
-				      enum iio_event_direction dir, int state)
+				      enum iio_event_direction dir,
+				      bool state)
 {
 	struct ad7091r_state *st = iio_priv(indio_dev);
 
diff --git a/drivers/iio/adc/ad7291.c b/drivers/iio/adc/ad7291.c
index 4c7f887adbbf..60e12faa3207 100644
--- a/drivers/iio/adc/ad7291.c
+++ b/drivers/iio/adc/ad7291.c
@@ -269,7 +269,7 @@ static int ad7291_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
 				     enum iio_event_direction dir,
-				     int state)
+				     bool state)
 {
 	int ret = 0;
 	struct ad7291_chip_info *chip = iio_priv(indio_dev);
diff --git a/drivers/iio/adc/ad799x.c b/drivers/iio/adc/ad799x.c
index 0f107e3fc2c8..aa44b4e2542b 100644
--- a/drivers/iio/adc/ad799x.c
+++ b/drivers/iio/adc/ad799x.c
@@ -406,7 +406,7 @@ static int ad799x_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
 				     enum iio_event_direction dir,
-				     int state)
+				     bool state)
 {
 	struct ad799x_state *st = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/adc/hi8435.c b/drivers/iio/adc/hi8435.c
index fb635a756440..689e34f06987 100644
--- a/drivers/iio/adc/hi8435.c
+++ b/drivers/iio/adc/hi8435.c
@@ -132,7 +132,7 @@ static int hi8435_read_event_config(struct iio_dev *idev,
 static int hi8435_write_event_config(struct iio_dev *idev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
-				     enum iio_event_direction dir, int state)
+				     enum iio_event_direction dir, bool state)
 {
 	struct hi8435_priv *priv = iio_priv(idev);
 	int ret;
diff --git a/drivers/iio/adc/max1363.c b/drivers/iio/adc/max1363.c
index 8da2d8d7a9c6..9a0baea08ab6 100644
--- a/drivers/iio/adc/max1363.c
+++ b/drivers/iio/adc/max1363.c
@@ -944,7 +944,7 @@ error_ret:
 
 static int max1363_write_event_config(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, enum iio_event_type type,
-	enum iio_event_direction dir, int state)
+	enum iio_event_direction dir, bool state)
 {
 	struct max1363_state *st = iio_priv(indio_dev);
 
diff --git a/drivers/iio/adc/pac1921.c b/drivers/iio/adc/pac1921.c
index 43a3dd321a50..b0f6727cfe38 100644
--- a/drivers/iio/adc/pac1921.c
+++ b/drivers/iio/adc/pac1921.c
@@ -699,7 +699,8 @@ static int pac1921_read_event_config(struct iio_dev *indio_dev,
 static int pac1921_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
-				      enum iio_event_direction dir, int state)
+				      enum iio_event_direction dir,
+				      bool state)
 {
 	struct pac1921_priv *priv = iio_priv(indio_dev);
 	u8 ovf_bit;
diff --git a/drivers/iio/adc/palmas_gpadc.c b/drivers/iio/adc/palmas_gpadc.c
index 67d567ee21b4..d283ee8fb1d2 100644
--- a/drivers/iio/adc/palmas_gpadc.c
+++ b/drivers/iio/adc/palmas_gpadc.c
@@ -676,7 +676,7 @@ static int palmas_gpadc_write_event_config(struct iio_dev *indio_dev,
 					   const struct iio_chan_spec *chan,
 					   enum iio_event_type type,
 					   enum iio_event_direction dir,
-					   int state)
+					   bool state)
 {
 	struct palmas_gpadc *adc = iio_priv(indio_dev);
 	int adc_chan = chan->channel;
diff --git a/drivers/iio/adc/ti-ads1015.c b/drivers/iio/adc/ti-ads1015.c
index 052d2124b215..47fe8e16aee4 100644
--- a/drivers/iio/adc/ti-ads1015.c
+++ b/drivers/iio/adc/ti-ads1015.c
@@ -806,7 +806,7 @@ static int ads1015_disable_event_config(struct ads1015_data *data,
 
 static int ads1015_write_event_config(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, enum iio_event_type type,
-	enum iio_event_direction dir, int state)
+	enum iio_event_direction dir, bool state)
 {
 	struct ads1015_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/adc/xilinx-ams.c b/drivers/iio/adc/xilinx-ams.c
index ebc583b07e0c..76dd0343f5f7 100644
--- a/drivers/iio/adc/xilinx-ams.c
+++ b/drivers/iio/adc/xilinx-ams.c
@@ -905,7 +905,7 @@ static int ams_write_event_config(struct iio_dev *indio_dev,
 				  const struct iio_chan_spec *chan,
 				  enum iio_event_type type,
 				  enum iio_event_direction dir,
-				  int state)
+				  bool state)
 {
 	struct ams *ams = iio_priv(indio_dev);
 	unsigned int alarm;
diff --git a/drivers/iio/adc/xilinx-xadc-events.c b/drivers/iio/adc/xilinx-xadc-events.c
index 90f62377c34d..c188d3dcab48 100644
--- a/drivers/iio/adc/xilinx-xadc-events.c
+++ b/drivers/iio/adc/xilinx-xadc-events.c
@@ -121,7 +121,7 @@ int xadc_read_event_config(struct iio_dev *indio_dev,
 
 int xadc_write_event_config(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, enum iio_event_type type,
-	enum iio_event_direction dir, int state)
+	enum iio_event_direction dir, bool state)
 {
 	unsigned int alarm = xadc_get_alarm_mask(chan);
 	struct xadc *xadc = iio_priv(indio_dev);
diff --git a/drivers/iio/adc/xilinx-xadc.h b/drivers/iio/adc/xilinx-xadc.h
index 3036f4d613ff..b4d9d4683117 100644
--- a/drivers/iio/adc/xilinx-xadc.h
+++ b/drivers/iio/adc/xilinx-xadc.h
@@ -25,7 +25,7 @@ int xadc_read_event_config(struct iio_dev *indio_dev,
 	enum iio_event_direction dir);
 int xadc_write_event_config(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, enum iio_event_type type,
-	enum iio_event_direction dir, int state);
+	enum iio_event_direction dir, bool state);
 int xadc_read_event_value(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, enum iio_event_type type,
 	enum iio_event_direction dir, enum iio_event_info info,
diff --git a/drivers/iio/cdc/ad7150.c b/drivers/iio/cdc/ad7150.c
index 4c03b9e834b8..e64a41bae32c 100644
--- a/drivers/iio/cdc/ad7150.c
+++ b/drivers/iio/cdc/ad7150.c
@@ -232,7 +232,7 @@ static int ad7150_write_event_params(struct iio_dev *indio_dev,
 static int ad7150_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
-				     enum iio_event_direction dir, int state)
+				     enum iio_event_direction dir, bool state)
 {
 	struct ad7150_chip_info *chip = iio_priv(indio_dev);
 	int ret = 0;
diff --git a/drivers/iio/dac/ad5421.c b/drivers/iio/dac/ad5421.c
index 7644acfd879e..1462ee640b16 100644
--- a/drivers/iio/dac/ad5421.c
+++ b/drivers/iio/dac/ad5421.c
@@ -384,7 +384,7 @@ static int ad5421_write_raw(struct iio_dev *indio_dev,
 
 static int ad5421_write_event_config(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, enum iio_event_type type,
-	enum iio_event_direction dir, int state)
+	enum iio_event_direction dir, bool state)
 {
 	struct ad5421_state *st = iio_priv(indio_dev);
 	unsigned int mask;
diff --git a/drivers/iio/dac/ad8460.c b/drivers/iio/dac/ad8460.c
index 7470d97825e0..f235394589df 100644
--- a/drivers/iio/dac/ad8460.c
+++ b/drivers/iio/dac/ad8460.c
@@ -573,7 +573,7 @@ static int ad8460_read_event_value(struct iio_dev *indio_dev,
 static int ad8460_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
-				     enum iio_event_direction dir, int val)
+				     enum iio_event_direction dir, bool val)
 {
 	struct ad8460_state *state = iio_priv(indio_dev);
 	int fault;
diff --git a/drivers/iio/dummy/iio_simple_dummy.h b/drivers/iio/dummy/iio_simple_dummy.h
index a91622ac54e0..8246f25dbad0 100644
--- a/drivers/iio/dummy/iio_simple_dummy.h
+++ b/drivers/iio/dummy/iio_simple_dummy.h
@@ -60,7 +60,7 @@ int iio_simple_dummy_write_event_config(struct iio_dev *indio_dev,
 					const struct iio_chan_spec *chan,
 					enum iio_event_type type,
 					enum iio_event_direction dir,
-					int state);
+					bool state);
 
 int iio_simple_dummy_read_event_value(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
diff --git a/drivers/iio/dummy/iio_simple_dummy_events.c b/drivers/iio/dummy/iio_simple_dummy_events.c
index 63a2b844be50..c7f2d3a4d60b 100644
--- a/drivers/iio/dummy/iio_simple_dummy_events.c
+++ b/drivers/iio/dummy/iio_simple_dummy_events.c
@@ -53,7 +53,7 @@ int iio_simple_dummy_write_event_config(struct iio_dev *indio_dev,
 					const struct iio_chan_spec *chan,
 					enum iio_event_type type,
 					enum iio_event_direction dir,
-					int state)
+					bool state)
 {
 	struct iio_dummy_state *st = iio_priv(indio_dev);
 
diff --git a/drivers/iio/gyro/bmg160_core.c b/drivers/iio/gyro/bmg160_core.c
index bb235697262b..ba877d067afb 100644
--- a/drivers/iio/gyro/bmg160_core.c
+++ b/drivers/iio/gyro/bmg160_core.c
@@ -748,7 +748,7 @@ static int bmg160_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
 				     enum iio_event_direction dir,
-				     int state)
+				     bool state)
 {
 	struct bmg160_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/imu/bmi323/bmi323_core.c b/drivers/iio/imu/bmi323/bmi323_core.c
index 1e6c083ea5c1..76a88e1ccc1d 100644
--- a/drivers/iio/imu/bmi323/bmi323_core.c
+++ b/drivers/iio/imu/bmi323/bmi323_core.c
@@ -785,7 +785,7 @@ static const struct attribute_group bmi323_event_attribute_group = {
 static int bmi323_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
-				     enum iio_event_direction dir, int state)
+				     enum iio_event_direction dir, bool state)
 {
 	struct bmi323_data *data = iio_priv(indio_dev);
 
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
index 21ebf0f7e28f..40271352b02c 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
@@ -1173,7 +1173,7 @@ static int inv_mpu6050_write_event_config(struct iio_dev *indio_dev,
 					  const struct iio_chan_spec *chan,
 					  enum iio_event_type type,
 					  enum iio_event_direction dir,
-					  int state)
+					  bool state)
 {
 	struct inv_mpu6050_state *st = iio_priv(indio_dev);
 
diff --git a/drivers/iio/imu/kmx61.c b/drivers/iio/imu/kmx61.c
index 2af772775b68..324c38764656 100644
--- a/drivers/iio/imu/kmx61.c
+++ b/drivers/iio/imu/kmx61.c
@@ -942,7 +942,7 @@ static int kmx61_write_event_config(struct iio_dev *indio_dev,
 				    const struct iio_chan_spec *chan,
 				    enum iio_event_type type,
 				    enum iio_event_direction dir,
-				    int state)
+				    bool state)
 {
 	struct kmx61_data *data = kmx61_get_data(indio_dev);
 	int ret = 0;
diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c
index fb4c6c39ff2e..caefa15e559b 100644
--- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c
+++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c
@@ -1959,7 +1959,7 @@ static int
 st_lsm6dsx_write_event_config(struct iio_dev *iio_dev,
 			      const struct iio_chan_spec *chan,
 			      enum iio_event_type type,
-			      enum iio_event_direction dir, int state)
+			      enum iio_event_direction dir, bool state)
 {
 	struct st_lsm6dsx_sensor *sensor = iio_priv(iio_dev);
 	struct st_lsm6dsx_hw *hw = sensor->hw;
diff --git a/drivers/iio/light/adux1020.c b/drivers/iio/light/adux1020.c
index 06d5bc1d246c..593d614b1689 100644
--- a/drivers/iio/light/adux1020.c
+++ b/drivers/iio/light/adux1020.c
@@ -502,7 +502,8 @@ fail:
 static int adux1020_write_event_config(struct iio_dev *indio_dev,
 				       const struct iio_chan_spec *chan,
 				       enum iio_event_type type,
-				       enum iio_event_direction dir, int state)
+				       enum iio_event_direction dir,
+				       bool state)
 {
 	struct adux1020_data *data = iio_priv(indio_dev);
 	int ret, mask;
diff --git a/drivers/iio/light/apds9300.c b/drivers/iio/light/apds9300.c
index 11f2ab4ca261..95861b2a5b2d 100644
--- a/drivers/iio/light/apds9300.c
+++ b/drivers/iio/light/apds9300.c
@@ -321,7 +321,7 @@ static int apds9300_read_interrupt_config(struct iio_dev *indio_dev,
 
 static int apds9300_write_interrupt_config(struct iio_dev *indio_dev,
 		const struct iio_chan_spec *chan, enum iio_event_type type,
-		enum iio_event_direction dir, int state)
+		enum iio_event_direction dir, bool state)
 {
 	struct apds9300_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/light/apds9306.c b/drivers/iio/light/apds9306.c
index 079e02be1005..8adc74040db2 100644
--- a/drivers/iio/light/apds9306.c
+++ b/drivers/iio/light/apds9306.c
@@ -1071,7 +1071,7 @@ static int apds9306_write_event_config(struct iio_dev *indio_dev,
 				       const struct iio_chan_spec *chan,
 				       enum iio_event_type type,
 				       enum iio_event_direction dir,
-				       int state)
+				       bool state)
 {
 	struct apds9306_data *data = iio_priv(indio_dev);
 	struct apds9306_regfields *rf = &data->rf;
diff --git a/drivers/iio/light/apds9960.c b/drivers/iio/light/apds9960.c
index 3a56eaae5a68..a7f0cc99f236 100644
--- a/drivers/iio/light/apds9960.c
+++ b/drivers/iio/light/apds9960.c
@@ -757,7 +757,7 @@ static int apds9960_write_event_config(struct iio_dev *indio_dev,
 				       const struct iio_chan_spec *chan,
 				       enum iio_event_type type,
 				       enum iio_event_direction dir,
-				       int state)
+				       bool state)
 {
 	struct apds9960_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/light/bh1745.c b/drivers/iio/light/bh1745.c
index fc6bf062d4f5..23e9f16090cc 100644
--- a/drivers/iio/light/bh1745.c
+++ b/drivers/iio/light/bh1745.c
@@ -638,7 +638,7 @@ static int bh1745_read_event_config(struct iio_dev *indio_dev,
 static int bh1745_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
-				     enum iio_event_direction dir, int state)
+				     enum iio_event_direction dir, bool state)
 {
 	struct bh1745_data *data = iio_priv(indio_dev);
 	int value;
diff --git a/drivers/iio/light/cm36651.c b/drivers/iio/light/cm36651.c
index a4a1505534c0..ae3fc3299eec 100644
--- a/drivers/iio/light/cm36651.c
+++ b/drivers/iio/light/cm36651.c
@@ -529,7 +529,7 @@ static int cm36651_write_prox_event_config(struct iio_dev *indio_dev,
 					const struct iio_chan_spec *chan,
 					enum iio_event_type type,
 					enum iio_event_direction dir,
-					int state)
+					bool state)
 {
 	struct cm36651_data *cm36651 = iio_priv(indio_dev);
 	int cmd, ret;
diff --git a/drivers/iio/light/gp2ap002.c b/drivers/iio/light/gp2ap002.c
index f8b1d7dd6f5f..d56ee217fe53 100644
--- a/drivers/iio/light/gp2ap002.c
+++ b/drivers/iio/light/gp2ap002.c
@@ -340,7 +340,7 @@ static int gp2ap002_write_event_config(struct iio_dev *indio_dev,
 				       const struct iio_chan_spec *chan,
 				       enum iio_event_type type,
 				       enum iio_event_direction dir,
-				       int state)
+				       bool state)
 {
 	struct gp2ap002 *gp2ap002 = iio_priv(indio_dev);
 
diff --git a/drivers/iio/light/gp2ap020a00f.c b/drivers/iio/light/gp2ap020a00f.c
index 81e718cdeae3..1a352c88598e 100644
--- a/drivers/iio/light/gp2ap020a00f.c
+++ b/drivers/iio/light/gp2ap020a00f.c
@@ -1159,7 +1159,7 @@ static int gp2ap020a00f_write_event_config(struct iio_dev *indio_dev,
 					   const struct iio_chan_spec *chan,
 					   enum iio_event_type type,
 					   enum iio_event_direction dir,
-					   int state)
+					   bool state)
 {
 	struct gp2ap020a00f_data *data = iio_priv(indio_dev);
 	enum gp2ap020a00f_cmd cmd;
diff --git a/drivers/iio/light/iqs621-als.c b/drivers/iio/light/iqs621-als.c
index 6de33feada3a..b9f230210f07 100644
--- a/drivers/iio/light/iqs621-als.c
+++ b/drivers/iio/light/iqs621-als.c
@@ -271,7 +271,7 @@ static int iqs621_als_write_event_config(struct iio_dev *indio_dev,
 					 const struct iio_chan_spec *chan,
 					 enum iio_event_type type,
 					 enum iio_event_direction dir,
-					 int state)
+					 bool state)
 {
 	struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
 	struct iqs62x_core *iqs62x = iqs621_als->iqs62x;
diff --git a/drivers/iio/light/ltr390.c b/drivers/iio/light/ltr390.c
index 3bdffb6360bc..df664f360903 100644
--- a/drivers/iio/light/ltr390.c
+++ b/drivers/iio/light/ltr390.c
@@ -553,7 +553,7 @@ static int ltr390_write_event_config(struct iio_dev *indio_dev,
 				const struct iio_chan_spec *chan,
 				enum iio_event_type type,
 				enum iio_event_direction dir,
-				int state)
+				bool state)
 {
 	struct ltr390_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/light/ltr501.c b/drivers/iio/light/ltr501.c
index 616dc6921702..604f5f900a2e 100644
--- a/drivers/iio/light/ltr501.c
+++ b/drivers/iio/light/ltr501.c
@@ -1077,7 +1077,7 @@ static int ltr501_read_event_config(struct iio_dev *indio_dev,
 static int ltr501_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
-				     enum iio_event_direction dir, int state)
+				     enum iio_event_direction dir, bool state)
 {
 	struct ltr501_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/light/max44009.c b/drivers/iio/light/max44009.c
index 3b92362675dc..8cd7f5664e5b 100644
--- a/drivers/iio/light/max44009.c
+++ b/drivers/iio/light/max44009.c
@@ -422,7 +422,7 @@ static int max44009_write_event_config(struct iio_dev *indio_dev,
 				       const struct iio_chan_spec *chan,
 				       enum iio_event_type type,
 				       enum iio_event_direction dir,
-				       int state)
+				       bool state)
 {
 	struct max44009_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/light/opt3001.c b/drivers/iio/light/opt3001.c
index ff7fc0d4b08f..65b295877b41 100644
--- a/drivers/iio/light/opt3001.c
+++ b/drivers/iio/light/opt3001.c
@@ -634,7 +634,7 @@ static int opt3001_read_event_config(struct iio_dev *iio,
 
 static int opt3001_write_event_config(struct iio_dev *iio,
 		const struct iio_chan_spec *chan, enum iio_event_type type,
-		enum iio_event_direction dir, int state)
+		enum iio_event_direction dir, bool state)
 {
 	struct opt3001 *opt = iio_priv(iio);
 	int ret;
diff --git a/drivers/iio/light/stk3310.c b/drivers/iio/light/stk3310.c
index c6f950af5afa..b81cc44db43c 100644
--- a/drivers/iio/light/stk3310.c
+++ b/drivers/iio/light/stk3310.c
@@ -324,7 +324,7 @@ static int stk3310_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
 				      enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	int ret;
 	struct stk3310_data *data = iio_priv(indio_dev);
diff --git a/drivers/iio/light/tcs3472.c b/drivers/iio/light/tcs3472.c
index 04452b4664f3..4186aac04902 100644
--- a/drivers/iio/light/tcs3472.c
+++ b/drivers/iio/light/tcs3472.c
@@ -327,7 +327,7 @@ static int tcs3472_read_event_config(struct iio_dev *indio_dev,
 
 static int tcs3472_write_event_config(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, enum iio_event_type type,
-	enum iio_event_direction dir, int state)
+	enum iio_event_direction dir, bool state)
 {
 	struct tcs3472_data *data = iio_priv(indio_dev);
 	int ret = 0;
diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c
index 1a6f514bced6..f1fe7640fce6 100644
--- a/drivers/iio/light/tsl2563.c
+++ b/drivers/iio/light/tsl2563.c
@@ -630,7 +630,7 @@ static irqreturn_t tsl2563_event_handler(int irq, void *private)
 
 static int tsl2563_write_interrupt_config(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, enum iio_event_type type,
-	enum iio_event_direction dir, int state)
+	enum iio_event_direction dir, bool state)
 {
 	struct tsl2563_chip *chip = iio_priv(indio_dev);
 	int ret = 0;
diff --git a/drivers/iio/light/tsl2591.c b/drivers/iio/light/tsl2591.c
index 850c2465992f..b81ca6f73f92 100644
--- a/drivers/iio/light/tsl2591.c
+++ b/drivers/iio/light/tsl2591.c
@@ -985,7 +985,7 @@ static int tsl2591_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
 				      enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	struct tsl2591_chip *chip = iio_priv(indio_dev);
 	struct i2c_client *client = chip->client;
diff --git a/drivers/iio/light/tsl2772.c b/drivers/iio/light/tsl2772.c
index 26082f239c4c..349afdcbe30d 100644
--- a/drivers/iio/light/tsl2772.c
+++ b/drivers/iio/light/tsl2772.c
@@ -1081,7 +1081,7 @@ static int tsl2772_write_interrupt_config(struct iio_dev *indio_dev,
 					  const struct iio_chan_spec *chan,
 					  enum iio_event_type type,
 					  enum iio_event_direction dir,
-					  int val)
+					  bool val)
 {
 	struct tsl2772_chip *chip = iio_priv(indio_dev);
 
diff --git a/drivers/iio/light/us5182d.c b/drivers/iio/light/us5182d.c
index de6967ac3b0b..c83114aed6b2 100644
--- a/drivers/iio/light/us5182d.c
+++ b/drivers/iio/light/us5182d.c
@@ -627,7 +627,7 @@ static int us5182d_read_event_config(struct iio_dev *indio_dev,
 
 static int us5182d_write_event_config(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, enum iio_event_type type,
-	enum iio_event_direction dir, int state)
+	enum iio_event_direction dir, bool state)
 {
 	struct us5182d_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/light/vcnl4000.c b/drivers/iio/light/vcnl4000.c
index 4e3641ff2ed4..e19199b17f2e 100644
--- a/drivers/iio/light/vcnl4000.c
+++ b/drivers/iio/light/vcnl4000.c
@@ -1456,7 +1456,7 @@ static int vcnl4010_write_event_config(struct iio_dev *indio_dev,
 				       const struct iio_chan_spec *chan,
 				       enum iio_event_type type,
 				       enum iio_event_direction dir,
-				       int state)
+				       bool state)
 {
 	switch (chan->type) {
 	case IIO_PROXIMITY:
@@ -1501,7 +1501,8 @@ static int vcnl4040_read_event_config(struct iio_dev *indio_dev,
 static int vcnl4040_write_event_config(struct iio_dev *indio_dev,
 				       const struct iio_chan_spec *chan,
 				       enum iio_event_type type,
-				       enum iio_event_direction dir, int state)
+				       enum iio_event_direction dir,
+				       bool state)
 {
 	int ret = -EINVAL;
 	u16 val, mask;
diff --git a/drivers/iio/light/veml6030.c b/drivers/iio/light/veml6030.c
index 95751c101590..208a040ee345 100644
--- a/drivers/iio/light/veml6030.c
+++ b/drivers/iio/light/veml6030.c
@@ -821,7 +821,7 @@ static int veml6030_read_interrupt_config(struct iio_dev *indio_dev,
  */
 static int veml6030_write_interrupt_config(struct iio_dev *indio_dev,
 		const struct iio_chan_spec *chan, enum iio_event_type type,
-		enum iio_event_direction dir, int state)
+		enum iio_event_direction dir, bool state)
 {
 	int ret;
 	struct veml6030_data *data = iio_priv(indio_dev);
diff --git a/drivers/iio/position/iqs624-pos.c b/drivers/iio/position/iqs624-pos.c
index 4d7452314209..8239239c6ee2 100644
--- a/drivers/iio/position/iqs624-pos.c
+++ b/drivers/iio/position/iqs624-pos.c
@@ -181,7 +181,7 @@ static int iqs624_pos_write_event_config(struct iio_dev *indio_dev,
 					 const struct iio_chan_spec *chan,
 					 enum iio_event_type type,
 					 enum iio_event_direction dir,
-					 int state)
+					 bool state)
 {
 	struct iqs624_pos_private *iqs624_pos = iio_priv(indio_dev);
 	struct iqs62x_core *iqs62x = iqs624_pos->iqs62x;
diff --git a/drivers/iio/proximity/aw96103.c b/drivers/iio/proximity/aw96103.c
index 707ba0a510aa..cdd254da9e50 100644
--- a/drivers/iio/proximity/aw96103.c
+++ b/drivers/iio/proximity/aw96103.c
@@ -422,7 +422,7 @@ static int aw96103_read_event_config(struct iio_dev *indio_dev,
 static int aw96103_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
-				      enum iio_event_direction dir, int state)
+				      enum iio_event_direction dir, bool state)
 {
 	struct aw96103 *aw96103 = iio_priv(indio_dev);
 
diff --git a/drivers/iio/proximity/cros_ec_mkbp_proximity.c b/drivers/iio/proximity/cros_ec_mkbp_proximity.c
index b1a4a923e788..667369be0555 100644
--- a/drivers/iio/proximity/cros_ec_mkbp_proximity.c
+++ b/drivers/iio/proximity/cros_ec_mkbp_proximity.c
@@ -167,7 +167,7 @@ static int cros_ec_mkbp_proximity_read_event_config(struct iio_dev *indio_dev,
 static int cros_ec_mkbp_proximity_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
-				     enum iio_event_direction dir, int state)
+				     enum iio_event_direction dir, bool state)
 {
 	struct cros_ec_mkbp_proximity_data *data = iio_priv(indio_dev);
 
diff --git a/drivers/iio/proximity/hx9023s.c b/drivers/iio/proximity/hx9023s.c
index 38441b1ee040..4021feb7a7ac 100644
--- a/drivers/iio/proximity/hx9023s.c
+++ b/drivers/iio/proximity/hx9023s.c
@@ -874,7 +874,7 @@ static int hx9023s_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
 				      enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	struct hx9023s_data *data = iio_priv(indio_dev);
 
diff --git a/drivers/iio/proximity/irsd200.c b/drivers/iio/proximity/irsd200.c
index fb0691da99ee..b09d15230111 100644
--- a/drivers/iio/proximity/irsd200.c
+++ b/drivers/iio/proximity/irsd200.c
@@ -648,7 +648,8 @@ static int irsd200_read_event_config(struct iio_dev *indio_dev,
 static int irsd200_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
-				      enum iio_event_direction dir, int state)
+				      enum iio_event_direction dir,
+				      bool state)
 {
 	struct irsd200_data *data = iio_priv(indio_dev);
 	unsigned int tmp;
diff --git a/drivers/iio/proximity/sx9500.c b/drivers/iio/proximity/sx9500.c
index e3da709424d5..c4e94d0fb163 100644
--- a/drivers/iio/proximity/sx9500.c
+++ b/drivers/iio/proximity/sx9500.c
@@ -540,7 +540,7 @@ static int sx9500_write_event_config(struct iio_dev *indio_dev,
 				     const struct iio_chan_spec *chan,
 				     enum iio_event_type type,
 				     enum iio_event_direction dir,
-				     int state)
+				     bool state)
 {
 	struct sx9500_data *data = iio_priv(indio_dev);
 	int ret;
diff --git a/drivers/iio/proximity/sx_common.c b/drivers/iio/proximity/sx_common.c
index bcf502e02342..76384c74fe01 100644
--- a/drivers/iio/proximity/sx_common.c
+++ b/drivers/iio/proximity/sx_common.c
@@ -268,7 +268,7 @@ EXPORT_SYMBOL_NS_GPL(sx_common_read_event_config, SEMTECH_PROX);
 int sx_common_write_event_config(struct iio_dev *indio_dev,
 				 const struct iio_chan_spec *chan,
 				 enum iio_event_type type,
-				 enum iio_event_direction dir, int state)
+				 enum iio_event_direction dir, bool state)
 {
 	struct sx_common_data *data = iio_priv(indio_dev);
 	unsigned int eventirq = SX_COMMON_FAR_IRQ | SX_COMMON_CLOSE_IRQ;
diff --git a/drivers/iio/proximity/sx_common.h b/drivers/iio/proximity/sx_common.h
index da53268201a9..fb14e6f06a6d 100644
--- a/drivers/iio/proximity/sx_common.h
+++ b/drivers/iio/proximity/sx_common.h
@@ -143,7 +143,7 @@ int sx_common_read_event_config(struct iio_dev *indio_dev,
 int sx_common_write_event_config(struct iio_dev *indio_dev,
 				 const struct iio_chan_spec *chan,
 				 enum iio_event_type type,
-				 enum iio_event_direction dir, int state);
+				 enum iio_event_direction dir, bool state);
 
 int sx_common_probe(struct i2c_client *client,
 		    const struct sx_common_chip_info *chip_info,
diff --git a/drivers/iio/proximity/vcnl3020.c b/drivers/iio/proximity/vcnl3020.c
index d1ddf85f5383..bb6c9cc88b35 100644
--- a/drivers/iio/proximity/vcnl3020.c
+++ b/drivers/iio/proximity/vcnl3020.c
@@ -449,7 +449,7 @@ static int vcnl3020_write_event_config(struct iio_dev *indio_dev,
 				       const struct iio_chan_spec *chan,
 				       enum iio_event_type type,
 				       enum iio_event_direction dir,
-				       int state)
+				       bool state)
 {
 	switch (chan->type) {
 	case IIO_PROXIMITY:
diff --git a/drivers/iio/temperature/mcp9600.c b/drivers/iio/temperature/mcp9600.c
index f1bb0976273d..c2447860adfd 100644
--- a/drivers/iio/temperature/mcp9600.c
+++ b/drivers/iio/temperature/mcp9600.c
@@ -200,7 +200,7 @@ static int mcp9600_write_event_config(struct iio_dev *indio_dev,
 				      const struct iio_chan_spec *chan,
 				      enum iio_event_type type,
 				      enum iio_event_direction dir,
-				      int state)
+				      bool state)
 {
 	struct mcp9600_data *data = iio_priv(indio_dev);
 	struct i2c_client *client = data->client;
diff --git a/drivers/iio/temperature/tmp007.c b/drivers/iio/temperature/tmp007.c
index 9bdfa9423492..fd4d389ce1df 100644
--- a/drivers/iio/temperature/tmp007.c
+++ b/drivers/iio/temperature/tmp007.c
@@ -216,7 +216,7 @@ static irqreturn_t tmp007_interrupt_handler(int irq, void *private)
 
 static int tmp007_write_event_config(struct iio_dev *indio_dev,
 		const struct iio_chan_spec *chan, enum iio_event_type type,
-		enum iio_event_direction dir, int state)
+		enum iio_event_direction dir, bool state)
 {
 	struct tmp007_data *data = iio_priv(indio_dev);
 	unsigned int status_mask;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 5c6682bd4cb9..59c58f455311 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -514,7 +514,7 @@ struct iio_info {
 				  const struct iio_chan_spec *chan,
 				  enum iio_event_type type,
 				  enum iio_event_direction dir,
-				  int state);
+				  bool state);
 
 	int (*read_event_value)(struct iio_dev *indio_dev,
 				const struct iio_chan_spec *chan,
-- 
cgit v1.2.3


From 5d8173b8493151d32b99ec6732fb29c58256a7c8 Mon Sep 17 00:00:00 2001
From: Julien Stephan <jstephan@baylibre.com>
Date: Mon, 28 Oct 2024 17:38:11 +0100
Subject: iio: events.h: add event identifier macros for differential channel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, there are 3 helper macros in iio/events.h to create event
identifiers:
- IIO_EVENT_CODE : create generic event identifier for differential and non
  differential channels
- IIO_MOD_EVENT_CODE : create event identifier for modified (non
  differential) channels
- IIO_UNMOD_EVENT_CODE : create event identifier for unmodified (non
  differential) channels

For differential channels, drivers are expected to use IIO_EVENT_CODE.
However, only one driver in drivers/iio currently uses it correctly,
leading to inconsistent event identifiers for differential channels that
don’t match the intended attributes (such as max1363.c that supports
differential channels, but only uses IIO_UNMOD_EVENT_CODE).

To prevent such issues in future drivers, a new helper macro,
IIO_DIFF_EVENT_CODE, is introduced to specifically create event identifiers
for differential channels. Only one helper is needed for differential
channels since they cannot have modifiers.

Additionally, the descriptions for IIO_MOD_EVENT_CODE and
IIO_UNMOD_EVENT_CODE have been updated to clarify that they are intended
for non-differential channels,

Signed-off-by: Julien Stephan <jstephan@baylibre.com>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20241028-iio-add-macro-for-even-identifier-for-differential-channels-v1-1-b452c90f7ea6@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/events.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/events.h b/include/linux/iio/events.h
index a4558c45a548..eeaba5e1525e 100644
--- a/include/linux/iio/events.h
+++ b/include/linux/iio/events.h
@@ -30,7 +30,8 @@
 
 
 /**
- * IIO_MOD_EVENT_CODE() - create event identifier for modified channels
+ * IIO_MOD_EVENT_CODE() - create event identifier for modified (non
+ * differential) channels
  * @chan_type:	Type of the channel. Should be one of enum iio_chan_type.
  * @number:	Channel number.
  * @modifier:	Modifier for the channel. Should be one of enum iio_modifier.
@@ -43,7 +44,8 @@
 	IIO_EVENT_CODE(chan_type, 0, modifier, direction, type, number, 0, 0)
 
 /**
- * IIO_UNMOD_EVENT_CODE() - create event identifier for unmodified channels
+ * IIO_UNMOD_EVENT_CODE() - create event identifier for unmodified (non
+ * differential) channels
  * @chan_type:	Type of the channel. Should be one of enum iio_chan_type.
  * @number:	Channel number.
  * @type:	Type of the event. Should be one of enum iio_event_type.
@@ -53,4 +55,16 @@
 #define IIO_UNMOD_EVENT_CODE(chan_type, number, type, direction)	\
 	IIO_EVENT_CODE(chan_type, 0, 0, direction, type, number, 0, 0)
 
+/**
+ * IIO_DIFF_EVENT_CODE() - create event identifier for differential channels
+ * @chan_type:	Type of the channel. Should be one of enum iio_chan_type.
+ * @chan1:	First channel number for differential channels.
+ * @chan2:	Second channel number for differential channels.
+ * @type:	Type of the event. Should be one of enum iio_event_type.
+ * @direction:	Direction of the event. One of enum iio_event_direction.
+ */
+
+#define IIO_DIFF_EVENT_CODE(chan_type, chan1, chan2, type, direction)	\
+	IIO_EVENT_CODE(chan_type, 1, 0, direction, type, 0, chan1, chan2)
+
 #endif
-- 
cgit v1.2.3


From 01f567d22152dfa8799e9fde5f18bbb5650d8681 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Fri, 1 Nov 2024 17:17:10 -0500
Subject: iio: events: make IIO_EVENT_CODE macro private

Make IIO_EVENT_CODE "private" by adding a leading underscore.

There are no more users of this macro in the kernel so we can make it
"private" and encourage developers to use the specialized versions of
the macro instead.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20241101-iio-fix-event-macro-use-v1-3-0000c5d09f6d@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/events.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/events.h b/include/linux/iio/events.h
index eeaba5e1525e..72062a0c7c87 100644
--- a/include/linux/iio/events.h
+++ b/include/linux/iio/events.h
@@ -10,7 +10,7 @@
 #include <uapi/linux/iio/events.h>
 
 /**
- * IIO_EVENT_CODE() - create event identifier
+ * _IIO_EVENT_CODE() - create event identifier
  * @chan_type:	Type of the channel. Should be one of enum iio_chan_type.
  * @diff:	Whether the event is for an differential channel or not.
  * @modifier:	Modifier for the channel. Should be one of enum iio_modifier.
@@ -19,10 +19,13 @@
  * @chan:	Channel number for non-differential channels.
  * @chan1:	First channel number for differential channels.
  * @chan2:	Second channel number for differential channels.
+ *
+ * Drivers should use the specialized macros below instead of using this one
+ * directly.
  */
 
-#define IIO_EVENT_CODE(chan_type, diff, modifier, direction,		\
-		       type, chan, chan1, chan2)			\
+#define _IIO_EVENT_CODE(chan_type, diff, modifier, direction,		\
+			type, chan, chan1, chan2)			\
 	(((u64)type << 56) | ((u64)diff << 55) |			\
 	 ((u64)direction << 48) | ((u64)modifier << 40) |		\
 	 ((u64)chan_type << 32) | (((u16)chan2) << 16) | ((u16)chan1) | \
@@ -41,7 +44,7 @@
 
 #define IIO_MOD_EVENT_CODE(chan_type, number, modifier,		\
 			   type, direction)				\
-	IIO_EVENT_CODE(chan_type, 0, modifier, direction, type, number, 0, 0)
+	_IIO_EVENT_CODE(chan_type, 0, modifier, direction, type, number, 0, 0)
 
 /**
  * IIO_UNMOD_EVENT_CODE() - create event identifier for unmodified (non
@@ -53,7 +56,7 @@
  */
 
 #define IIO_UNMOD_EVENT_CODE(chan_type, number, type, direction)	\
-	IIO_EVENT_CODE(chan_type, 0, 0, direction, type, number, 0, 0)
+	_IIO_EVENT_CODE(chan_type, 0, 0, direction, type, number, 0, 0)
 
 /**
  * IIO_DIFF_EVENT_CODE() - create event identifier for differential channels
@@ -65,6 +68,6 @@
  */
 
 #define IIO_DIFF_EVENT_CODE(chan_type, chan1, chan2, type, direction)	\
-	IIO_EVENT_CODE(chan_type, 1, 0, direction, type, 0, chan1, chan2)
+	_IIO_EVENT_CODE(chan_type, 1, 0, direction, type, 0, chan1, chan2)
 
 #endif
-- 
cgit v1.2.3


From a865276872ec4f129f8a582634be82dcc275dc2a Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Wed, 30 Oct 2024 18:23:25 -0600
Subject: dim: make dim_calc_stats() inputs const pointers

Make the start and end arguments to dim_calc_stats() const pointers
to clarify that the function does not modify their values.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Arthur Kiyanovski <akiyano@amazon.com>
Link: https://patch.msgid.link/20241031002326.3426181-1-csander@purestorage.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dim.h | 3 ++-
 lib/dim/dim.c       | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index 1b581ff25a15..84579a50ae7f 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -351,7 +351,8 @@ void dim_park_tired(struct dim *dim);
  * Takes into consideration counter wrap-around.
  * Returned boolean indicates whether curr_stats are reliable.
  */
-bool dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+bool dim_calc_stats(const struct dim_sample *start,
+		    const struct dim_sample *end,
 		    struct dim_stats *curr_stats);
 
 /**
diff --git a/lib/dim/dim.c b/lib/dim/dim.c
index 83b65ac74d73..97c3d084ebf0 100644
--- a/lib/dim/dim.c
+++ b/lib/dim/dim.c
@@ -54,7 +54,8 @@ void dim_park_tired(struct dim *dim)
 }
 EXPORT_SYMBOL(dim_park_tired);
 
-bool dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+bool dim_calc_stats(const struct dim_sample *start,
+		    const struct dim_sample *end,
 		    struct dim_stats *curr_stats)
 {
 	/* u32 holds up to 71 minutes, should be enough */
-- 
cgit v1.2.3


From 61bf0009a7657d394d942c8ee961b9ea5f2168fe Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Wed, 30 Oct 2024 18:23:26 -0600
Subject: dim: pass dim_sample to net_dim() by reference

net_dim() is currently passed a struct dim_sample argument by value.
struct dim_sample is 24 bytes. Since this is greater 16 bytes, x86-64
passes it on the stack. All callers have already initialized dim_sample
on the stack, so passing it by value requires pushing a duplicated copy
to the stack. Either witing to the stack and immediately reading it, or
perhaps dereferencing addresses relative to the stack pointer in a chain
of push instructions, seems to perform quite poorly.

In a heavy TCP workload, mlx5e_handle_rx_dim() consumes 3% of CPU time,
94% of which is attributed to the first push instruction to copy
dim_sample on the stack for the call to net_dim():
// Call ktime_get()
  0.26 |4ead2:   call   4ead7 <mlx5e_handle_rx_dim+0x47>
// Pass the address of struct dim in %rdi
       |4ead7:   lea    0x3d0(%rbx),%rdi
// Set dim_sample.pkt_ctr
       |4eade:   mov    %r13d,0x8(%rsp)
// Set dim_sample.byte_ctr
       |4eae3:   mov    %r12d,0xc(%rsp)
// Set dim_sample.event_ctr
  0.15 |4eae8:   mov    %bp,0x10(%rsp)
// Duplicate dim_sample on the stack
 94.16 |4eaed:   push   0x10(%rsp)
  2.79 |4eaf1:   push   0x10(%rsp)
  0.07 |4eaf5:   push   %rax
// Call net_dim()
  0.21 |4eaf6:   call   4eafb <mlx5e_handle_rx_dim+0x6b>

To allow the caller to reuse the struct dim_sample already on the stack,
pass the struct dim_sample by reference to net_dim().

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Shannon Nelson <shannon.nelson@amd.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Arthur Kiyanovski <akiyano@amazon.com>
Reviewed-by: Louis Peens <louis.peens@corigine.com>
Link: https://patch.msgid.link/20241031002326.3426181-2-csander@purestorage.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/net_dim.rst                   |  2 +-
 drivers/net/ethernet/amazon/ena/ena_netdev.c           |  2 +-
 drivers/net/ethernet/broadcom/bcmsysport.c             |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c              |  4 ++--
 drivers/net/ethernet/broadcom/genet/bcmgenet.c         |  2 +-
 drivers/net/ethernet/freescale/enetc/enetc.c           |  2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c        |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c              |  4 ++--
 drivers/net/ethernet/intel/idpf/idpf_txrx.c            |  4 ++--
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c |  2 +-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c            |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c      |  4 ++--
 drivers/net/ethernet/netronome/nfp/nfd3/dp.c           |  4 ++--
 drivers/net/ethernet/netronome/nfp/nfdk/dp.c           |  4 ++--
 drivers/net/ethernet/pensando/ionic/ionic_txrx.c       |  2 +-
 drivers/net/virtio_net.c                               |  2 +-
 drivers/soc/fsl/dpio/dpio-service.c                    |  2 +-
 include/linux/dim.h                                    |  2 +-
 lib/dim/net_dim.c                                      | 10 +++++-----
 19 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/net_dim.rst b/Documentation/networking/net_dim.rst
index 8908fd7b0a8d..4377998e6826 100644
--- a/Documentation/networking/net_dim.rst
+++ b/Documentation/networking/net_dim.rst
@@ -156,7 +156,7 @@ usage is not complete but it should make the outline of the usage clear.
 		          my_entity->bytes,
 		          &dim_sample);
 	/* Call net DIM */
-	net_dim(&my_entity->dim, dim_sample);
+	net_dim(&my_entity->dim, &dim_sample);
 	...
   }
 
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 96df20854eb9..63c8a2328142 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -1383,7 +1383,7 @@ static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi)
 			  rx_ring->rx_stats.bytes,
 			  &dim_sample);
 
-	net_dim(&ena_napi->dim, dim_sample);
+	net_dim(&ena_napi->dim, &dim_sample);
 
 	rx_ring->per_napi_packets = 0;
 }
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index caff6e87a488..031e9e0cca53 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1029,7 +1029,7 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 	if (priv->dim.use_dim) {
 		dim_update_sample(priv->dim.event_ctr, priv->dim.packets,
 				  priv->dim.bytes, &dim_sample);
-		net_dim(&priv->dim.dim, dim_sample);
+		net_dim(&priv->dim.dim, &dim_sample);
 	}
 
 	return work_done;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 6dd6541d8619..ca42b81133d7 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3102,7 +3102,7 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
 				  cpr->rx_packets,
 				  cpr->rx_bytes,
 				  &dim_sample);
-		net_dim(&cpr->dim, dim_sample);
+		net_dim(&cpr->dim, &dim_sample);
 	}
 	return work_done;
 }
@@ -3233,7 +3233,7 @@ poll_done:
 				  cpr_rx->rx_packets,
 				  cpr_rx->rx_bytes,
 				  &dim_sample);
-		net_dim(&cpr->dim, dim_sample);
+		net_dim(&cpr->dim, &dim_sample);
 	}
 	return work_done;
 }
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 10966ab15373..53a949eb9180 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2405,7 +2405,7 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 	if (ring->dim.use_dim) {
 		dim_update_sample(ring->dim.event_ctr, ring->dim.packets,
 				  ring->dim.bytes, &dim_sample);
-		net_dim(&ring->dim.dim, dim_sample);
+		net_dim(&ring->dim.dim, &dim_sample);
 	}
 
 	return work_done;
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
index c09370eab319..05dedea6185a 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc.c
@@ -718,7 +718,7 @@ static void enetc_rx_net_dim(struct enetc_int_vector *v)
 			  v->rx_ring.stats.packets,
 			  v->rx_ring.stats.bytes,
 			  &dim_sample);
-	net_dim(&v->rx_dim, dim_sample);
+	net_dim(&v->rx_dim, &dim_sample);
 }
 
 static int enetc_bd_ready_count(struct enetc_bdr *tx_ring, int ci)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index b09f0cca34dc..fbfd3ee5648f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -4478,7 +4478,7 @@ static void hns3_update_rx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector)
 
 	dim_update_sample(tqp_vector->event_cnt, rx_group->total_packets,
 			  rx_group->total_bytes, &sample);
-	net_dim(&rx_group->dim, sample);
+	net_dim(&rx_group->dim, &sample);
 }
 
 static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector)
@@ -4491,7 +4491,7 @@ static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector)
 
 	dim_update_sample(tqp_vector->event_cnt, tx_group->total_packets,
 			  tx_group->total_bytes, &sample);
-	net_dim(&tx_group->dim, sample);
+	net_dim(&tx_group->dim, &sample);
 }
 
 static int hns3_nic_common_poll(struct napi_struct *napi, int budget)
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8208055d6e7f..5d2d7736fd5f 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -1352,14 +1352,14 @@ static void ice_net_dim(struct ice_q_vector *q_vector)
 		struct dim_sample dim_sample;
 
 		__ice_update_sample(q_vector, tx, &dim_sample, true);
-		net_dim(&tx->dim, dim_sample);
+		net_dim(&tx->dim, &dim_sample);
 	}
 
 	if (ITR_IS_DYNAMIC(rx)) {
 		struct dim_sample dim_sample;
 
 		__ice_update_sample(q_vector, rx, &dim_sample, false);
-		net_dim(&rx->dim, dim_sample);
+		net_dim(&rx->dim, &dim_sample);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index d4e6f0e10487..da2a5becf62f 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -3679,7 +3679,7 @@ static void idpf_net_dim(struct idpf_q_vector *q_vector)
 
 	idpf_update_dim_sample(q_vector, &dim_sample, &q_vector->tx_dim,
 			       packets, bytes);
-	net_dim(&q_vector->tx_dim, dim_sample);
+	net_dim(&q_vector->tx_dim, &dim_sample);
 
 check_rx_itr:
 	if (!IDPF_ITR_IS_DYNAMIC(q_vector->rx_intr_mode))
@@ -3698,7 +3698,7 @@ check_rx_itr:
 
 	idpf_update_dim_sample(q_vector, &dim_sample, &q_vector->rx_dim,
 			       packets, bytes);
-	net_dim(&q_vector->rx_dim, dim_sample);
+	net_dim(&q_vector->rx_dim, &dim_sample);
 }
 
 /**
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
index 933e18ba2fb2..7aaf32e9aa95 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
@@ -527,7 +527,7 @@ static void otx2_adjust_adaptive_coalese(struct otx2_nic *pfvf, struct otx2_cq_p
 			  rx_frames + tx_frames,
 			  rx_bytes + tx_bytes,
 			  &dim_sample);
-	net_dim(&cq_poll->dim, dim_sample);
+	net_dim(&cq_poll->dim, &dim_sample);
 }
 
 int otx2_napi_handler(struct napi_struct *napi, int budget)
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index f01ceee5f02d..53485142938c 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2227,7 +2227,7 @@ rx_done:
 	eth->rx_bytes += bytes;
 	dim_update_sample(eth->rx_events, eth->rx_packets, eth->rx_bytes,
 			  &dim_sample);
-	net_dim(&eth->rx_dim, dim_sample);
+	net_dim(&eth->rx_dim, &dim_sample);
 
 	if (xdp_flush)
 		xdp_do_flush();
@@ -2377,7 +2377,7 @@ static int mtk_poll_tx(struct mtk_eth *eth, int budget)
 
 	dim_update_sample(eth->tx_events, eth->tx_packets, eth->tx_bytes,
 			  &dim_sample);
-	net_dim(&eth->tx_dim, dim_sample);
+	net_dim(&eth->tx_dim, &dim_sample);
 
 	if (mtk_queue_stopped(eth) &&
 	    (atomic_read(&ring->free_count) > ring->thresh))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 5873fde65c2e..417098f0b2bb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -55,7 +55,7 @@ static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
 		return;
 
 	dim_update_sample(sq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
-	net_dim(sq->dim, dim_sample);
+	net_dim(sq->dim, &dim_sample);
 }
 
 static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
@@ -67,7 +67,7 @@ static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
 		return;
 
 	dim_update_sample(rq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
-	net_dim(rq->dim, dim_sample);
+	net_dim(rq->dim, &dim_sample);
 }
 
 void mlx5e_trigger_irq(struct mlx5e_icosq *sq)
diff --git a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c
index d215efc6cad0..f1c6c47564b1 100644
--- a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c
@@ -1179,7 +1179,7 @@ int nfp_nfd3_poll(struct napi_struct *napi, int budget)
 		} while (u64_stats_fetch_retry(&r_vec->rx_sync, start));
 
 		dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample);
-		net_dim(&r_vec->rx_dim, dim_sample);
+		net_dim(&r_vec->rx_dim, &dim_sample);
 	}
 
 	if (r_vec->nfp_net->tx_coalesce_adapt_on && r_vec->tx_ring) {
@@ -1194,7 +1194,7 @@ int nfp_nfd3_poll(struct napi_struct *napi, int budget)
 		} while (u64_stats_fetch_retry(&r_vec->tx_sync, start));
 
 		dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample);
-		net_dim(&r_vec->tx_dim, dim_sample);
+		net_dim(&r_vec->tx_dim, &dim_sample);
 	}
 
 	return pkts_polled;
diff --git a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c
index dae5af7d1845..ebeb6ab4465c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c
@@ -1289,7 +1289,7 @@ int nfp_nfdk_poll(struct napi_struct *napi, int budget)
 		} while (u64_stats_fetch_retry(&r_vec->rx_sync, start));
 
 		dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample);
-		net_dim(&r_vec->rx_dim, dim_sample);
+		net_dim(&r_vec->rx_dim, &dim_sample);
 	}
 
 	if (r_vec->nfp_net->tx_coalesce_adapt_on && r_vec->tx_ring) {
@@ -1304,7 +1304,7 @@ int nfp_nfdk_poll(struct napi_struct *napi, int budget)
 		} while (u64_stats_fetch_retry(&r_vec->tx_sync, start));
 
 		dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample);
-		net_dim(&r_vec->tx_dim, dim_sample);
+		net_dim(&r_vec->tx_dim, &dim_sample);
 	}
 
 	return pkts_polled;
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
index 0eeda7e502db..2ac59564ded1 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
@@ -928,7 +928,7 @@ static void ionic_dim_update(struct ionic_qcq *qcq, int napi_mode)
 	dim_update_sample(qcq->cq.bound_intr->rearm_count,
 			  pkts, bytes, &dim_sample);
 
-	net_dim(&qcq->dim, dim_sample);
+	net_dim(&qcq->dim, &dim_sample);
 }
 
 int ionic_tx_napi(struct napi_struct *napi, int budget)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 792e9eadbfc3..869586c17ffd 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2804,7 +2804,7 @@ static void virtnet_rx_dim_update(struct virtnet_info *vi, struct receive_queue
 			  u64_stats_read(&rq->stats.bytes),
 			  &cur_sample);
 
-	net_dim(&rq->dim, cur_sample);
+	net_dim(&rq->dim, &cur_sample);
 	rq->packets_in_napi = 0;
 }
 
diff --git a/drivers/soc/fsl/dpio/dpio-service.c b/drivers/soc/fsl/dpio/dpio-service.c
index b811446e0fa5..0b60ed16297c 100644
--- a/drivers/soc/fsl/dpio/dpio-service.c
+++ b/drivers/soc/fsl/dpio/dpio-service.c
@@ -891,7 +891,7 @@ void dpaa2_io_update_net_dim(struct dpaa2_io *d, __u64 frames, __u64 bytes)
 	d->frames += frames;
 
 	dim_update_sample(d->event_ctr, d->frames, d->bytes, &dim_sample);
-	net_dim(&d->rx_dim, dim_sample);
+	net_dim(&d->rx_dim, &dim_sample);
 
 	spin_unlock(&d->dim_lock);
 }
diff --git a/include/linux/dim.h b/include/linux/dim.h
index 84579a50ae7f..06543fd40fcc 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -425,7 +425,7 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
  * This is the main logic of the algorithm, where data is processed in order
  * to decide on next required action.
  */
-void net_dim(struct dim *dim, struct dim_sample end_sample);
+void net_dim(struct dim *dim, const struct dim_sample *end_sample);
 
 /* RDMA DIM */
 
diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c
index d7e7028e9b19..d6aa09a979b3 100644
--- a/lib/dim/net_dim.c
+++ b/lib/dim/net_dim.c
@@ -347,7 +347,7 @@ static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
 	return dim->profile_ix != prev_ix;
 }
 
-void net_dim(struct dim *dim, struct dim_sample end_sample)
+void net_dim(struct dim *dim, const struct dim_sample *end_sample)
 {
 	struct dim_stats curr_stats;
 	u16 nevents;
@@ -355,11 +355,11 @@ void net_dim(struct dim *dim, struct dim_sample end_sample)
 	switch (dim->state) {
 	case DIM_MEASURE_IN_PROGRESS:
 		nevents = BIT_GAP(BITS_PER_TYPE(u16),
-				  end_sample.event_ctr,
+				  end_sample->event_ctr,
 				  dim->start_sample.event_ctr);
 		if (nevents < DIM_NEVENTS)
 			break;
-		if (!dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats))
+		if (!dim_calc_stats(&dim->start_sample, end_sample, &curr_stats))
 			break;
 		if (net_dim_decision(&curr_stats, dim)) {
 			dim->state = DIM_APPLY_NEW_PROFILE;
@@ -368,8 +368,8 @@ void net_dim(struct dim *dim, struct dim_sample end_sample)
 		}
 		fallthrough;
 	case DIM_START_MEASURE:
-		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
-				  end_sample.byte_ctr, &dim->start_sample);
+		dim_update_sample(end_sample->event_ctr, end_sample->pkt_ctr,
+				  end_sample->byte_ctr, &dim->start_sample);
 		dim->state = DIM_MEASURE_IN_PROGRESS;
 		break;
 	case DIM_APPLY_NEW_PROFILE:
-- 
cgit v1.2.3


From 1685f685ff8036c74c3b5e9006ec7ceda5bafefa Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Sat, 26 Oct 2024 18:43:33 +0300
Subject: soc: qcom: llcc: add support for SAR2130P and SAR1130P

Implement necessary support for the LLCC control on the SAR1130P and
SAR2130P platforms. These two platforms use different ATTR1_MAX_CAP
shift and also require manual override for num_banks.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20241026-sar2130p-llcc-v3-3-2a58fa1b4d12@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/soc/qcom/llcc-qcom.c       | 460 ++++++++++++++++++++++++++++++++++++-
 include/linux/soc/qcom/llcc-qcom.h |  12 +
 2 files changed, 466 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index e69bcaf8bffd..c0a01904ae4c 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -136,6 +136,8 @@ struct qcom_llcc_config {
 	const struct llcc_slice_config *sct_data;
 	const u32 *reg_offset;
 	const struct llcc_edac_reg_offset *edac_reg_offset;
+	u32 max_cap_shift; /* instead of ATTR1_MAX_CAP_SHIFT */
+	u32 num_banks;
 	int size;
 	bool need_llcc_cfg;
 	bool no_edac;
@@ -297,6 +299,408 @@ static const struct llcc_slice_config sa8775p_data[] =  {
 	},
 };
 
+static const struct llcc_slice_config sar1130p_data[] = {
+	{
+		.usecase_id = LLCC_CPUSS,
+		.slice_id = 1,
+		.max_cap = 4096,
+		.priority = 1,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+		.activate_on_init = true,
+	}, {
+		.usecase_id = LLCC_VIDSC0,
+		.slice_id = 2,
+		.max_cap = 512,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_AUDIO,
+		.slice_id = 6,
+		.max_cap = 1024,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_CMPT,
+		.slice_id = 10,
+		.max_cap = 1024,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_GPUHTW,
+		.slice_id = 11,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_GPU,
+		.slice_id = 12,
+		.max_cap = 3072,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+		.write_scid_en = true,
+	}, {
+		.usecase_id = LLCC_MMUHWT,
+		.slice_id = 13,
+		.max_cap = 512,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+	}, {
+		.usecase_id = LLCC_DISP,
+		.slice_id = 16,
+		.max_cap = 12800,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_CVP,
+		.slice_id = 28,
+		.max_cap = 256,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_APTCM,
+		.slice_id = 26,
+		.max_cap = 2048,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x3,
+		.cache_mode = true,
+		.dis_cap_alloc = true,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_WRCACHE,
+		.slice_id = 31,
+		.max_cap = 256,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.activate_on_init = true,
+	}, {
+		.usecase_id = LLCC_AENPU,
+		.slice_id = 30,
+		.max_cap = 3072,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0x1fff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_DISP_LEFT,
+		.slice_id = 17,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_DISP_RIGHT,
+		.slice_id = 18,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_EVCS_LEFT,
+		.slice_id = 22,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_EVCS_RIGHT,
+		.slice_id = 23,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	},
+};
+
+static const struct llcc_slice_config sar2130p_data[] = {
+	{
+		.usecase_id = LLCC_CPUSS,
+		.slice_id = 1,
+		.max_cap = 6144,
+		.priority = 1,
+		.fixed_size = 0,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+		.activate_on_init = true,
+	}, {
+		.usecase_id = LLCC_VIDSC0,
+		.slice_id = 2,
+		.max_cap = 128,
+		.priority = 2,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_AUDIO,
+		.slice_id = 6,
+		.max_cap = 1024,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_CMPT,
+		.slice_id = 10,
+		.max_cap = 1024,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_GPUHTW,
+		.slice_id = 11,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_GPU,
+		.slice_id = 12,
+		.max_cap = 1536,
+		.priority = 2,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+		.write_scid_en = true,
+	}, {
+		.usecase_id = LLCC_MMUHWT,
+		.slice_id = 13,
+		.max_cap = 1024,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.activate_on_init = true,
+	}, {
+		.usecase_id = LLCC_DISP,
+		.slice_id = 16,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_APTCM,
+		.slice_id = 26,
+		.max_cap = 2048,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x3,
+		.cache_mode = true,
+		.dis_cap_alloc = true,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_WRCACHE,
+		.slice_id = 31,
+		.max_cap = 256,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.activate_on_init = true,
+	}, {
+		.usecase_id = LLCC_VIEYE,
+		.slice_id = 7,
+		.max_cap = 7168,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_VIDPTH,
+		.slice_id = 8,
+		.max_cap = 7168,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_GPUMV,
+		.slice_id = 9,
+		.max_cap = 2048,
+		.priority = 2,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_EVA_LEFT,
+		.slice_id = 20,
+		.max_cap = 7168,
+		.priority = 5,
+		.fixed_size = true,
+		.bonus_ways = 0x3ffffffc,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_EVA_RIGHT,
+		.slice_id = 21,
+		.max_cap = 7168,
+		.priority = 5,
+		.fixed_size = true,
+		.bonus_ways = 0x3ffffffc,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_EVAGAIN,
+		.slice_id = 25,
+		.max_cap = 1024,
+		.priority = 2,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_AENPU,
+		.slice_id = 30,
+		.max_cap = 3072,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_VIPTH,
+		.slice_id = 29,
+		.max_cap = 1024,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0x3fffffff,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_DISP_LEFT,
+		.slice_id = 17,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_DISP_RIGHT,
+		.slice_id = 18,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_EVCS_LEFT,
+		.slice_id = 22,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_EVCS_RIGHT,
+		.slice_id = 23,
+		.max_cap = 0,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	}, {
+		.usecase_id = LLCC_SPAD,
+		.slice_id = 24,
+		.max_cap = 7168,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0x0,
+		.res_ways = 0x0,
+		.cache_mode = 0,
+		.retain_on_pc = true,
+	},
+};
+
 static const struct llcc_slice_config sc7180_data[] =  {
 	{
 		.usecase_id = LLCC_CPUSS,
@@ -2686,6 +3090,30 @@ static const struct qcom_llcc_config sa8775p_cfg[] = {
 	},
 };
 
+static const struct qcom_llcc_config sar1130p_cfg[] = {
+	{
+		.sct_data	= sar1130p_data,
+		.size		= ARRAY_SIZE(sar1130p_data),
+		.need_llcc_cfg	= true,
+		.reg_offset	= llcc_v2_1_reg_offset,
+		.edac_reg_offset = &llcc_v2_1_edac_reg_offset,
+		.max_cap_shift	= 14,
+		.num_banks	= 2,
+	},
+};
+
+static const struct qcom_llcc_config sar2130p_cfg[] = {
+	{
+		.sct_data	= sar2130p_data,
+		.size		= ARRAY_SIZE(sar2130p_data),
+		.need_llcc_cfg	= true,
+		.reg_offset	= llcc_v2_1_reg_offset,
+		.edac_reg_offset = &llcc_v2_1_edac_reg_offset,
+		.max_cap_shift	= 14,
+		.num_banks	= 2,
+	},
+};
+
 static const struct qcom_llcc_config sc7180_cfg[] = {
 	{
 		.sct_data	= sc7180_data,
@@ -2837,6 +3265,16 @@ static const struct qcom_sct_config sa8775p_cfgs = {
 	.num_config	= ARRAY_SIZE(sa8775p_cfg),
 };
 
+static const struct qcom_sct_config sar1130p_cfgs = {
+	.llcc_config	= sar1130p_cfg,
+	.num_config	= ARRAY_SIZE(sar1130p_cfg),
+};
+
+static const struct qcom_sct_config sar2130p_cfgs = {
+	.llcc_config	= sar2130p_cfg,
+	.num_config	= ARRAY_SIZE(sar2130p_cfg),
+};
+
 static const struct qcom_sct_config sc7180_cfgs = {
 	.llcc_config	= sc7180_cfg,
 	.num_config	= ARRAY_SIZE(sc7180_cfg),
@@ -3144,7 +3582,10 @@ static int _qcom_llcc_cfg_program(const struct llcc_slice_config *config,
 	 */
 	max_cap_cacheline = max_cap_cacheline / drv_data->num_banks;
 	max_cap_cacheline >>= CACHE_LINE_SIZE_SHIFT;
-	attr1_val |= max_cap_cacheline << ATTR1_MAX_CAP_SHIFT;
+	if (cfg->max_cap_shift)
+		attr1_val |= max_cap_cacheline << cfg->max_cap_shift;
+	else
+		attr1_val |= max_cap_cacheline << ATTR1_MAX_CAP_SHIFT;
 
 	attr1_cfg = LLCC_TRP_ATTR1_CFGn(config->slice_id);
 
@@ -3381,12 +3822,17 @@ static int qcom_llcc_probe(struct platform_device *pdev)
 		goto err;
 	cfg = &cfgs->llcc_config[cfg_index];
 
-	ret = regmap_read(regmap, cfg->reg_offset[LLCC_COMMON_STATUS0], &num_banks);
-	if (ret)
-		goto err;
+	if (cfg->num_banks) {
+		num_banks = cfg->num_banks;
+	} else {
+		ret = regmap_read(regmap, cfg->reg_offset[LLCC_COMMON_STATUS0], &num_banks);
+		if (ret)
+			goto err;
+
+		num_banks &= LLCC_LB_CNT_MASK;
+		num_banks >>= LLCC_LB_CNT_SHIFT;
+	}
 
-	num_banks &= LLCC_LB_CNT_MASK;
-	num_banks >>= LLCC_LB_CNT_SHIFT;
 	drv_data->num_banks = num_banks;
 
 	drv_data->regmaps = devm_kcalloc(dev, num_banks, sizeof(*drv_data->regmaps), GFP_KERNEL);
@@ -3483,6 +3929,8 @@ err:
 static const struct of_device_id qcom_llcc_of_match[] = {
 	{ .compatible = "qcom,qdu1000-llcc", .data = &qdu1000_cfgs},
 	{ .compatible = "qcom,sa8775p-llcc", .data = &sa8775p_cfgs },
+	{ .compatible = "qcom,sar1130p-llcc", .data = &sar1130p_cfgs },
+	{ .compatible = "qcom,sar2130p-llcc", .data = &sar2130p_cfgs },
 	{ .compatible = "qcom,sc7180-llcc", .data = &sc7180_cfgs },
 	{ .compatible = "qcom,sc7280-llcc", .data = &sc7280_cfgs },
 	{ .compatible = "qcom,sc8180x-llcc", .data = &sc8180x_cfgs },
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 9e9f528b1370..a79bfac230c6 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -54,7 +54,19 @@
 #define LLCC_CAMEXP4	 52
 #define LLCC_DISP_WB	 53
 #define LLCC_DISP_1	 54
+#define LLCC_VIEYE	 57
+#define LLCC_VIDPTH	 58
+#define LLCC_GPUMV	 59
+#define LLCC_EVA_LEFT	 60
+#define LLCC_EVA_RIGHT	 61
+#define LLCC_EVAGAIN	 62
+#define LLCC_VIPTH	 63
 #define LLCC_VIDVSP	 64
+#define LLCC_DISP_LEFT	 65
+#define LLCC_DISP_RIGHT	 66
+#define LLCC_EVCS_LEFT	 67
+#define LLCC_EVCS_RIGHT	 68
+#define LLCC_SPAD	 69
 
 /**
  * struct llcc_slice_desc - Cache slice descriptor
-- 
cgit v1.2.3


From 8ab3138a9b2dcb0ddf281240cf8cba414eb1224a Mon Sep 17 00:00:00 2001
From: Edward Srouji <edwards@nvidia.com>
Date: Tue, 3 Sep 2024 14:37:51 +0300
Subject: net/mlx5: Introduce data placement ordering bits

Introduce out-of-order (OOO) data placement (DP) IFC related bits to
support OOO DP QP.

Signed-off-by: Edward Srouji <edwards@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/f30e5cbb5459fd02f27f35909bb545cab346b58b.1725362773.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 96d369112bfa..2a037843b117 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1872,7 +1872,11 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_328[0x2];
 	u8	   relaxed_ordering_read[0x1];
 	u8         log_max_pd[0x5];
-	u8         reserved_at_330[0x5];
+	u8         dp_ordering_ooo_all_ud[0x1];
+	u8         dp_ordering_ooo_all_uc[0x1];
+	u8         dp_ordering_ooo_all_xrc[0x1];
+	u8         dp_ordering_ooo_all_dc[0x1];
+	u8         dp_ordering_ooo_all_rc[0x1];
 	u8         pcie_reset_using_hotreset_method[0x1];
 	u8         pci_sync_for_fw_update_with_driver_unload[0x1];
 	u8         vnic_env_cnt_steering_fail[0x1];
@@ -2094,7 +2098,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
 	u8         migratable[0x1];
-	u8         reserved_at_81[0x11];
+	u8         reserved_at_81[0x7];
+	u8         dp_ordering_force[0x1];
+	u8         reserved_at_89[0x9];
 	u8         query_vuid[0x1];
 	u8         reserved_at_93[0x5];
 	u8         umr_log_entity_size_5[0x1];
@@ -3524,7 +3530,8 @@ struct mlx5_ifc_qpc_bits {
 	u8         latency_sensitive[0x1];
 	u8         reserved_at_24[0x1];
 	u8         drain_sigerr[0x1];
-	u8         reserved_at_26[0x2];
+	u8         reserved_at_26[0x1];
+	u8         dp_ordering_force[0x1];
 	u8         pd[0x18];
 
 	u8         mtu[0x3];
@@ -3597,7 +3604,8 @@ struct mlx5_ifc_qpc_bits {
 	u8         rae[0x1];
 	u8         reserved_at_493[0x1];
 	u8         page_offset[0x6];
-	u8         reserved_at_49a[0x3];
+	u8         reserved_at_49a[0x2];
+	u8         dp_ordering_1[0x1];
 	u8         cd_slave_receive[0x1];
 	u8         cd_slave_send[0x1];
 	u8         cd_master[0x1];
@@ -4507,7 +4515,8 @@ struct mlx5_ifc_dctc_bits {
 	u8         state[0x4];
 	u8         reserved_at_8[0x18];
 
-	u8         reserved_at_20[0x8];
+	u8         reserved_at_20[0x7];
+	u8         dp_ordering_force[0x1];
 	u8         user_index[0x18];
 
 	u8         reserved_at_40[0x8];
@@ -4522,7 +4531,9 @@ struct mlx5_ifc_dctc_bits {
 	u8         latency_sensitive[0x1];
 	u8         rlky[0x1];
 	u8         free_ar[0x1];
-	u8         reserved_at_73[0xd];
+	u8         reserved_at_73[0x1];
+	u8         dp_ordering_1[0x1];
+	u8         reserved_at_75[0xb];
 
 	u8         reserved_at_80[0x8];
 	u8         cs_res[0x8];
-- 
cgit v1.2.3


From fe5ba6bf91b3e30118c59fe51048cda101de6542 Mon Sep 17 00:00:00 2001
From: Wei Fang <wei.fang@nxp.com>
Date: Wed, 30 Oct 2024 17:39:15 +0800
Subject: net: enetc: add initial netc-blk-ctrl driver support

The netc-blk-ctrl driver is used to configure Integrated Endpoint
Register Block (IERB) and Privileged Register Block (PRB) of NETC.
For i.MX platforms, it is also used to configure the NETCMIX block.

The IERB contains registers that are used for pre-boot initialization,
debug, and non-customer configuration. The PRB controls global reset
and global error handling for NETC. The NETCMIX block is mainly used
to set MII protocol and PCS protocol of the links, it also contains
settings for some other functions.

Note the IERB configuration registers can only be written after being
unlocked by PRB, otherwise, all write operations are inhibited. A warm
reset is performed when the IERB is unlocked, and it results in an FLR
to all NETC devices. Therefore, all NETC device drivers must be probed
or initialized after the warm reset is finished.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/enetc/Kconfig       |  14 +
 drivers/net/ethernet/freescale/enetc/Makefile      |   3 +
 .../net/ethernet/freescale/enetc/netc_blk_ctrl.c   | 445 +++++++++++++++++++++
 include/linux/fsl/netc_global.h                    |  19 +
 4 files changed, 481 insertions(+)
 create mode 100644 drivers/net/ethernet/freescale/enetc/netc_blk_ctrl.c
 create mode 100644 include/linux/fsl/netc_global.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/freescale/enetc/Kconfig b/drivers/net/ethernet/freescale/enetc/Kconfig
index 4d75e6807e92..51d80ea959d4 100644
--- a/drivers/net/ethernet/freescale/enetc/Kconfig
+++ b/drivers/net/ethernet/freescale/enetc/Kconfig
@@ -75,3 +75,17 @@ config FSL_ENETC_QOS
 	  enable/disable from user space via Qos commands(tc). In the kernel
 	  side, it can be loaded by Qos driver. Currently, it is only support
 	  taprio(802.1Qbv) and Credit Based Shaper(802.1Qbu).
+
+config NXP_NETC_BLK_CTRL
+	tristate "NETC blocks control driver"
+	help
+	  This driver configures Integrated Endpoint Register Block (IERB) and
+	  Privileged Register Block (PRB) of NETC. For i.MX platforms, it also
+	  includes the configuration of NETCMIX block.
+	  The IERB contains registers that are used for pre-boot initialization,
+	  debug, and non-customer configuration. The PRB controls global reset
+	  and global error handling for NETC. The NETCMIX block is mainly used
+	  to set MII protocol and PCS protocol of the links, it also contains
+	  settings for some other functions.
+
+	  If compiled as module (M), the module name is nxp-netc-blk-ctrl.
diff --git a/drivers/net/ethernet/freescale/enetc/Makefile b/drivers/net/ethernet/freescale/enetc/Makefile
index b13cbbabb2ea..737c32f83ea5 100644
--- a/drivers/net/ethernet/freescale/enetc/Makefile
+++ b/drivers/net/ethernet/freescale/enetc/Makefile
@@ -19,3 +19,6 @@ fsl-enetc-mdio-y := enetc_pci_mdio.o enetc_mdio.o
 
 obj-$(CONFIG_FSL_ENETC_PTP_CLOCK) += fsl-enetc-ptp.o
 fsl-enetc-ptp-y := enetc_ptp.o
+
+obj-$(CONFIG_NXP_NETC_BLK_CTRL) += nxp-netc-blk-ctrl.o
+nxp-netc-blk-ctrl-y := netc_blk_ctrl.o
diff --git a/drivers/net/ethernet/freescale/enetc/netc_blk_ctrl.c b/drivers/net/ethernet/freescale/enetc/netc_blk_ctrl.c
new file mode 100644
index 000000000000..bcb8eefeb93c
--- /dev/null
+++ b/drivers/net/ethernet/freescale/enetc/netc_blk_ctrl.c
@@ -0,0 +1,445 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * NXP NETC Blocks Control Driver
+ *
+ * Copyright 2024 NXP
+ *
+ * This driver is used for pre-initialization of NETC, such as PCS and MII
+ * protocols, LDID, warm reset, etc. Therefore, all NETC device drivers can
+ * only be probed after the netc-blk-crtl driver has completed initialization.
+ * In addition, when the system enters suspend mode, IERB, PRB, and NETCMIX
+ * will be powered off, except for WOL. Therefore, when the system resumes,
+ * these blocks need to be reinitialized.
+ */
+
+#include <linux/bits.h>
+#include <linux/clk.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/fsl/netc_global.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_net.h>
+#include <linux/of_platform.h>
+#include <linux/phy.h>
+#include <linux/platform_device.h>
+#include <linux/seq_file.h>
+
+/* NETCMIX registers */
+#define IMX95_CFG_LINK_IO_VAR		0x0
+#define  IO_VAR_16FF_16G_SERDES		0x1
+#define  IO_VAR(port, var)		(((var) & 0xf) << ((port) << 2))
+
+#define IMX95_CFG_LINK_MII_PROT		0x4
+#define CFG_LINK_MII_PORT_0		GENMASK(3, 0)
+#define CFG_LINK_MII_PORT_1		GENMASK(7, 4)
+#define  MII_PROT_MII			0x0
+#define  MII_PROT_RMII			0x1
+#define  MII_PROT_RGMII			0x2
+#define  MII_PROT_SERIAL		0x3
+#define  MII_PROT(port, prot)		(((prot) & 0xf) << ((port) << 2))
+
+#define IMX95_CFG_LINK_PCS_PROT(a)	(0x8 + (a) * 4)
+#define PCS_PROT_1G_SGMII		BIT(0)
+#define PCS_PROT_2500M_SGMII		BIT(1)
+#define PCS_PROT_XFI			BIT(3)
+#define PCS_PROT_SFI			BIT(4)
+#define PCS_PROT_10G_SXGMII		BIT(6)
+
+/* NETC privileged register block register */
+#define PRB_NETCRR			0x100
+#define  NETCRR_SR			BIT(0)
+#define  NETCRR_LOCK			BIT(1)
+
+#define PRB_NETCSR			0x104
+#define  NETCSR_ERROR			BIT(0)
+#define  NETCSR_STATE			BIT(1)
+
+/* NETC integrated endpoint register block register */
+#define IERB_EMDIOFAUXR			0x344
+#define IERB_T0FAUXR			0x444
+#define IERB_EFAUXR(a)			(0x3044 + 0x100 * (a))
+#define IERB_VFAUXR(a)			(0x4004 + 0x40 * (a))
+#define FAUXR_LDID			GENMASK(3, 0)
+
+/* Platform information */
+#define IMX95_ENETC0_BUS_DEVFN		0x0
+#define IMX95_ENETC1_BUS_DEVFN		0x40
+#define IMX95_ENETC2_BUS_DEVFN		0x80
+
+/* Flags for different platforms */
+#define NETC_HAS_NETCMIX		BIT(0)
+
+struct netc_devinfo {
+	u32 flags;
+	int (*netcmix_init)(struct platform_device *pdev);
+	int (*ierb_init)(struct platform_device *pdev);
+};
+
+struct netc_blk_ctrl {
+	void __iomem *prb;
+	void __iomem *ierb;
+	void __iomem *netcmix;
+
+	const struct netc_devinfo *devinfo;
+	struct platform_device *pdev;
+	struct dentry *debugfs_root;
+};
+
+static void netc_reg_write(void __iomem *base, u32 offset, u32 val)
+{
+	netc_write(base + offset, val);
+}
+
+static u32 netc_reg_read(void __iomem *base, u32 offset)
+{
+	return netc_read(base + offset);
+}
+
+static int netc_of_pci_get_bus_devfn(struct device_node *np)
+{
+	u32 reg[5];
+	int error;
+
+	error = of_property_read_u32_array(np, "reg", reg, ARRAY_SIZE(reg));
+	if (error)
+		return error;
+
+	return (reg[0] >> 8) & 0xffff;
+}
+
+static int netc_get_link_mii_protocol(phy_interface_t interface)
+{
+	switch (interface) {
+	case PHY_INTERFACE_MODE_MII:
+		return MII_PROT_MII;
+	case PHY_INTERFACE_MODE_RMII:
+		return MII_PROT_RMII;
+	case PHY_INTERFACE_MODE_RGMII:
+	case PHY_INTERFACE_MODE_RGMII_ID:
+	case PHY_INTERFACE_MODE_RGMII_RXID:
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+		return MII_PROT_RGMII;
+	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_2500BASEX:
+	case PHY_INTERFACE_MODE_10GBASER:
+	case PHY_INTERFACE_MODE_XGMII:
+	case PHY_INTERFACE_MODE_USXGMII:
+		return MII_PROT_SERIAL;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int imx95_netcmix_init(struct platform_device *pdev)
+{
+	struct netc_blk_ctrl *priv = platform_get_drvdata(pdev);
+	struct device_node *np = pdev->dev.of_node;
+	phy_interface_t interface;
+	int bus_devfn, mii_proto;
+	u32 val;
+	int err;
+
+	/* Default setting of MII protocol */
+	val = MII_PROT(0, MII_PROT_RGMII) | MII_PROT(1, MII_PROT_RGMII) |
+	      MII_PROT(2, MII_PROT_SERIAL);
+
+	/* Update the link MII protocol through parsing phy-mode */
+	for_each_available_child_of_node_scoped(np, child) {
+		for_each_available_child_of_node_scoped(child, gchild) {
+			if (!of_device_is_compatible(gchild, "pci1131,e101"))
+				continue;
+
+			bus_devfn = netc_of_pci_get_bus_devfn(gchild);
+			if (bus_devfn < 0)
+				return -EINVAL;
+
+			if (bus_devfn == IMX95_ENETC2_BUS_DEVFN)
+				continue;
+
+			err = of_get_phy_mode(gchild, &interface);
+			if (err)
+				continue;
+
+			mii_proto = netc_get_link_mii_protocol(interface);
+			if (mii_proto < 0)
+				return -EINVAL;
+
+			switch (bus_devfn) {
+			case IMX95_ENETC0_BUS_DEVFN:
+				val = u32_replace_bits(val, mii_proto,
+						       CFG_LINK_MII_PORT_0);
+				break;
+			case IMX95_ENETC1_BUS_DEVFN:
+				val = u32_replace_bits(val, mii_proto,
+						       CFG_LINK_MII_PORT_1);
+				break;
+			default:
+				return -EINVAL;
+			}
+		}
+	}
+
+	/* Configure Link I/O variant */
+	netc_reg_write(priv->netcmix, IMX95_CFG_LINK_IO_VAR,
+		       IO_VAR(2, IO_VAR_16FF_16G_SERDES));
+	/* Configure Link 2 PCS protocol */
+	netc_reg_write(priv->netcmix, IMX95_CFG_LINK_PCS_PROT(2),
+		       PCS_PROT_10G_SXGMII);
+	netc_reg_write(priv->netcmix, IMX95_CFG_LINK_MII_PROT, val);
+
+	return 0;
+}
+
+static bool netc_ierb_is_locked(struct netc_blk_ctrl *priv)
+{
+	return !!(netc_reg_read(priv->prb, PRB_NETCRR) & NETCRR_LOCK);
+}
+
+static int netc_lock_ierb(struct netc_blk_ctrl *priv)
+{
+	u32 val;
+
+	netc_reg_write(priv->prb, PRB_NETCRR, NETCRR_LOCK);
+
+	return read_poll_timeout(netc_reg_read, val, !(val & NETCSR_STATE),
+				 100, 2000, false, priv->prb, PRB_NETCSR);
+}
+
+static int netc_unlock_ierb_with_warm_reset(struct netc_blk_ctrl *priv)
+{
+	u32 val;
+
+	netc_reg_write(priv->prb, PRB_NETCRR, 0);
+
+	return read_poll_timeout(netc_reg_read, val, !(val & NETCRR_LOCK),
+				 1000, 100000, true, priv->prb, PRB_NETCRR);
+}
+
+static int imx95_ierb_init(struct platform_device *pdev)
+{
+	struct netc_blk_ctrl *priv = platform_get_drvdata(pdev);
+
+	/* EMDIO : No MSI-X intterupt */
+	netc_reg_write(priv->ierb, IERB_EMDIOFAUXR, 0);
+	/* ENETC0 PF */
+	netc_reg_write(priv->ierb, IERB_EFAUXR(0), 0);
+	/* ENETC0 VF0 */
+	netc_reg_write(priv->ierb, IERB_VFAUXR(0), 1);
+	/* ENETC0 VF1 */
+	netc_reg_write(priv->ierb, IERB_VFAUXR(1), 2);
+	/* ENETC1 PF */
+	netc_reg_write(priv->ierb, IERB_EFAUXR(1), 3);
+	/* ENETC1 VF0 */
+	netc_reg_write(priv->ierb, IERB_VFAUXR(2), 5);
+	/* ENETC1 VF1 */
+	netc_reg_write(priv->ierb, IERB_VFAUXR(3), 6);
+	/* ENETC2 PF */
+	netc_reg_write(priv->ierb, IERB_EFAUXR(2), 4);
+	/* ENETC2 VF0 */
+	netc_reg_write(priv->ierb, IERB_VFAUXR(4), 5);
+	/* ENETC2 VF1 */
+	netc_reg_write(priv->ierb, IERB_VFAUXR(5), 6);
+	/* NETC TIMER */
+	netc_reg_write(priv->ierb, IERB_T0FAUXR, 7);
+
+	return 0;
+}
+
+static int netc_ierb_init(struct platform_device *pdev)
+{
+	struct netc_blk_ctrl *priv = platform_get_drvdata(pdev);
+	const struct netc_devinfo *devinfo = priv->devinfo;
+	int err;
+
+	if (netc_ierb_is_locked(priv)) {
+		err = netc_unlock_ierb_with_warm_reset(priv);
+		if (err) {
+			dev_err(&pdev->dev, "Unlock IERB failed.\n");
+			return err;
+		}
+	}
+
+	if (devinfo->ierb_init) {
+		err = devinfo->ierb_init(pdev);
+		if (err)
+			return err;
+	}
+
+	err = netc_lock_ierb(priv);
+	if (err) {
+		dev_err(&pdev->dev, "Lock IERB failed.\n");
+		return err;
+	}
+
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+static int netc_prb_show(struct seq_file *s, void *data)
+{
+	struct netc_blk_ctrl *priv = s->private;
+	u32 val;
+
+	val = netc_reg_read(priv->prb, PRB_NETCRR);
+	seq_printf(s, "[PRB NETCRR] Lock:%d SR:%d\n",
+		   (val & NETCRR_LOCK) ? 1 : 0,
+		   (val & NETCRR_SR) ? 1 : 0);
+
+	val = netc_reg_read(priv->prb, PRB_NETCSR);
+	seq_printf(s, "[PRB NETCSR] State:%d Error:%d\n",
+		   (val & NETCSR_STATE) ? 1 : 0,
+		   (val & NETCSR_ERROR) ? 1 : 0);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(netc_prb);
+
+static void netc_blk_ctrl_create_debugfs(struct netc_blk_ctrl *priv)
+{
+	struct dentry *root;
+
+	root = debugfs_create_dir("netc_blk_ctrl", NULL);
+	if (IS_ERR(root))
+		return;
+
+	priv->debugfs_root = root;
+
+	debugfs_create_file("prb", 0444, root, priv, &netc_prb_fops);
+}
+
+static void netc_blk_ctrl_remove_debugfs(struct netc_blk_ctrl *priv)
+{
+	debugfs_remove_recursive(priv->debugfs_root);
+	priv->debugfs_root = NULL;
+}
+
+#else
+
+static void netc_blk_ctrl_create_debugfs(struct netc_blk_ctrl *priv)
+{
+}
+
+static void netc_blk_ctrl_remove_debugfs(struct netc_blk_ctrl *priv)
+{
+}
+#endif
+
+static int netc_prb_check_error(struct netc_blk_ctrl *priv)
+{
+	if (netc_reg_read(priv->prb, PRB_NETCSR) & NETCSR_ERROR)
+		return -1;
+
+	return 0;
+}
+
+static const struct netc_devinfo imx95_devinfo = {
+	.flags = NETC_HAS_NETCMIX,
+	.netcmix_init = imx95_netcmix_init,
+	.ierb_init = imx95_ierb_init,
+};
+
+static const struct of_device_id netc_blk_ctrl_match[] = {
+	{ .compatible = "nxp,imx95-netc-blk-ctrl", .data = &imx95_devinfo },
+	{},
+};
+MODULE_DEVICE_TABLE(of, netc_blk_ctrl_match);
+
+static int netc_blk_ctrl_probe(struct platform_device *pdev)
+{
+	struct device_node *node = pdev->dev.of_node;
+	const struct netc_devinfo *devinfo;
+	struct device *dev = &pdev->dev;
+	const struct of_device_id *id;
+	struct netc_blk_ctrl *priv;
+	struct clk *ipg_clk;
+	void __iomem *regs;
+	int err;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->pdev = pdev;
+	ipg_clk = devm_clk_get_optional_enabled(dev, "ipg");
+	if (IS_ERR(ipg_clk))
+		return dev_err_probe(dev, PTR_ERR(ipg_clk),
+				     "Set ipg clock failed\n");
+
+	id = of_match_device(netc_blk_ctrl_match, dev);
+	if (!id)
+		return dev_err_probe(dev, -EINVAL, "Cannot match device\n");
+
+	devinfo = (struct netc_devinfo *)id->data;
+	if (!devinfo)
+		return dev_err_probe(dev, -EINVAL, "No device information\n");
+
+	priv->devinfo = devinfo;
+	regs = devm_platform_ioremap_resource_byname(pdev, "ierb");
+	if (IS_ERR(regs))
+		return dev_err_probe(dev, PTR_ERR(regs),
+				     "Missing IERB resource\n");
+
+	priv->ierb = regs;
+	regs = devm_platform_ioremap_resource_byname(pdev, "prb");
+	if (IS_ERR(regs))
+		return dev_err_probe(dev, PTR_ERR(regs),
+				     "Missing PRB resource\n");
+
+	priv->prb = regs;
+	if (devinfo->flags & NETC_HAS_NETCMIX) {
+		regs = devm_platform_ioremap_resource_byname(pdev, "netcmix");
+		if (IS_ERR(regs))
+			return dev_err_probe(dev, PTR_ERR(regs),
+					     "Missing NETCMIX resource\n");
+		priv->netcmix = regs;
+	}
+
+	platform_set_drvdata(pdev, priv);
+	if (devinfo->netcmix_init) {
+		err = devinfo->netcmix_init(pdev);
+		if (err)
+			return dev_err_probe(dev, err,
+					     "Initializing NETCMIX failed\n");
+	}
+
+	err = netc_ierb_init(pdev);
+	if (err)
+		return dev_err_probe(dev, err, "Initializing IERB failed\n");
+
+	if (netc_prb_check_error(priv) < 0)
+		dev_warn(dev, "The current IERB configuration is invalid\n");
+
+	netc_blk_ctrl_create_debugfs(priv);
+
+	err = of_platform_populate(node, NULL, NULL, dev);
+	if (err) {
+		netc_blk_ctrl_remove_debugfs(priv);
+		return dev_err_probe(dev, err, "of_platform_populate failed\n");
+	}
+
+	return 0;
+}
+
+static void netc_blk_ctrl_remove(struct platform_device *pdev)
+{
+	struct netc_blk_ctrl *priv = platform_get_drvdata(pdev);
+
+	of_platform_depopulate(&pdev->dev);
+	netc_blk_ctrl_remove_debugfs(priv);
+}
+
+static struct platform_driver netc_blk_ctrl_driver = {
+	.driver = {
+		.name = "nxp-netc-blk-ctrl",
+		.of_match_table = netc_blk_ctrl_match,
+	},
+	.probe = netc_blk_ctrl_probe,
+	.remove = netc_blk_ctrl_remove,
+};
+
+module_platform_driver(netc_blk_ctrl_driver);
+
+MODULE_DESCRIPTION("NXP NETC Blocks Control Driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/include/linux/fsl/netc_global.h b/include/linux/fsl/netc_global.h
new file mode 100644
index 000000000000..fdecca8c90f0
--- /dev/null
+++ b/include/linux/fsl/netc_global.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/* Copyright 2024 NXP
+ */
+#ifndef __NETC_GLOBAL_H
+#define __NETC_GLOBAL_H
+
+#include <linux/io.h>
+
+static inline u32 netc_read(void __iomem *reg)
+{
+	return ioread32(reg);
+}
+
+static inline void netc_write(void __iomem *reg, u32 val)
+{
+	iowrite32(val, reg);
+}
+
+#endif
-- 
cgit v1.2.3


From bc2940869508b7b956a757a26d3b1ebf9546790e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 23 Oct 2024 16:34:42 -0400
Subject: nfs_common: fix localio to cope with racing nfs_local_probe()

Fix the possibility of racing nfs_local_probe() resulting in:
  list_add double add: new=ffff8b99707f9f58, prev=ffff8b99707f9f58, next=ffffffffc0f30000.
  ------------[ cut here ]------------
  kernel BUG at lib/list_debug.c:35!

Add nfs_uuid_init() to properly initialize all nfs_uuid_t members
(particularly its list_head).

Switch to returning bool from nfs_uuid_begin(), returns false if
nfs_uuid_t is already in-use (its list_head is on a list). Update
nfs_local_probe() to return early if the nfs_client's cl_uuid
(nfs_uuid_t) is in-use.

Also, switch nfs_uuid_begin() from using list_add_tail_rcu() to
list_add_tail() -- rculist was used in an earlier version of the
localio code that had a lockless nfs_uuid_lookup interface.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/client.c            |  3 +--
 fs/nfs/localio.c           |  3 ++-
 fs/nfs_common/nfslocalio.c | 23 ++++++++++++++++++-----
 include/linux/nfslocalio.h |  3 ++-
 4 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 114282398716..03ecc7765615 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -181,8 +181,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 	seqlock_init(&clp->cl_boot_lock);
 	ktime_get_real_ts64(&clp->cl_nfssvc_boot);
-	clp->cl_uuid.net = NULL;
-	clp->cl_uuid.dom = NULL;
+	nfs_uuid_init(&clp->cl_uuid);
 	spin_lock_init(&clp->cl_localio_lock);
 #endif /* CONFIG_NFS_LOCALIO */
 
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index d0aa680ec816..8f0ce82a677e 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -205,7 +205,8 @@ void nfs_local_probe(struct nfs_client *clp)
 		nfs_local_disable(clp);
 	}
 
-	nfs_uuid_begin(&clp->cl_uuid);
+	if (!nfs_uuid_begin(&clp->cl_uuid))
+		return;
 	if (nfs_server_uuid_is_local(clp))
 		nfs_local_enable(clp);
 	nfs_uuid_end(&clp->cl_uuid);
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 5c8ce5066c16..09404d142d1a 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -5,7 +5,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/rculist.h>
+#include <linux/list.h>
 #include <linux/nfslocalio.h>
 #include <net/netns/generic.h>
 
@@ -20,15 +20,27 @@ static DEFINE_SPINLOCK(nfs_uuid_lock);
  */
 static LIST_HEAD(nfs_uuids);
 
-void nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
+void nfs_uuid_init(nfs_uuid_t *nfs_uuid)
 {
 	nfs_uuid->net = NULL;
 	nfs_uuid->dom = NULL;
-	uuid_gen(&nfs_uuid->uuid);
+	INIT_LIST_HEAD(&nfs_uuid->list);
+}
+EXPORT_SYMBOL_GPL(nfs_uuid_init);
 
+bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
+{
 	spin_lock(&nfs_uuid_lock);
-	list_add_tail_rcu(&nfs_uuid->list, &nfs_uuids);
+	/* Is this nfs_uuid already in use? */
+	if (!list_empty(&nfs_uuid->list)) {
+		spin_unlock(&nfs_uuid_lock);
+		return false;
+	}
+	uuid_gen(&nfs_uuid->uuid);
+	list_add_tail(&nfs_uuid->list, &nfs_uuids);
 	spin_unlock(&nfs_uuid_lock);
+
+	return true;
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_begin);
 
@@ -36,7 +48,8 @@ void nfs_uuid_end(nfs_uuid_t *nfs_uuid)
 {
 	if (nfs_uuid->net == NULL) {
 		spin_lock(&nfs_uuid_lock);
-		list_del_init(&nfs_uuid->list);
+		if (nfs_uuid->net == NULL)
+			list_del_init(&nfs_uuid->list);
 		spin_unlock(&nfs_uuid_lock);
 	}
 }
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index b0dd9b1eef4f..3982fea79919 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -32,7 +32,8 @@ typedef struct {
 	struct auth_domain *dom; /* auth_domain for localio */
 } nfs_uuid_t;
 
-void nfs_uuid_begin(nfs_uuid_t *);
+void nfs_uuid_init(nfs_uuid_t *);
+bool nfs_uuid_begin(nfs_uuid_t *);
 void nfs_uuid_end(nfs_uuid_t *);
 void nfs_uuid_is_local(const uuid_t *, struct list_head *,
 		       struct net *, struct auth_domain *, struct module *);
-- 
cgit v1.2.3


From 2a8f6153e1c2db06a537a5c9d61102eb591776f1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 4 Nov 2024 08:39:32 +0100
Subject: block: pre-calculate max_zone_append_sectors

max_zone_append_sectors differs from all other queue limits in that the
final value used is not stored in the queue_limits but needs to be
obtained using queue_limits_max_zone_append_sectors helper.  This not
only adds (tiny) extra overhead to the I/O path, but also can be easily
forgotten in file system code.

Add a new max_hw_zone_append_sectors value to queue_limits which is
set by the driver, and calculate max_zone_append_sectors from that and
the other inputs in blk_validate_zoned_limits, similar to how
max_sectors is calculated to fix this.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241104073955.112324-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c               |  2 +-
 block/blk-merge.c              |  3 +--
 block/blk-settings.c           | 25 ++++++++++++-------------
 block/blk-sysfs.c              | 17 +++--------------
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/block/ublk_drv.c       |  2 +-
 drivers/block/virtio_blk.c     |  2 +-
 drivers/md/dm-zone.c           |  4 ++--
 drivers/nvme/host/multipath.c  |  2 +-
 drivers/nvme/host/zns.c        |  2 +-
 drivers/scsi/sd_zbc.c          |  2 --
 include/linux/blkdev.h         | 21 +++------------------
 12 files changed, 27 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 09d10bb95fda..5df4607321ca 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -607,7 +607,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 		return BLK_STS_IOERR;
 
 	/* Make sure the BIO is small enough and will not get split */
-	if (nr_sectors > queue_max_zone_append_sectors(q))
+	if (nr_sectors > q->limits.max_zone_append_sectors)
 		return BLK_STS_IOERR;
 
 	bio->bi_opf |= REQ_NOMERGE;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d813d799cee7..7c1375a080ad 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -388,11 +388,10 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 struct bio *bio_split_zone_append(struct bio *bio,
 		const struct queue_limits *lim, unsigned *nr_segs)
 {
-	unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim);
 	int split_sectors;
 
 	split_sectors = bio_split_rw_at(bio, lim, nr_segs,
-			max_sectors << SECTOR_SHIFT);
+			lim->max_zone_append_sectors << SECTOR_SHIFT);
 	if (WARN_ON_ONCE(split_sectors > 0))
 		split_sectors = -EINVAL;
 	return bio_submit_split(bio, split_sectors);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5ee3d6d1448d..5cb69d85af0e 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,17 +91,16 @@ static int blk_validate_zoned_limits(struct queue_limits *lim)
 	if (lim->zone_write_granularity < lim->logical_block_size)
 		lim->zone_write_granularity = lim->logical_block_size;
 
-	if (lim->max_zone_append_sectors) {
-		/*
-		 * The Zone Append size is limited by the maximum I/O size
-		 * and the zone size given that it can't span zones.
-		 */
-		lim->max_zone_append_sectors =
-			min3(lim->max_hw_sectors,
-			     lim->max_zone_append_sectors,
-			     lim->chunk_sectors);
-	}
-
+	/*
+	 * The Zone Append size is limited by the maximum I/O size and the zone
+	 * size given that it can't span zones.
+	 *
+	 * If no max_hw_zone_append_sectors limit is provided, the block layer
+	 * will emulated it, else we're also bound by the hardware limit.
+	 */
+	lim->max_zone_append_sectors =
+		min_not_zero(lim->max_hw_zone_append_sectors,
+			min(lim->chunk_sectors, lim->max_hw_sectors));
 	return 0;
 }
 
@@ -527,8 +526,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
 					b->max_write_zeroes_sectors);
-	t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
-					 queue_limits_max_zone_append_sectors(b));
+	t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
+					b->max_hw_zone_append_sectors);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 741b95dfdbf6..d9f22122ae2f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -131,6 +131,7 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
 
 #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field)			\
 static ssize_t queue_##_field##_show(struct gendisk *disk, char *page)	\
@@ -178,18 +179,6 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
 	return ret;
 }
 
-/*
- * For zone append queue_max_zone_append_sectors does not just return the
- * underlying queue limits, but actually contains a calculation.  Because of
- * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here.
- */
-static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page)
-{
-	return sprintf(page, "%llu\n",
-		(u64)queue_max_zone_append_sectors(disk->queue) <<
-			SECTOR_SHIFT);
-}
-
 static ssize_t
 queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count)
 {
@@ -479,7 +468,7 @@ QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
 
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
-QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
@@ -607,7 +596,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_atomic_write_unit_max_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_max_write_zeroes_sectors_entry.attr,
-	&queue_zone_append_max_entry.attr,
+	&queue_max_zone_append_sectors_entry.attr,
 	&queue_zone_write_granularity_entry.attr,
 	&queue_rotational_entry.attr,
 	&queue_zoned_entry.attr,
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 9bc768b2ca56..0d5f9bf95229 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -166,7 +166,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
 
 	lim->features |= BLK_FEAT_ZONED;
 	lim->chunk_sectors = dev->zone_size_sects;
-	lim->max_zone_append_sectors = dev->zone_append_max_sectors;
+	lim->max_hw_zone_append_sectors = dev->zone_append_max_sectors;
 	lim->max_open_zones = dev->zone_max_open;
 	lim->max_active_zones = dev->zone_max_active;
 	return 0;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 59951e7c2593..8d938b2b41ee 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2270,7 +2270,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
 		lim.features |= BLK_FEAT_ZONED;
 		lim.max_active_zones = p->max_active_zones;
 		lim.max_open_zones =  p->max_open_zones;
-		lim.max_zone_append_sectors = p->max_zone_append_sectors;
+		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
 	}
 
 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 194417abc105..0e99a4714928 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -784,7 +784,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk,
 			wg, v);
 		return -ENODEV;
 	}
-	lim->max_zone_append_sectors = v;
+	lim->max_hw_zone_append_sectors = v;
 	dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
 
 	return 0;
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index c0d41c36e06e..20edd3fabbab 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -344,7 +344,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
 		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
 	} else {
 		set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
-		lim->max_zone_append_sectors = 0;
+		lim->max_hw_zone_append_sectors = 0;
 	}
 
 	/*
@@ -379,7 +379,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
 	if (!zlim.mapped_nr_seq_zones) {
 		lim->max_open_zones = 0;
 		lim->max_active_zones = 0;
-		lim->max_zone_append_sectors = 0;
+		lim->max_hw_zone_append_sectors = 0;
 		lim->zone_write_granularity = 0;
 		lim->chunk_sectors = 0;
 		lim->features &= ~BLK_FEAT_ZONED;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 6a15873055b9..c26cb7d3a2e5 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -636,7 +636,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (head->ids.csi == NVME_CSI_ZNS)
 		lim.features |= BLK_FEAT_ZONED;
 	else
-		lim.max_zone_append_sectors = 0;
+		lim.max_hw_zone_append_sectors = 0;
 
 	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
 	if (IS_ERR(head->disk))
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 9a06f9d98cd6..382949e18c6a 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -111,7 +111,7 @@ void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
 	lim->features |= BLK_FEAT_ZONED;
 	lim->max_open_zones = zi->max_open_zones;
 	lim->max_active_zones = zi->max_active_zones;
-	lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
+	lim->max_hw_zone_append_sectors = ns->ctrl->max_zone_append;
 	lim->chunk_sectors = ns->head->zsze =
 		nvme_lba_to_sect(ns->head, zi->zone_size);
 }
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index ee2b74238758..de5c54c057ec 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -634,8 +634,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim,
 		lim->max_open_zones = sdkp->zones_max_open;
 	lim->max_active_zones = 0;
 	lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks);
-	/* Enable block layer zone append emulation */
-	lim->max_zone_append_sectors = 0;
 
 	return 0;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7bfc877e159e..6d1413bd69a5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -375,6 +375,7 @@ struct queue_limits {
 	unsigned int		max_user_discard_sectors;
 	unsigned int		max_secure_erase_sectors;
 	unsigned int		max_write_zeroes_sectors;
+	unsigned int		max_hw_zone_append_sectors;
 	unsigned int		max_zone_append_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
@@ -1204,25 +1205,9 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
 	return q->limits.max_segment_size;
 }
 
-static inline unsigned int
-queue_limits_max_zone_append_sectors(const struct queue_limits *l)
-{
-	unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
-
-	return min_not_zero(l->max_zone_append_sectors, max_sectors);
-}
-
-static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
-{
-	if (!blk_queue_is_zoned(q))
-		return 0;
-
-	return queue_limits_max_zone_append_sectors(&q->limits);
-}
-
 static inline bool queue_emulates_zone_append(struct request_queue *q)
 {
-	return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+	return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors;
 }
 
 static inline bool bdev_emulates_zone_append(struct block_device *bdev)
@@ -1233,7 +1218,7 @@ static inline bool bdev_emulates_zone_append(struct block_device *bdev)
 static inline unsigned int
 bdev_max_zone_append_sectors(struct block_device *bdev)
 {
-	return queue_max_zone_append_sectors(bdev_get_queue(bdev));
+	return bdev_limits(bdev)->max_zone_append_sectors;
 }
 
 static inline unsigned int bdev_max_segments(struct block_device *bdev)
-- 
cgit v1.2.3


From 9a783139614fb837da4ccb2f8ec6f0ddc802b3d3 Mon Sep 17 00:00:00 2001
From: Alistair Francis <alistair23@gmail.com>
Date: Mon, 4 Nov 2024 16:03:00 +1000
Subject: bpf: Move btf_type_is_struct_ptr() under CONFIG_BPF_SYSCALL

The static inline btf_type_is_struct_ptr() function calls
btf_type_skip_modifiers() which is guarded by CONFIG_BPF_SYSCALL.
btf_type_is_struct_ptr() is also only called by CONFIG_BPF_SYSCALL
ifdef code, so let's only expose btf_type_is_struct_ptr() if
CONFIG_BPF_SYSCALL is defined.

Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
Link: https://lore.kernel.org/r/20241104060300.421403-1-alistair.francis@wdc.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 631060e3ad14..4214e76c9168 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -582,6 +582,16 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
 bool btf_types_are_same(const struct btf *btf1, u32 id1,
 			const struct btf *btf2, u32 id2);
 int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx);
+
+static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
+{
+	if (!btf_type_is_ptr(t))
+		return false;
+
+	t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+	return btf_type_is_struct(t);
+}
 #else
 static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
 						    u32 type_id)
@@ -661,15 +671,4 @@ static inline int btf_check_iter_arg(struct btf *btf, const struct btf_type *fun
 	return -EOPNOTSUPP;
 }
 #endif
-
-static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
-{
-	if (!btf_type_is_ptr(t))
-		return false;
-
-	t = btf_type_skip_modifiers(btf, t->type, NULL);
-
-	return btf_type_is_struct(t);
-}
-
 #endif
-- 
cgit v1.2.3


From cb4158ce8ec8a5bb528cc1693356a5eb8058094d Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 4 Nov 2024 09:19:57 -0800
Subject: bpf: Mark raw_tp arguments with PTR_MAYBE_NULL

Arguments to a raw tracepoint are tagged as trusted, which carries the
semantics that the pointer will be non-NULL.  However, in certain cases,
a raw tracepoint argument may end up being NULL. More context about this
issue is available in [0].

Thus, there is a discrepancy between the reality, that raw_tp arguments
can actually be NULL, and the verifier's knowledge, that they are never
NULL, causing explicit NULL checks to be deleted, and accesses to such
pointers potentially crashing the kernel.

To fix this, mark raw_tp arguments as PTR_MAYBE_NULL, and then special
case the dereference and pointer arithmetic to permit it, and allow
passing them into helpers/kfuncs; these exceptions are made for raw_tp
programs only. Ensure that we don't do this when ref_obj_id > 0, as in
that case this is an acquired object and doesn't need such adjustment.

The reason we do mask_raw_tp_trusted_reg logic is because other will
recheck in places whether the register is a trusted_reg, and then
consider our register as untrusted when detecting the presence of the
PTR_MAYBE_NULL flag.

To allow safe dereference, we enable PROBE_MEM marking when we see loads
into trusted pointers with PTR_MAYBE_NULL.

While trusted raw_tp arguments can also be passed into helpers or kfuncs
where such broken assumption may cause issues, a future patch set will
tackle their case separately, as PTR_TO_BTF_ID (without PTR_TRUSTED) can
already be passed into helpers and causes similar problems. Thus, they
are left alone for now.

It is possible that these checks also permit passing non-raw_tp args
that are trusted PTR_TO_BTF_ID with null marking. In such a case,
allowing dereference when pointer is NULL expands allowed behavior, so
won't regress existing programs, and the case of passing these into
helpers is the same as above and will be dealt with later.

Also update the failure case in tp_btf_nullable selftest to capture the
new behavior, as the verifier will no longer cause an error when
directly dereference a raw tracepoint argument marked as __nullable.

  [0]: https://lore.kernel.org/bpf/ZrCZS6nisraEqehw@jlelli-thinkpadt14gen4.remote.csb

Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Reported-by: Juri Lelli <juri.lelli@redhat.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241104171959.2938862-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  6 ++
 kernel/bpf/btf.c                                   |  5 +-
 kernel/bpf/verifier.c                              | 79 ++++++++++++++++++++--
 .../selftests/bpf/progs/test_tp_btf_nullable.c     |  6 +-
 4 files changed, 87 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3ba4d475174..1b84613b10ac 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3495,4 +3495,10 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog)
 	return prog->aux->func_idx != 0;
 }
 
+static inline bool bpf_prog_is_raw_tp(const struct bpf_prog *prog)
+{
+	return prog->type == BPF_PROG_TYPE_TRACING &&
+	       prog->expected_attach_type == BPF_TRACE_RAW_TP;
+}
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ed3219da7181..e7a59e6462a9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6588,7 +6588,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	if (prog_args_trusted(prog))
 		info->reg_type |= PTR_TRUSTED;
 
-	if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+	/* Raw tracepoint arguments always get marked as maybe NULL */
+	if (bpf_prog_is_raw_tp(prog))
+		info->reg_type |= PTR_MAYBE_NULL;
+	else if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
 		info->reg_type |= PTR_MAYBE_NULL;
 
 	if (tgt_prog) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ba800c7611e3..7958d6ff6b73 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -418,6 +418,25 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
 	return rec;
 }
 
+static bool mask_raw_tp_reg_cond(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) {
+	return reg->type == (PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL) &&
+	       bpf_prog_is_raw_tp(env->prog) && !reg->ref_obj_id;
+}
+
+static bool mask_raw_tp_reg(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	if (!mask_raw_tp_reg_cond(env, reg))
+		return false;
+	reg->type &= ~PTR_MAYBE_NULL;
+	return true;
+}
+
+static void unmask_raw_tp_reg(struct bpf_reg_state *reg, bool result)
+{
+	if (result)
+		reg->type |= PTR_MAYBE_NULL;
+}
+
 static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
 {
 	struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
@@ -6622,6 +6641,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	const char *field_name = NULL;
 	enum bpf_type_flag flag = 0;
 	u32 btf_id = 0;
+	bool mask;
 	int ret;
 
 	if (!env->allow_ptr_leaks) {
@@ -6693,7 +6713,21 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 
 	if (ret < 0)
 		return ret;
-
+	/* For raw_tp progs, we allow dereference of PTR_MAYBE_NULL
+	 * trusted PTR_TO_BTF_ID, these are the ones that are possibly
+	 * arguments to the raw_tp. Since internal checks in for trusted
+	 * reg in check_ptr_to_btf_access would consider PTR_MAYBE_NULL
+	 * modifier as problematic, mask it out temporarily for the
+	 * check. Don't apply this to pointers with ref_obj_id > 0, as
+	 * those won't be raw_tp args.
+	 *
+	 * We may end up applying this relaxation to other trusted
+	 * PTR_TO_BTF_ID with maybe null flag, since we cannot
+	 * distinguish PTR_MAYBE_NULL tagged for arguments vs normal
+	 * tagging, but that should expand allowed behavior, and not
+	 * cause regression for existing behavior.
+	 */
+	mask = mask_raw_tp_reg(env, reg);
 	if (ret != PTR_TO_BTF_ID) {
 		/* just mark; */
 
@@ -6754,8 +6788,13 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		clear_trusted_flags(&flag);
 	}
 
-	if (atype == BPF_READ && value_regno >= 0)
+	if (atype == BPF_READ && value_regno >= 0) {
 		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
+		/* We've assigned a new type to regno, so don't undo masking. */
+		if (regno == value_regno)
+			mask = false;
+	}
+	unmask_raw_tp_reg(reg, mask);
 
 	return 0;
 }
@@ -7140,7 +7179,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (base_type(reg->type) == PTR_TO_BTF_ID &&
-		   !type_may_be_null(reg->type)) {
+		   (mask_raw_tp_reg_cond(env, reg) || !type_may_be_null(reg->type))) {
 		err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
 					      value_regno);
 	} else if (reg->type == CONST_PTR_TO_MAP) {
@@ -8833,6 +8872,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	enum bpf_reg_type type = reg->type;
 	u32 *arg_btf_id = NULL;
 	int err = 0;
+	bool mask;
 
 	if (arg_type == ARG_DONTCARE)
 		return 0;
@@ -8873,11 +8913,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	    base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
 		arg_btf_id = fn->arg_btf_id[arg];
 
+	mask = mask_raw_tp_reg(env, reg);
 	err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
-	if (err)
-		return err;
 
-	err = check_func_arg_reg_off(env, reg, regno, arg_type);
+	err = err ?: check_func_arg_reg_off(env, reg, regno, arg_type);
+	unmask_raw_tp_reg(reg, mask);
 	if (err)
 		return err;
 
@@ -9672,14 +9712,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 				return ret;
 		} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
 			struct bpf_call_arg_meta meta;
+			bool mask;
 			int err;
 
 			if (register_is_null(reg) && type_may_be_null(arg->arg_type))
 				continue;
 
 			memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
+			mask = mask_raw_tp_reg(env, reg);
 			err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
 			err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+			unmask_raw_tp_reg(reg, mask);
 			if (err)
 				return err;
 		} else {
@@ -12007,6 +12050,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		enum bpf_arg_type arg_type = ARG_DONTCARE;
 		u32 regno = i + 1, ref_id, type_size;
 		bool is_ret_buf_sz = false;
+		bool mask = false;
 		int kf_arg_type;
 
 		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
@@ -12065,12 +12109,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			return -EINVAL;
 		}
 
+		mask = mask_raw_tp_reg(env, reg);
 		if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
 		    (register_is_null(reg) || type_may_be_null(reg->type)) &&
 			!is_kfunc_arg_nullable(meta->btf, &args[i])) {
 			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+			unmask_raw_tp_reg(reg, mask);
 			return -EACCES;
 		}
+		unmask_raw_tp_reg(reg, mask);
 
 		if (reg->ref_obj_id) {
 			if (is_kfunc_release(meta) && meta->ref_obj_id) {
@@ -12128,16 +12175,24 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
 				break;
 
+			/* Allow passing maybe NULL raw_tp arguments to
+			 * kfuncs for compatibility. Don't apply this to
+			 * arguments with ref_obj_id > 0.
+			 */
+			mask = mask_raw_tp_reg(env, reg);
 			if (!is_trusted_reg(reg)) {
 				if (!is_kfunc_rcu(meta)) {
 					verbose(env, "R%d must be referenced or trusted\n", regno);
+					unmask_raw_tp_reg(reg, mask);
 					return -EINVAL;
 				}
 				if (!is_rcu_reg(reg)) {
 					verbose(env, "R%d must be a rcu pointer\n", regno);
+					unmask_raw_tp_reg(reg, mask);
 					return -EINVAL;
 				}
 			}
+			unmask_raw_tp_reg(reg, mask);
 			fallthrough;
 		case KF_ARG_PTR_TO_CTX:
 		case KF_ARG_PTR_TO_DYNPTR:
@@ -12160,7 +12215,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 		if (is_kfunc_release(meta) && reg->ref_obj_id)
 			arg_type |= OBJ_RELEASE;
+		mask = mask_raw_tp_reg(env, reg);
 		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+		unmask_raw_tp_reg(reg, mask);
 		if (ret < 0)
 			return ret;
 
@@ -12337,6 +12394,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			ref_tname = btf_name_by_offset(btf, ref_t->name_off);
 			fallthrough;
 		case KF_ARG_PTR_TO_BTF_ID:
+			mask = mask_raw_tp_reg(env, reg);
 			/* Only base_type is checked, further checks are done here */
 			if ((base_type(reg->type) != PTR_TO_BTF_ID ||
 			     (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
@@ -12345,9 +12403,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				verbose(env, "expected %s or socket\n",
 					reg_type_str(env, base_type(reg->type) |
 							  (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+				unmask_raw_tp_reg(reg, mask);
 				return -EINVAL;
 			}
 			ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+			unmask_raw_tp_reg(reg, mask);
 			if (ret < 0)
 				return ret;
 			break;
@@ -13320,7 +13380,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
  */
 static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 				   struct bpf_insn *insn,
-				   const struct bpf_reg_state *ptr_reg,
+				   struct bpf_reg_state *ptr_reg,
 				   const struct bpf_reg_state *off_reg)
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
@@ -13334,6 +13394,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	struct bpf_sanitize_info info = {};
 	u8 opcode = BPF_OP(insn->code);
 	u32 dst = insn->dst_reg;
+	bool mask;
 	int ret;
 
 	dst_reg = &regs[dst];
@@ -13360,11 +13421,14 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
+	mask = mask_raw_tp_reg(env, ptr_reg);
 	if (ptr_reg->type & PTR_MAYBE_NULL) {
 		verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
 			dst, reg_type_str(env, ptr_reg->type));
+		unmask_raw_tp_reg(ptr_reg, mask);
 		return -EACCES;
 	}
+	unmask_raw_tp_reg(ptr_reg, mask);
 
 	switch (base_type(ptr_reg->type)) {
 	case PTR_TO_CTX:
@@ -19866,6 +19930,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		 * for this case.
 		 */
 		case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
+		case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
 			if (type == BPF_READ) {
 				if (BPF_MODE(insn->code) == BPF_MEM)
 					insn->code = BPF_LDX | BPF_PROBE_MEM |
diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
index bba3e37f749b..5aaf2b065f86 100644
--- a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
+++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
@@ -7,7 +7,11 @@
 #include "bpf_misc.h"
 
 SEC("tp_btf/bpf_testmod_test_nullable_bare")
-__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")
+/* This used to be a failure test, but raw_tp nullable arguments can now
+ * directly be dereferenced, whether they have nullable annotation or not,
+ * and don't need to be explicitly checked.
+ */
+__success
 int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx)
 {
 	return nullable_ctx->len;
-- 
cgit v1.2.3


From ad37bcd965fda43f34cf5cc051f5d310880bd1e7 Mon Sep 17 00:00:00 2001
From: Alice Ryhl <aliceryhl@google.com>
Date: Wed, 30 Oct 2024 16:04:25 +0000
Subject: rust: add tracepoint support

Make it possible to have Rust code call into tracepoints defined by C
code. It is still required that the tracepoint is declared in a C
header, and that this header is included in the input to bindgen.

Instead of calling __DO_TRACE directly, the exported rust_do_trace_
function calls an inline helper function. This is because the `cond`
argument does not exist at the callsite of DEFINE_RUST_DO_TRACE.

__DECLARE_TRACE always emits an inline static and an extern declaration
that is only used when CREATE_RUST_TRACE_POINTS is set. These should not
end up in the final binary so it is not a problem that they sometimes
are emitted without a user.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Cc: " =?utf-8?q?Bj=C3=B6rn_Roy_Baron?= " <bjorn3_gh@protonmail.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Uros Bizjak <ubizjak@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Anup Patel <apatel@ventanamicro.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Conor Dooley <conor.dooley@microchip.com>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Bibo Mao <maobibo@loongson.cn>
Cc: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tianrui Zhao <zhaotianrui@loongson.cn>
Link: https://lore.kernel.org/20241030-tracepoint-v12-2-eec7f0f8ad22@google.com
Reviewed-by: Carlos Llamas <cmllamas@google.com>
Reviewed-by: Gary Guo <gary@garyguo.net>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h      | 28 ++++++++++++++++++++++-
 include/trace/define_trace.h    | 12 ++++++++++
 rust/bindings/bindings_helper.h |  1 +
 rust/kernel/lib.rs              |  1 +
 rust/kernel/tracepoint.rs       | 49 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 rust/kernel/tracepoint.rs

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 0dc67fad706c..84c4924e499f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -225,6 +225,18 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			preempt_enable_notrace();			\
 	} while (0)
 
+/*
+ * Declare an exported function that Rust code can call to trigger this
+ * tracepoint. This function does not include the static branch; that is done
+ * in Rust to avoid a function call when the tracepoint is disabled.
+ */
+#define DEFINE_RUST_DO_TRACE(name, proto, args)
+#define __DEFINE_RUST_DO_TRACE(name, proto, args)			\
+	notrace void rust_do_trace_##name(proto)			\
+	{								\
+		__rust_do_trace_##name(args);				\
+	}
+
 /*
  * Make sure the alignment of the structure in the __tracepoints section will
  * not add unwanted padding between the beginning of the section and the
@@ -240,6 +252,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	extern int __traceiter_##name(data_proto);			\
 	DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name);	\
 	extern struct tracepoint __tracepoint_##name;			\
+	extern void rust_do_trace_##name(proto);			\
 	static inline int						\
 	register_trace_##name(void (*probe)(data_proto), void *data)	\
 	{								\
@@ -271,6 +284,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 
 #define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
 	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+	static inline void __rust_do_trace_##name(proto)		\
+	{								\
+		__DO_TRACE(name,					\
+			TP_ARGS(args),					\
+			TP_CONDITION(cond), 0);				\
+	}								\
 	static inline void trace_##name(proto)				\
 	{								\
 		if (static_branch_unlikely(&__tracepoint_##name.key))	\
@@ -285,6 +304,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 
 #define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto)	\
 	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+	static inline void __rust_do_trace_##name(proto)		\
+	{								\
+		__DO_TRACE(name,					\
+			TP_ARGS(args),					\
+			TP_CONDITION(cond), 1);				\
+	}								\
 	static inline void trace_##name(proto)				\
 	{								\
 		if (static_branch_unlikely(&__tracepoint_##name.key))	\
@@ -339,7 +364,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	void __probestub_##_name(void *__data, proto)			\
 	{								\
 	}								\
-	DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);
+	DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);	\
+	DEFINE_RUST_DO_TRACE(_name, TP_PROTO(proto), TP_ARGS(args))
 
 #define DEFINE_TRACE(name, proto, args)		\
 	DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args));
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index ff5fa17a6259..0557626b6f6a 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -76,6 +76,13 @@
 #define DECLARE_TRACE(name, proto, args)	\
 	DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
 
+/* If requested, create helpers for calling these tracepoints from Rust. */
+#ifdef CREATE_RUST_TRACE_POINTS
+#undef DEFINE_RUST_DO_TRACE
+#define DEFINE_RUST_DO_TRACE(name, proto, args)	\
+	__DEFINE_RUST_DO_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
+
 #undef TRACE_INCLUDE
 #undef __TRACE_INCLUDE
 
@@ -134,6 +141,11 @@
 # undef UNDEF_TRACE_INCLUDE_PATH
 #endif
 
+#ifdef CREATE_RUST_TRACE_POINTS
+# undef DEFINE_RUST_DO_TRACE
+# define DEFINE_RUST_DO_TRACE(name, proto, args)
+#endif
+
 /* We may be processing more files */
 #define CREATE_TRACE_POINTS
 
diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index e0846e7e93e6..752572e638a6 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -20,6 +20,7 @@
 #include <linux/refcount.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/tracepoint.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
index 708ff817ccc3..55f81f49024e 100644
--- a/rust/kernel/lib.rs
+++ b/rust/kernel/lib.rs
@@ -54,6 +54,7 @@ pub mod str;
 pub mod sync;
 pub mod task;
 pub mod time;
+pub mod tracepoint;
 pub mod types;
 pub mod uaccess;
 pub mod workqueue;
diff --git a/rust/kernel/tracepoint.rs b/rust/kernel/tracepoint.rs
new file mode 100644
index 000000000000..c6e80aa99e8e
--- /dev/null
+++ b/rust/kernel/tracepoint.rs
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2024 Google LLC.
+
+//! Logic for tracepoints.
+
+/// Declare the Rust entry point for a tracepoint.
+///
+/// This macro generates an unsafe function that calls into C, and its safety requirements will be
+/// whatever the relevant C code requires. To document these safety requirements, you may add
+/// doc-comments when invoking the macro.
+#[macro_export]
+macro_rules! declare_trace {
+    ($($(#[$attr:meta])* $pub:vis unsafe fn $name:ident($($argname:ident : $argtyp:ty),* $(,)?);)*) => {$(
+        $( #[$attr] )*
+        #[inline(always)]
+        $pub unsafe fn $name($($argname : $argtyp),*) {
+            #[cfg(CONFIG_TRACEPOINTS)]
+            {
+                // SAFETY: It's always okay to query the static key for a tracepoint.
+                let should_trace = unsafe {
+                    $crate::macros::paste! {
+                        $crate::jump_label::static_branch_unlikely!(
+                            $crate::bindings::[< __tracepoint_ $name >],
+                            $crate::bindings::tracepoint,
+                            key
+                        )
+                    }
+                };
+
+                if should_trace {
+                    $crate::macros::paste! {
+                        // SAFETY: The caller guarantees that it is okay to call this tracepoint.
+                        unsafe { $crate::bindings::[< rust_do_trace_ $name >]($($argname),*) };
+                    }
+                }
+            }
+
+            #[cfg(not(CONFIG_TRACEPOINTS))]
+            {
+                // If tracepoints are disabled, insert a trivial use of each argument
+                // to avoid unused argument warnings.
+                $( let _unused = $argname; )*
+            }
+        }
+    )*}
+}
+
+pub use declare_trace;
-- 
cgit v1.2.3


From 5609296750afd6462a4d994b6803ccc5e8bf1d4e Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 30 Oct 2024 16:39:51 +0000
Subject: PM: EM: Add min/max available performance state limits

On some devices there are HW dependencies for shared frequency and voltage
between devices. It will impact Energy Aware Scheduler (EAS) decision,
where CPUs share the voltage & frequency domain with other CPUs or devices
e.g.
 - Mid CPUs + Big CPU
 - Little CPU + L3 cache in DSU
 - some other device + Little CPUs

Detailed explanation of one example:
When the L3 cache frequency is increased, the affected Little CPUs might
run at higher voltage and frequency. That higher voltage causes higher CPU
power and thus more energy is used for running the tasks. This is
important for background running tasks, which try to run on energy
efficient CPUs.

Therefore, add performance state limits which are applied for the device
(in this case CPU). This is important on SoCs with HW dependencies
mentioned above so that the Energy Aware Scheduler (EAS) does not use
performance states outside the valid min-max range for energy calculation.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://patch.msgid.link/20241030164126.1263793-2-lukasz.luba@arm.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h | 29 +++++++++++++++++-------
 kernel/power/energy_model.c  | 52 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 1ff52020cf75..752e0b297582 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -55,6 +55,8 @@ struct em_perf_table {
  * struct em_perf_domain - Performance domain
  * @em_table:		Pointer to the runtime modifiable em_perf_table
  * @nr_perf_states:	Number of performance states
+ * @min_perf_state:	Minimum allowed Performance State index
+ * @max_perf_state:	Maximum allowed Performance State index
  * @flags:		See "em_perf_domain flags"
  * @cpus:		Cpumask covering the CPUs of the domain. It's here
  *			for performance reasons to avoid potential cache
@@ -70,6 +72,8 @@ struct em_perf_table {
 struct em_perf_domain {
 	struct em_perf_table __rcu *em_table;
 	int nr_perf_states;
+	int min_perf_state;
+	int max_perf_state;
 	unsigned long flags;
 	unsigned long cpus[];
 };
@@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table);
 int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
 			 int nr_states);
 int em_dev_update_chip_binning(struct device *dev);
+int em_update_performance_limits(struct em_perf_domain *pd,
+		unsigned long freq_min_khz, unsigned long freq_max_khz);
 
 /**
  * em_pd_get_efficient_state() - Get an efficient performance state from the EM
  * @table:		List of performance states, in ascending order
- * @nr_perf_states:	Number of performance states
+ * @pd:			performance domain for which this must be done
  * @max_util:		Max utilization to map with the EM
- * @pd_flags:		Performance Domain flags
  *
  * It is called from the scheduler code quite frequently and as a consequence
  * doesn't implement any check.
@@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev);
  * requirement.
  */
 static inline int
-em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
-			  unsigned long max_util, unsigned long pd_flags)
+em_pd_get_efficient_state(struct em_perf_state *table,
+			  struct em_perf_domain *pd, unsigned long max_util)
 {
+	unsigned long pd_flags = pd->flags;
+	int min_ps = pd->min_perf_state;
+	int max_ps = pd->max_perf_state;
 	struct em_perf_state *ps;
 	int i;
 
-	for (i = 0; i < nr_perf_states; i++) {
+	for (i = min_ps; i <= max_ps; i++) {
 		ps = &table[i];
 		if (ps->performance >= max_util) {
 			if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
@@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
 		}
 	}
 
-	return nr_perf_states - 1;
+	return max_ps;
 }
 
 /**
@@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 * requested performance.
 	 */
 	em_table = rcu_dereference(pd->em_table);
-	i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
-				      max_util, pd->flags);
+	i = em_pd_get_efficient_state(em_table->state, pd, max_util);
 	ps = &em_table->state[i];
 
 	/*
@@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev)
 {
 	return -EINVAL;
 }
+static inline
+int em_update_performance_limits(struct em_perf_domain *pd,
+		unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 927cc55ba0b3..d07faf42eace 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 		goto unlock;
 
 	dev->em_pd->flags |= flags;
+	dev->em_pd->min_perf_state = 0;
+	dev->em_pd->max_perf_state = nr_states - 1;
 
 	em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
 
@@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev)
 	return em_recalc_and_update(dev, pd, em_table);
 }
 EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
+
+
+/**
+ * em_update_performance_limits() - Update Energy Model with performance
+ *				limits information.
+ * @pd			: Performance Domain with EM that has to be updated.
+ * @freq_min_khz	: New minimum allowed frequency for this device.
+ * @freq_max_khz	: New maximum allowed frequency for this device.
+ *
+ * This function allows to update the EM with information about available
+ * performance levels. It takes the minimum and maximum frequency in kHz
+ * and does internal translation to performance levels.
+ * Returns 0 on success or -EINVAL when failed.
+ */
+int em_update_performance_limits(struct em_perf_domain *pd,
+		unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+	struct em_perf_state *table;
+	int min_ps = -1;
+	int max_ps = -1;
+	int i;
+
+	if (!pd)
+		return -EINVAL;
+
+	rcu_read_lock();
+	table = em_perf_state_from_pd(pd);
+
+	for (i = 0; i < pd->nr_perf_states; i++) {
+		if (freq_min_khz == table[i].frequency)
+			min_ps = i;
+		if (freq_max_khz == table[i].frequency)
+			max_ps = i;
+	}
+	rcu_read_unlock();
+
+	/* Only update when both are found and sane */
+	if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
+		return -EINVAL;
+
+
+	/* Guard simultaneous updates and make them atomic */
+	mutex_lock(&em_pd_mutex);
+	pd->min_perf_state = min_ps;
+	pd->max_perf_state = max_ps;
+	mutex_unlock(&em_pd_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(em_update_performance_limits);
-- 
cgit v1.2.3


From 9e0933c21c128d6d8ac4d8aae0babaf9a43100b8 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Mon, 4 Nov 2024 16:14:02 -0800
Subject: fs: iomap: Atomic write support

Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC
flag set.

Initially FSes (XFS) should only support writing a single FS block
atomically.

As with any atomic write, we should produce a single bio which covers the
complete write length.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
[djwong: clarify a couple of things in the docs]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 Documentation/filesystems/iomap/operations.rst | 15 ++++++++++
 fs/iomap/direct-io.c                           | 38 +++++++++++++++++++++++---
 fs/iomap/trace.h                               |  3 +-
 include/linux/iomap.h                          |  1 +
 4 files changed, 52 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 8e6c721d2330..ee790f843cfa 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -513,6 +513,21 @@ IOMAP_WRITE`` with any combination of the following enhancements:
    if the mapping is unwritten and the filesystem cannot handle zeroing
    the unaligned regions without exposing stale contents.
 
+ * ``IOMAP_ATOMIC``: This write is being issued with torn-write
+   protection.
+   Only a single bio can be created for the write, and the write must
+   not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
+   set.
+   The file range to write must be aligned to satisfy the requirements
+   of both the filesystem and the underlying block device's atomic
+   commit capabilities.
+   If filesystem metadata updates are required (e.g. unwritten extent
+   conversion or copy on write), all updates for the entire file range
+   must be committed atomically as well.
+   Only one space mapping is allowed per untorn write.
+   Untorn writes must be aligned to, and must not be longer than, a
+   single file block.
+
 Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
 calling this function.
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f637aa0706a3..ed4764e3b8f0 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
  * clearing the WRITE_THROUGH flag in the dio request.
  */
 static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
-		const struct iomap *iomap, bool use_fua)
+		const struct iomap *iomap, bool use_fua, bool atomic)
 {
 	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
 
@@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
 		opflags |= REQ_FUA;
 	else
 		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+	if (atomic)
+		opflags |= REQ_ATOMIC;
 
 	return opflags;
 }
@@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
 	unsigned int fs_block_size = i_blocksize(inode), pad;
-	loff_t length = iomap_length(iter);
+	const loff_t length = iomap_length(iter);
+	bool atomic = iter->flags & IOMAP_ATOMIC;
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf;
 	struct bio *bio;
@@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	size_t copied = 0;
 	size_t orig_count;
 
+	if (atomic && length != fs_block_size)
+		return -EINVAL;
+
 	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
 	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
 		return -EINVAL;
@@ -382,7 +388,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	 * can set up the page vector appropriately for a ZONE_APPEND
 	 * operation.
 	 */
-	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
 
 	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
 	do {
@@ -415,6 +421,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		}
 
 		n = bio->bi_iter.bi_size;
+		if (WARN_ON_ONCE(atomic && n != length)) {
+			/*
+			 * This bio should have covered the complete length,
+			 * which it doesn't, so error. We may need to zero out
+			 * the tail (complete FS block), similar to when
+			 * bio_iov_iter_get_pages() returns an error, above.
+			 */
+			ret = -EINVAL;
+			bio_put(bio);
+			goto zero_tail;
+		}
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			task_io_account_write(n);
 		} else {
@@ -598,6 +615,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iomi.flags |= IOMAP_NOWAIT;
 
+	if (iocb->ki_flags & IOCB_ATOMIC)
+		iomi.flags |= IOMAP_ATOMIC;
+
 	if (iov_iter_rw(iter) == READ) {
 		/* reads can always complete inline */
 		dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -659,7 +679,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			if (ret != -EAGAIN) {
 				trace_iomap_dio_invalidate_fail(inode, iomi.pos,
 								iomi.len);
-				ret = -ENOTBLK;
+				if (iocb->ki_flags & IOCB_ATOMIC) {
+					/*
+					 * folio invalidation failed, maybe
+					 * this is transient, unlock and see if
+					 * the caller tries again.
+					 */
+					ret = -EAGAIN;
+				} else {
+					/* fall back to buffered write */
+					ret = -ENOTBLK;
+				}
 			}
 			goto out_free_dio;
 		}
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 0a991c4ce87d..4118a42cdab0 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
 	{ IOMAP_REPORT,		"REPORT" }, \
 	{ IOMAP_FAULT,		"FAULT" }, \
 	{ IOMAP_DIRECT,		"DIRECT" }, \
-	{ IOMAP_NOWAIT,		"NOWAIT" }
+	{ IOMAP_NOWAIT,		"NOWAIT" }, \
+	{ IOMAP_ATOMIC,		"ATOMIC" }
 
 #define IOMAP_F_FLAGS_STRINGS \
 	{ IOMAP_F_NEW,		"NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4ad12a3c8bae..c7644bdcfca3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@ struct iomap_folio_ops {
 #else
 #define IOMAP_DAX		0
 #endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC		(1 << 9)
 
 struct iomap_ops {
 	/*
-- 
cgit v1.2.3


From 0a2cdeeae9ddc14d752173be6af98bc9fb45c6ad Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Mon, 4 Nov 2024 15:00:41 +0800
Subject: net: tcp: replace the document for "lsndtime" in tcp_sock

Commit d5fed5addb2b ("tcp: reorganize tcp_sock fast path variables")
moved the fields around and misplaced the documentation for "lsndtime".
So, let's replace it in the proper place.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20241104070041.64302-1-dongml2@chinatelecom.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/tcp.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6a5e08b937b3..f88daaa76d83 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -200,7 +200,6 @@ struct tcp_sock {
 
 	/* TX read-mostly hotpath cache lines */
 	__cacheline_group_begin(tcp_sock_read_tx);
-	/* timestamp of last sent data packet (for restart window) */
 	u32	max_window;	/* Maximal window ever seen from peer	*/
 	u32	rcv_ssthresh;	/* Current window clamp			*/
 	u32	reordering;	/* Packet reordering metric.		*/
@@ -263,7 +262,7 @@ struct tcp_sock {
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
 	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
-	u32	lsndtime;
+	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
 	u32	mdev_us;	/* medium deviation			*/
 	u32	rtt_seq;	/* sequence number to update rttvar	*/
 	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
-- 
cgit v1.2.3


From f248ff14b7589306c8af922465aefedf9b10fa9e Mon Sep 17 00:00:00 2001
From: Desnes Nunes <desnesn@redhat.com>
Date: Thu, 31 Oct 2024 11:28:00 -0300
Subject: misc: rtsx: Cleanup on DRV_NAME cardreader variables

The rtsx_pci_ms memstick driver has been dropped, thus there is no more
need for DRV_NAME_RTSX_PCI_MS variable. Additionally, this also stand-
arizes DRV_NAME variables on alcor_pci and rtsx_usb drivers.

Fixes: d0f459259c13 ("memstick: rtsx_pci_ms: Remove Realtek PCI memstick driver")
Signed-off-by: Desnes Nunes <desnesn@redhat.com>
Link: https://lore.kernel.org/r/20241031142801.1141680-1-desnesn@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/cardreader/alcor_pci.c | 2 --
 drivers/misc/cardreader/rtsx_usb.c  | 6 +++---
 include/linux/alcor_pci.h           | 1 +
 include/linux/rtsx_common.h         | 1 -
 include/linux/rtsx_usb.h            | 4 ++++
 5 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/cardreader/alcor_pci.c b/drivers/misc/cardreader/alcor_pci.c
index 0142c4bf4f42..a5549eaf52d0 100644
--- a/drivers/misc/cardreader/alcor_pci.c
+++ b/drivers/misc/cardreader/alcor_pci.c
@@ -17,8 +17,6 @@
 
 #include <linux/alcor_pci.h>
 
-#define DRV_NAME_ALCOR_PCI			"alcor_pci"
-
 static DEFINE_IDA(alcor_pci_idr);
 
 static struct mfd_cell alcor_pci_cells[] = {
diff --git a/drivers/misc/cardreader/rtsx_usb.c b/drivers/misc/cardreader/rtsx_usb.c
index f150d8769f19..77b0490a1b38 100644
--- a/drivers/misc/cardreader/rtsx_usb.c
+++ b/drivers/misc/cardreader/rtsx_usb.c
@@ -20,11 +20,11 @@ MODULE_PARM_DESC(polling_pipe, "polling pipe (0: ctl, 1: bulk)");
 
 static const struct mfd_cell rtsx_usb_cells[] = {
 	[RTSX_USB_SD_CARD] = {
-		.name = "rtsx_usb_sdmmc",
+		.name = DRV_NAME_RTSX_USB_SDMMC,
 		.pdata_size = 0,
 	},
 	[RTSX_USB_MS_CARD] = {
-		.name = "rtsx_usb_ms",
+		.name = DRV_NAME_RTSX_USB_MS,
 		.pdata_size = 0,
 	},
 };
@@ -780,7 +780,7 @@ static const struct usb_device_id rtsx_usb_usb_ids[] = {
 MODULE_DEVICE_TABLE(usb, rtsx_usb_usb_ids);
 
 static struct usb_driver rtsx_usb_driver = {
-	.name			= "rtsx_usb",
+	.name			= DRV_NAME_RTSX_USB,
 	.probe			= rtsx_usb_probe,
 	.disconnect		= rtsx_usb_disconnect,
 	.suspend		= rtsx_usb_suspend,
diff --git a/include/linux/alcor_pci.h b/include/linux/alcor_pci.h
index c4a0b23846d8..dcb1d37dabc2 100644
--- a/include/linux/alcor_pci.h
+++ b/include/linux/alcor_pci.h
@@ -11,6 +11,7 @@
 #define ALCOR_SD_CARD 0
 #define ALCOR_MS_CARD 1
 
+#define DRV_NAME_ALCOR_PCI			"alcor_pci"
 #define DRV_NAME_ALCOR_PCI_SDMMC		"alcor_sdmmc"
 #define DRV_NAME_ALCOR_PCI_MS			"alcor_ms"
 
diff --git a/include/linux/rtsx_common.h b/include/linux/rtsx_common.h
index bf290ad14c57..da9c8c6b5d50 100644
--- a/include/linux/rtsx_common.h
+++ b/include/linux/rtsx_common.h
@@ -12,7 +12,6 @@
 
 #define DRV_NAME_RTSX_PCI		"rtsx_pci"
 #define DRV_NAME_RTSX_PCI_SDMMC		"rtsx_pci_sdmmc"
-#define DRV_NAME_RTSX_PCI_MS		"rtsx_pci_ms"
 
 #define RTSX_REG_PAIR(addr, val)	(((u32)(addr) << 16) | (u8)(val))
 
diff --git a/include/linux/rtsx_usb.h b/include/linux/rtsx_usb.h
index 3247ed8e9ff0..f267a06c6b1e 100644
--- a/include/linux/rtsx_usb.h
+++ b/include/linux/rtsx_usb.h
@@ -12,6 +12,10 @@
 
 #include <linux/usb.h>
 
+#define DRV_NAME_RTSX_USB		"rtsx_usb"
+#define DRV_NAME_RTSX_USB_SDMMC		"rtsx_usb_sdmmc"
+#define DRV_NAME_RTSX_USB_MS		"rtsx_usb_ms"
+
 /* related module names */
 #define RTSX_USB_SD_CARD	0
 #define RTSX_USB_MS_CARD	1
-- 
cgit v1.2.3


From 7fce207af5ec074a9a50e90eb866b17ca4a90f06 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 24 Oct 2024 10:18:07 -0700
Subject: mm/writeback: add folio_mark_dirty_lock()

Add a new convenience helper folio_mark_dirty_lock() that grabs the
folio lock before calling folio_mark_dirty().

Refactor set_page_dirty_lock() to directly use folio_mark_dirty_lock().

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/mm.h  |  1 +
 mm/folio-compat.c   |  6 ++++++
 mm/page-writeback.c | 22 +++++++++++-----------
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..446d7096c48f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2539,6 +2539,7 @@ struct kvec;
 struct page *get_dump_page(unsigned long addr);
 
 bool folio_mark_dirty(struct folio *folio);
+bool folio_mark_dirty_lock(struct folio *folio);
 bool set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 80746182e9e8..1d1832e2a599 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -52,6 +52,12 @@ bool set_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL(set_page_dirty);
 
+int set_page_dirty_lock(struct page *page)
+{
+	return folio_mark_dirty_lock(page_folio(page));
+}
+EXPORT_SYMBOL(set_page_dirty_lock);
+
 bool clear_page_dirty_for_io(struct page *page)
 {
 	return folio_clear_dirty_for_io(page_folio(page));
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fcd4c1439cb9..db00a66d8b84 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2914,25 +2914,25 @@ bool folio_mark_dirty(struct folio *folio)
 EXPORT_SYMBOL(folio_mark_dirty);
 
 /*
- * set_page_dirty() is racy if the caller has no reference against
- * page->mapping->host, and if the page is unlocked.  This is because another
- * CPU could truncate the page off the mapping and then free the mapping.
+ * folio_mark_dirty() is racy if the caller has no reference against
+ * folio->mapping->host, and if the folio is unlocked.  This is because another
+ * CPU could truncate the folio off the mapping and then free the mapping.
  *
- * Usually, the page _is_ locked, or the caller is a user-space process which
+ * Usually, the folio _is_ locked, or the caller is a user-space process which
  * holds a reference on the inode by having an open file.
  *
- * In other cases, the page should be locked before running set_page_dirty().
+ * In other cases, the folio should be locked before running folio_mark_dirty().
  */
-int set_page_dirty_lock(struct page *page)
+bool folio_mark_dirty_lock(struct folio *folio)
 {
-	int ret;
+	bool ret;
 
-	lock_page(page);
-	ret = set_page_dirty(page);
-	unlock_page(page);
+	folio_lock(folio);
+	ret = folio_mark_dirty(folio);
+	folio_unlock(folio);
 	return ret;
 }
-EXPORT_SYMBOL(set_page_dirty_lock);
+EXPORT_SYMBOL(folio_mark_dirty_lock);
 
 /*
  * This cancels just the dirty bit on the kernel page itself, it does NOT
-- 
cgit v1.2.3


From 35890f85573c2ebbbf3491dc66f7ee2ad63055af Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 30 Oct 2024 21:20:45 -0300
Subject: vfio: Remove VFIO_TYPE1_NESTING_IOMMU

This control causes the ARM SMMU drivers to choose a stage 2
implementation for the IO pagetable (vs the stage 1 usual default),
however this choice has no significant visible impact to the VFIO
user. Further qemu never implemented this and no other userspace user is
known.

The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
SMMU translation services to the guest operating system" however the rest
of the API to set the guest table pointer for the stage 1 and manage
invalidation was never completed, or at least never upstreamed, rendering
this part useless dead code.

Upstream has now settled on iommufd as the uAPI for controlling nested
translation. Choosing the stage 2 implementation should be done by through
the IOMMU_HWPT_ALLOC_NEST_PARENT flag during domain allocation.

Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
enable_nesting iommu_domain_op.

Just in-case there is some userspace using this continue to treat
requesting it as a NOP, but do not advertise support any more.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Donald Dutile <ddutile@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ----------------
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 16 ----------------
 drivers/iommu/iommu.c                       | 10 ----------
 drivers/iommu/iommufd/vfio_compat.c         |  7 +------
 drivers/vfio/vfio_iommu_type1.c             | 12 +-----------
 include/linux/iommu.h                       |  3 ---
 include/uapi/linux/vfio.h                   |  2 +-
 7 files changed, 3 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 737c5b882355..acf250aeb18b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3378,21 +3378,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	int ret = 0;
-
-	mutex_lock(&smmu_domain->init_mutex);
-	if (smmu_domain->smmu)
-		ret = -EPERM;
-	else
-		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
-	mutex_unlock(&smmu_domain->init_mutex);
-
-	return ret;
-}
-
 static int arm_smmu_of_xlate(struct device *dev,
 			     const struct of_phandle_args *args)
 {
@@ -3514,7 +3499,6 @@ static struct iommu_ops arm_smmu_ops = {
 		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
-		.enable_nesting		= arm_smmu_enable_nesting,
 		.free			= arm_smmu_domain_free_paging,
 	}
 };
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 8321962b3714..12b173eec454 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1558,21 +1558,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	int ret = 0;
-
-	mutex_lock(&smmu_domain->init_mutex);
-	if (smmu_domain->smmu)
-		ret = -EPERM;
-	else
-		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-	mutex_unlock(&smmu_domain->init_mutex);
-
-	return ret;
-}
-
 static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks)
 {
@@ -1656,7 +1641,6 @@ static struct iommu_ops arm_smmu_ops = {
 		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
-		.enable_nesting		= arm_smmu_enable_nesting,
 		.set_pgtable_quirks	= arm_smmu_set_pgtable_quirks,
 		.free			= arm_smmu_domain_free,
 	}
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 83c8e617a2c5..dbd70d5a4702 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2723,16 +2723,6 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
-int iommu_enable_nesting(struct iommu_domain *domain)
-{
-	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
-		return -EINVAL;
-	if (!domain->ops->enable_nesting)
-		return -EINVAL;
-	return domain->ops->enable_nesting(domain);
-}
-EXPORT_SYMBOL_GPL(iommu_enable_nesting);
-
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirk)
 {
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index a3ad5f0b6c59..514aacd64009 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
 	case VFIO_DMA_CC_IOMMU:
 		return iommufd_vfio_cc_iommu(ictx);
 
-	/*
-	 * This is obsolete, and to be removed from VFIO. It was an incomplete
-	 * idea that got merged.
-	 * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
-	 */
-	case VFIO_TYPE1_NESTING_IOMMU:
+	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
 		return 0;
 
 	/*
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf391b40e576..50ebc9593c9d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,7 +72,6 @@ struct vfio_iommu {
 	uint64_t		pgsize_bitmap;
 	uint64_t		num_non_pinned_groups;
 	bool			v2;
-	bool			nesting;
 	bool			dirty_page_tracking;
 	struct list_head	emulated_iommu_groups;
 };
@@ -2195,12 +2194,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		goto out_free_domain;
 	}
 
-	if (iommu->nesting) {
-		ret = iommu_enable_nesting(domain->domain);
-		if (ret)
-			goto out_domain;
-	}
-
 	ret = iommu_attach_group(domain->domain, group->iommu_group);
 	if (ret)
 		goto out_domain;
@@ -2541,9 +2534,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	switch (arg) {
 	case VFIO_TYPE1_IOMMU:
 		break;
-	case VFIO_TYPE1_NESTING_IOMMU:
-		iommu->nesting = true;
-		fallthrough;
+	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
 		iommu->v2 = true;
 		break;
@@ -2638,7 +2629,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
 	switch (arg) {
 	case VFIO_TYPE1_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
-	case VFIO_TYPE1_NESTING_IOMMU:
 	case VFIO_UNMAP_ALL:
 		return 1;
 	case VFIO_UPDATE_VADDR:
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..c88d18d2c928 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -635,7 +635,6 @@ struct iommu_ops {
  * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
  *                           including no-snoop TLPs on PCIe or other platform
  *                           specific mechanisms.
- * @enable_nesting: Enable nesting
  * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
  * @free: Release the domain after use.
  */
@@ -663,7 +662,6 @@ struct iommu_domain_ops {
 				    dma_addr_t iova);
 
 	bool (*enforce_cache_coherency)(struct iommu_domain *domain);
-	int (*enable_nesting)(struct iommu_domain *domain);
 	int (*set_pgtable_quirks)(struct iommu_domain *domain,
 				  unsigned long quirks);
 
@@ -844,7 +842,6 @@ extern void iommu_group_put(struct iommu_group *group);
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
-int iommu_enable_nesting(struct iommu_domain *domain);
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks);
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf190..c8dbf8219c4f 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -35,7 +35,7 @@
 #define VFIO_EEH			5
 
 /* Two-stage IOMMU */
-#define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
+#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
 
 #define VFIO_SPAPR_TCE_v2_IOMMU		7
 
-- 
cgit v1.2.3


From 807404d66fcf898d4bcc6a3e3edb07ffd5b88400 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 30 Oct 2024 21:20:47 -0300
Subject: ACPI/IORT: Support CANWBS memory access flag

The IORT spec, Issue E.f (April 2024), adds a new CANWBS bit to the Memory
Access Flag field in the Memory Access Properties table, mainly for a PCI
Root Complex.

This CANWBS defines the coherency of memory accesses to be not marked IOWB
cacheable/shareable. Its value further implies the coherency impact from a
pair of mismatched memory attributes (e.g. in a nested translation case):
  0x0: Use of mismatched memory attributes for accesses made by this
       device may lead to a loss of coherency.
  0x1: Coherency of accesses made by this device to locations in
       Conventional memory are ensured as follows, even if the memory
       attributes for the accesses presented by the device or provided by
       the SMMU are different from Inner and Outer Write-back cacheable,
       Shareable.

Note that the loss of coherency on a CANWBS-unsupported HW typically could
occur to an SMMU that doesn't implement the S2FWB feature where additional
cache flush operations would be required to prevent that from happening.

Add a new ACPI_IORT_MF_CANWBS flag and set IOMMU_FWSPEC_PCI_RC_CANWBS upon
the presence of this new flag.

CANWBS and S2FWB are similar features, in that they both guarantee the VM
can not violate coherency, however S2FWB can be bypassed by PCI No Snoop
TLPs, while CANWBS cannot. Thus CANWBS meets the requirements to set
IOMMU_CAP_ENFORCE_CACHE_COHERENCY.

Architecturally ARM has expected that VFIO would disable No Snoop through
PCI Config space, if this is done then the two would have the same
protections.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Donald Dutile <ddutile@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/acpi/arm64/iort.c | 13 +++++++++++++
 include/linux/iommu.h     |  2 ++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 4c745a26226b..1f7e4c691d9e 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1218,6 +1218,17 @@ static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node)
 	return pci_rc->ats_attribute & ACPI_IORT_ATS_SUPPORTED;
 }
 
+static bool iort_pci_rc_supports_canwbs(struct acpi_iort_node *node)
+{
+	struct acpi_iort_memory_access *memory_access;
+	struct acpi_iort_root_complex *pci_rc;
+
+	pci_rc = (struct acpi_iort_root_complex *)node->node_data;
+	memory_access =
+		(struct acpi_iort_memory_access *)&pci_rc->memory_properties;
+	return memory_access->memory_flags & ACPI_IORT_MF_CANWBS;
+}
+
 static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node,
 			    u32 streamid)
 {
@@ -1335,6 +1346,8 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 		fwspec = dev_iommu_fwspec_get(dev);
 		if (fwspec && iort_pci_rc_supports_ats(node))
 			fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
+		if (fwspec && iort_pci_rc_supports_canwbs(node))
+			fwspec->flags |= IOMMU_FWSPEC_PCI_RC_CANWBS;
 	} else {
 		node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT,
 				      iort_match_node_callback, dev);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c88d18d2c928..4ad9b9ec6c9b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -991,6 +991,8 @@ struct iommu_fwspec {
 
 /* ATS is supported */
 #define IOMMU_FWSPEC_PCI_RC_ATS			(1 << 0)
+/* CANWBS is supported */
+#define IOMMU_FWSPEC_PCI_RC_CANWBS		(1 << 1)
 
 /*
  * An iommu attach handle represents a relationship between an iommu domain
-- 
cgit v1.2.3


From 5c1806c41ce0a0110db5dd4c483cf2dc28b3ddf0 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 4 Nov 2024 16:43:07 +0100
Subject: kcsan, seqlock: Support seqcount_latch_t

While fuzzing an arm64 kernel, Alexander Potapenko reported:

| BUG: KCSAN: data-race in ktime_get_mono_fast_ns / timekeeping_update
|
| write to 0xffffffc082e74248 of 56 bytes by interrupt on cpu 0:
|  update_fast_timekeeper kernel/time/timekeeping.c:430 [inline]
|  timekeeping_update+0x1d8/0x2d8 kernel/time/timekeeping.c:768
|  timekeeping_advance+0x9e8/0xb78 kernel/time/timekeeping.c:2344
|  update_wall_time+0x18/0x38 kernel/time/timekeeping.c:2360
|  [...]
|
| read to 0xffffffc082e74258 of 8 bytes by task 5260 on cpu 1:
|  __ktime_get_fast_ns kernel/time/timekeeping.c:372 [inline]
|  ktime_get_mono_fast_ns+0x88/0x174 kernel/time/timekeeping.c:489
|  init_srcu_struct_fields+0x40c/0x530 kernel/rcu/srcutree.c:263
|  init_srcu_struct+0x14/0x20 kernel/rcu/srcutree.c:311
|  [...]
|
| value changed: 0x000002f875d33266 -> 0x000002f877416866
|
| Reported by Kernel Concurrency Sanitizer on:
| CPU: 1 UID: 0 PID: 5260 Comm: syz.2.7483 Not tainted 6.12.0-rc3-dirty #78

This is a false positive data race between a seqcount latch writer and a reader
accessing stale data. Since its introduction, KCSAN has never understood the
seqcount_latch interface (due to being unannotated).

Unlike the regular seqlock interface, the seqcount_latch interface for latch
writers never has had a well-defined critical section, making it difficult to
teach tooling where the critical section starts and ends.

Introduce an instrumentable (non-raw) seqcount_latch interface, with
which we can clearly denote writer critical sections. This both helps
readability and tooling like KCSAN to understand when the writer is done
updating all latch copies.

Fixes: 88ecd153be95 ("seqlock, kcsan: Add annotations for KCSAN")
Reported-by: Alexander Potapenko <glider@google.com>
Co-developed-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241104161910.780003-4-elver@google.com
---
 Documentation/locking/seqlock.rst |  2 +-
 include/linux/seqlock.h           | 86 ++++++++++++++++++++++++++++++++-------
 2 files changed, 72 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst
index bfda1a5fecad..ec6411d02ac8 100644
--- a/Documentation/locking/seqlock.rst
+++ b/Documentation/locking/seqlock.rst
@@ -153,7 +153,7 @@ Use seqcount_latch_t when the write side sections cannot be protected
 from interruption by readers. This is typically the case when the read
 side can be invoked from NMI handlers.
 
-Check `raw_write_seqcount_latch()` for more information.
+Check `write_seqcount_latch()` for more information.
 
 
 .. _seqlock_t:
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index fffeb754880f..45eee0e5dca0 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -621,6 +621,23 @@ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *
 	return READ_ONCE(s->seqcount.sequence);
 }
 
+/**
+ * read_seqcount_latch() - pick even/odd latch data copy
+ * @s: Pointer to seqcount_latch_t
+ *
+ * See write_seqcount_latch() for details and a full reader/writer usage
+ * example.
+ *
+ * Return: sequence counter raw value. Use the lowest bit as an index for
+ * picking which data copy to read. The full counter must then be checked
+ * with read_seqcount_latch_retry().
+ */
+static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s)
+{
+	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
+	return raw_read_seqcount_latch(s);
+}
+
 /**
  * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
  * @s:		Pointer to seqcount_latch_t
@@ -635,9 +652,34 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
 	return unlikely(READ_ONCE(s->seqcount.sequence) != start);
 }
 
+/**
+ * read_seqcount_latch_retry() - end a seqcount_latch_t read section
+ * @s:		Pointer to seqcount_latch_t
+ * @start:	count, from read_seqcount_latch()
+ *
+ * Return: true if a read section retry is required, else false
+ */
+static __always_inline int
+read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
+{
+	kcsan_atomic_next(0);
+	return raw_read_seqcount_latch_retry(s, start);
+}
+
 /**
  * raw_write_seqcount_latch() - redirect latch readers to even/odd copy
  * @s: Pointer to seqcount_latch_t
+ */
+static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s)
+{
+	smp_wmb();	/* prior stores before incrementing "sequence" */
+	s->seqcount.sequence++;
+	smp_wmb();      /* increment "sequence" before following stores */
+}
+
+/**
+ * write_seqcount_latch_begin() - redirect latch readers to odd copy
+ * @s: Pointer to seqcount_latch_t
  *
  * The latch technique is a multiversion concurrency control method that allows
  * queries during non-atomic modifications. If you can guarantee queries never
@@ -665,17 +707,11 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
  *
  *	void latch_modify(struct latch_struct *latch, ...)
  *	{
- *		smp_wmb();	// Ensure that the last data[1] update is visible
- *		latch->seq.sequence++;
- *		smp_wmb();	// Ensure that the seqcount update is visible
- *
+ *		write_seqcount_latch_begin(&latch->seq);
  *		modify(latch->data[0], ...);
- *
- *		smp_wmb();	// Ensure that the data[0] update is visible
- *		latch->seq.sequence++;
- *		smp_wmb();	// Ensure that the seqcount update is visible
- *
+ *		write_seqcount_latch(&latch->seq);
  *		modify(latch->data[1], ...);
+ *		write_seqcount_latch_end(&latch->seq);
  *	}
  *
  * The query will have a form like::
@@ -686,13 +722,13 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
  *		unsigned seq, idx;
  *
  *		do {
- *			seq = raw_read_seqcount_latch(&latch->seq);
+ *			seq = read_seqcount_latch(&latch->seq);
  *
  *			idx = seq & 0x01;
  *			entry = data_query(latch->data[idx], ...);
  *
  *		// This includes needed smp_rmb()
- *		} while (raw_read_seqcount_latch_retry(&latch->seq, seq));
+ *		} while (read_seqcount_latch_retry(&latch->seq, seq));
  *
  *		return entry;
  *	}
@@ -716,11 +752,31 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
  *	When data is a dynamic data structure; one should use regular RCU
  *	patterns to manage the lifetimes of the objects within.
  */
-static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
+static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s)
 {
-	smp_wmb();	/* prior stores before incrementing "sequence" */
-	s->seqcount.sequence++;
-	smp_wmb();      /* increment "sequence" before following stores */
+	kcsan_nestable_atomic_begin();
+	raw_write_seqcount_latch(s);
+}
+
+/**
+ * write_seqcount_latch() - redirect latch readers to even copy
+ * @s: Pointer to seqcount_latch_t
+ */
+static __always_inline void write_seqcount_latch(seqcount_latch_t *s)
+{
+	raw_write_seqcount_latch(s);
+}
+
+/**
+ * write_seqcount_latch_end() - end a seqcount_latch_t write section
+ * @s:		Pointer to seqcount_latch_t
+ *
+ * Marks the end of a seqcount_latch_t writer section, after all copies of the
+ * latch-protected data have been updated.
+ */
+static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s)
+{
+	kcsan_nestable_atomic_end();
 }
 
 #define __SEQLOCK_UNLOCKED(lockname)					\
-- 
cgit v1.2.3


From 93190bc35d6d4364a4d8c38ac8961dabecbff4ed Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 4 Nov 2024 16:43:08 +0100
Subject: seqlock, treewide: Switch to non-raw seqcount_latch interface

Switch all instrumentable users of the seqcount_latch interface over to
the non-raw interface.

Co-developed-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241104161910.780003-5-elver@google.com
---
 arch/x86/kernel/tsc.c        |  5 +++--
 include/linux/rbtree_latch.h | 20 +++++++++++---------
 kernel/printk/printk.c       |  9 +++++----
 kernel/time/sched_clock.c    | 12 +++++++-----
 kernel/time/timekeeping.c    | 12 +++++++-----
 5 files changed, 33 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index dfe6847fd99e..67aeaba4ba9c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -174,10 +174,11 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts
 
 	c2n = per_cpu_ptr(&cyc2ns, cpu);
 
-	raw_write_seqcount_latch(&c2n->seq);
+	write_seqcount_latch_begin(&c2n->seq);
 	c2n->data[0] = data;
-	raw_write_seqcount_latch(&c2n->seq);
+	write_seqcount_latch(&c2n->seq);
 	c2n->data[1] = data;
+	write_seqcount_latch_end(&c2n->seq);
 }
 
 static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
index 6a0999c26c7c..2f630eb8307e 100644
--- a/include/linux/rbtree_latch.h
+++ b/include/linux/rbtree_latch.h
@@ -14,7 +14,7 @@
  *
  * If we need to allow unconditional lookups (say as required for NMI context
  * usage) we need a more complex setup; this data structure provides this by
- * employing the latch technique -- see @raw_write_seqcount_latch -- to
+ * employing the latch technique -- see @write_seqcount_latch_begin -- to
  * implement a latched RB-tree which does allow for unconditional lookups by
  * virtue of always having (at least) one stable copy of the tree.
  *
@@ -132,7 +132,7 @@ __lt_find(void *key, struct latch_tree_root *ltr, int idx,
  * @ops: operators defining the node order
  *
  * It inserts @node into @root in an ordered fashion such that we can always
- * observe one complete tree. See the comment for raw_write_seqcount_latch().
+ * observe one complete tree. See the comment for write_seqcount_latch_begin().
  *
  * The inserts use rcu_assign_pointer() to publish the element such that the
  * tree structure is stored before we can observe the new @node.
@@ -145,10 +145,11 @@ latch_tree_insert(struct latch_tree_node *node,
 		  struct latch_tree_root *root,
 		  const struct latch_tree_ops *ops)
 {
-	raw_write_seqcount_latch(&root->seq);
+	write_seqcount_latch_begin(&root->seq);
 	__lt_insert(node, root, 0, ops->less);
-	raw_write_seqcount_latch(&root->seq);
+	write_seqcount_latch(&root->seq);
 	__lt_insert(node, root, 1, ops->less);
+	write_seqcount_latch_end(&root->seq);
 }
 
 /**
@@ -159,7 +160,7 @@ latch_tree_insert(struct latch_tree_node *node,
  *
  * Removes @node from the trees @root in an ordered fashion such that we can
  * always observe one complete tree. See the comment for
- * raw_write_seqcount_latch().
+ * write_seqcount_latch_begin().
  *
  * It is assumed that @node will observe one RCU quiescent state before being
  * reused of freed.
@@ -172,10 +173,11 @@ latch_tree_erase(struct latch_tree_node *node,
 		 struct latch_tree_root *root,
 		 const struct latch_tree_ops *ops)
 {
-	raw_write_seqcount_latch(&root->seq);
+	write_seqcount_latch_begin(&root->seq);
 	__lt_erase(node, root, 0);
-	raw_write_seqcount_latch(&root->seq);
+	write_seqcount_latch(&root->seq);
 	__lt_erase(node, root, 1);
+	write_seqcount_latch_end(&root->seq);
 }
 
 /**
@@ -204,9 +206,9 @@ latch_tree_find(void *key, struct latch_tree_root *root,
 	unsigned int seq;
 
 	do {
-		seq = raw_read_seqcount_latch(&root->seq);
+		seq = read_seqcount_latch(&root->seq);
 		node = __lt_find(key, root, seq & 1, ops->comp);
-	} while (raw_read_seqcount_latch_retry(&root->seq, seq));
+	} while (read_seqcount_latch_retry(&root->seq, seq));
 
 	return node;
 }
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index beb808f4c367..19911c8fa7b6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -560,10 +560,11 @@ bool printk_percpu_data_ready(void)
 /* Must be called under syslog_lock. */
 static void latched_seq_write(struct latched_seq *ls, u64 val)
 {
-	raw_write_seqcount_latch(&ls->latch);
+	write_seqcount_latch_begin(&ls->latch);
 	ls->val[0] = val;
-	raw_write_seqcount_latch(&ls->latch);
+	write_seqcount_latch(&ls->latch);
 	ls->val[1] = val;
+	write_seqcount_latch_end(&ls->latch);
 }
 
 /* Can be called from any context. */
@@ -574,10 +575,10 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls)
 	u64 val;
 
 	do {
-		seq = raw_read_seqcount_latch(&ls->latch);
+		seq = read_seqcount_latch(&ls->latch);
 		idx = seq & 0x1;
 		val = ls->val[idx];
-	} while (raw_read_seqcount_latch_retry(&ls->latch, seq));
+	} while (read_seqcount_latch_retry(&ls->latch, seq));
 
 	return val;
 }
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 29bdf309dae8..fcca4e72f1ef 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -71,13 +71,13 @@ static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 
 notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
 {
-	*seq = raw_read_seqcount_latch(&cd.seq);
+	*seq = read_seqcount_latch(&cd.seq);
 	return cd.read_data + (*seq & 1);
 }
 
 notrace int sched_clock_read_retry(unsigned int seq)
 {
-	return raw_read_seqcount_latch_retry(&cd.seq, seq);
+	return read_seqcount_latch_retry(&cd.seq, seq);
 }
 
 static __always_inline unsigned long long __sched_clock(void)
@@ -132,16 +132,18 @@ unsigned long long notrace sched_clock(void)
 static void update_clock_read_data(struct clock_read_data *rd)
 {
 	/* steer readers towards the odd copy */
-	raw_write_seqcount_latch(&cd.seq);
+	write_seqcount_latch_begin(&cd.seq);
 
 	/* now its safe for us to update the normal (even) copy */
 	cd.read_data[0] = *rd;
 
 	/* switch readers back to the even copy */
-	raw_write_seqcount_latch(&cd.seq);
+	write_seqcount_latch(&cd.seq);
 
 	/* update the backup (odd) copy with the new data */
 	cd.read_data[1] = *rd;
+
+	write_seqcount_latch_end(&cd.seq);
 }
 
 /*
@@ -279,7 +281,7 @@ void __init generic_sched_clock_init(void)
  */
 static u64 notrace suspended_sched_clock_read(void)
 {
-	unsigned int seq = raw_read_seqcount_latch(&cd.seq);
+	unsigned int seq = read_seqcount_latch(&cd.seq);
 
 	return cd.read_data[seq & 1].epoch_cyc;
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e6f409bf311..18752983e834 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -411,7 +411,7 @@ static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
  * We want to use this from any context including NMI and tracing /
  * instrumenting the timekeeping code itself.
  *
- * Employ the latch technique; see @raw_write_seqcount_latch.
+ * Employ the latch technique; see @write_seqcount_latch.
  *
  * So if a NMI hits the update of base[0] then it will use base[1]
  * which is still consistent. In the worst case this can result is a
@@ -424,16 +424,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
 	struct tk_read_base *base = tkf->base;
 
 	/* Force readers off to base[1] */
-	raw_write_seqcount_latch(&tkf->seq);
+	write_seqcount_latch_begin(&tkf->seq);
 
 	/* Update base[0] */
 	memcpy(base, tkr, sizeof(*base));
 
 	/* Force readers back to base[0] */
-	raw_write_seqcount_latch(&tkf->seq);
+	write_seqcount_latch(&tkf->seq);
 
 	/* Update base[1] */
 	memcpy(base + 1, base, sizeof(*base));
+
+	write_seqcount_latch_end(&tkf->seq);
 }
 
 static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
@@ -443,11 +445,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 	u64 now;
 
 	do {
-		seq = raw_read_seqcount_latch(&tkf->seq);
+		seq = read_seqcount_latch(&tkf->seq);
 		tkr = tkf->base + (seq & 0x01);
 		now = ktime_to_ns(tkr->base);
 		now += __timekeeping_get_ns(tkr);
-	} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
+	} while (read_seqcount_latch_retry(&tkf->seq, seq));
 
 	return now;
 }
-- 
cgit v1.2.3


From 183ec5f26b2fc97a4a9871865bfe9b33c41fddb2 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 4 Nov 2024 16:43:09 +0100
Subject: kcsan, seqlock: Fix incorrect assumption in read_seqbegin()

During testing of the preceding changes, I noticed that in some cases,
current->kcsan_ctx.in_flat_atomic remained true until task exit. This is
obviously wrong, because _all_ accesses for the given task will be
treated as atomic, resulting in false negatives i.e. missed data races.

Debugging led to fs/dcache.c, where we can see this usage of seqlock:

	struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
	{
		struct dentry *dentry;
		unsigned seq;

		do {
			seq = read_seqbegin(&rename_lock);
			dentry = __d_lookup(parent, name);
			if (dentry)
				break;
		} while (read_seqretry(&rename_lock, seq));
	[...]

As can be seen, read_seqretry() is never called if dentry != NULL;
consequently, current->kcsan_ctx.in_flat_atomic will never be reset to
false by read_seqretry().

Give up on the wrong assumption of "assume closing read_seqretry()", and
rely on the already-present annotations in read_seqcount_begin/retry().

Fixes: 88ecd153be95 ("seqlock, kcsan: Add annotations for KCSAN")
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241104161910.780003-6-elver@google.com
---
 include/linux/seqlock.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 45eee0e5dca0..5298765d6ca4 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -810,11 +810,7 @@ static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s)
  */
 static inline unsigned read_seqbegin(const seqlock_t *sl)
 {
-	unsigned ret = read_seqcount_begin(&sl->seqcount);
-
-	kcsan_atomic_next(0);  /* non-raw usage, assume closing read_seqretry() */
-	kcsan_flat_atomic_begin();
-	return ret;
+	return read_seqcount_begin(&sl->seqcount);
 }
 
 /**
@@ -830,12 +826,6 @@ static inline unsigned read_seqbegin(const seqlock_t *sl)
  */
 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 {
-	/*
-	 * Assume not nested: read_seqretry() may be called multiple times when
-	 * completing read critical section.
-	 */
-	kcsan_flat_atomic_end();
-
 	return read_seqcount_retry(&sl->seqcount, start);
 }
 
-- 
cgit v1.2.3


From 0f0d1b8e5010bfe1feeb4d78d137e41946a5370d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Oct 2024 14:20:35 +0100
Subject: sched/ext: Remove sched_fork() hack

Instead of solving the underlying problem of the double invocation of
__sched_fork() for idle tasks, sched-ext decided to hack around the issue
by partially clearing out the entity struct to preserve the already
enqueued node. A provided analysis and solution has been ignored for four
months.

Now that someone else has taken care of cleaning it up, remove the
disgusting hack and clear out the full structure. Remove the comment in the
structure declaration as well, as there is no requirement for @node being
the last element anymore.

Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/87ldy82wkc.ffs@tglx
---
 include/linux/sched/ext.h | 1 -
 kernel/sched/ext.c        | 7 +------
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1ddbde64a31b..2799e7284fff 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -199,7 +199,6 @@ struct sched_ext_entity {
 #ifdef CONFIG_EXT_GROUP_SCHED
 	struct cgroup		*cgrp_moving_from;
 #endif
-	/* must be the last field, see init_scx_entity() */
 	struct list_head	tasks_node;
 };
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5900b06fd036..f6e9a14042d5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3548,12 +3548,7 @@ static void scx_ops_exit_task(struct task_struct *p)
 
 void init_scx_entity(struct sched_ext_entity *scx)
 {
-	/*
-	 * init_idle() calls this function again after fork sequence is
-	 * complete. Don't touch ->tasks_node as it's already linked.
-	 */
-	memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node));
-
+	memset(scx, 0, sizeof(*scx));
 	INIT_LIST_HEAD(&scx->dsq_list.node);
 	RB_CLEAR_NODE(&scx->dsq_priq);
 	scx->sticky_cpu = -1;
-- 
cgit v1.2.3


From 26baa1f1c4bdc34b8d698c1900b407d863ad0e69 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 4 Oct 2024 14:47:02 +0200
Subject: sched: Add TIF_NEED_RESCHED_LAZY infrastructure

Add the basic infrastructure to split the TIF_NEED_RESCHED bit in two.
Either bit will cause a resched on return-to-user, but only
TIF_NEED_RESCHED will drive IRQ preemption.

No behavioural change intended.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20241007075055.219540785@infradead.org
---
 include/linux/entry-common.h |  3 ++-
 include/linux/entry-kvm.h    |  5 +++--
 include/linux/sched.h        |  3 ++-
 include/linux/thread_info.h  | 21 +++++++++++++++++----
 kernel/entry/common.c        |  2 +-
 kernel/entry/kvm.c           |  4 ++--
 kernel/sched/core.c          | 34 +++++++++++++++++++++-------------
 7 files changed, 48 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 1e50cdb83ae5..fc61d0205c97 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -64,7 +64,8 @@
 
 #define EXIT_TO_USER_MODE_WORK						\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
-	 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |	\
+	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
+	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
 	 ARCH_EXIT_TO_USER_MODE_WORK)
 
 /**
diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 6813171afccb..16149f6625e4 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -17,8 +17,9 @@
 #endif
 
 #define XFER_TO_GUEST_MODE_WORK						\
-	(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL |	\
-	 _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
+	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \
+	 _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME |			\
+	 ARCH_XFER_TO_GUEST_MODE_WORK)
 
 struct kvm_vcpu;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a76e3d074a2a..1d5cc3e50884 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2002,7 +2002,8 @@ static inline void set_tsk_need_resched(struct task_struct *tsk)
 
 static inline void clear_tsk_need_resched(struct task_struct *tsk)
 {
-	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+	atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
+			   (atomic_long_t *)&task_thread_info(tsk)->flags);
 }
 
 static inline int test_tsk_need_resched(struct task_struct *tsk)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9ea0b28068f4..cf2446c9c30d 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -59,6 +59,14 @@ enum syscall_work_bit {
 
 #include <asm/thread_info.h>
 
+#ifndef TIF_NEED_RESCHED_LAZY
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+#error Inconsistent PREEMPT_LAZY
+#endif
+#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
+#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
+#endif
+
 #ifdef __KERNEL__
 
 #ifndef arch_set_restart_data
@@ -179,22 +187,27 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
 
 #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
 
-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
 {
-	return arch_test_bit(TIF_NEED_RESCHED,
+	return arch_test_bit(bit,
 			     (unsigned long *)(&current_thread_info()->flags));
 }
 
 #else
 
-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
 {
-	return test_bit(TIF_NEED_RESCHED,
+	return test_bit(bit,
 			(unsigned long *)(&current_thread_info()->flags));
 }
 
 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
 
+static __always_inline bool tif_need_resched(void)
+{
+	return tif_test_bit(TIF_NEED_RESCHED);
+}
+
 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
 static inline int arch_within_stack_frames(const void * const stack,
 					   const void * const stackend,
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 5b6934e23c21..e33691d5adf7 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 
 		local_irq_enable_exit_to_user(ti_work);
 
-		if (ti_work & _TIF_NEED_RESCHED)
+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
 			schedule();
 
 		if (ti_work & _TIF_UPROBE)
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 2e0f75bcb7fd..8485f63863af 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 			return -EINTR;
 		}
 
-		if (ti_work & _TIF_NEED_RESCHED)
+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
 			schedule();
 
 		if (ti_work & _TIF_NOTIFY_RESUME)
@@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 			return ret;
 
 		ti_work = read_thread_flags();
-	} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+	} while (ti_work & XFER_TO_GUEST_MODE_WORK);
 	return 0;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aad48850c1ef..0cd05e36b6b6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -941,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq)
  * this avoids any races wrt polling state changes and thereby avoids
  * spurious IPIs.
  */
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
 {
-	struct thread_info *ti = task_thread_info(p);
-	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+	return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
 }
 
 /*
@@ -969,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p)
 }
 
 #else
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
 {
-	set_tsk_need_resched(p);
+	set_ti_thread_flag(ti, tif);
 	return true;
 }
 
@@ -1076,28 +1075,37 @@ void wake_up_q(struct wake_q_head *head)
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int tif)
 {
 	struct task_struct *curr = rq->curr;
+	struct thread_info *cti = task_thread_info(curr);
 	int cpu;
 
 	lockdep_assert_rq_held(rq);
 
-	if (test_tsk_need_resched(curr))
+	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
 		return;
 
 	cpu = cpu_of(rq);
 
 	if (cpu == smp_processor_id()) {
-		set_tsk_need_resched(curr);
-		set_preempt_need_resched();
+		set_ti_thread_flag(cti, tif);
+		if (tif == TIF_NEED_RESCHED)
+			set_preempt_need_resched();
 		return;
 	}
 
-	if (set_nr_and_not_polling(curr))
-		smp_send_reschedule(cpu);
-	else
+	if (set_nr_and_not_polling(cti, tif)) {
+		if (tif == TIF_NEED_RESCHED)
+			smp_send_reschedule(cpu);
+	} else {
 		trace_sched_wake_idle_without_ipi(cpu);
+	}
+}
+
+void resched_curr(struct rq *rq)
+{
+	__resched_curr(rq, TIF_NEED_RESCHED);
 }
 
 void resched_cpu(int cpu)
@@ -1192,7 +1200,7 @@ static void wake_up_idle_cpu(int cpu)
 	 * and testing of the above solutions didn't appear to report
 	 * much benefits.
 	 */
-	if (set_nr_and_not_polling(rq->idle))
+	if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
 		smp_send_reschedule(cpu);
 	else
 		trace_sched_wake_idle_without_ipi(cpu);
-- 
cgit v1.2.3


From 7c70cb94d29cd325fabe4a818c18613e3b9919a1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 4 Oct 2024 14:46:58 +0200
Subject: sched: Add Lazy preemption model

Change fair to use resched_curr_lazy(), which, when the lazy
preemption model is selected, will set TIF_NEED_RESCHED_LAZY.

This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
As such, the average delay between setting LAZY and actually
rescheduling will be TICK_NSEC/2.

In short, Lazy preemption will delay preemption for fair class but
will function as Full preemption for all the other classes, most
notably the realtime (RR/FIFO/DEADLINE) classes.

The goal is to bridge the performance gap with Voluntary, such that we
might eventually remove that option entirely.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20241007075055.331243614@infradead.org
---
 include/linux/preempt.h |  8 ++++-
 kernel/Kconfig.preempt  | 15 ++++++++++
 kernel/sched/core.c     | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/debug.c    |  5 ++--
 kernel/sched/fair.c     |  6 ++--
 kernel/sched/sched.h    |  1 +
 6 files changed, 107 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index ce76f1a45722..ca86235ac15c 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
 extern bool preempt_model_none(void);
 extern bool preempt_model_voluntary(void);
 extern bool preempt_model_full(void);
+extern bool preempt_model_lazy(void);
 
 #else
 
@@ -502,6 +503,11 @@ static inline bool preempt_model_full(void)
 	return IS_ENABLED(CONFIG_PREEMPT);
 }
 
+static inline bool preempt_model_lazy(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+
 #endif
 
 static inline bool preempt_model_rt(void)
@@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void)
  */
 static inline bool preempt_model_preemptible(void)
 {
-	return preempt_model_full() || preempt_model_rt();
+	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
 }
 
 #endif /* __LINUX_PREEMPT_H */
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index fe782cd77388..09f06d8964cf 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,6 +11,9 @@ config PREEMPT_BUILD
 	select PREEMPTION
 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
 
+config ARCH_HAS_PREEMPT_LAZY
+	bool
+
 choice
 	prompt "Preemption Model"
 	default PREEMPT_NONE
@@ -67,6 +70,18 @@ config PREEMPT
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_LAZY
+	bool "Scheduler controlled preemption model"
+	depends on !ARCH_NO_PREEMPT
+	depends on ARCH_HAS_PREEMPT_LAZY
+	select PREEMPT_BUILD
+	help
+	  This option provides a scheduler driven preemption model that
+	  is fundamentally similar to full preemption, but is less
+	  eager to preempt SCHED_NORMAL tasks in an attempt to
+	  reduce lock holder preemption and recover some of the performance
+	  gains seen from using Voluntary preemption.
+
 config PREEMPT_RT
 	bool "Fully Preemptible Kernel (Real-Time)"
 	depends on EXPERT && ARCH_SUPPORTS_RT
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0cd05e36b6b6..df6a34d27d2b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1083,6 +1083,13 @@ static void __resched_curr(struct rq *rq, int tif)
 
 	lockdep_assert_rq_held(rq);
 
+	/*
+	 * Always immediately preempt the idle task; no point in delaying doing
+	 * actual work.
+	 */
+	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+		tif = TIF_NEED_RESCHED;
+
 	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
 		return;
 
@@ -1108,6 +1115,32 @@ void resched_curr(struct rq *rq)
 	__resched_curr(rq, TIF_NEED_RESCHED);
 }
 
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int get_lazy_tif_bit(void)
+{
+	if (dynamic_preempt_lazy())
+		return TIF_NEED_RESCHED_LAZY;
+
+	return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+	__resched_curr(rq, get_lazy_tif_bit());
+}
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -5612,6 +5645,10 @@ void sched_tick(void)
 	update_rq_clock(rq);
 	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
 	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
+
+	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+		resched_curr(rq);
+
 	donor->sched_class->task_tick(rq, donor, 0);
 	if (sched_feat(LATENCY_WARN))
 		resched_latency = cpu_resched_latency(rq);
@@ -7374,6 +7411,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * VOLUNTARY:
  *   cond_resched               <- __cond_resched
@@ -7381,6 +7419,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * FULL:
  *   cond_resched               <- RET0
@@ -7388,6 +7427,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
  *   preempt_schedule           <- preempt_schedule
  *   preempt_schedule_notrace   <- preempt_schedule_notrace
  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- false
+ *
+ * LAZY:
+ *   cond_resched               <- RET0
+ *   might_resched              <- RET0
+ *   preempt_schedule           <- preempt_schedule
+ *   preempt_schedule_notrace   <- preempt_schedule_notrace
+ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- true
  */
 
 enum {
@@ -7395,6 +7443,7 @@ enum {
 	preempt_dynamic_none,
 	preempt_dynamic_voluntary,
 	preempt_dynamic_full,
+	preempt_dynamic_lazy,
 };
 
 int preempt_dynamic_mode = preempt_dynamic_undefined;
@@ -7410,15 +7459,23 @@ int sched_dynamic_mode(const char *str)
 	if (!strcmp(str, "full"))
 		return preempt_dynamic_full;
 
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+	if (!strcmp(str, "lazy"))
+		return preempt_dynamic_lazy;
+#endif
+
 	return -EINVAL;
 }
 
+#define preempt_dynamic_key_enable(f)	static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_key_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+
 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
 #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+#define preempt_dynamic_enable(f)	preempt_dynamic_key_enable(f)
+#define preempt_dynamic_disable(f)	preempt_dynamic_key_disable(f)
 #else
 #error "Unsupported PREEMPT_DYNAMIC mechanism"
 #endif
@@ -7438,6 +7495,7 @@ static void __sched_dynamic_update(int mode)
 	preempt_dynamic_enable(preempt_schedule);
 	preempt_dynamic_enable(preempt_schedule_notrace);
 	preempt_dynamic_enable(irqentry_exit_cond_resched);
+	preempt_dynamic_key_disable(preempt_lazy);
 
 	switch (mode) {
 	case preempt_dynamic_none:
@@ -7447,6 +7505,7 @@ static void __sched_dynamic_update(int mode)
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: none\n");
 		break;
@@ -7458,6 +7517,7 @@ static void __sched_dynamic_update(int mode)
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: voluntary\n");
 		break;
@@ -7469,9 +7529,22 @@ static void __sched_dynamic_update(int mode)
 		preempt_dynamic_enable(preempt_schedule);
 		preempt_dynamic_enable(preempt_schedule_notrace);
 		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: full\n");
 		break;
+
+	case preempt_dynamic_lazy:
+		if (!klp_override)
+			preempt_dynamic_disable(cond_resched);
+		preempt_dynamic_disable(might_resched);
+		preempt_dynamic_enable(preempt_schedule);
+		preempt_dynamic_enable(preempt_schedule_notrace);
+		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_enable(preempt_lazy);
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: lazy\n");
+		break;
 	}
 
 	preempt_dynamic_mode = mode;
@@ -7534,6 +7607,8 @@ static void __init preempt_dynamic_init(void)
 			sched_dynamic_update(preempt_dynamic_none);
 		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
 			sched_dynamic_update(preempt_dynamic_voluntary);
+		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+			sched_dynamic_update(preempt_dynamic_lazy);
 		} else {
 			/* Default static call setting, nothing to do */
 			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
@@ -7554,6 +7629,7 @@ static void __init preempt_dynamic_init(void)
 PREEMPT_MODEL_ACCESSOR(none);
 PREEMPT_MODEL_ACCESSOR(voluntary);
 PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
 
 #else /* !CONFIG_PREEMPT_DYNAMIC: */
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f4035c7a0fa1..44a49f90b05f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
 static int sched_dynamic_show(struct seq_file *m, void *v)
 {
 	static const char * preempt_modes[] = {
-		"none", "voluntary", "full"
+		"none", "voluntary", "full", "lazy",
 	};
+	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+	for (i = 0; i < j; i++) {
 		if (preempt_dynamic_mode == i)
 			seq_puts(m, "(");
 		seq_puts(m, preempt_modes[i]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6512258dc71f..3356315d7e64 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		return;
 
 	if (resched || did_preempt_short(cfs_rq, curr)) {
-		resched_curr(rq);
+		resched_curr_lazy(rq);
 		clear_buddies(cfs_rq, curr);
 	}
 }
@@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		return;
 	}
 #endif
@@ -8829,7 +8829,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	return;
 
 preempt:
-	resched_curr(rq);
+	resched_curr_lazy(rq);
 }
 
 static struct task_struct *pick_task_fair(struct rq *rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e51bf5a344d3..090dd4b38fa2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2689,6 +2689,7 @@ extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
 extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
 extern void resched_cpu(int cpu);
 
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-- 
cgit v1.2.3


From 18d92bb57c39504d9da11c6ef604f58eb1d5a117 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 22 Oct 2024 18:59:08 +0300
Subject: perf/core: Add aux_pause, aux_resume, aux_start_paused

Hardware traces, such as instruction traces, can produce a vast amount of
trace data, so being able to reduce tracing to more specific circumstances
can be useful.

The ability to pause or resume tracing when another event happens, can do
that.

Add ability for an event to "pause" or "resume" AUX area tracing.

Add aux_pause bit to perf_event_attr to indicate that, if the event
happens, the associated AUX area tracing should be paused. Ditto
aux_resume. Do not allow aux_pause and aux_resume to be set together.

Add aux_start_paused bit to perf_event_attr to indicate to an AUX area
event that it should start in a "paused" state.

Add aux_paused to struct hw_perf_event for AUX area events to keep track of
the "paused" state. aux_paused is initialized to aux_start_paused.

Add PERF_EF_PAUSE and PERF_EF_RESUME modes for ->stop() and ->start()
callbacks. Call as needed, during __perf_event_output(). Add
aux_in_pause_resume to struct perf_buffer to prevent races with the NMI
handler. Pause/resume in NMI context will miss out if it coincides with
another pause/resume.

To use aux_pause or aux_resume, an event must be in a group with the AUX
area event as the group leader.

Example (requires Intel PT and tools patches also):

 $ perf record --kcore -e intel_pt/aux-action=start-paused/k,syscalls:sys_enter_newuname/aux-action=resume/,syscalls:sys_exit_newuname/aux-action=pause/ uname
 Linux
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.043 MB perf.data ]
 $ perf script --call-trace
 uname   30805 [000] 24001.058782799: name: 0x7ffc9c1865b0
 uname   30805 [000] 24001.058784424:  psb offs: 0
 uname   30805 [000] 24001.058784424:  cbr: 39 freq: 3904 MHz (139%)
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])        debug_smp_processor_id
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])        __x64_sys_newuname
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])            down_read
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                __cond_resched
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                preempt_count_add
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                    in_lock_functions
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                preempt_count_sub
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])            up_read
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                preempt_count_add
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                    in_lock_functions
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                preempt_count_sub
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])            _copy_to_user
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])        syscall_exit_to_user_mode
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])            syscall_exit_work
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                perf_syscall_exit
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                    debug_smp_processor_id
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                    perf_trace_buf_alloc
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        perf_swevent_get_recursion_context
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            debug_smp_processor_id
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        debug_smp_processor_id
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                    perf_tp_event
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        perf_trace_buf_update
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            tracing_gen_ctx_irq_test
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        perf_swevent_event
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            __perf_event_account_interrupt
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                __this_cpu_preempt_check
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            perf_event_output_forward
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                perf_event_aux_pause
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                    ring_buffer_get
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                        __rcu_read_lock
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                        __rcu_read_unlock
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                    pt_event_stop
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                        debug_smp_processor_id
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                        debug_smp_processor_id
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                        native_write_msr
 uname   30805 [000] 24001.058785463: ([kernel.kallsyms])                                        native_write_msr
 uname   30805 [000] 24001.058785639: 0x0

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20241022155920.17511-3-adrian.hunter@intel.com
---
 include/linux/perf_event.h      | 28 +++++++++++++++
 include/uapi/linux/perf_event.h | 11 +++++-
 kernel/events/core.c            | 75 ++++++++++++++++++++++++++++++++++++++---
 kernel/events/internal.h        |  1 +
 4 files changed, 110 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb908843f209..91b310052a7c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -170,6 +170,12 @@ struct hw_perf_event {
 		};
 		struct { /* aux / Intel-PT */
 			u64		aux_config;
+			/*
+			 * For AUX area events, aux_paused cannot be a state
+			 * flag because it can be updated asynchronously to
+			 * state.
+			 */
+			unsigned int	aux_paused;
 		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
@@ -294,6 +300,7 @@ struct perf_event_pmu_context;
 #define PERF_PMU_CAP_NO_EXCLUDE			0x0040
 #define PERF_PMU_CAP_AUX_OUTPUT			0x0080
 #define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
+#define PERF_PMU_CAP_AUX_PAUSE			0x0200
 
 /**
  * pmu::scope
@@ -384,6 +391,8 @@ struct pmu {
 #define PERF_EF_START	0x01		/* start the counter when adding    */
 #define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
 #define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
+#define PERF_EF_PAUSE	0x08		/* AUX area event, pause tracing */
+#define PERF_EF_RESUME	0x10		/* AUX area event, resume tracing */
 
 	/*
 	 * Adds/Removes a counter to/from the PMU, can be done inside a
@@ -423,6 +432,18 @@ struct pmu {
 	 *
 	 * ->start() with PERF_EF_RELOAD will reprogram the counter
 	 *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
+	 *
+	 * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
+	 * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
+	 * PERF_EF_RESUME.
+	 *
+	 * ->start() with PERF_EF_RESUME will start as simply as possible but
+	 * only if the counter is not otherwise stopped. Will not overlap
+	 * another ->start() with PERF_EF_RESUME nor ->stop() with
+	 * PERF_EF_PAUSE.
+	 *
+	 * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other
+	 * ->stop()/->start() invocations, just not itself.
 	 */
 	void (*start)			(struct perf_event *event, int flags);
 	void (*stop)			(struct perf_event *event, int flags);
@@ -1679,6 +1700,13 @@ static inline bool has_aux(struct perf_event *event)
 	return event->pmu->setup_aux;
 }
 
+static inline bool has_aux_action(struct perf_event *event)
+{
+	return event->attr.aux_sample_size ||
+	       event->attr.aux_pause ||
+	       event->attr.aux_resume;
+}
+
 static inline bool is_write_backward(struct perf_event *event)
 {
 	return !!event->attr.write_backward;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 4842c36fdf80..0524d541d4e3 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -511,7 +511,16 @@ struct perf_event_attr {
 	__u16	sample_max_stack;
 	__u16	__reserved_2;
 	__u32	aux_sample_size;
-	__u32	__reserved_3;
+
+	union {
+		__u32	aux_action;
+		struct {
+			__u32	aux_start_paused :  1, /* start AUX area tracing paused */
+				aux_pause        :  1, /* on overflow, pause AUX area tracing */
+				aux_resume       :  1, /* on overflow, resume AUX area tracing */
+				__reserved_3     : 29;
+		};
+	};
 
 	/*
 	 * User provided data if sigtrap=1, passed back to user via
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e3589c4287cb..0e9cfe6f3535 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2146,7 +2146,7 @@ static void perf_put_aux_event(struct perf_event *event)
 
 static bool perf_need_aux_event(struct perf_event *event)
 {
-	return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+	return event->attr.aux_output || has_aux_action(event);
 }
 
 static int perf_get_aux_event(struct perf_event *event,
@@ -2171,6 +2171,10 @@ static int perf_get_aux_event(struct perf_event *event,
 	    !perf_aux_output_match(event, group_leader))
 		return 0;
 
+	if ((event->attr.aux_pause || event->attr.aux_resume) &&
+	    !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+		return 0;
+
 	if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
 		return 0;
 
@@ -8016,6 +8020,49 @@ void perf_prepare_header(struct perf_event_header *header,
 	WARN_ON_ONCE(header->size & 7);
 }
 
+static void __perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+	if (pause) {
+		if (!event->hw.aux_paused) {
+			event->hw.aux_paused = 1;
+			event->pmu->stop(event, PERF_EF_PAUSE);
+		}
+	} else {
+		if (event->hw.aux_paused) {
+			event->hw.aux_paused = 0;
+			event->pmu->start(event, PERF_EF_RESUME);
+		}
+	}
+}
+
+static void perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+	struct perf_buffer *rb;
+
+	if (WARN_ON_ONCE(!event))
+		return;
+
+	rb = ring_buffer_get(event);
+	if (!rb)
+		return;
+
+	scoped_guard (irqsave) {
+		/*
+		 * Guard against self-recursion here. Another event could trip
+		 * this same from NMI context.
+		 */
+		if (READ_ONCE(rb->aux_in_pause_resume))
+			break;
+
+		WRITE_ONCE(rb->aux_in_pause_resume, 1);
+		barrier();
+		__perf_event_aux_pause(event, pause);
+		barrier();
+		WRITE_ONCE(rb->aux_in_pause_resume, 0);
+	}
+	ring_buffer_put(rb);
+}
+
 static __always_inline int
 __perf_event_output(struct perf_event *event,
 		    struct perf_sample_data *data,
@@ -9818,9 +9865,12 @@ static int __perf_event_overflow(struct perf_event *event,
 
 	ret = __perf_event_account_interrupt(event, throttle);
 
+	if (event->attr.aux_pause)
+		perf_event_aux_pause(event->aux_event, true);
+
 	if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
 	    !bpf_overflow_handler(event, data, regs))
-		return ret;
+		goto out;
 
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
@@ -9882,6 +9932,9 @@ static int __perf_event_overflow(struct perf_event *event,
 		event->pending_wakeup = 1;
 		irq_work_queue(&event->pending_irq);
 	}
+out:
+	if (event->attr.aux_resume)
+		perf_event_aux_pause(event->aux_event, false);
 
 	return ret;
 }
@@ -12273,11 +12326,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	}
 
 	if (event->attr.aux_output &&
-	    !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+	    (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
+	     event->attr.aux_pause || event->attr.aux_resume)) {
 		err = -EOPNOTSUPP;
 		goto err_pmu;
 	}
 
+	if (event->attr.aux_pause && event->attr.aux_resume) {
+		err = -EINVAL;
+		goto err_pmu;
+	}
+
+	if (event->attr.aux_start_paused) {
+		if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
+			err = -EOPNOTSUPP;
+			goto err_pmu;
+		}
+		event->hw.aux_paused = 1;
+	}
+
 	if (cgroup_fd != -1) {
 		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
 		if (err)
@@ -13073,7 +13140,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	 * Grouping is not supported for kernel events, neither is 'AUX',
 	 * make sure the caller's intentions are adjusted.
 	 */
-	if (attr->aux_output)
+	if (attr->aux_output || attr->aux_action)
 		return ERR_PTR(-EINVAL);
 
 	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index e072d995d670..249288d82b8d 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -52,6 +52,7 @@ struct perf_buffer {
 	void				(*free_aux)(void *);
 	refcount_t			aux_refcount;
 	int				aux_in_sampling;
+	int				aux_in_pause_resume;
 	void				**aux_pages;
 	void				*aux_priv;
 
-- 
cgit v1.2.3


From 95e2eaf5b91aae6c3a433cd7882733bd806fa3c8 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 4 Nov 2024 09:40:30 +0800
Subject: iommu/vt-d: Remove unused dmar_msi_read

dmar_msi_read() has been unused since 2022 in
commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture")

Remove it.

(dmar_msi_write still exists and is used once).

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://lore.kernel.org/r/20241022002702.302728-1-linux@treblig.org
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/dmar.c | 13 -------------
 include/linux/dmar.h       |  1 -
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index e16c2b1d7633..9f424acf474e 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1895,19 +1895,6 @@ void dmar_msi_write(int irq, struct msi_msg *msg)
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
-void dmar_msi_read(int irq, struct msi_msg *msg)
-{
-	struct intel_iommu *iommu = irq_get_handler_data(irq);
-	int reg = dmar_msi_reg(iommu, irq);
-	unsigned long flag;
-
-	raw_spin_lock_irqsave(&iommu->register_lock, flag);
-	msg->data = readl(iommu->reg + reg + 4);
-	msg->address_lo = readl(iommu->reg + reg + 8);
-	msg->address_hi = readl(iommu->reg + reg + 12);
-	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
 static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 		u8 fault_reason, u32 pasid, u16 source_id,
 		unsigned long long addr)
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 499bb2c63483..692b2b445761 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -292,7 +292,6 @@ static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src)
 struct irq_data;
 extern void dmar_msi_unmask(struct irq_data *data);
 extern void dmar_msi_mask(struct irq_data *data);
-extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
 extern irqreturn_t dmar_fault(int irq, void *dev_id);
-- 
cgit v1.2.3


From bebf29b18f34620e25f7e2bd9e4e4d8e34a8977d Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 3 Nov 2024 17:03:31 +0000
Subject: sysfs: introduce callback attribute_group::bin_size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Several drivers need to dynamically calculate the size of an binary
attribute. Currently this is done by assigning attr->size from the
is_bin_visible() callback.

This has drawbacks:
* It is not documented.
* A single attribute can be instantiated multiple times, overwriting the
  shared size field.
* It prevents the structure to be moved to read-only memory.

Introduce a new dedicated callback to calculate the size of the
attribute.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-2-71110628844c@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/group.c      | 2 ++
 include/linux/sysfs.h | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 45b2e92941da..8b01a7eda5fb 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -98,6 +98,8 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 				if (!mode)
 					continue;
 			}
+			if (grp->bin_size)
+				size = grp->bin_size(kobj, *bin_attr, i);
 
 			WARN(mode & ~(SYSFS_PREALLOC | 0664),
 			     "Attribute %s: Invalid permissions 0%o\n",
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index c4e64dc11206..4746cccb9589 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -87,6 +87,11 @@ do {							\
  *		SYSFS_GROUP_VISIBLE() when assigning this callback to
  *		specify separate _group_visible() and _attr_visible()
  *		handlers.
+ * @bin_size:
+ *		Optional: Function to return the size of a binary attribute
+ *		of the group. Will be called repeatedly for each binary
+ *		attribute in the group. Overwrites the size field embedded
+ *		inside the attribute itself.
  * @attrs:	Pointer to NULL terminated list of attributes.
  * @bin_attrs:	Pointer to NULL terminated list of binary attributes.
  *		Either attrs or bin_attrs or both must be provided.
@@ -97,6 +102,9 @@ struct attribute_group {
 					      struct attribute *, int);
 	umode_t			(*is_bin_visible)(struct kobject *,
 						  struct bin_attribute *, int);
+	size_t			(*bin_size)(struct kobject *,
+					    const struct bin_attribute *,
+					    int);
 	struct attribute	**attrs;
 	struct bin_attribute	**bin_attrs;
 };
-- 
cgit v1.2.3


From b626816fdd7f9beb841856ba049396cff46e99aa Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 3 Nov 2024 17:03:34 +0000
Subject: sysfs: treewide: constify attribute callback of bin_is_visible()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The is_bin_visible() callbacks should not modify the struct
bin_attribute passed as argument.
Enforce this by marking the argument as const.

As there are not many callback implementers perform this change
throughout the tree at once.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-5-71110628844c@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cxl/port.c                      |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  2 +-
 drivers/infiniband/hw/qib/qib_sysfs.c   |  2 +-
 drivers/mtd/spi-nor/sysfs.c             |  2 +-
 drivers/nvmem/core.c                    |  3 ++-
 drivers/pci/pci-sysfs.c                 |  2 +-
 drivers/pci/vpd.c                       |  2 +-
 drivers/platform/x86/amd/hsmp.c         |  2 +-
 drivers/platform/x86/intel/sdsi.c       |  2 +-
 drivers/scsi/scsi_sysfs.c               |  2 +-
 drivers/usb/core/sysfs.c                |  2 +-
 include/linux/sysfs.h                   | 30 +++++++++++++++---------------
 12 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 9dc394295e1f..24041cf85cfb 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -173,7 +173,7 @@ static ssize_t CDAT_read(struct file *filp, struct kobject *kobj,
 static BIN_ATTR_ADMIN_RO(CDAT, 0);
 
 static umode_t cxl_port_bin_attr_is_visible(struct kobject *kobj,
-					    struct bin_attribute *attr, int i)
+					    const struct bin_attribute *attr, int i)
 {
 	struct device *dev = kobj_to_dev(kobj);
 	struct cxl_port *port = to_cxl_port(dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 0b28b2cf1517..c1c329eb920b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -3999,7 +3999,7 @@ static umode_t amdgpu_flash_attr_is_visible(struct kobject *kobj, struct attribu
 }
 
 static umode_t amdgpu_bin_flash_attr_is_visible(struct kobject *kobj,
-						struct bin_attribute *attr,
+						const struct bin_attribute *attr,
 						int idx)
 {
 	struct device *dev = kobj_to_dev(kobj);
diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c
index 53ec7510e4eb..ba2cd68b53e6 100644
--- a/drivers/infiniband/hw/qib/qib_sysfs.c
+++ b/drivers/infiniband/hw/qib/qib_sysfs.c
@@ -283,7 +283,7 @@ static struct bin_attribute *port_ccmgta_attributes[] = {
 };
 
 static umode_t qib_ccmgta_is_bin_visible(struct kobject *kobj,
-				 struct bin_attribute *attr, int n)
+				 const struct bin_attribute *attr, int n)
 {
 	struct qib_pportdata *ppd = qib_get_pportdata_kobj(kobj);
 
diff --git a/drivers/mtd/spi-nor/sysfs.c b/drivers/mtd/spi-nor/sysfs.c
index 96064e4babf0..5e9eb268073d 100644
--- a/drivers/mtd/spi-nor/sysfs.c
+++ b/drivers/mtd/spi-nor/sysfs.c
@@ -87,7 +87,7 @@ static umode_t spi_nor_sysfs_is_visible(struct kobject *kobj,
 }
 
 static umode_t spi_nor_sysfs_is_bin_visible(struct kobject *kobj,
-					    struct bin_attribute *attr, int n)
+					    const struct bin_attribute *attr, int n)
 {
 	struct spi_device *spi = to_spi_device(kobj_to_dev(kobj));
 	struct spi_mem *spimem = spi_get_drvdata(spi);
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 63370c76394e..73e44d724f90 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -298,7 +298,8 @@ static umode_t nvmem_bin_attr_get_umode(struct nvmem_device *nvmem)
 }
 
 static umode_t nvmem_bin_attr_is_visible(struct kobject *kobj,
-					 struct bin_attribute *attr, int i)
+					 const struct bin_attribute *attr,
+					 int i)
 {
 	struct device *dev = kobj_to_dev(kobj);
 	struct nvmem_device *nvmem = to_nvmem_device(dev);
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 040f01b2b999..13912940ed2b 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1326,7 +1326,7 @@ static struct bin_attribute *pci_dev_rom_attrs[] = {
 };
 
 static umode_t pci_dev_rom_attr_is_visible(struct kobject *kobj,
-					   struct bin_attribute *a, int n)
+					   const struct bin_attribute *a, int n)
 {
 	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
 
diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c
index e4300f5f304f..a469bcbc0da7 100644
--- a/drivers/pci/vpd.c
+++ b/drivers/pci/vpd.c
@@ -325,7 +325,7 @@ static struct bin_attribute *vpd_attrs[] = {
 };
 
 static umode_t vpd_attr_is_visible(struct kobject *kobj,
-				   struct bin_attribute *a, int n)
+				   const struct bin_attribute *a, int n)
 {
 	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
 
diff --git a/drivers/platform/x86/amd/hsmp.c b/drivers/platform/x86/amd/hsmp.c
index 8fcf38eed7f0..8f00850c139f 100644
--- a/drivers/platform/x86/amd/hsmp.c
+++ b/drivers/platform/x86/amd/hsmp.c
@@ -620,7 +620,7 @@ static int hsmp_get_tbl_dram_base(u16 sock_ind)
 }
 
 static umode_t hsmp_is_sock_attr_visible(struct kobject *kobj,
-					 struct bin_attribute *battr, int id)
+					 const struct bin_attribute *battr, int id)
 {
 	if (plat_dev.proto_ver == HSMP_PROTO_VER6)
 		return battr->attr.mode;
diff --git a/drivers/platform/x86/intel/sdsi.c b/drivers/platform/x86/intel/sdsi.c
index 9d137621f0e6..33f33b1070fd 100644
--- a/drivers/platform/x86/intel/sdsi.c
+++ b/drivers/platform/x86/intel/sdsi.c
@@ -541,7 +541,7 @@ static struct bin_attribute *sdsi_bin_attrs[] = {
 };
 
 static umode_t
-sdsi_battr_is_visible(struct kobject *kobj, struct bin_attribute *attr, int n)
+sdsi_battr_is_visible(struct kobject *kobj, const struct bin_attribute *attr, int n)
 {
 	struct device *dev = kobj_to_dev(kobj);
 	struct sdsi_priv *priv = dev_get_drvdata(dev);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 32f94db6d6bf..f3a1ecb42128 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1274,7 +1274,7 @@ static umode_t scsi_sdev_attr_is_visible(struct kobject *kobj,
 }
 
 static umode_t scsi_sdev_bin_attr_is_visible(struct kobject *kobj,
-					     struct bin_attribute *attr, int i)
+					     const struct bin_attribute *attr, int i)
 {
 	struct device *dev = kobj_to_dev(kobj);
 	struct scsi_device *sdev = to_scsi_device(dev);
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index 61b6d978892c..b4cba23831ac 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -925,7 +925,7 @@ static struct bin_attribute *dev_bin_attrs[] = {
 };
 
 static umode_t dev_bin_attrs_are_visible(struct kobject *kobj,
-		struct bin_attribute *a, int n)
+		const struct bin_attribute *a, int n)
 {
 	struct device *dev = kobj_to_dev(kobj);
 	struct usb_device *udev = to_usb_device(dev);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 4746cccb9589..d1b22d56198b 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -101,7 +101,7 @@ struct attribute_group {
 	umode_t			(*is_visible)(struct kobject *,
 					      struct attribute *, int);
 	umode_t			(*is_bin_visible)(struct kobject *,
-						  struct bin_attribute *, int);
+						  const struct bin_attribute *, int);
 	size_t			(*bin_size)(struct kobject *,
 					    const struct bin_attribute *,
 					    int);
@@ -199,22 +199,22 @@ struct attribute_group {
  * attributes, the group visibility is determined by the function
  * specified to is_visible() not is_bin_visible()
  */
-#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name)                             \
-	static inline umode_t sysfs_group_visible_##name(                \
-		struct kobject *kobj, struct bin_attribute *attr, int n) \
-	{                                                                \
-		if (n == 0 && !name##_group_visible(kobj))               \
-			return SYSFS_GROUP_INVISIBLE;                    \
-		return name##_attr_visible(kobj, attr, n);               \
+#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name)                                   \
+	static inline umode_t sysfs_group_visible_##name(                      \
+		struct kobject *kobj, const struct bin_attribute *attr, int n) \
+	{                                                                      \
+		if (n == 0 && !name##_group_visible(kobj))                     \
+			return SYSFS_GROUP_INVISIBLE;                          \
+		return name##_attr_visible(kobj, attr, n);                     \
 	}
 
-#define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name)                   \
-	static inline umode_t sysfs_group_visible_##name(             \
-		struct kobject *kobj, struct bin_attribute *a, int n) \
-	{                                                             \
-		if (n == 0 && !name##_group_visible(kobj))            \
-			return SYSFS_GROUP_INVISIBLE;                 \
-		return a->mode;                                       \
+#define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name)                         \
+	static inline umode_t sysfs_group_visible_##name(                   \
+		struct kobject *kobj, const struct bin_attribute *a, int n) \
+	{                                                                   \
+		if (n == 0 && !name##_group_visible(kobj))                  \
+			return SYSFS_GROUP_INVISIBLE;                       \
+		return a->mode;                                             \
 	}
 
 #define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn
-- 
cgit v1.2.3


From 94a20fb9af16417ab5fd17bcde3d906926f15ef6 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 3 Nov 2024 17:03:35 +0000
Subject: sysfs: treewide: constify attribute callback of bin_attribute::mmap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The mmap() callbacks should not modify the struct
bin_attribute passed as argument.
Enforce this by marking the argument as const.

As there are not many callback implementers perform this change
throughout the tree at once.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Andrew Donnellan <ajd@linux.ibm.com> # ocxl
Acked-by: Krzysztof Wilczyński <kw@linux.com>
Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-6-71110628844c@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/alpha/kernel/pci-sysfs.c          |  6 +++---
 drivers/misc/ocxl/sysfs.c              |  2 +-
 drivers/pci/p2pdma.c                   |  2 +-
 drivers/pci/pci-sysfs.c                | 10 +++++-----
 drivers/platform/x86/intel/pmt/class.c |  2 +-
 drivers/uio/uio_hv_generic.c           |  2 +-
 include/linux/sysfs.h                  |  2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c
index 5808a66e2a81..3048758304b5 100644
--- a/arch/alpha/kernel/pci-sysfs.c
+++ b/arch/alpha/kernel/pci-sysfs.c
@@ -64,7 +64,7 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num,
  * Return: %0 on success, negative error code otherwise
  */
 static int pci_mmap_resource(struct kobject *kobj,
-			     struct bin_attribute *attr,
+			     const struct bin_attribute *attr,
 			     struct vm_area_struct *vma, int sparse)
 {
 	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
@@ -93,14 +93,14 @@ static int pci_mmap_resource(struct kobject *kobj,
 }
 
 static int pci_mmap_resource_sparse(struct file *filp, struct kobject *kobj,
-				    struct bin_attribute *attr,
+				    const struct bin_attribute *attr,
 				    struct vm_area_struct *vma)
 {
 	return pci_mmap_resource(kobj, attr, vma, 1);
 }
 
 static int pci_mmap_resource_dense(struct file *filp, struct kobject *kobj,
-				   struct bin_attribute *attr,
+				   const struct bin_attribute *attr,
 				   struct vm_area_struct *vma)
 {
 	return pci_mmap_resource(kobj, attr, vma, 0);
diff --git a/drivers/misc/ocxl/sysfs.c b/drivers/misc/ocxl/sysfs.c
index 405180d47d9b..07520d6e6dc5 100644
--- a/drivers/misc/ocxl/sysfs.c
+++ b/drivers/misc/ocxl/sysfs.c
@@ -125,7 +125,7 @@ static const struct vm_operations_struct global_mmio_vmops = {
 };
 
 static int global_mmio_mmap(struct file *filp, struct kobject *kobj,
-			struct bin_attribute *bin_attr,
+			const struct bin_attribute *bin_attr,
 			struct vm_area_struct *vma)
 {
 	struct ocxl_afu *afu = to_afu(kobj_to_dev(kobj));
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 4f47a13cb500..7abd4f546d3c 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -90,7 +90,7 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR_RO(published);
 
 static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
-		struct bin_attribute *attr, struct vm_area_struct *vma)
+		const struct bin_attribute *attr, struct vm_area_struct *vma)
 {
 	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
 	size_t len = vma->vm_end - vma->vm_start;
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 13912940ed2b..0ad3427228b1 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -910,7 +910,7 @@ static ssize_t pci_write_legacy_io(struct file *filp, struct kobject *kobj,
  * memory space.
  */
 static int pci_mmap_legacy_mem(struct file *filp, struct kobject *kobj,
-			       struct bin_attribute *attr,
+			       const struct bin_attribute *attr,
 			       struct vm_area_struct *vma)
 {
 	struct pci_bus *bus = to_pci_bus(kobj_to_dev(kobj));
@@ -930,7 +930,7 @@ static int pci_mmap_legacy_mem(struct file *filp, struct kobject *kobj,
  * memory space. Returns -ENOSYS if the operation isn't supported
  */
 static int pci_mmap_legacy_io(struct file *filp, struct kobject *kobj,
-			      struct bin_attribute *attr,
+			      const struct bin_attribute *attr,
 			      struct vm_area_struct *vma)
 {
 	struct pci_bus *bus = to_pci_bus(kobj_to_dev(kobj));
@@ -1034,7 +1034,7 @@ void pci_remove_legacy_files(struct pci_bus *b)
  *
  * Use the regular PCI mapping routines to map a PCI resource into userspace.
  */
-static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
+static int pci_mmap_resource(struct kobject *kobj, const struct bin_attribute *attr,
 			     struct vm_area_struct *vma, int write_combine)
 {
 	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
@@ -1059,14 +1059,14 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
 }
 
 static int pci_mmap_resource_uc(struct file *filp, struct kobject *kobj,
-				struct bin_attribute *attr,
+				const struct bin_attribute *attr,
 				struct vm_area_struct *vma)
 {
 	return pci_mmap_resource(kobj, attr, vma, 0);
 }
 
 static int pci_mmap_resource_wc(struct file *filp, struct kobject *kobj,
-				struct bin_attribute *attr,
+				const struct bin_attribute *attr,
 				struct vm_area_struct *vma)
 {
 	return pci_mmap_resource(kobj, attr, vma, 1);
diff --git a/drivers/platform/x86/intel/pmt/class.c b/drivers/platform/x86/intel/pmt/class.c
index c04bb7f97a4d..f9afa23e754b 100644
--- a/drivers/platform/x86/intel/pmt/class.c
+++ b/drivers/platform/x86/intel/pmt/class.c
@@ -103,7 +103,7 @@ intel_pmt_read(struct file *filp, struct kobject *kobj,
 
 static int
 intel_pmt_mmap(struct file *filp, struct kobject *kobj,
-		struct bin_attribute *attr, struct vm_area_struct *vma)
+		const struct bin_attribute *attr, struct vm_area_struct *vma)
 {
 	struct intel_pmt_entry *entry = container_of(attr,
 						     struct intel_pmt_entry,
diff --git a/drivers/uio/uio_hv_generic.c b/drivers/uio/uio_hv_generic.c
index 870409599411..3976360d0096 100644
--- a/drivers/uio/uio_hv_generic.c
+++ b/drivers/uio/uio_hv_generic.c
@@ -135,7 +135,7 @@ static void hv_uio_rescind(struct vmbus_channel *channel)
  * The ring buffer is allocated as contiguous memory by vmbus_open
  */
 static int hv_uio_ring_mmap(struct file *filp, struct kobject *kobj,
-			    struct bin_attribute *attr,
+			    const struct bin_attribute *attr,
 			    struct vm_area_struct *vma)
 {
 	struct vmbus_channel *channel
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index d1b22d56198b..9fcdc8cd3118 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -309,7 +309,7 @@ struct bin_attribute {
 			 char *, loff_t, size_t);
 	loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *,
 			 loff_t, int);
-	int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr,
+	int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr,
 		    struct vm_area_struct *vma);
 };
 
-- 
cgit v1.2.3


From 699e7b85afb5d94b99b0a3edca7e9e93ea320c8f Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 3 Nov 2024 17:03:36 +0000
Subject: sysfs: treewide: constify attribute callback of
 bin_attribute::llseek()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The llseek() callbacks should not modify the struct
bin_attribute passed as argument.
Enforce this by marking the argument as const.

As there are not many callback implementers perform this change
throughout the tree at once.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-7-71110628844c@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/pci-sysfs.c | 2 +-
 include/linux/sysfs.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 0ad3427228b1..49bee70f7d37 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -841,7 +841,7 @@ static const struct attribute_group pci_dev_config_attr_group = {
 static __maybe_unused loff_t
 pci_llseek_resource(struct file *filep,
 		    struct kobject *kobj __always_unused,
-		    struct bin_attribute *attr,
+		    const struct bin_attribute *attr,
 		    loff_t offset, int whence)
 {
 	return fixed_size_llseek(filep, offset, whence, attr->size);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 9fcdc8cd3118..cb2a5e277c23 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -307,7 +307,7 @@ struct bin_attribute {
 			char *, loff_t, size_t);
 	ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *,
 			 char *, loff_t, size_t);
-	loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *,
+	loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *,
 			 loff_t, int);
 	int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr,
 		    struct vm_area_struct *vma);
-- 
cgit v1.2.3


From ae587a509903cca138e910445d8c21fe73b45c80 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 3 Nov 2024 17:03:37 +0000
Subject: sysfs: implement all BIN_ATTR_* macros in terms of __BIN_ATTR()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The preparations for the upcoming constification of struct bin_attribute
requires some logic in the structure definition macros.
To avoid duplication of that logic in multiple macros, reimplement all
other macros in terms of __BIN_ATTR().

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-8-71110628844c@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index cb2a5e277c23..d17c473c1ef2 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -333,17 +333,11 @@ struct bin_attribute {
 	.size	= _size,						\
 }
 
-#define __BIN_ATTR_RO(_name, _size) {					\
-	.attr	= { .name = __stringify(_name), .mode = 0444 },		\
-	.read	= _name##_read,						\
-	.size	= _size,						\
-}
+#define __BIN_ATTR_RO(_name, _size)					\
+	__BIN_ATTR(_name, 0444, _name##_read, NULL, _size)
 
-#define __BIN_ATTR_WO(_name, _size) {					\
-	.attr	= { .name = __stringify(_name), .mode = 0200 },		\
-	.write	= _name##_write,					\
-	.size	= _size,						\
-}
+#define __BIN_ATTR_WO(_name, _size)					\
+	__BIN_ATTR(_name, 0200, NULL, _name##_write, _size)
 
 #define __BIN_ATTR_RW(_name, _size)					\
 	__BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size)
@@ -364,11 +358,8 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size)
 struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)
 
 
-#define __BIN_ATTR_ADMIN_RO(_name, _size) {					\
-	.attr	= { .name = __stringify(_name), .mode = 0400 },		\
-	.read	= _name##_read,						\
-	.size	= _size,						\
-}
+#define __BIN_ATTR_ADMIN_RO(_name, _size)				\
+	__BIN_ATTR(_name, 0400, _name##_read, NULL, _size)
 
 #define __BIN_ATTR_ADMIN_RW(_name, _size)					\
 	__BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size)
@@ -379,10 +370,8 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size)
 #define BIN_ATTR_ADMIN_RW(_name, _size)					\
 struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size)
 
-#define __BIN_ATTR_SIMPLE_RO(_name, _mode) {				\
-	.attr	= { .name = __stringify(_name), .mode = _mode },	\
-	.read	= sysfs_bin_attr_simple_read,				\
-}
+#define __BIN_ATTR_SIMPLE_RO(_name, _mode)				\
+	__BIN_ATTR(_name, _mode, sysfs_bin_attr_simple_read, NULL, 0)
 
 #define BIN_ATTR_SIMPLE_RO(_name)					\
 struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444)
-- 
cgit v1.2.3


From eb2e6c3a8d66ff37b2ee26cd32334ae0e05fd596 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 3 Nov 2024 17:03:38 +0000
Subject: sysfs: bin_attribute: add const read/write callback variants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To make it possible to put struct bin_attribute into read-only memory,
the sysfs core has to stop passing mutable pointers to the read() and
write() callbacks.
As there are numerous implementors of these callbacks throughout the
tree it's not possible to change all of them at once.
To enable a step-by-step transition, add new variants of the read() and
write() callbacks which differ only in the constness of the struct
bin_attribute argument.

As most binary attributes are defined through macros, extend these
macros to transparently handle both variants of callbacks to minimize
the churn during the transition.
As soon as all handlers are switch to the const variant, the non-const
one can be removed together with the transition machinery.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-9-71110628844c@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c       | 22 +++++++++++++++++-----
 include/linux/sysfs.h | 25 +++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 6d39696b4306..785408861c01 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -91,9 +91,12 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
 			count = size - pos;
 	}
 
-	if (!battr->read)
+	if (!battr->read && !battr->read_new)
 		return -EIO;
 
+	if (battr->read_new)
+		return battr->read_new(of->file, kobj, battr, buf, pos, count);
+
 	return battr->read(of->file, kobj, battr, buf, pos, count);
 }
 
@@ -152,9 +155,12 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
 	if (!count)
 		return 0;
 
-	if (!battr->write)
+	if (!battr->write && !battr->write_new)
 		return -EIO;
 
+	if (battr->write_new)
+		return battr->write_new(of->file, kobj, battr, buf, pos, count);
+
 	return battr->write(of->file, kobj, battr, buf, pos, count);
 }
 
@@ -323,13 +329,19 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
 	const struct kernfs_ops *ops;
 	struct kernfs_node *kn;
 
+	if (battr->read && battr->read_new)
+		return -EINVAL;
+
+	if (battr->write && battr->write_new)
+		return -EINVAL;
+
 	if (battr->mmap)
 		ops = &sysfs_bin_kfops_mmap;
-	else if (battr->read && battr->write)
+	else if ((battr->read || battr->read_new) && (battr->write || battr->write_new))
 		ops = &sysfs_bin_kfops_rw;
-	else if (battr->read)
+	else if (battr->read || battr->read_new)
 		ops = &sysfs_bin_kfops_ro;
-	else if (battr->write)
+	else if (battr->write || battr->write_new)
 		ops = &sysfs_bin_kfops_wo;
 	else
 		ops = &sysfs_file_kfops_empty;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index d17c473c1ef2..d713a6445a62 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -305,8 +305,12 @@ struct bin_attribute {
 	struct address_space *(*f_mapping)(void);
 	ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *,
 			char *, loff_t, size_t);
+	ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *,
+			    char *, loff_t, size_t);
 	ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *,
 			 char *, loff_t, size_t);
+	ssize_t (*write_new)(struct file *, struct kobject *,
+			     const struct bin_attribute *, char *, loff_t, size_t);
 	loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *,
 			 loff_t, int);
 	int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr,
@@ -325,11 +329,28 @@ struct bin_attribute {
  */
 #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr)
 
+typedef ssize_t __sysfs_bin_rw_handler_new(struct file *, struct kobject *,
+					   const struct bin_attribute *, char *, loff_t, size_t);
+
 /* macros to create static binary attributes easier */
 #define __BIN_ATTR(_name, _mode, _read, _write, _size) {		\
 	.attr = { .name = __stringify(_name), .mode = _mode },		\
-	.read	= _read,						\
-	.write	= _write,						\
+	.read = _Generic(_read,						\
+		__sysfs_bin_rw_handler_new * : NULL,			\
+		default : _read						\
+	),								\
+	.read_new = _Generic(_read,					\
+		__sysfs_bin_rw_handler_new * : _read,			\
+		default : NULL						\
+	),								\
+	.write = _Generic(_write,					\
+		__sysfs_bin_rw_handler_new * : NULL,			\
+		default : _write					\
+	),								\
+	.write_new = _Generic(_write,					\
+		__sysfs_bin_rw_handler_new * : _write,			\
+		default : NULL						\
+	),								\
 	.size	= _size,						\
 }
 
-- 
cgit v1.2.3


From 64c58d7c99343a910edf995e15d8037e19ec5777 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 3 Nov 2024 20:19:16 -0800
Subject: iomap: add a merge boundary flag

File systems might have boundaries over which merges aren't possible.
In fact these are very common, although most of the time some kind of
header at the beginning of this region (e.g. XFS alloation groups, ext4
block groups) automatically create a merge barrier.  But if that is
not present, say for a device purely used for data we need to manually
communicate that to iomap.

Add a IOMAP_F_BOUNDARY flag to never merge I/O into a previous mapping.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/buffered-io.c | 6 ++++++
 include/linux/iomap.h  | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ef0b68bccbb6..fcadd31017d1 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1601,6 +1601,8 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 {
 	if (ioend->io_bio.bi_status != next->io_bio.bi_status)
 		return false;
+	if (next->io_flags & IOMAP_F_BOUNDARY)
+		return false;
 	if ((ioend->io_flags & IOMAP_F_SHARED) ^
 	    (next->io_flags & IOMAP_F_SHARED))
 		return false;
@@ -1720,6 +1722,8 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
 	INIT_LIST_HEAD(&ioend->io_list);
 	ioend->io_type = wpc->iomap.type;
 	ioend->io_flags = wpc->iomap.flags;
+	if (pos > wpc->iomap.offset)
+		wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
 	ioend->io_offset = pos;
@@ -1731,6 +1735,8 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
 
 static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
 {
+	if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+		return false;
 	if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
 	    (wpc->ioend->io_flags & IOMAP_F_SHARED))
 		return false;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f61407e3b121..9ecb8ea7714c 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -53,6 +53,9 @@ struct vm_fault;
  *
  * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent
  * rather than a file data extent.
+ *
+ * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
+ * never be merged with the mapping before it.
  */
 #define IOMAP_F_NEW		(1U << 0)
 #define IOMAP_F_DIRTY		(1U << 1)
@@ -64,6 +67,7 @@ struct vm_fault;
 #define IOMAP_F_BUFFER_HEAD	0
 #endif /* CONFIG_BUFFER_HEAD */
 #define IOMAP_F_XATTR		(1U << 5)
+#define IOMAP_F_BOUNDARY	(1U << 6)
 
 /*
  * Flags set by the core iomap code during operations:
-- 
cgit v1.2.3


From 57a063632df8db6cb20d64ee52a06d4e2049235a Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 23 Oct 2024 11:45:24 -0700
Subject: Input: introduce notion of passive observers for input handlers

Sometimes it is useful to observe (and maybe modify) data coming from
an input device, but only do that if there are other users of such input
device. An example is touchpad switching functionality on Lenovo IdeaPad
Z570 where it is desirable to suppress events coming from the touchpad
if user toggles touchpad on/off button (on this laptop the firmware does
not stop the device).

Introduce notion of passive observers for input handlers to solve this
issue. An input handler marked as passive observer behaves exactly like
any other input handler or filter, but with one exception: it does not
open/start underlying input device when attaching to it.

Link: https://lore.kernel.org/r/ZxlEROX7bMo5cbZP@google.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/input.c | 15 ++++++++++-----
 include/linux/input.h |  5 +++++
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 3c321671793f..3b1e88ead97e 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -605,6 +605,9 @@ int input_open_device(struct input_handle *handle)
 
 	handle->open++;
 
+	if (handle->handler->passive_observer)
+		goto out;
+
 	if (dev->users++ || dev->inhibited) {
 		/*
 		 * Device is already opened and/or inhibited,
@@ -668,11 +671,13 @@ void input_close_device(struct input_handle *handle)
 
 	__input_release_device(handle);
 
-	if (!--dev->users && !dev->inhibited) {
-		if (dev->poller)
-			input_dev_poller_stop(dev->poller);
-		if (dev->close)
-			dev->close(dev);
+	if (!handle->handler->passive_observer) {
+		if (!--dev->users && !dev->inhibited) {
+			if (dev->poller)
+				input_dev_poller_stop(dev->poller);
+			if (dev->close)
+				dev->close(dev);
+		}
 	}
 
 	if (!--handle->open) {
diff --git a/include/linux/input.h b/include/linux/input.h
index 89a0be6ee0e2..6437c35f0796 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -286,6 +286,10 @@ struct input_handle;
  * @start: starts handler for given handle. This function is called by
  *	input core right after connect() method and also when a process
  *	that "grabbed" a device releases it
+ * @passive_observer: set to %true by drivers only interested in observing
+ *	data stream from devices if there are other users present. Such
+ *	drivers will not result in starting underlying hardware device
+ *	when input_open_device() is called for their handles
  * @legacy_minors: set to %true by drivers using legacy minor ranges
  * @minor: beginning of range of 32 legacy minors for devices this driver
  *	can provide
@@ -321,6 +325,7 @@ struct input_handler {
 	void (*disconnect)(struct input_handle *handle);
 	void (*start)(struct input_handle *handle);
 
+	bool passive_observer;
 	bool legacy_minors;
 	int minor;
 	const char *name;
-- 
cgit v1.2.3


From 0fb4a7ad270b3b209e510eb9dc5b07bf02b7edaf Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 29 Oct 2024 18:11:46 +0000
Subject: mm: refactor map_deny_write_exec()

Refactor the map_deny_write_exec() to not unnecessarily require a VMA
parameter but rather to accept VMA flags parameters, which allows us to
use this function early in mmap_region() in a subsequent commit.

While we're here, we refactor the function to be more readable and add
some additional documentation.

Link: https://lkml.kernel.org/r/6be8bb59cd7c68006ebb006eb9d8dc27104b1f70.1730224667.git.lorenzo.stoakes@oracle.com
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: Jann Horn <jannh@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jann Horn <jannh@google.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mman.h | 21 ++++++++++++++++++---
 mm/mmap.c            |  2 +-
 mm/mprotect.c        |  2 +-
 mm/vma.h             |  2 +-
 4 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mman.h b/include/linux/mman.h
index bcb201ab7a41..8ddca62d6460 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -188,16 +188,31 @@ static inline bool arch_memory_deny_write_exec_supported(void)
  *
  *	d)	mmap(PROT_READ | PROT_EXEC)
  *		mmap(PROT_READ | PROT_EXEC | PROT_BTI)
+ *
+ * This is only applicable if the user has set the Memory-Deny-Write-Execute
+ * (MDWE) protection mask for the current process.
+ *
+ * @old specifies the VMA flags the VMA originally possessed, and @new the ones
+ * we propose to set.
+ *
+ * Return: false if proposed change is OK, true if not ok and should be denied.
  */
-static inline bool map_deny_write_exec(struct vm_area_struct *vma,  unsigned long vm_flags)
+static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
 {
+	/* If MDWE is disabled, we have nothing to deny. */
 	if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
 		return false;
 
-	if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+	/* If the new VMA is not executable, we have nothing to deny. */
+	if (!(new & VM_EXEC))
+		return false;
+
+	/* Under MDWE we do not accept newly writably executable VMAs... */
+	if (new & VM_WRITE)
 		return true;
 
-	if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+	/* ...nor previously non-executable VMAs becoming executable. */
+	if (!(old & VM_EXEC))
 		return true;
 
 	return false;
diff --git a/mm/mmap.c b/mm/mmap.c
index ac0604f146f6..ab71d4c3464c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1505,7 +1505,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		vma_set_anonymous(vma);
 	}
 
-	if (map_deny_write_exec(vma, vma->vm_flags)) {
+	if (map_deny_write_exec(vma->vm_flags, vma->vm_flags)) {
 		error = -EACCES;
 		goto close_and_free_vma;
 	}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5d6d06107d..6f450af3252e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -810,7 +810,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
 			break;
 		}
 
-		if (map_deny_write_exec(vma, newflags)) {
+		if (map_deny_write_exec(vma->vm_flags, newflags)) {
 			error = -EACCES;
 			break;
 		}
diff --git a/mm/vma.h b/mm/vma.h
index 75558b5e9c8c..d58068c0ff2e 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -42,7 +42,7 @@ struct vma_munmap_struct {
 	int vma_count;                  /* Number of vmas that will be removed */
 	bool unlock;                    /* Unlock after the munmap */
 	bool clear_ptes;                /* If there are outstanding PTE to be cleared */
-	/* 1 byte hole */
+	/* 2 byte hole */
 	unsigned long nr_pages;         /* Number of pages being removed */
 	unsigned long locked_vm;        /* Number of locked pages */
 	unsigned long nr_accounted;     /* Number of VM_ACCOUNT pages */
-- 
cgit v1.2.3


From 5baf8b037debf4ec60108ccfeccb8636d1dbad81 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 29 Oct 2024 18:11:47 +0000
Subject: mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling

Currently MTE is permitted in two circumstances (desiring to use MTE
having been specified by the VM_MTE flag) - where MAP_ANONYMOUS is
specified, as checked by arch_calc_vm_flag_bits() and actualised by
setting the VM_MTE_ALLOWED flag, or if the file backing the mapping is
shmem, in which case we set VM_MTE_ALLOWED in shmem_mmap() when the mmap
hook is activated in mmap_region().

The function that checks that, if VM_MTE is set, VM_MTE_ALLOWED is also
set is the arm64 implementation of arch_validate_flags().

Unfortunately, we intend to refactor mmap_region() to perform this check
earlier, meaning that in the case of a shmem backing we will not have
invoked shmem_mmap() yet, causing the mapping to fail spuriously.

It is inappropriate to set this architecture-specific flag in general mm
code anyway, so a sensible resolution of this issue is to instead move the
check somewhere else.

We resolve this by setting VM_MTE_ALLOWED much earlier in do_mmap(), via
the arch_calc_vm_flag_bits() call.

This is an appropriate place to do this as we already check for the
MAP_ANONYMOUS case here, and the shmem file case is simply a variant of
the same idea - we permit RAM-backed memory.

This requires a modification to the arch_calc_vm_flag_bits() signature to
pass in a pointer to the struct file associated with the mapping, however
this is not too egregious as this is only used by two architectures anyway
- arm64 and parisc.

So this patch performs this adjustment and removes the unnecessary
assignment of VM_MTE_ALLOWED in shmem_mmap().

[akpm@linux-foundation.org: fix whitespace, per Catalin]
Link: https://lkml.kernel.org/r/ec251b20ba1964fb64cf1607d2ad80c47f3873df.1730224667.git.lorenzo.stoakes@oracle.com
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Jann Horn <jannh@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/mman.h  | 10 +++++++---
 arch/parisc/include/asm/mman.h |  5 +++--
 include/linux/mman.h           |  7 ++++---
 mm/mmap.c                      |  2 +-
 mm/nommu.c                     |  2 +-
 mm/shmem.c                     |  3 ---
 6 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 9e39217b4afb..798d965760d4 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -6,6 +6,8 @@
 
 #ifndef BUILD_VDSO
 #include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/shmem_fs.h>
 #include <linux/types.h>
 
 static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
@@ -31,19 +33,21 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
 }
 #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
 
-static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
+static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
+						   unsigned long flags)
 {
 	/*
 	 * Only allow MTE on anonymous mappings as these are guaranteed to be
 	 * backed by tags-capable memory. The vm_flags may be overridden by a
 	 * filesystem supporting MTE (RAM-based).
 	 */
-	if (system_supports_mte() && (flags & MAP_ANONYMOUS))
+	if (system_supports_mte() &&
+	    ((flags & MAP_ANONYMOUS) || shmem_file(file)))
 		return VM_MTE_ALLOWED;
 
 	return 0;
 }
-#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
+#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
 
 static inline bool arch_validate_prot(unsigned long prot,
 	unsigned long addr __always_unused)
diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h
index 89b6beeda0b8..663f587dc789 100644
--- a/arch/parisc/include/asm/mman.h
+++ b/arch/parisc/include/asm/mman.h
@@ -2,6 +2,7 @@
 #ifndef __ASM_MMAN_H__
 #define __ASM_MMAN_H__
 
+#include <linux/fs.h>
 #include <uapi/asm/mman.h>
 
 /* PARISC cannot allow mdwe as it needs writable stacks */
@@ -11,7 +12,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
 }
 #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
 
-static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
+static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags)
 {
 	/*
 	 * The stack on parisc grows upwards, so if userspace requests memory
@@ -23,6 +24,6 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
 
 	return 0;
 }
-#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
+#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
 
 #endif /* __ASM_MMAN_H__ */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 8ddca62d6460..a842783ffa62 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_MMAN_H
 #define _LINUX_MMAN_H
 
+#include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/percpu_counter.h>
 
@@ -94,7 +95,7 @@ static inline void vm_unacct_memory(long pages)
 #endif
 
 #ifndef arch_calc_vm_flag_bits
-#define arch_calc_vm_flag_bits(flags) 0
+#define arch_calc_vm_flag_bits(file, flags) 0
 #endif
 
 #ifndef arch_validate_prot
@@ -151,13 +152,13 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
  * Combine the mmap "flags" argument into "vm_flags" used internally.
  */
 static inline unsigned long
-calc_vm_flag_bits(unsigned long flags)
+calc_vm_flag_bits(struct file *file, unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
 	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
 	       _calc_vm_trans(flags, MAP_STACK,	     VM_NOHUGEPAGE) |
-	       arch_calc_vm_flag_bits(flags);
+	       arch_calc_vm_flag_bits(file, flags);
 }
 
 unsigned long vm_commit_limit(void);
diff --git a/mm/mmap.c b/mm/mmap.c
index ab71d4c3464c..aee5fa08ae5d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -344,7 +344,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	 * to. we assume access permissions have been handled by the open
 	 * of the memory object, so we don't do any here.
 	 */
-	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
 	/* Obtain the address to map to. we verify (or select) it and ensure
diff --git a/mm/nommu.c b/mm/nommu.c
index 635d028d647b..e9b5f527ab5b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -842,7 +842,7 @@ static unsigned long determine_vm_flags(struct file *file,
 {
 	unsigned long vm_flags;
 
-	vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
+	vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
 
 	if (!file) {
 		/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 4ba1d00fabda..e87f5d6799a7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2733,9 +2733,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	if (ret)
 		return ret;
 
-	/* arm64 - allow memory tagging on RAM-based files */
-	vm_flags_set(vma, VM_MTE_ALLOWED);
-
 	file_accessed(file);
 	/* This is anonymous shared memory if it is unlinked at the time of mmap */
 	if (inode->i_nlink)
-- 
cgit v1.2.3


From d2d243df445a88c26e91eac02b041213c7a32e9e Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Sun, 22 Sep 2024 12:32:13 +0800
Subject: mm: shmem: fix khugepaged activation policy for shmem

Shmem has a separate interface (different from anonymous pages) to control
huge page allocation, that means shmem THP can be enabled while anonymous
THP is disabled.  However, in this case, khugepaged will not start to
collapse shmem THP, which is unreasonable.

To fix this issue, we should call start_stop_khugepaged() to activate or
deactivate the khugepaged thread when setting shmem mTHP interfaces.
Moreover, add a new helper shmem_hpage_pmd_enabled() to help to check
whether shmem THP is enabled, which will determine if khugepaged should be
activated.

Link: https://lkml.kernel.org/r/9b9c6cbc4499bf44c6455367fd9e0f6036525680.1726978977.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reported-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h |  6 ++++++
 mm/khugepaged.c          |  6 +++++-
 mm/shmem.c               | 29 +++++++++++++++++++++++++++--
 3 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 515a9a6a3c6f..ee6635052383 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -114,6 +114,7 @@ int shmem_unuse(unsigned int type);
 unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
 				loff_t write_end, bool shmem_huge_force);
+bool shmem_hpage_pmd_enabled(void);
 #else
 static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
@@ -121,6 +122,11 @@ static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
 {
 	return 0;
 }
+
+static inline bool shmem_hpage_pmd_enabled(void)
+{
+	return false;
+}
 #endif
 
 #ifdef CONFIG_SHMEM
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b538c3d48386..0bd80e134010 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -416,9 +416,11 @@ static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
 static bool hugepage_pmd_enabled(void)
 {
 	/*
-	 * We cover both the anon and the file-backed case here; file-backed
+	 * We cover the anon, shmem and the file-backed case here; file-backed
 	 * hugepages, when configured in, are determined by the global control.
 	 * Anon pmd-sized hugepages are determined by the pmd-size control.
+	 * Shmem pmd-sized hugepages are also determined by its pmd-size control,
+	 * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
 	 */
 	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
 	    hugepage_global_enabled())
@@ -430,6 +432,8 @@ static bool hugepage_pmd_enabled(void)
 	if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
 	    hugepage_global_enabled())
 		return true;
+	if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
+		return true;
 	return false;
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index e87f5d6799a7..6ad50ba60d8e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1655,6 +1655,23 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+bool shmem_hpage_pmd_enabled(void)
+{
+	if (shmem_huge == SHMEM_HUGE_DENY)
+		return false;
+	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
+		return true;
+	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
+		return true;
+	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
+		return true;
+	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
+	    shmem_huge != SHMEM_HUGE_NEVER)
+		return true;
+
+	return false;
+}
+
 unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
 				loff_t write_end, bool shmem_huge_force)
@@ -5024,7 +5041,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
 		struct kobj_attribute *attr, const char *buf, size_t count)
 {
 	char tmp[16];
-	int huge;
+	int huge, err;
 
 	if (count + 1 > sizeof(tmp))
 		return -EINVAL;
@@ -5048,7 +5065,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
 	shmem_huge = huge;
 	if (shmem_huge > SHMEM_HUGE_DENY)
 		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
-	return count;
+
+	err = start_stop_khugepaged();
+	return err ? err : count;
 }
 
 struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
@@ -5125,6 +5144,12 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
 		ret = -EINVAL;
 	}
 
+	if (ret > 0) {
+		int err = start_stop_khugepaged();
+
+		if (err)
+			ret = err;
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3


From 9e9e085effe9b7e342138fde3cf8577d22509932 Mon Sep 17 00:00:00 2001
From: Adrian Huang <ahuang12@lenovo.com>
Date: Sat, 27 Jul 2024 00:52:46 +0800
Subject: mm/vmalloc: combine all TLB flush operations of KASAN shadow virtual
 address into one operation

When compiling kernel source 'make -j $(nproc)' with the up-and-running
KASAN-enabled kernel on a 256-core machine, the following soft lockup is
shown:

watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760]
CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95
Workqueue: events drain_vmap_area_work
RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0
Code: 38 c8 7c 08 84 c9 0f 85 49 08 00 00 8b 45 08 a8 01 74 2e 48 89 f1 49 89 f7 48 c1 e9 03 41 83 e7 07 4c 01 e9 41 83 c7 03 f3 90 <0f> b6 01 41 38 c7 7c 08 84 c0 0f 85 d4 06 00 00 8b 45 08 a8 01 75
RSP: 0018:ffffc9000cb3fb60 EFLAGS: 00000202
RAX: 0000000000000011 RBX: ffff8883bc4469c0 RCX: ffffed10776e9949
RDX: 0000000000000002 RSI: ffff8883bb74ca48 RDI: ffffffff8434dc50
RBP: ffff8883bb74ca40 R08: ffff888103585dc0 R09: ffff8884533a1800
R10: 0000000000000004 R11: ffffffffffffffff R12: ffffed1077888d39
R13: dffffc0000000000 R14: ffffed1077888d38 R15: 0000000000000003
FS:  0000000000000000(0000) GS:ffff8883bc400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005577b5c8d158 CR3: 0000000004850000 CR4: 0000000000350ef0
Call Trace:
 <IRQ>
 ? watchdog_timer_fn+0x2cd/0x390
 ? __pfx_watchdog_timer_fn+0x10/0x10
 ? __hrtimer_run_queues+0x300/0x6d0
 ? sched_clock_cpu+0x69/0x4e0
 ? __pfx___hrtimer_run_queues+0x10/0x10
 ? srso_return_thunk+0x5/0x5f
 ? ktime_get_update_offsets_now+0x7f/0x2a0
 ? srso_return_thunk+0x5/0x5f
 ? srso_return_thunk+0x5/0x5f
 ? hrtimer_interrupt+0x2ca/0x760
 ? __sysvec_apic_timer_interrupt+0x8c/0x2b0
 ? sysvec_apic_timer_interrupt+0x6a/0x90
 </IRQ>
 <TASK>
 ? asm_sysvec_apic_timer_interrupt+0x16/0x20
 ? smp_call_function_many_cond+0x1d8/0xbb0
 ? __pfx_do_kernel_range_flush+0x10/0x10
 on_each_cpu_cond_mask+0x20/0x40
 flush_tlb_kernel_range+0x19b/0x250
 ? srso_return_thunk+0x5/0x5f
 ? kasan_release_vmalloc+0xa7/0xc0
 purge_vmap_node+0x357/0x820
 ? __pfx_purge_vmap_node+0x10/0x10
 __purge_vmap_area_lazy+0x5b8/0xa10
 drain_vmap_area_work+0x21/0x30
 process_one_work+0x661/0x10b0
 worker_thread+0x844/0x10e0
 ? srso_return_thunk+0x5/0x5f
 ? __kthread_parkme+0x82/0x140
 ? __pfx_worker_thread+0x10/0x10
 kthread+0x2a5/0x370
 ? __pfx_kthread+0x10/0x10
 ret_from_fork+0x30/0x70
 ? __pfx_kthread+0x10/0x10
 ret_from_fork_asm+0x1a/0x30
 </TASK>

Debugging Analysis:

  1. The following ftrace log shows that the lockup CPU spends too much
     time iterating vmap_nodes and flushing TLB when purging vm_area
     structures. (Some info is trimmed).

     kworker: funcgraph_entry:              |  drain_vmap_area_work() {
     kworker: funcgraph_entry:              |   mutex_lock() {
     kworker: funcgraph_entry:  1.092 us    |     __cond_resched();
     kworker: funcgraph_exit:   3.306 us    |   }
     ...                                        ...
     kworker: funcgraph_entry:              |    flush_tlb_kernel_range() {
     ...                                          ...
     kworker: funcgraph_exit: # 7533.649 us |    }
     ...                                         ...
     kworker: funcgraph_entry:  2.344 us    |   mutex_unlock();
     kworker: funcgraph_exit: $ 23871554 us | }

     The drain_vmap_area_work() spends over 23 seconds.

     There are 2805 flush_tlb_kernel_range() calls in the ftrace log.
       * One is called in __purge_vmap_area_lazy().
       * Others are called by purge_vmap_node->kasan_release_vmalloc.
         purge_vmap_node() iteratively releases kasan vmalloc
         allocations and flushes TLB for each vmap_area.
           - [Rough calculation] Each flush_tlb_kernel_range() runs
             about 7.5ms.
               -- 2804 * 7.5ms = 21.03 seconds.
               -- That's why a soft lock is triggered.

  2. Extending the soft lockup time can work around the issue (For example,
     # echo 60 > /proc/sys/kernel/watchdog_thresh). This confirms the
     above-mentioned speculation: drain_vmap_area_work() spends too much
     time.

If we combine all TLB flush operations of the KASAN shadow virtual
address into one operation in the call path
'purge_vmap_node()->kasan_release_vmalloc()', the running time of
drain_vmap_area_work() can be saved greatly. The idea is from the
flush_tlb_kernel_range() call in __purge_vmap_area_lazy(). And, the
soft lockup won't be triggered.

Here is the test result based on 6.10:

[6.10 wo/ the patch]
  1. ftrace latency profiling (record a trace if the latency > 20s).
     echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh
     echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function
     echo function_graph > /sys/kernel/debug/tracing/current_tracer
     echo 1 > /sys/kernel/debug/tracing/tracing_on

  2. Run `make -j $(nproc)` to compile the kernel source

  3. Once the soft lockup is reproduced, check the ftrace log:
     cat /sys/kernel/debug/tracing/trace
        # tracer: function_graph
        #
        # CPU  DURATION                  FUNCTION CALLS
        # |     |   |                     |   |   |   |
          76) $ 50412985 us |    } /* __purge_vmap_area_lazy */
          76) $ 50412997 us |  } /* drain_vmap_area_work */
          76) $ 29165911 us |    } /* __purge_vmap_area_lazy */
          76) $ 29165926 us |  } /* drain_vmap_area_work */
          91) $ 53629423 us |    } /* __purge_vmap_area_lazy */
          91) $ 53629434 us |  } /* drain_vmap_area_work */
          91) $ 28121014 us |    } /* __purge_vmap_area_lazy */
          91) $ 28121026 us |  } /* drain_vmap_area_work */

[6.10 w/ the patch]
  1. Repeat step 1-2 in "[6.10 wo/ the patch]"

  2. The soft lockup is not triggered and ftrace log is empty.
     cat /sys/kernel/debug/tracing/trace
     # tracer: function_graph
     #
     # CPU  DURATION                  FUNCTION CALLS
     # |     |   |                     |   |   |   |

  3. Setting 'tracing_thresh' to 10/5 seconds does not get any ftrace
     log.

  4. Setting 'tracing_thresh' to 1 second gets ftrace log.
     cat /sys/kernel/debug/tracing/trace
     # tracer: function_graph
     #
     # CPU  DURATION                  FUNCTION CALLS
     # |     |   |                     |   |   |   |
       23) $ 1074942 us  |    } /* __purge_vmap_area_lazy */
       23) $ 1074950 us  |  } /* drain_vmap_area_work */

  The worst execution time of drain_vmap_area_work() is about 1 second.

Link: https://lore.kernel.org/lkml/ZqFlawuVnOMY2k3E@pc638.lan/
Link: https://lkml.kernel.org/r/20240726165246.31326-1-ahuang12@lenovo.com
Fixes: 282631cb2447 ("mm: vmalloc: remove global purge_vmap_area_root rb-tree")
Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
Co-developed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Tested-by: Jiwei Sun <sunjw10@lenovo.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 12 +++++++++---
 mm/kasan/shadow.c     | 14 ++++++++++----
 mm/vmalloc.c          | 34 ++++++++++++++++++++++++++--------
 3 files changed, 45 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 00a3bf7c0d8f..6bbfc8aa42e8 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -29,6 +29,9 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t;
 #define KASAN_VMALLOC_VM_ALLOC		((__force kasan_vmalloc_flags_t)0x02u)
 #define KASAN_VMALLOC_PROT_NORMAL	((__force kasan_vmalloc_flags_t)0x04u)
 
+#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
+#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
+
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 
 #include <linux/pgtable.h>
@@ -564,7 +567,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
 int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end);
+			   unsigned long free_region_end,
+			   unsigned long flags);
 
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -579,7 +583,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -614,7 +619,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 static inline void *kasan_unpoison_vmalloc(const void *start,
 					   unsigned long size,
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d6210ca48dda..88d1c9dcb507 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  */
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end)
+			   unsigned long free_region_end,
+			   unsigned long flags)
 {
 	void *shadow_start, *shadow_end;
 	unsigned long region_start, region_end;
@@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
 			return;
 		}
-		apply_to_existing_page_range(&init_mm,
+
+
+		if (flags & KASAN_VMALLOC_PAGE_RANGE)
+			apply_to_existing_page_range(&init_mm,
 					     (unsigned long)shadow_start,
 					     size, kasan_depopulate_vmalloc_pte,
 					     NULL);
-		flush_tlb_kernel_range((unsigned long)shadow_start,
-				       (unsigned long)shadow_end);
+
+		if (flags & KASAN_VMALLOC_TLB_FLUSH)
+			flush_tlb_kernel_range((unsigned long)shadow_start,
+					       (unsigned long)shadow_end);
 	}
 }
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 634162271c00..5480b77f4167 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2182,6 +2182,25 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
 	reclaim_list_global(&decay_list);
 }
 
+static void
+kasan_release_vmalloc_node(struct vmap_node *vn)
+{
+	struct vmap_area *va;
+	unsigned long start, end;
+
+	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
+	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
+
+	list_for_each_entry(va, &vn->purge_list, list) {
+		if (is_vmalloc_or_module_addr((void *) va->va_start))
+			kasan_release_vmalloc(va->va_start, va->va_end,
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE);
+	}
+
+	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
+}
+
 static void purge_vmap_node(struct work_struct *work)
 {
 	struct vmap_node *vn = container_of(work,
@@ -2190,20 +2209,17 @@ static void purge_vmap_node(struct work_struct *work)
 	struct vmap_area *va, *n_va;
 	LIST_HEAD(local_list);
 
+	if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+		kasan_release_vmalloc_node(vn);
+
 	vn->nr_purged = 0;
 
 	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = va_size(va) >> PAGE_SHIFT;
-		unsigned long orig_start = va->va_start;
-		unsigned long orig_end = va->va_end;
 		unsigned int vn_id = decode_vn_id(va->flags);
 
 		list_del_init(&va->list);
 
-		if (is_vmalloc_or_module_addr((void *)orig_start))
-			kasan_release_vmalloc(orig_start, orig_end,
-					      va->va_start, va->va_end);
-
 		nr_purged_pages += nr;
 		vn->nr_purged++;
 
@@ -4784,7 +4800,8 @@ recovery:
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 	}
 
@@ -4834,7 +4851,8 @@ err_free_shadow:
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 		kfree(vms[area]);
 	}
-- 
cgit v1.2.3


From 1cd1a4e71b61eaf8cadd15372b67ccd60a2e1a99 Mon Sep 17 00:00:00 2001
From: Tanya Agarwal <tanyaagarwal25699@gmail.com>
Date: Fri, 27 Sep 2024 00:05:16 +0530
Subject: mm/mempolicy: fix comments for better documentation

Fix typo in mempolicy.h and Correct the number of allowed memory policy

Link: https://lkml.kernel.org/r/20240926183516.4034-2-tanyaagarwal25699@gmail.com
Signed-off-by: Tanya Agarwal <tanyaagarwal25699@gmail.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Cc: Anup Sharma <anupnewsmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 2 +-
 mm/mempolicy.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1add16f21612..ce9885e0178a 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -47,7 +47,7 @@ struct mempolicy {
 	atomic_t refcnt;
 	unsigned short mode; 	/* See MPOL_* above */
 	unsigned short flags;	/* See set_mempolicy() MPOL_F_* above */
-	nodemask_t nodes;	/* interleave/bind/perfer */
+	nodemask_t nodes;	/* interleave/bind/preferred/etc */
 	int home_node;		/* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
 
 	union {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b646fab3e45e..9e18a6fc3061 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -8,7 +8,7 @@
  * NUMA policy allows the user to give hints in which node(s) memory should
  * be allocated.
  *
- * Support four policies per VMA and per process:
+ * Support six policies per VMA and per process:
  *
  * The VMA policy has priority over the process policy for a page fault.
  *
-- 
cgit v1.2.3


From f2f484085ef1a2bb5aea861a06bc6b4dc50d2ab8 Mon Sep 17 00:00:00 2001
From: Nanyong Sun <sunnanyong@huawei.com>
Date: Thu, 26 Sep 2024 15:49:22 +0800
Subject: mm: move mm flags to mm_types.h

The types of mm flags are now far beyond the core dump related features.
This patch moves mm flags from linux/sched/coredump.h to linux/mm_types.h.
The linux/sched/coredump.h has include the mm_types.h, so the C files
related to coredump does not need to change head file inclusion.  In
addition, the inclusion of sched/coredump.h now can be deleted from the C
files that irrelevant to core dump.

Link: https://lkml.kernel.org/r/20240926074922.2721274-1-sunnanyong@huawei.com
Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h        |  1 -
 include/linux/khugepaged.h     |  2 -
 include/linux/ksm.h            |  1 -
 include/linux/mm_types.h       | 84 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/oom.h            |  1 -
 include/linux/sched/coredump.h | 82 -----------------------------------------
 kernel/events/uprobes.c        |  1 -
 kernel/fork.c                  |  1 -
 mm/huge_memory.c               |  1 -
 mm/khugepaged.c                |  1 -
 mm/ksm.c                       |  1 -
 mm/memory.c                    |  1 -
 mm/oom_kill.c                  |  1 -
 13 files changed, 84 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ef5b80e48599..8afe09a2cf03 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -2,7 +2,6 @@
 #ifndef _LINUX_HUGE_MM_H
 #define _LINUX_HUGE_MM_H
 
-#include <linux/sched/coredump.h>
 #include <linux/mm_types.h>
 
 #include <linux/fs.h> /* only for vma_is_dax() */
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 30baae91b225..1f46046080f5 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -2,8 +2,6 @@
 #ifndef _LINUX_KHUGEPAGED_H
 #define _LINUX_KHUGEPAGED_H
 
-#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
-
 extern unsigned int khugepaged_max_ptes_none __read_mostly;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern struct attribute_group khugepaged_attr_group;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index ec9c05044d4f..29022e71a074 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -13,7 +13,6 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/sched.h>
-#include <linux/sched/coredump.h>
 
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6e3bdf8e38bc..ff8627acbaa7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1499,4 +1499,88 @@ enum {
 	/* See also internal only FOLL flags in mm/internal.h */
 };
 
+/* mm flags */
+
+/*
+ * The first two bits represent core dump modes for set-user-ID,
+ * the modes are SUID_DUMP_* defined in linux/sched/coredump.h
+ */
+#define MMF_DUMPABLE_BITS 2
+#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
+/* coredump filter bits */
+#define MMF_DUMP_ANON_PRIVATE	2
+#define MMF_DUMP_ANON_SHARED	3
+#define MMF_DUMP_MAPPED_PRIVATE	4
+#define MMF_DUMP_MAPPED_SHARED	5
+#define MMF_DUMP_ELF_HEADERS	6
+#define MMF_DUMP_HUGETLB_PRIVATE 7
+#define MMF_DUMP_HUGETLB_SHARED  8
+#define MMF_DUMP_DAX_PRIVATE	9
+#define MMF_DUMP_DAX_SHARED	10
+
+#define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
+#define MMF_DUMP_FILTER_BITS	9
+#define MMF_DUMP_FILTER_MASK \
+	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
+#define MMF_DUMP_FILTER_DEFAULT \
+	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
+	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+
+#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
+# define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
+#else
+# define MMF_DUMP_MASK_DEFAULT_ELF	0
+#endif
+					/* leave room for more dump flags */
+#define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
+#define MMF_VM_HUGEPAGE		17	/* set when mm is available for khugepaged */
+
+/*
+ * This one-shot flag is dropped due to necessity of changing exe once again
+ * on NFS restore
+ */
+//#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
+
+#define MMF_HAS_UPROBES		19	/* has uprobes */
+#define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
+#define MMF_OOM_SKIP		21	/* mm is of no interest for the OOM killer */
+#define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
+#define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
+#define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
+#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
+#define MMF_OOM_REAP_QUEUED	25	/* mm was queued for oom_reaper */
+#define MMF_MULTIPROCESS	26	/* mm is shared between processes */
+/*
+ * MMF_HAS_PINNED: Whether this mm has pinned any pages.  This can be either
+ * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
+ * a counter on its own. We're aggresive on this bit for now: even if the
+ * pinned pages were unpinned later on, we'll still keep this bit set for the
+ * lifecycle of this mm, just for simplicity.
+ */
+#define MMF_HAS_PINNED		27	/* FOLL_PIN has run, never cleared */
+
+#define MMF_HAS_MDWE		28
+#define MMF_HAS_MDWE_MASK	(1 << MMF_HAS_MDWE)
+
+
+#define MMF_HAS_MDWE_NO_INHERIT	29
+
+#define MMF_VM_MERGE_ANY	30
+#define MMF_VM_MERGE_ANY_MASK	(1 << MMF_VM_MERGE_ANY)
+
+#define MMF_TOPDOWN		31	/* mm searches top down by default */
+#define MMF_TOPDOWN_MASK	(1 << MMF_TOPDOWN)
+
+#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
+				 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
+
+static inline unsigned long mmf_init_flags(unsigned long flags)
+{
+	if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
+		flags &= ~((1UL << MMF_HAS_MDWE) |
+			   (1UL << MMF_HAS_MDWE_NO_INHERIT));
+	return flags & MMF_INIT_MASK;
+}
+
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7d0c9c48a0c5..1e0fc6931ce9 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -7,7 +7,6 @@
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <uapi/linux/oom.h>
-#include <linux/sched/coredump.h> /* MMF_* */
 #include <linux/mm.h> /* VM_FAULT* */
 
 struct zonelist;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index e62ff805cfc9..6eb65ceed213 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -8,12 +8,6 @@
 #define SUID_DUMP_USER		1	/* Dump as user of process */
 #define SUID_DUMP_ROOT		2	/* Dump as root */
 
-/* mm flags */
-
-/* for SUID_DUMP_* above */
-#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
-
 extern void set_dumpable(struct mm_struct *mm, int value);
 /*
  * This returns the actual value of the suid_dumpable flag. For things
@@ -31,80 +25,4 @@ static inline int get_dumpable(struct mm_struct *mm)
 	return __get_dumpable(mm->flags);
 }
 
-/* coredump filter bits */
-#define MMF_DUMP_ANON_PRIVATE	2
-#define MMF_DUMP_ANON_SHARED	3
-#define MMF_DUMP_MAPPED_PRIVATE	4
-#define MMF_DUMP_MAPPED_SHARED	5
-#define MMF_DUMP_ELF_HEADERS	6
-#define MMF_DUMP_HUGETLB_PRIVATE 7
-#define MMF_DUMP_HUGETLB_SHARED  8
-#define MMF_DUMP_DAX_PRIVATE	9
-#define MMF_DUMP_DAX_SHARED	10
-
-#define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
-#define MMF_DUMP_FILTER_BITS	9
-#define MMF_DUMP_FILTER_MASK \
-	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
-#define MMF_DUMP_FILTER_DEFAULT \
-	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
-	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
-
-#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
-# define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
-#else
-# define MMF_DUMP_MASK_DEFAULT_ELF	0
-#endif
-					/* leave room for more dump flags */
-#define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
-#define MMF_VM_HUGEPAGE		17	/* set when mm is available for
-					   khugepaged */
-/*
- * This one-shot flag is dropped due to necessity of changing exe once again
- * on NFS restore
- */
-//#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
-
-#define MMF_HAS_UPROBES		19	/* has uprobes */
-#define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
-#define MMF_OOM_SKIP		21	/* mm is of no interest for the OOM killer */
-#define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
-#define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
-#define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
-#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
-#define MMF_OOM_REAP_QUEUED	25	/* mm was queued for oom_reaper */
-#define MMF_MULTIPROCESS	26	/* mm is shared between processes */
-/*
- * MMF_HAS_PINNED: Whether this mm has pinned any pages.  This can be either
- * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
- * a counter on its own. We're aggresive on this bit for now: even if the
- * pinned pages were unpinned later on, we'll still keep this bit set for the
- * lifecycle of this mm, just for simplicity.
- */
-#define MMF_HAS_PINNED		27	/* FOLL_PIN has run, never cleared */
-
-#define MMF_HAS_MDWE		28
-#define MMF_HAS_MDWE_MASK	(1 << MMF_HAS_MDWE)
-
-
-#define MMF_HAS_MDWE_NO_INHERIT	29
-
-#define MMF_VM_MERGE_ANY	30
-#define MMF_VM_MERGE_ANY_MASK	(1 << MMF_VM_MERGE_ANY)
-
-#define MMF_TOPDOWN		31	/* mm searches top down by default */
-#define MMF_TOPDOWN_MASK	(1 << MMF_TOPDOWN)
-
-#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
-				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
-				 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
-
-static inline unsigned long mmf_init_flags(unsigned long flags)
-{
-	if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
-		flags &= ~((1UL << MMF_HAS_MDWE) |
-			   (1UL << MMF_HAS_MDWE_NO_INHERIT));
-	return flags & MMF_INIT_MASK;
-}
-
 #endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4b52cb2ae6d6..75ac18a3ac0f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
 #include <linux/export.h>
 #include <linux/rmap.h>		/* anon_vma_prepare */
 #include <linux/mmu_notifier.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index b2ab422f6230..61a4abd628f3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -16,7 +16,6 @@
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
 #include <linux/sched/user.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/stat.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 03fd4bc39ea1..e71b58d84cba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0bd80e134010..ed1a225dd198 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -4,7 +4,6 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index a2e2a521df0a..dec536d6d91a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -20,7 +20,6 @@
 #include <linux/mman.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
 #include <linux/sched/cputime.h>
 #include <linux/rwsem.h>
 #include <linux/pagemap.h>
diff --git a/mm/memory.c b/mm/memory.c
index bdf77a3ec47b..c8d5d040d6ab 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -44,7 +44,6 @@
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
 #include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
 #include <linux/hugetlb.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4d7a0004df2c..1c485beb0b93 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -24,7 +24,6 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
 #include <linux/sched/task.h>
 #include <linux/sched/debug.h>
 #include <linux/swap.h>
-- 
cgit v1.2.3


From 66efef9b1a7d6cc725efa9395fb390483ad5b555 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Thu, 26 Sep 2024 14:46:14 +0800
Subject: mm: pgtable: introduce pte_offset_map_{ro|rw}_nolock()

Patch series "introduce pte_offset_map_{ro|rw}_nolock()", v5.

As proposed by David Hildenbrand [1], this series introduces the following
two new helper functions to replace pte_offset_map_nolock().

1. pte_offset_map_ro_nolock()
2. pte_offset_map_rw_nolock()

As the name suggests, pte_offset_map_ro_nolock() is used for read-only
case.  In this case, only read-only operations will be performed on PTE
page after the PTL is held.  The RCU lock in pte_offset_map_nolock() will
ensure that the PTE page will not be freed, and there is no need to worry
about whether the pmd entry is modified.  Therefore
pte_offset_map_ro_nolock() is just a renamed version of
pte_offset_map_nolock().

pte_offset_map_rw_nolock() is used for may-write case.  In this case, the
pte or pmd entry may be modified after the PTL is held, so we need to
ensure that the pmd entry has not been modified concurrently.  So in
addition to the name change, it also outputs the pmdval when successful.
The users should make sure the page table is stable like checking
pte_same() or checking pmd_same() by using the output pmdval before
performing the write operations.

This series will convert all pte_offset_map_nolock() into the above two
helper functions one by one, and finally completely delete it.

This also a preparation for reclaiming the empty user PTE page table
pages.


This patch (of 13):

Currently, the usage of pte_offset_map_nolock() can be divided into the
following two cases:

1) After acquiring PTL, only read-only operations are performed on the PTE
   page. In this case, the RCU lock in pte_offset_map_nolock() will ensure
   that the PTE page will not be freed, and there is no need to worry
   about whether the pmd entry is modified.

2) After acquiring PTL, the pte or pmd entries may be modified. At this
   time, we need to ensure that the pmd entry has not been modified
   concurrently.

To more clearing distinguish between these two cases, this commit
introduces two new helper functions to replace pte_offset_map_nolock().
For 1), just rename it to pte_offset_map_ro_nolock().  For 2), in addition
to changing the name to pte_offset_map_rw_nolock(), it also outputs the
pmdval when successful.  It is applicable for may-write cases where any
modification operations to the page table may happen after the
corresponding spinlock is held afterwards.  But the users should make sure
the page table is stable like checking pte_same() or checking pmd_same()
by using the output pmdval before performing the write operations.

Note: "RO" / "RW" expresses the intended semantics, not that the *kmap*
will be read-only/read-write protected.

Subsequent commits will convert pte_offset_map_nolock() into the above
two functions one by one, and finally completely delete it.

Link: https://lkml.kernel.org/r/cover.1727332572.git.zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/5aeecfa131600a454b1f3a038a1a54282ca3b856.1727332572.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/split_page_table_lock.rst |  7 +++++
 include/linux/mm.h                         |  5 ++++
 mm/pgtable-generic.c                       | 48 ++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst
index e4f6972eb6c0..08d0e706a32d 100644
--- a/Documentation/mm/split_page_table_lock.rst
+++ b/Documentation/mm/split_page_table_lock.rst
@@ -19,6 +19,13 @@ There are helpers to lock/unlock a table and other accessor functions:
  - pte_offset_map_nolock()
 	maps PTE, returns pointer to PTE with pointer to its PTE table
 	lock (not taken), or returns NULL if no PTE table;
+ - pte_offset_map_ro_nolock()
+	maps PTE, returns pointer to PTE with pointer to its PTE table
+	lock (not taken), or returns NULL if no PTE table;
+ - pte_offset_map_rw_nolock()
+	maps PTE, returns pointer to PTE with pointer to its PTE table
+	lock (not taken) and the value of its pmd entry, or returns NULL
+	if no PTE table;
  - pte_offset_map()
 	maps PTE, returns pointer to PTE, or returns NULL if no PTE table;
  - pte_unmap()
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 61fff5d34ed5..0cf45d4b7286 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3017,6 +3017,11 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
 
 pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long addr, spinlock_t **ptlp);
+pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long addr, spinlock_t **ptlp);
+pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long addr, pmd_t *pmdvalp,
+				spinlock_t **ptlp);
 
 #define pte_unmap_unlock(pte, ptl)	do {		\
 	spin_unlock(ptl);				\
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index a78a4adf711a..daa08b91ab6b 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -317,6 +317,31 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
 	return pte;
 }
 
+pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long addr, spinlock_t **ptlp)
+{
+	pmd_t pmdval;
+	pte_t *pte;
+
+	pte = __pte_offset_map(pmd, addr, &pmdval);
+	if (likely(pte))
+		*ptlp = pte_lockptr(mm, &pmdval);
+	return pte;
+}
+
+pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long addr, pmd_t *pmdvalp,
+				spinlock_t **ptlp)
+{
+	pte_t *pte;
+
+	VM_WARN_ON_ONCE(!pmdvalp);
+	pte = __pte_offset_map(pmd, addr, pmdvalp);
+	if (likely(pte))
+		*ptlp = pte_lockptr(mm, pmdvalp);
+	return pte;
+}
+
 /*
  * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
  * __pte_offset_map_lock() below, is usually called with the pmd pointer for
@@ -356,6 +381,29 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
  * recheck *pmd once the lock is taken; in practice, no callsite needs that -
  * either the mmap_lock for write, or pte_same() check on contents, is enough.
  *
+ * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
+ * but when successful, it also outputs a pointer to the spinlock in ptlp - as
+ * pte_offset_map_lock() does, but in this case without locking it.  This helps
+ * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
+ * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock
+ * pointer for the page table that it returns. Even after grabbing the spinlock,
+ * we might be looking either at a page table that is still mapped or one that
+ * was unmapped and is about to get freed. But for R/O access this is sufficient.
+ * So it is only applicable for read-only cases where any modification operations
+ * to the page table are not allowed even if the corresponding spinlock is held
+ * afterwards.
+ *
+ * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like
+ * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval.
+ * It is applicable for may-write cases where any modification operations to the
+ * page table may happen after the corresponding spinlock is held afterwards.
+ * But the users should make sure the page table is stable like checking pte_same()
+ * or checking pmd_same() by using the output pmdval before performing the write
+ * operations.
+ *
+ * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will
+ * be read-only/read-write protected.
+ *
  * Note that free_pgtables(), used after unmapping detached vmas, or when
  * exiting the whole mm, does not take page table lock before freeing a page
  * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
-- 
cgit v1.2.3


From 583e66debd1d5aa8c401aebe924c7406e15579a7 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Thu, 26 Sep 2024 14:46:26 +0800
Subject: mm: pgtable: remove pte_offset_map_nolock()

Now no users are using the pte_offset_map_nolock(), remove it.

Link: https://lkml.kernel.org/r/d04f9bbbcde048fb6ffa6f2bdbc6f9b22d5286f9.1727332572.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/split_page_table_lock.rst |  3 ---
 include/linux/mm.h                         |  2 --
 mm/pgtable-generic.c                       | 21 ---------------------
 3 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst
index 08d0e706a32d..581446d4a4eb 100644
--- a/Documentation/mm/split_page_table_lock.rst
+++ b/Documentation/mm/split_page_table_lock.rst
@@ -16,9 +16,6 @@ There are helpers to lock/unlock a table and other accessor functions:
  - pte_offset_map_lock()
 	maps PTE and takes PTE table lock, returns pointer to PTE with
 	pointer to its PTE table lock, or returns NULL if no PTE table;
- - pte_offset_map_nolock()
-	maps PTE, returns pointer to PTE with pointer to its PTE table
-	lock (not taken), or returns NULL if no PTE table;
  - pte_offset_map_ro_nolock()
 	maps PTE, returns pointer to PTE with pointer to its PTE table
 	lock (not taken), or returns NULL if no PTE table;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0cf45d4b7286..8f5394d75ce2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3015,8 +3015,6 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
 	return pte;
 }
 
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
-			unsigned long addr, spinlock_t **ptlp);
 pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
 				unsigned long addr, spinlock_t **ptlp);
 pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index daa08b91ab6b..5297dcc38c37 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -305,18 +305,6 @@ nomap:
 	return NULL;
 }
 
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
-			     unsigned long addr, spinlock_t **ptlp)
-{
-	pmd_t pmdval;
-	pte_t *pte;
-
-	pte = __pte_offset_map(pmd, addr, &pmdval);
-	if (likely(pte))
-		*ptlp = pte_lockptr(mm, &pmdval);
-	return pte;
-}
-
 pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
 				unsigned long addr, spinlock_t **ptlp)
 {
@@ -372,15 +360,6 @@ pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
  * and disconnected table.  Until pte_unmap(pte) unmaps and rcu_read_unlock()s
  * afterwards.
  *
- * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
- * but when successful, it also outputs a pointer to the spinlock in ptlp - as
- * pte_offset_map_lock() does, but in this case without locking it.  This helps
- * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
- * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
- * pointer for the page table that it returns.  In principle, the caller should
- * recheck *pmd once the lock is taken; in practice, no callsite needs that -
- * either the mmap_lock for write, or pte_same() check on contents, is enough.
- *
  * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
  * but when successful, it also outputs a pointer to the spinlock in ptlp - as
  * pte_offset_map_lock() does, but in this case without locking it.  This helps
-- 
cgit v1.2.3


From 473c371254d2c9906c286c939eaa99d0fac13e38 Mon Sep 17 00:00:00 2001
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Date: Thu, 26 Sep 2024 13:06:47 +0800
Subject: mm: migrate LRU_REFS_MASK bits in folio_migrate_flags

Bits of LRU_REFS_MASK are not inherited during migration which lead to new
folio start from tier0 when MGLRU enabled.  Try to bring as much bits of
folio->flags as possible since compaction and alloc_contig_range which
introduce migration do happen at times.

Link: https://lkml.kernel.org/r/20240926050647.5653-1-zhaoyang.huang@unisoc.com
Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Suggested-by: Yu Zhao <yuzhao@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h | 10 ++++++++++
 mm/migrate.c              |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f4fe593c1400..6f801c7b36e2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -291,6 +291,12 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
 	return true;
 }
 
+static inline void folio_migrate_refs(struct folio *new, struct folio *old)
+{
+	unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK;
+
+	set_mask_bits(&new->flags, LRU_REFS_MASK, refs);
+}
 #else /* !CONFIG_LRU_GEN */
 
 static inline bool lru_gen_enabled(void)
@@ -313,6 +319,10 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
 	return false;
 }
 
+static inline void folio_migrate_refs(struct folio *new, struct folio *old)
+{
+
+}
 #endif /* CONFIG_LRU_GEN */
 
 static __always_inline
diff --git a/mm/migrate.c b/mm/migrate.c
index dfa24e41e8f9..72c6657f4f72 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -695,6 +695,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
 	if (folio_test_idle(folio))
 		folio_set_idle(newfolio);
 
+	folio_migrate_refs(newfolio, folio);
 	/*
 	 * Copy NUMA information to the new page, to prevent over-eager
 	 * future migrations of this same page.
-- 
cgit v1.2.3


From 4cc0473d7754d387680bdf0728eb29f0ec8834bf Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Mon, 7 Oct 2024 22:49:05 +0800
Subject: get rid of __get_task_comm()

Patch series "Improve the copy of task comm", v8.

Using {memcpy,strncpy,strcpy,kstrdup} to copy the task comm relies on the
length of task comm.  Changes in the task comm could result in a
destination string that is overflow.  Therefore, we should explicitly
ensure the destination string is always NUL-terminated, regardless of the
task comm.  This approach will facilitate future extensions to the task
comm.

As suggested by Linus [0], we can identify all relevant code with the
following git grep command:

  git grep 'memcpy.*->comm\>'
  git grep 'kstrdup.*->comm\>'
  git grep 'strncpy.*->comm\>'
  git grep 'strcpy.*->comm\>'

PATCH #2~#4:   memcpy
PATCH #5~#6:   kstrdup
PATCH #7:      strcpy

Please note that strncpy() is not included in this series as it is being
tracked by another effort. [1]


This patch (of 7):

We want to eliminate the use of __get_task_comm() for the following
reasons:

- The task_lock() is unnecessary
  Quoted from Linus [0]:
  : Since user space can randomly change their names anyway, using locking
  : was always wrong for readers (for writers it probably does make sense
  : to have some lock - although practically speaking nobody cares there
  : either, but at least for a writer some kind of race could have
  : long-term mixed results

Link: https://lkml.kernel.org/r/20241007144911.27693-1-laoar.shao@gmail.com
Link: https://lkml.kernel.org/r/20241007144911.27693-2-laoar.shao@gmail.com
Link: https://lore.kernel.org/all/CAHk-=wivfrF0_zvf+oj6==Sh=-npJooP8chLPEfaFV0oNYTTBA@mail.gmail.com [0]
Link: https://lore.kernel.org/all/CAHk-=whWtUC-AjmGJveAETKOMeMFSTwKwu99v7+b6AyHMmaDFA@mail.gmail.com/
Link: https://lore.kernel.org/all/CAHk-=wjAmmHUg6vho1KjzQi2=psR30+CogFd4aXrThr2gsiS4g@mail.gmail.com/ [0]
Link: https://github.com/KSPP/linux/issues/90 [1]
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Matus Jokay <matus.jokay@stuba.sk>
Cc: Alejandro Colomar <alx@kernel.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Airlie <airlied@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: James Morris <jmorris@namei.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Ondrej Mosnacek <omosnace@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Quentin Monnet <qmo@kernel.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Stephen Smalley <stephen.smalley.work@gmail.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exec.c             | 10 ----------
 fs/proc/array.c       |  2 +-
 include/linux/sched.h | 28 ++++++++++++++++++++++------
 kernel/kthread.c      |  2 +-
 4 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 6c53920795c2..77364806b48d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1189,16 +1189,6 @@ static int unshare_sighand(struct task_struct *me)
 	return 0;
 }
 
-char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
-{
-	task_lock(tsk);
-	/* Always NUL terminated and zero-padded */
-	strscpy_pad(buf, tsk->comm, buf_size);
-	task_unlock(tsk);
-	return buf;
-}
-EXPORT_SYMBOL_GPL(__get_task_comm);
-
 /*
  * These functions flushes out all traces of the currently running executable
  * so that a new one can be started
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 34a47fb0c57f..55ed3510d2bb 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -109,7 +109,7 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
 	else if (p->flags & PF_KTHREAD)
 		get_kthread_comm(tcomm, sizeof(tcomm), p);
 	else
-		__get_task_comm(tcomm, sizeof(tcomm), p);
+		get_task_comm(tcomm, p);
 
 	if (escape)
 		seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bb343136ddd0..67718d5591dd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1121,9 +1121,12 @@ struct task_struct {
 	/*
 	 * executable name, excluding path.
 	 *
-	 * - normally initialized setup_new_exec()
-	 * - access it with [gs]et_task_comm()
-	 * - lock it with task_lock()
+	 * - normally initialized begin_new_exec()
+	 * - set it with set_task_comm()
+	 *   - strscpy_pad() to ensure it is always NUL-terminated and
+	 *     zero-padded
+	 *   - task_lock() to ensure the operation is atomic and the name is
+	 *     fully updated.
 	 */
 	char				comm[TASK_COMM_LEN];
 
@@ -1938,10 +1941,23 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from)
 	__set_task_comm(tsk, from, false);
 }
 
-extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
+/*
+ * - Why not use task_lock()?
+ *   User space can randomly change their names anyway, so locking for readers
+ *   doesn't make sense. For writers, locking is probably necessary, as a race
+ *   condition could lead to long-term mixed results.
+ *   The strscpy_pad() in __set_task_comm() can ensure that the task comm is
+ *   always NUL-terminated and zero-padded. Therefore the race condition between
+ *   reader and writer is not an issue.
+ *
+ * - BUILD_BUG_ON() can help prevent the buf from being truncated.
+ *   Since the callers don't perform any return value checks, this safeguard is
+ *   necessary.
+ */
 #define get_task_comm(buf, tsk) ({			\
-	BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN);	\
-	__get_task_comm(buf, sizeof(buf), tsk);		\
+	BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN);	\
+	strscpy_pad(buf, (tsk)->comm);			\
+	buf;						\
 })
 
 #ifdef CONFIG_SMP
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9bb36897b6c6..a5ac612b1609 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
 	struct kthread *kthread = to_kthread(tsk);
 
 	if (!kthread || !kthread->full_name) {
-		__get_task_comm(buf, buf_size, tsk);
+		strscpy(buf, tsk->comm, buf_size);
 		return;
 	}
 
-- 
cgit v1.2.3


From f2fa0fd4e7db8326a77618962714924b64f5f889 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 12 Oct 2024 19:52:53 +0200
Subject: reboot: move reboot_notifier_list to kernel/reboot.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All the functions related to the reboot notifier list are in
kernel/reboot.c.  Move the list itself, too.  As there are no direct users
anymore, make the declaration static.

Link: https://lkml.kernel.org/r/20241012-reboot_notifier_list-v1-1-6093bb9455ce@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/notifier.h | 2 --
 kernel/notifier.c        | 8 --------
 kernel/reboot.c          | 7 +++++++
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 45702bdcbceb..b42e64734968 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -237,7 +237,5 @@ static inline int notifier_to_errno(int ret)
 #define KBD_KEYSYM		0x0004 /* Keyboard keysym */
 #define KBD_POST_KEYSYM		0x0005 /* Called after keyboard keysym interpretation */
 
-extern struct blocking_notifier_head reboot_notifier_list;
-
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index b3ce28f39eb6..2f9fe7c30287 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -5,18 +5,10 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/vmalloc.h>
-#include <linux/reboot.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/notifier.h>
 
-/*
- *	Notifier list for kernel code which wants to be called
- *	at shutdown. This is used to stop any idling DMA operations
- *	and the like.
- */
-BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
-
 /*
  *	Notifier chain core routines.  The exported routines below
  *	are layered on top of these, with appropriate locking added.
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f05dbde2c93f..ffdf86b717ab 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -72,6 +72,13 @@ static bool poweroff_fallback_to_halt;
  */
 void __weak (*pm_power_off)(void);
 
+/*
+ *	Notifier list for kernel code which wants to be called
+ *	at shutdown. This is used to stop any idling DMA operations
+ *	and the like.
+ */
+static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+
 /**
  *	emergency_restart - reboot the system
  *
-- 
cgit v1.2.3


From a9d38bcd7337f051912174ebfc500e1cef73982e Mon Sep 17 00:00:00 2001
From: Sui Jingfeng <sui.jingfeng@linux.dev>
Date: Sat, 12 Oct 2024 18:08:17 +0800
Subject: scatterlist: fix a typo

Replace the 'One' with 'On'.

Link: https://lkml.kernel.org/r/20241012100817.323007-1-sui.jingfeng@linux.dev
Fixes: af2880ec4402 ("scatterlist: add dedicated config for DMA flags")
Signed-off-by: Sui Jingfeng <sui.jingfeng@linux.dev>
Reviewed-by: Petr Tesarik <petr@tesarici.cz>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/scatterlist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index e61d164622db..c5e2239b550e 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -273,7 +273,7 @@ static inline void sg_unmark_end(struct scatterlist *sg)
 }
 
 /*
- * One 64-bit architectures there is a 4-byte padding in struct scatterlist
+ * On 64-bit architectures there is a 4-byte padding in struct scatterlist
  * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA
  * flags bits to indicate when a specific dma address is a bus address or the
  * buffer may have been bounced via SWIOTLB.
-- 
cgit v1.2.3


From 74ef070e325465a1b364db6a5c6859785537f835 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 21 Oct 2024 10:07:36 +0200
Subject: percpu: merge VERIFY_PERCPU_PTR() into its only user

Merge VERIFY_PERCPU_PTR() into non-CONFIG_SMP per_cpu_ptr() to make macro
similar to CONFIG_SMP per_cpu_ptr().  This will allow a follow-up patch to
refactor common code to a macro.

No functional changes, non-CONFIG_SMP per_cpu_ptr() was the only user of
VERIFY_PERCPU_PTR().

Link: https://lkml.kernel.org/r/20241021080856.48746-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu-defs.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8efce7414fad..7fa88c5f4b26 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -254,13 +254,13 @@ do {									\
 
 #else	/* CONFIG_SMP */
 
-#define VERIFY_PERCPU_PTR(__p)						\
+#define per_cpu_ptr(ptr, cpu)						\
 ({									\
-	__verify_pcpu_ptr(__p);						\
-	(typeof(*(__p)) __kernel __force *)(__p);			\
+	(void)(cpu);							\
+	__verify_pcpu_ptr(ptr);						\
+	(typeof(*(ptr)) __kernel __force *)(ptr);			\
 })
 
-#define per_cpu_ptr(ptr, cpu)	({ (void)(cpu); VERIFY_PERCPU_PTR(ptr); })
 #define raw_cpu_ptr(ptr)	per_cpu_ptr(ptr, 0)
 #define this_cpu_ptr(ptr)	raw_cpu_ptr(ptr)
 
-- 
cgit v1.2.3


From 001217defda86d0d6a5a9e6cf77a6b813857e7e3 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 21 Oct 2024 10:07:37 +0200
Subject: percpu: introduce PERCPU_PTR() macro

Introduce PERCPU_PTR() macro to cast the percpu pointer from the percpu
address space to a generic (kernel) address space.  Use it in
per_cpu_ptr() and related SHIFT_PERCPU_PTR() macros.

Also remove common knowledge from SHIFT_PERCPU_PTR() comment, "weird cast"
is just a standard way to inform sparse of a cast from the percpu address
space to a generic address space.

Link: https://lkml.kernel.org/r/20241021080856.48746-2-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu-defs.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 7fa88c5f4b26..e1cf7982424f 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -220,15 +220,17 @@ do {									\
 	(void)__vpp_verify;						\
 } while (0)
 
+#define PERCPU_PTR(__p)							\
+	(typeof(*(__p)) __force __kernel *)(__p);
+
 #ifdef CONFIG_SMP
 
 /*
- * Add an offset to a pointer but keep the pointer as-is.  Use RELOC_HIDE()
- * to prevent the compiler from making incorrect assumptions about the
- * pointer value.  The weird cast keeps both GCC and sparse happy.
+ * Add an offset to a pointer.  Use RELOC_HIDE() to prevent the compiler
+ * from making incorrect assumptions about the pointer value.
  */
 #define SHIFT_PERCPU_PTR(__p, __offset)					\
-	RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset))
+	RELOC_HIDE(PERCPU_PTR(__p), (__offset))
 
 #define per_cpu_ptr(ptr, cpu)						\
 ({									\
@@ -258,7 +260,7 @@ do {									\
 ({									\
 	(void)(cpu);							\
 	__verify_pcpu_ptr(ptr);						\
-	(typeof(*(ptr)) __kernel __force *)(ptr);			\
+	PERCPU_PTR(ptr);						\
 })
 
 #define raw_cpu_ptr(ptr)	per_cpu_ptr(ptr, 0)
-- 
cgit v1.2.3


From dabddd687c9e1a06241d6b4d1f66b9f2b60b3ad1 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 21 Oct 2024 10:07:38 +0200
Subject: percpu: cast percpu pointer in PERCPU_PTR() via unsigned long

Cast pointer from percpu address space to generic (kernel) address space
in PERCPU_PTR() macro via unsigned long intermediate cast [1].  This
intermediate cast is also required to avoid build failure when GCC's
strict named address space checks for x86 targets [2] are enabled.

Found by GCC's named address space checks.

[1] https://sparse.docs.kernel.org/en/latest/annotations.html#address-space-name
[2] https://gcc.gnu.org/onlinedocs/gcc/Named-Address-Spaces.html#x86-Named-Address-Spaces

Link: https://lkml.kernel.org/r/20241021080856.48746-3-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu-defs.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index e1cf7982424f..35842d1e3879 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -221,7 +221,10 @@ do {									\
 } while (0)
 
 #define PERCPU_PTR(__p)							\
-	(typeof(*(__p)) __force __kernel *)(__p);
+({									\
+	unsigned long __pcpu_ptr = (__force unsigned long)(__p);	\
+	(typeof(*(__p)) __force __kernel *)(__pcpu_ptr);		\
+})
 
 #ifdef CONFIG_SMP
 
-- 
cgit v1.2.3


From 92a8b224b833e82d286d2100432adbac8cf8a2a1 Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Sun, 20 Oct 2024 12:01:51 +0800
Subject: lib/min_heap: introduce non-inline versions of min heap API functions

Patch series "Enhance min heap API with non-inline functions and
optimizations", v2.

Add non-inline versions of the min heap API functions in lib/min_heap.c
and updates all users outside of kernel/events/core.c to use these
non-inline versions.  To mitigate the performance impact of indirect
function calls caused by the non-inline versions of the swap and compare
functions, a builtin swap has been introduced that swaps elements based on
their size.  Additionally, it micro-optimizes the efficiency of the min
heap by pre-scaling the counter, following the same approach as in
lib/sort.c.  Documentation for the min heap API has also been added to the
core-api section.


This patch (of 10):

All current min heap API functions are marked with '__always_inline'.
However, as the number of users increases, inlining these functions
everywhere leads to a increase in kernel size.

In performance-critical paths, such as when perf events are enabled and
min heap functions are called on every context switch, it is important to
retain the inline versions for optimal performance.  To balance this, the
original inline functions are kept, and additional non-inline versions of
the functions have been added in lib/min_heap.c.

Link: https://lkml.kernel.org/r/20241020040200.939973-1-visitorckw@gmail.com
Link: https://lore.kernel.org/20240522161048.8d8bbc7b153b4ecd92c50666@linux-foundation.org
Link: https://lkml.kernel.org/r/20241020040200.939973-2-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Coly Li <colyli@suse.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: "Liang, Kan" <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Sakai <msakai@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/md/bcache/Kconfig |   1 +
 drivers/md/dm-vdo/Kconfig |   1 +
 fs/bcachefs/Kconfig       |   1 +
 include/linux/min_heap.h  | 129 ++++++++++++++++++++++++++++++----------------
 kernel/events/core.c      |   6 +--
 lib/Kconfig               |   3 ++
 lib/Kconfig.debug         |   1 +
 lib/Makefile              |   1 +
 lib/min_heap.c            |  70 +++++++++++++++++++++++++
 9 files changed, 167 insertions(+), 46 deletions(-)
 create mode 100644 lib/min_heap.c

(limited to 'include/linux')

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index b2d10063d35f..d4697e79d5a3 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -5,6 +5,7 @@ config BCACHE
 	select BLOCK_HOLDER_DEPRECATED if SYSFS
 	select CRC64
 	select CLOSURES
+	select MIN_HEAP
 	help
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/dm-vdo/Kconfig b/drivers/md/dm-vdo/Kconfig
index 111ecd2c2a24..2400b2bc4bc7 100644
--- a/drivers/md/dm-vdo/Kconfig
+++ b/drivers/md/dm-vdo/Kconfig
@@ -7,6 +7,7 @@ config DM_VDO
 	select DM_BUFIO
 	select LZ4_COMPRESS
 	select LZ4_DECOMPRESS
+	select MIN_HEAP
 	help
 	  This device mapper target presents a block device with
 	  deduplication, compression and thin-provisioning.
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5bac803ea367..ab6c95b895b3 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -24,6 +24,7 @@ config BCACHEFS_FS
 	select XXHASH
 	select SRCU
 	select SYMBOLIC_ERRNAME
+	select MIN_HEAP
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 43a7b9dcf15e..0abb21173979 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -40,7 +40,7 @@ struct min_heap_callbacks {
 
 /* Initialize a min-heap. */
 static __always_inline
-void __min_heap_init(min_heap_char *heap, void *data, int size)
+void __min_heap_init_inline(min_heap_char *heap, void *data, int size)
 {
 	heap->nr = 0;
 	heap->size = size;
@@ -50,33 +50,33 @@ void __min_heap_init(min_heap_char *heap, void *data, int size)
 		heap->data = heap->preallocated;
 }
 
-#define min_heap_init(_heap, _data, _size)	\
-	__min_heap_init((min_heap_char *)_heap, _data, _size)
+#define min_heap_init_inline(_heap, _data, _size)	\
+	__min_heap_init_inline((min_heap_char *)_heap, _data, _size)
 
 /* Get the minimum element from the heap. */
 static __always_inline
-void *__min_heap_peek(struct min_heap_char *heap)
+void *__min_heap_peek_inline(struct min_heap_char *heap)
 {
 	return heap->nr ? heap->data : NULL;
 }
 
-#define min_heap_peek(_heap)	\
-	(__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap))
+#define min_heap_peek_inline(_heap)	\
+	(__minheap_cast(_heap) __min_heap_peek_inline((min_heap_char *)_heap))
 
 /* Check if the heap is full. */
 static __always_inline
-bool __min_heap_full(min_heap_char *heap)
+bool __min_heap_full_inline(min_heap_char *heap)
 {
 	return heap->nr == heap->size;
 }
 
-#define min_heap_full(_heap)	\
-	__min_heap_full((min_heap_char *)_heap)
+#define min_heap_full_inline(_heap)	\
+	__min_heap_full_inline((min_heap_char *)_heap)
 
 /* Sift the element at pos down the heap. */
 static __always_inline
-void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size,
-		const struct min_heap_callbacks *func, void *args)
+void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size,
+				 const struct min_heap_callbacks *func, void *args)
 {
 	void *left, *right;
 	void *data = heap->data;
@@ -108,13 +108,14 @@ void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size,
 	}
 }
 
-#define min_heap_sift_down(_heap, _pos, _func, _args)	\
-	__min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args)
+#define min_heap_sift_down_inline(_heap, _pos, _func, _args)	\
+	__min_heap_sift_down_inline((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap),	\
+				    _func, _args)
 
 /* Sift up ith element from the heap, O(log2(nr)). */
 static __always_inline
-void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
-		const struct min_heap_callbacks *func, void *args)
+void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx,
+			       const struct min_heap_callbacks *func, void *args)
 {
 	void *data = heap->data;
 	size_t parent;
@@ -128,27 +129,28 @@ void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
 	}
 }
 
-#define min_heap_sift_up(_heap, _idx, _func, _args)	\
-	__min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args)
+#define min_heap_sift_up_inline(_heap, _idx, _func, _args)	\
+	__min_heap_sift_up_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx,	\
+				  _func, _args)
 
 /* Floyd's approach to heapification that is O(nr). */
 static __always_inline
-void __min_heapify_all(min_heap_char *heap, size_t elem_size,
-		const struct min_heap_callbacks *func, void *args)
+void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size,
+			      const struct min_heap_callbacks *func, void *args)
 {
 	int i;
 
 	for (i = heap->nr / 2 - 1; i >= 0; i--)
-		__min_heap_sift_down(heap, i, elem_size, func, args);
+		__min_heap_sift_down_inline(heap, i, elem_size, func, args);
 }
 
-#define min_heapify_all(_heap, _func, _args)	\
-	__min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
+#define min_heapify_all_inline(_heap, _func, _args)	\
+	__min_heapify_all_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
 
 /* Remove minimum element from the heap, O(log2(nr)). */
 static __always_inline
-bool __min_heap_pop(min_heap_char *heap, size_t elem_size,
-		const struct min_heap_callbacks *func, void *args)
+bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size,
+			   const struct min_heap_callbacks *func, void *args)
 {
 	void *data = heap->data;
 
@@ -158,13 +160,13 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size,
 	/* Place last element at the root (position 0) and then sift down. */
 	heap->nr--;
 	memcpy(data, data + (heap->nr * elem_size), elem_size);
-	__min_heap_sift_down(heap, 0, elem_size, func, args);
+	__min_heap_sift_down_inline(heap, 0, elem_size, func, args);
 
 	return true;
 }
 
-#define min_heap_pop(_heap, _func, _args)	\
-	__min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop_inline(_heap, _func, _args)	\
+	__min_heap_pop_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
 
 /*
  * Remove the minimum element and then push the given element. The
@@ -172,22 +174,21 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size,
  * efficient than a pop followed by a push that does 2.
  */
 static __always_inline
-void __min_heap_pop_push(min_heap_char *heap,
-		const void *element, size_t elem_size,
-		const struct min_heap_callbacks *func,
-		void *args)
+void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
+				const struct min_heap_callbacks *func, void *args)
 {
 	memcpy(heap->data, element, elem_size);
-	__min_heap_sift_down(heap, 0, elem_size, func, args);
+	__min_heap_sift_down_inline(heap, 0, elem_size, func, args);
 }
 
-#define min_heap_pop_push(_heap, _element, _func, _args)	\
-	__min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop_push_inline(_heap, _element, _func, _args)	\
+	__min_heap_pop_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap),	\
+				   _func, _args)
 
 /* Push an element on to the heap, O(log2(nr)). */
 static __always_inline
-bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
-		const struct min_heap_callbacks *func, void *args)
+bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
+			    const struct min_heap_callbacks *func, void *args)
 {
 	void *data = heap->data;
 	int pos;
@@ -201,18 +202,19 @@ bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
 	heap->nr++;
 
 	/* Sift child at pos up. */
-	__min_heap_sift_up(heap, elem_size, pos, func, args);
+	__min_heap_sift_up_inline(heap, elem_size, pos, func, args);
 
 	return true;
 }
 
-#define min_heap_push(_heap, _element, _func, _args)	\
-	__min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args)
+#define min_heap_push_inline(_heap, _element, _func, _args)	\
+	__min_heap_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap),	\
+			       _func, _args)
 
 /* Remove ith element from the heap, O(log2(nr)). */
 static __always_inline
-bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
-		const struct min_heap_callbacks *func, void *args)
+bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx,
+			   const struct min_heap_callbacks *func, void *args)
 {
 	void *data = heap->data;
 
@@ -224,12 +226,53 @@ bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
 	if (idx == heap->nr)
 		return true;
 	func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args);
-	__min_heap_sift_up(heap, elem_size, idx, func, args);
-	__min_heap_sift_down(heap, idx, elem_size, func, args);
+	__min_heap_sift_up_inline(heap, elem_size, idx, func, args);
+	__min_heap_sift_down_inline(heap, idx, elem_size, func, args);
 
 	return true;
 }
 
+#define min_heap_del_inline(_heap, _idx, _func, _args)	\
+	__min_heap_del_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx,	\
+			      _func, _args)
+
+void __min_heap_init(min_heap_char *heap, void *data, int size);
+void *__min_heap_peek(struct min_heap_char *heap);
+bool __min_heap_full(min_heap_char *heap);
+void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size,
+			  const struct min_heap_callbacks *func, void *args);
+void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+			const struct min_heap_callbacks *func, void *args);
+void __min_heapify_all(min_heap_char *heap, size_t elem_size,
+		       const struct min_heap_callbacks *func, void *args);
+bool __min_heap_pop(min_heap_char *heap, size_t elem_size,
+		    const struct min_heap_callbacks *func, void *args);
+void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size,
+			 const struct min_heap_callbacks *func, void *args);
+bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+		     const struct min_heap_callbacks *func, void *args);
+bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+		    const struct min_heap_callbacks *func, void *args);
+
+#define min_heap_init(_heap, _data, _size)	\
+	__min_heap_init((min_heap_char *)_heap, _data, _size)
+#define min_heap_peek(_heap)	\
+	(__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap))
+#define min_heap_full(_heap)	\
+	__min_heap_full((min_heap_char *)_heap)
+#define min_heap_sift_down(_heap, _pos, _func, _args)	\
+	__min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args)
+#define min_heap_sift_up(_heap, _idx, _func, _args)	\
+	__min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args)
+#define min_heapify_all(_heap, _func, _args)	\
+	__min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop(_heap, _func, _args)	\
+	__min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop_push(_heap, _element, _func, _args)	\
+	__min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap),	\
+			    _func, _args)
+#define min_heap_push(_heap, _element, _func, _args)	\
+	__min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args)
 #define min_heap_del(_heap, _idx, _func, _args)	\
 	__min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args)
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index df27d08a7232..1b3c1198b2af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3870,7 +3870,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
 		perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
 	}
 
-	min_heapify_all(&event_heap, &perf_min_heap, NULL);
+	min_heapify_all_inline(&event_heap, &perf_min_heap, NULL);
 
 	while (event_heap.nr) {
 		ret = func(*evt, data);
@@ -3879,9 +3879,9 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
 
 		*evt = perf_event_groups_next(*evt, pmu);
 		if (*evt)
-			min_heap_sift_down(&event_heap, 0, &perf_min_heap, NULL);
+			min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL);
 		else
-			min_heap_pop(&event_heap, &perf_min_heap, NULL);
+			min_heap_pop_inline(&event_heap, &perf_min_heap, NULL);
 	}
 
 	return 0;
diff --git a/lib/Kconfig b/lib/Kconfig
index cf303bd91dda..f5a2781669ea 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -780,3 +780,6 @@ config FIRMWARE_TABLE
 
 config UNION_FIND
 	bool
+
+config MIN_HEAP
+	bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index eda319e9d569..2549b64b2280 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2279,6 +2279,7 @@ config TEST_LIST_SORT
 config TEST_MIN_HEAP
 	tristate "Min heap test"
 	depends on DEBUG_KERNEL || m
+	select MIN_HEAP
 	help
 	  Enable this to turn on min heap function tests. This test is
 	  executed only once during system boot (so affects only boot time),
diff --git a/lib/Makefile b/lib/Makefile
index feebed74fc7a..1eb89962daef 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -40,6 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 lib-$(CONFIG_UNION_FIND) += union_find.o
 lib-$(CONFIG_PRINTK) += dump_stack.o
 lib-$(CONFIG_SMP) += cpumask.o
+lib-$(CONFIG_MIN_HEAP) += min_heap.o
 
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
diff --git a/lib/min_heap.c b/lib/min_heap.c
new file mode 100644
index 000000000000..4485372ff3b1
--- /dev/null
+++ b/lib/min_heap.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+#include <linux/min_heap.h>
+
+void __min_heap_init(min_heap_char *heap, void *data, int size)
+{
+	__min_heap_init_inline(heap, data, size);
+}
+EXPORT_SYMBOL(__min_heap_init);
+
+void *__min_heap_peek(struct min_heap_char *heap)
+{
+	return __min_heap_peek_inline(heap);
+}
+EXPORT_SYMBOL(__min_heap_peek);
+
+bool __min_heap_full(min_heap_char *heap)
+{
+	return __min_heap_full_inline(heap);
+}
+EXPORT_SYMBOL(__min_heap_full);
+
+void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size,
+			  const struct min_heap_callbacks *func, void *args)
+{
+	__min_heap_sift_down_inline(heap, pos, elem_size, func, args);
+}
+EXPORT_SYMBOL(__min_heap_sift_down);
+
+void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+			const struct min_heap_callbacks *func, void *args)
+{
+	__min_heap_sift_up_inline(heap, elem_size, idx, func, args);
+}
+EXPORT_SYMBOL(__min_heap_sift_up);
+
+void __min_heapify_all(min_heap_char *heap, size_t elem_size,
+		       const struct min_heap_callbacks *func, void *args)
+{
+	__min_heapify_all_inline(heap, elem_size, func, args);
+}
+EXPORT_SYMBOL(__min_heapify_all);
+
+bool __min_heap_pop(min_heap_char *heap, size_t elem_size,
+		    const struct min_heap_callbacks *func, void *args)
+{
+	return __min_heap_pop_inline(heap, elem_size, func, args);
+}
+EXPORT_SYMBOL(__min_heap_pop);
+
+void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size,
+			 const struct min_heap_callbacks *func, void *args)
+{
+	__min_heap_pop_push_inline(heap, element, elem_size, func, args);
+}
+EXPORT_SYMBOL(__min_heap_pop_push);
+
+bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+		     const struct min_heap_callbacks *func, void *args)
+{
+	return __min_heap_push_inline(heap, element, elem_size, func, args);
+}
+EXPORT_SYMBOL(__min_heap_push);
+
+bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+		    const struct min_heap_callbacks *func, void *args)
+{
+	return __min_heap_del_inline(heap, elem_size, idx, func, args);
+}
+EXPORT_SYMBOL(__min_heap_del);
-- 
cgit v1.2.3


From aa5888afc2347ebb394c2c4b694fa3026775009e Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Sun, 20 Oct 2024 12:01:52 +0800
Subject: lib min_heap: optimize min heap by prescaling counters for better
 performance

Improve the efficiency of the min heap by prescaling counters, eliminating
the need to repeatedly compute 'index * element_size' when accessing
elements.  By doing so, we avoid the overhead associated with
recalculating the byte offset for each heap operation.

However, with prescaling, the calculation for the parent element's
location is no longer as simple as '(i - 1) / 2'.  To address this, we
copy the parent function from 'lib/sort.c', which calculates the parent
offset in a branchless manner without using any division instructions.

This optimization should result in a more efficient heap implementation by
reducing the computational overhead of finding parent and child offsets.

Link: https://lkml.kernel.org/r/20241020040200.939973-3-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Coly Li <colyli@suse.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: "Liang, Kan" <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Sakai <msakai@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/min_heap.h | 73 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 0abb21173979..41b092a14238 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -38,6 +38,32 @@ struct min_heap_callbacks {
 	void (*swp)(void *lhs, void *rhs, void *args);
 };
 
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit.  That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+	i -= size;
+	i -= size & -(i & lsbit);
+	return i / 2;
+}
+
 /* Initialize a min-heap. */
 static __always_inline
 void __min_heap_init_inline(min_heap_char *heap, void *data, int size)
@@ -78,33 +104,30 @@ static __always_inline
 void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size,
 				 const struct min_heap_callbacks *func, void *args)
 {
-	void *left, *right;
+	const unsigned long lsbit = elem_size & -elem_size;
 	void *data = heap->data;
-	void *root = data + pos * elem_size;
-	int i = pos, j;
+	/* pre-scale counters for performance */
+	size_t a = pos * elem_size;
+	size_t b, c, d;
+	size_t n = heap->nr * elem_size;
 
 	/* Find the sift-down path all the way to the leaves. */
-	for (;;) {
-		if (i * 2 + 2 >= heap->nr)
-			break;
-		left = data + (i * 2 + 1) * elem_size;
-		right = data + (i * 2 + 2) * elem_size;
-		i = func->less(left, right, args) ? i * 2 + 1 : i * 2 + 2;
-	}
+	for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;)
+		b = func->less(data + c, data + d, args) ? c : d;
 
 	/* Special case for the last leaf with no sibling. */
-	if (i * 2 + 2 == heap->nr)
-		i = i * 2 + 1;
+	if (d == n)
+		b = c;
 
 	/* Backtrack to the correct location. */
-	while (i != pos && func->less(root, data + i * elem_size, args))
-		i = (i - 1) / 2;
+	while (b != a && func->less(data + a, data + b, args))
+		b = parent(b, lsbit, elem_size);
 
 	/* Shift the element into its correct place. */
-	j = i;
-	while (i != pos) {
-		i = (i - 1) / 2;
-		func->swp(data + i * elem_size, data + j * elem_size, args);
+	c = b;
+	while (b != a) {
+		b = parent(b, lsbit, elem_size);
+		func->swp(data + b, data + c, args);
 	}
 }
 
@@ -117,15 +140,17 @@ static __always_inline
 void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx,
 			       const struct min_heap_callbacks *func, void *args)
 {
+	const unsigned long lsbit = elem_size & -elem_size;
 	void *data = heap->data;
-	size_t parent;
+	/* pre-scale counters for performance */
+	size_t a = idx * elem_size, b;
 
-	while (idx) {
-		parent = (idx - 1) / 2;
-		if (func->less(data + parent * elem_size, data + idx * elem_size, args))
+	while (a) {
+		b = parent(a, lsbit, elem_size);
+		if (func->less(data + b, data + a, args))
 			break;
-		func->swp(data + parent * elem_size, data + idx * elem_size, args);
-		idx = parent;
+		func->swp(data + a, data + b, args);
+		a = b;
 	}
 }
 
-- 
cgit v1.2.3


From 03ec56d084611b5a4dc06ffa74db0928616e4d7f Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Sun, 20 Oct 2024 12:01:53 +0800
Subject: lib min_heap: avoid indirect function call by providing default swap

The non-inline min heap API can result in an indirect function call to the
custom swap function.  This becomes particularly costly when
CONFIG_MITIGATION_RETPOLINE is enabled, as indirect function calls are
expensive in this case.

To address this, copy the code from lib/sort.c and provide a default
builtin swap implementation that performs element swaps based on the
element size.  This change allows most users to avoid the overhead of
indirect function calls, improving efficiency.

Link: https://lkml.kernel.org/r/20241020040200.939973-4-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Coly Li <colyli@suse.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: "Liang, Kan" <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Sakai <msakai@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/min_heap.h | 159 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 156 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 41b092a14238..e781727c8916 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -38,6 +38,147 @@ struct min_heap_callbacks {
 	void (*swp)(void *lhs, void *rhs, void *args);
 };
 
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+	unsigned char lsbits = (unsigned char)size;
+
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static __always_inline
+void swap_words_32(void *a, void *b, size_t n)
+{
+	do {
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static __always_inline
+void swap_words_64(void *a, void *b, size_t n)
+{
+	do {
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static __always_inline
+void swap_bytes(void *a, void *b, size_t n)
+{
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0)
+#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1)
+#define SWAP_BYTES    ((void (*)(void *, void *, void *))2)
+
+/*
+ * Selects the appropriate swap function based on the element size.
+ */
+static __always_inline
+void *select_swap_func(const void *base, size_t size)
+{
+	if (is_aligned(base, size, 8))
+		return SWAP_WORDS_64;
+	else if (is_aligned(base, size, 4))
+		return SWAP_WORDS_32;
+	else
+		return SWAP_BYTES;
+}
+
+static __always_inline
+void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args),
+	     void *priv)
+{
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, priv);
+}
+
 /**
  * parent - given the offset of the child, find the offset of the parent.
  * @i: the offset of the heap element whose parent is sought.  Non-zero.
@@ -106,11 +247,15 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size,
 {
 	const unsigned long lsbit = elem_size & -elem_size;
 	void *data = heap->data;
+	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
 	/* pre-scale counters for performance */
 	size_t a = pos * elem_size;
 	size_t b, c, d;
 	size_t n = heap->nr * elem_size;
 
+	if (!swp)
+		swp = select_swap_func(data, elem_size);
+
 	/* Find the sift-down path all the way to the leaves. */
 	for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;)
 		b = func->less(data + c, data + d, args) ? c : d;
@@ -127,7 +272,7 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size,
 	c = b;
 	while (b != a) {
 		b = parent(b, lsbit, elem_size);
-		func->swp(data + b, data + c, args);
+		do_swap(data + b, data + c, elem_size, swp, args);
 	}
 }
 
@@ -142,14 +287,18 @@ void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx
 {
 	const unsigned long lsbit = elem_size & -elem_size;
 	void *data = heap->data;
+	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
 	/* pre-scale counters for performance */
 	size_t a = idx * elem_size, b;
 
+	if (!swp)
+		swp = select_swap_func(data, elem_size);
+
 	while (a) {
 		b = parent(a, lsbit, elem_size);
 		if (func->less(data + b, data + a, args))
 			break;
-		func->swp(data + a, data + b, args);
+		do_swap(data + a, data + b, elem_size, swp, args);
 		a = b;
 	}
 }
@@ -242,15 +391,19 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx,
 			   const struct min_heap_callbacks *func, void *args)
 {
 	void *data = heap->data;
+	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
 
 	if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
 		return false;
 
+	if (!swp)
+		swp = select_swap_func(data, elem_size);
+
 	/* Place last element at the root (position 0) and then sift down. */
 	heap->nr--;
 	if (idx == heap->nr)
 		return true;
-	func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args);
+	do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args);
 	__min_heap_sift_up_inline(heap, elem_size, idx, func, args);
 	__min_heap_sift_down_inline(heap, idx, elem_size, func, args);
 
-- 
cgit v1.2.3


From e1ef630c56d36770e180f0d0bf7b61b5289f5c48 Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Fri, 1 Nov 2024 11:57:13 +0200
Subject: clk: Add devm_clk_hw_register_gate_parent_hw()

Add devm_clk_hw_register_gate_parent_hw() macro to allow registering
devres managed gate clocks providing struct clk_hw object as parent.

Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Link: https://lore.kernel.org/20241101095720.2247815-3-claudiu.beznea.uj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/linux/clk-provider.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 7e43caabb54b..28cf7d103e92 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -622,6 +622,24 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 	__devm_clk_hw_register_gate((dev), NULL, (name), (parent_name), NULL, \
 			       NULL, (flags), (reg), (bit_idx),		      \
 			       (clk_gate_flags), (lock))
+/**
+ * devm_clk_hw_register_gate_parent_hw - register a gate clock with the clock
+ * framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_hw: pointer to parent clk
+ * @flags: framework-specific flags for this clock
+ * @reg: register address to control gating of this clock
+ * @bit_idx: which bit in the register controls gating of this clock
+ * @clk_gate_flags: gate-specific flags for this clock
+ * @lock: shared register lock for this clock
+ */
+#define devm_clk_hw_register_gate_parent_hw(dev, name, parent_hw, flags,      \
+					    reg, bit_idx, clk_gate_flags,     \
+					    lock)			      \
+	__devm_clk_hw_register_gate((dev), NULL, (name), NULL, (parent_hw),   \
+				    NULL, (flags), (reg), (bit_idx),	      \
+				    (clk_gate_flags), (lock))
 /**
  * devm_clk_hw_register_gate_parent_data - register a gate clock with the
  * clock framework
-- 
cgit v1.2.3


From 32360bf6a5d4016669c3545e7b0ec939937f5331 Mon Sep 17 00:00:00 2001
From: Dmitry Rokosov <ddrokosov@salutedevices.com>
Date: Wed, 4 Sep 2024 01:39:30 +0300
Subject: leds: Introduce ordered workqueue for LEDs events instead of
 system_wq

This allows to setup ordered workqueue for LEDs events. This may be
useful, because default 'system_wq' does not guarantee execution order
of each work_struct, thus for several brightness update requests (for
multiple LEDs), real brightness switch could be in random order.

Yes, for sysfs-based LEDs we have flush_work() call inside
brightness_store() operation, but it's blocking call, so userspace
caller can be blocked at a long time, which means LEDs animation stream
can be broken.

Ordered workqueue has the same behaviour as system_wq + flush_work(),
but all scheduled works are async and userspace caller is not blocked,
which it better for userspace animation scheduling.

Signed-off-by: Alexey Romanov <avromanov@salutedevices.com>
Signed-off-by: Dmitry Rokosov <ddrokosov@salutedevices.com>
Link: https://lore.kernel.org/r/20240903223936.21292-1-ddrokosov@salutedevices.com
[Lee: Couple of style fix-ups]
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-class.c | 12 +++++++++++-
 drivers/leds/led-core.c  |  6 +++---
 include/linux/leds.h     |  1 +
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 06b97fd49ad9..83e512df47dc 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -25,6 +25,8 @@
 static DEFINE_MUTEX(leds_lookup_lock);
 static LIST_HEAD(leds_lookup_list);
 
+static struct workqueue_struct *leds_wq;
+
 static ssize_t brightness_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -57,7 +59,6 @@ static ssize_t brightness_store(struct device *dev,
 	if (state == LED_OFF)
 		led_trigger_remove(led_cdev);
 	led_set_brightness(led_cdev, state);
-	flush_work(&led_cdev->set_brightness_work);
 
 	ret = size;
 unlock:
@@ -549,6 +550,8 @@ int led_classdev_register_ext(struct device *parent,
 
 	led_update_brightness(led_cdev);
 
+	led_cdev->wq = leds_wq;
+
 	led_init_core(led_cdev);
 
 #ifdef CONFIG_LEDS_TRIGGERS
@@ -667,12 +670,19 @@ EXPORT_SYMBOL_GPL(devm_led_classdev_unregister);
 
 static int __init leds_init(void)
 {
+	leds_wq = alloc_ordered_workqueue("leds", 0);
+	if (!leds_wq) {
+		pr_err("Failed to create LEDs ordered workqueue\n");
+		return -ENOMEM;
+	}
+
 	return class_register(&leds_class);
 }
 
 static void __exit leds_exit(void)
 {
 	class_unregister(&leds_class);
+	destroy_workqueue(leds_wq);
 }
 
 subsys_initcall(leds_init);
diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index 001c290bc07b..f6c46d2e5276 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -273,7 +273,7 @@ void led_blink_set_nosleep(struct led_classdev *led_cdev, unsigned long delay_on
 		led_cdev->delayed_delay_on = delay_on;
 		led_cdev->delayed_delay_off = delay_off;
 		set_bit(LED_SET_BLINK, &led_cdev->work_flags);
-		schedule_work(&led_cdev->set_brightness_work);
+		queue_work(led_cdev->wq, &led_cdev->set_brightness_work);
 		return;
 	}
 
@@ -304,7 +304,7 @@ void led_set_brightness(struct led_classdev *led_cdev, unsigned int brightness)
 		 */
 		if (!brightness) {
 			set_bit(LED_BLINK_DISABLE, &led_cdev->work_flags);
-			schedule_work(&led_cdev->set_brightness_work);
+			queue_work(led_cdev->wq, &led_cdev->set_brightness_work);
 		} else {
 			set_bit(LED_BLINK_BRIGHTNESS_CHANGE,
 				&led_cdev->work_flags);
@@ -340,7 +340,7 @@ void led_set_brightness_nopm(struct led_classdev *led_cdev, unsigned int value)
 		set_bit(LED_SET_BRIGHTNESS_OFF, &led_cdev->work_flags);
 	}
 
-	schedule_work(&led_cdev->set_brightness_work);
+	queue_work(led_cdev->wq, &led_cdev->set_brightness_work);
 }
 EXPORT_SYMBOL_GPL(led_set_brightness_nopm);
 
diff --git a/include/linux/leds.h b/include/linux/leds.h
index e5968c3ed4ae..9f34b9facaad 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -171,6 +171,7 @@ struct led_classdev {
 	int			 new_blink_brightness;
 	void			(*flash_resume)(struct led_classdev *led_cdev);
 
+	struct workqueue_struct *wq;			/* LED workqueue */
 	struct work_struct	set_brightness_work;
 	int			delayed_set_value;
 	unsigned long		delayed_delay_on;
-- 
cgit v1.2.3


From 4ca7cd938725a4050dcd62ae9472e931d603118d Mon Sep 17 00:00:00 2001
From: Mukesh Ojha <quic_mojha@quicinc.com>
Date: Sun, 3 Nov 2024 21:35:27 +0530
Subject: leds: class: Protect brightness_show() with led_cdev->led_access
 mutex

There is NULL pointer issue observed if from Process A where hid device
being added which results in adding a led_cdev addition and later a
another call to access of led_cdev attribute from Process B can result
in NULL pointer issue.

Use mutex led_cdev->led_access to protect access to led->cdev and its
attribute inside brightness_show() and max_brightness_show() and also
update the comment for mutex that it should be used to protect the led
class device fields.

	Process A 				Process B

 kthread+0x114
 worker_thread+0x244
 process_scheduled_works+0x248
 uhid_device_add_worker+0x24
 hid_add_device+0x120
 device_add+0x268
 bus_probe_device+0x94
 device_initial_probe+0x14
 __device_attach+0xfc
 bus_for_each_drv+0x10c
 __device_attach_driver+0x14c
 driver_probe_device+0x3c
 __driver_probe_device+0xa0
 really_probe+0x190
 hid_device_probe+0x130
 ps_probe+0x990
 ps_led_register+0x94
 devm_led_classdev_register_ext+0x58
 led_classdev_register_ext+0x1f8
 device_create_with_groups+0x48
 device_create_groups_vargs+0xc8
 device_add+0x244
 kobject_uevent+0x14
 kobject_uevent_env[jt]+0x224
 mutex_unlock[jt]+0xc4
 __mutex_unlock_slowpath+0xd4
 wake_up_q+0x70
 try_to_wake_up[jt]+0x48c
 preempt_schedule_common+0x28
 __schedule+0x628
 __switch_to+0x174
						el0t_64_sync+0x1a8/0x1ac
						el0t_64_sync_handler+0x68/0xbc
						el0_svc+0x38/0x68
						do_el0_svc+0x1c/0x28
						el0_svc_common+0x80/0xe0
						invoke_syscall+0x58/0x114
						__arm64_sys_read+0x1c/0x2c
						ksys_read+0x78/0xe8
						vfs_read+0x1e0/0x2c8
						kernfs_fop_read_iter+0x68/0x1b4
						seq_read_iter+0x158/0x4ec
						kernfs_seq_show+0x44/0x54
						sysfs_kf_seq_show+0xb4/0x130
						dev_attr_show+0x38/0x74
						brightness_show+0x20/0x4c
						dualshock4_led_get_brightness+0xc/0x74

[ 3313.874295][ T4013] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000060
[ 3313.874301][ T4013] Mem abort info:
[ 3313.874303][ T4013]   ESR = 0x0000000096000006
[ 3313.874305][ T4013]   EC = 0x25: DABT (current EL), IL = 32 bits
[ 3313.874307][ T4013]   SET = 0, FnV = 0
[ 3313.874309][ T4013]   EA = 0, S1PTW = 0
[ 3313.874311][ T4013]   FSC = 0x06: level 2 translation fault
[ 3313.874313][ T4013] Data abort info:
[ 3313.874314][ T4013]   ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000
[ 3313.874316][ T4013]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[ 3313.874318][ T4013]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[ 3313.874320][ T4013] user pgtable: 4k pages, 39-bit VAs, pgdp=00000008f2b0a000
..

[ 3313.874332][ T4013] Dumping ftrace buffer:
[ 3313.874334][ T4013]    (ftrace buffer empty)
..
..
[ dd3313.874639][ T4013] CPU: 6 PID: 4013 Comm: InputReader
[ 3313.874648][ T4013] pc : dualshock4_led_get_brightness+0xc/0x74
[ 3313.874653][ T4013] lr : led_update_brightness+0x38/0x60
[ 3313.874656][ T4013] sp : ffffffc0b910bbd0
..
..
[ 3313.874685][ T4013] Call trace:
[ 3313.874687][ T4013]  dualshock4_led_get_brightness+0xc/0x74
[ 3313.874690][ T4013]  brightness_show+0x20/0x4c
[ 3313.874692][ T4013]  dev_attr_show+0x38/0x74
[ 3313.874696][ T4013]  sysfs_kf_seq_show+0xb4/0x130
[ 3313.874700][ T4013]  kernfs_seq_show+0x44/0x54
[ 3313.874703][ T4013]  seq_read_iter+0x158/0x4ec
[ 3313.874705][ T4013]  kernfs_fop_read_iter+0x68/0x1b4
[ 3313.874708][ T4013]  vfs_read+0x1e0/0x2c8
[ 3313.874711][ T4013]  ksys_read+0x78/0xe8
[ 3313.874714][ T4013]  __arm64_sys_read+0x1c/0x2c
[ 3313.874718][ T4013]  invoke_syscall+0x58/0x114
[ 3313.874721][ T4013]  el0_svc_common+0x80/0xe0
[ 3313.874724][ T4013]  do_el0_svc+0x1c/0x28
[ 3313.874727][ T4013]  el0_svc+0x38/0x68
[ 3313.874730][ T4013]  el0t_64_sync_handler+0x68/0xbc
[ 3313.874732][ T4013]  el0t_64_sync+0x1a8/0x1ac

Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
Reviewed-by: Anish Kumar <yesanishhere@gmail.com>
Link: https://lore.kernel.org/r/20241103160527.82487-1-quic_mojha@quicinc.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-class.c | 14 +++++++++++---
 include/linux/leds.h     |  2 +-
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 83e512df47dc..2a04ac61574d 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -31,11 +31,14 @@ static ssize_t brightness_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	unsigned int brightness;
 
-	/* no lock needed for this */
+	mutex_lock(&led_cdev->led_access);
 	led_update_brightness(led_cdev);
+	brightness = led_cdev->brightness;
+	mutex_unlock(&led_cdev->led_access);
 
-	return sprintf(buf, "%u\n", led_cdev->brightness);
+	return sprintf(buf, "%u\n", brightness);
 }
 
 static ssize_t brightness_store(struct device *dev,
@@ -71,8 +74,13 @@ static ssize_t max_brightness_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	unsigned int max_brightness;
+
+	mutex_lock(&led_cdev->led_access);
+	max_brightness = led_cdev->max_brightness;
+	mutex_unlock(&led_cdev->led_access);
 
-	return sprintf(buf, "%u\n", led_cdev->max_brightness);
+	return sprintf(buf, "%u\n", max_brightness);
 }
 static DEVICE_ATTR_RO(max_brightness);
 
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 9f34b9facaad..98f9719c924c 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -239,7 +239,7 @@ struct led_classdev {
 	struct kernfs_node	*brightness_hw_changed_kn;
 #endif
 
-	/* Ensures consistent access to the LED Flash Class device */
+	/* Ensures consistent access to the LED class device */
 	struct mutex		led_access;
 };
 
-- 
cgit v1.2.3


From 33b091c08ed85e023c21376e6f787355fd46b440 Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@igalia.com>
Date: Fri, 1 Nov 2024 13:42:49 -0300
Subject: libfs: Fix kernel-doc warning in generic_ci_validate_strict_name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the indentation of the return values from
generic_ci_validate_strict_name() to properly render the comment and to
address a `make htmldocs` warning:

Documentation/filesystems/api-summary:14: include/linux/fs.h:3504:
WARNING: Bullet list ends without a blank line; unexpected unindent.

Fixes: 0e152beb5aa1 ("libfs: Create the helper function generic_ci_validate_strict_name()")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/lkml/20241030162435.05425f60@canb.auug.org.au/
Signed-off-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20241101164251.327884-2-andrealmeid@igalia.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b277369672a1..001d580af862 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3477,12 +3477,12 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
  * @name: name of the new file
  *
  * Return:
- * * True if the filename is suitable for this directory. It can be
- * true if a given name is not suitable for a strict encoding
- * directory, but the directory being used isn't strict
+ * * True: if the filename is suitable for this directory. It can be
+ *   true if a given name is not suitable for a strict encoding
+ *   directory, but the directory being used isn't strict
  * * False if the filename isn't suitable for this directory. This only
- * happens when a directory is casefolded and the filesystem is strict
- * about its encoding.
+ *   happens when a directory is casefolded and the filesystem is strict
+ *   about its encoding.
  */
 static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
 {
-- 
cgit v1.2.3


From 6140be90ec70c39fa844741ca3cc807dd0866394 Mon Sep 17 00:00:00 2001
From: Christian Göttsche <cgzones@googlemail.com>
Date: Fri, 26 Apr 2024 18:20:14 +0200
Subject: fs/xattr: add *at family syscalls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the four syscalls setxattrat(), getxattrat(), listxattrat() and
removexattrat().  Those can be used to operate on extended attributes,
especially security related ones, either relative to a pinned directory
or on a file descriptor without read access, avoiding a
/proc/<pid>/fd/<fd> detour, requiring a mounted procfs.

One use case will be setfiles(8) setting SELinux file contexts
("security.selinux") without race conditions and without a file
descriptor opened with read access requiring SELinux read permission.

Use the do_{name}at() pattern from fs/open.c.

Pass the value of the extended attribute, its length, and for
setxattrat(2) the command (XATTR_CREATE or XATTR_REPLACE) via an added
struct xattr_args to not exceed six syscall arguments and not
merging the AT_* and XATTR_* flags.

[AV: fixes by Christian Brauner folded in, the entire thing rebased on
top of {filename,file}_...xattr() primitives, treatment of empty
pathnames regularized.  As the result, AT_EMPTY_PATH+NULL handling
is cheap, so f...(2) can use it]

Signed-off-by: Christian Göttsche <cgzones@googlemail.com>
Link: https://lore.kernel.org/r/20240426162042.191916-1-cgoettsche@seltendoof.de
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
CC: x86@kernel.org
CC: linux-alpha@vger.kernel.org
CC: linux-kernel@vger.kernel.org
CC: linux-arm-kernel@lists.infradead.org
CC: linux-ia64@vger.kernel.org
CC: linux-m68k@lists.linux-m68k.org
CC: linux-mips@vger.kernel.org
CC: linux-parisc@vger.kernel.org
CC: linuxppc-dev@lists.ozlabs.org
CC: linux-s390@vger.kernel.org
CC: linux-sh@vger.kernel.org
CC: sparclinux@vger.kernel.org
CC: linux-fsdevel@vger.kernel.org
CC: audit@vger.kernel.org
CC: linux-arch@vger.kernel.org
CC: linux-api@vger.kernel.org
CC: linux-security-module@vger.kernel.org
CC: selinux@vger.kernel.org
[brauner: slight tweaks]
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   4 +
 arch/arm/tools/syscall.tbl                  |   4 +
 arch/arm64/tools/syscall_32.tbl             |   4 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   4 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   4 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   4 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   4 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   4 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   4 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   4 +
 arch/s390/kernel/syscalls/syscall.tbl       |   4 +
 arch/sh/kernel/syscalls/syscall.tbl         |   4 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   4 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   4 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   4 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   4 +
 fs/xattr.c                                  | 245 ++++++++++++++++++----------
 include/asm-generic/audit_change_attr.h     |   6 +
 include/linux/syscalls.h                    |  13 ++
 include/linux/xattr.h                       |   4 +
 include/uapi/asm-generic/unistd.h           |  11 +-
 include/uapi/linux/xattr.h                  |   7 +
 scripts/syscall.tbl                         |   4 +
 23 files changed, 268 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 74720667fe09..c59d53d6d3f3 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -502,3 +502,7 @@
 570	common	lsm_set_self_attr		sys_lsm_set_self_attr
 571	common	lsm_list_modules		sys_lsm_list_modules
 572	common  mseal				sys_mseal
+573	common	setxattrat			sys_setxattrat
+574	common	getxattrat			sys_getxattrat
+575	common	listxattrat			sys_listxattrat
+576	common	removexattrat			sys_removexattrat
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 23c98203c40f..49eeb2ad8dbd 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -477,3 +477,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 9a37930d4e26..69a829912a05 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -474,3 +474,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 22a3cbd4c602..f5ed71f1910d 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -462,3 +462,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 2b81a6bd78b2..680f568b77f2 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -468,3 +468,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 953f5b7dc723..0b9b7e25b69a 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -401,3 +401,7 @@
 460	n32	lsm_set_self_attr		sys_lsm_set_self_attr
 461	n32	lsm_list_modules		sys_lsm_list_modules
 462	n32	mseal				sys_mseal
+463	n32	setxattrat			sys_setxattrat
+464	n32	getxattrat			sys_getxattrat
+465	n32	listxattrat			sys_listxattrat
+466	n32	removexattrat			sys_removexattrat
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 1464c6be6eb3..c844cd5cda62 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -377,3 +377,7 @@
 460	n64	lsm_set_self_attr		sys_lsm_set_self_attr
 461	n64	lsm_list_modules		sys_lsm_list_modules
 462	n64	mseal				sys_mseal
+463	n64	setxattrat			sys_setxattrat
+464	n64	getxattrat			sys_getxattrat
+465	n64	listxattrat			sys_listxattrat
+466	n64	removexattrat			sys_removexattrat
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 2439a2491cff..349b8aad1159 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -450,3 +450,7 @@
 460	o32	lsm_set_self_attr		sys_lsm_set_self_attr
 461	o32	lsm_list_modules		sys_lsm_list_modules
 462	o32	mseal				sys_mseal
+463	o32	setxattrat			sys_setxattrat
+464	o32	getxattrat			sys_getxattrat
+465	o32	listxattrat			sys_listxattrat
+466	o32	removexattrat			sys_removexattrat
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 66dc406b12e4..d9fc94c86965 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -461,3 +461,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index ebae8415dfbb..d8b4ab78bef0 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -553,3 +553,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 01071182763e..e9115b4d8b63 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -465,3 +465,7 @@
 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
 462  common	mseal			sys_mseal			sys_mseal
+463  common	setxattrat		sys_setxattrat			sys_setxattrat
+464  common	getxattrat		sys_getxattrat			sys_getxattrat
+465  common	listxattrat		sys_listxattrat			sys_listxattrat
+466  common	removexattrat		sys_removexattrat		sys_removexattrat
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index c55fd7696d40..c8cad33bf250 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -466,3 +466,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index cfdfb3707c16..727f99d333b3 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -508,3 +508,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal 				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 534c74b14fab..4d0fb2fba7e2 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -468,3 +468,7 @@
 460	i386	lsm_set_self_attr	sys_lsm_set_self_attr
 461	i386	lsm_list_modules	sys_lsm_list_modules
 462	i386	mseal 			sys_mseal
+463	i386	setxattrat		sys_setxattrat
+464	i386	getxattrat		sys_getxattrat
+465	i386	listxattrat		sys_listxattrat
+466	i386	removexattrat		sys_removexattrat
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7093ee21c0d1..5eb708bff1c7 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -386,6 +386,10 @@
 460	common	lsm_set_self_attr	sys_lsm_set_self_attr
 461	common	lsm_list_modules	sys_lsm_list_modules
 462 	common  mseal			sys_mseal
+463	common	setxattrat		sys_setxattrat
+464	common	getxattrat		sys_getxattrat
+465	common	listxattrat		sys_listxattrat
+466	common	removexattrat		sys_removexattrat
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 67083fc1b2f5..37effc1b134e 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -433,3 +433,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal 				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/fs/xattr.c b/fs/xattr.c
index b76911b23293..deb336b821c9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -676,69 +676,90 @@ out:
 	return error;
 }
 
-static int path_setxattr(const char __user *pathname,
-			 const char __user *name, const void __user *value,
-			 size_t size, int flags, unsigned int lookup_flags)
+static int path_setxattrat(int dfd, const char __user *pathname,
+			   unsigned int at_flags, const char __user *name,
+			   const void __user *value, size_t size, int flags)
 {
 	struct xattr_name kname;
 	struct kernel_xattr_ctx ctx = {
-		.cvalue   = value,
-		.kvalue   = NULL,
-		.size     = size,
-		.kname    = &kname,
-		.flags    = flags,
+		.cvalue	= value,
+		.kvalue	= NULL,
+		.size	= size,
+		.kname	= &kname,
+		.flags	= flags,
 	};
+	struct filename *filename;
+	unsigned int lookup_flags = 0;
 	int error;
 
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+		lookup_flags = LOOKUP_FOLLOW;
+
 	error = setxattr_copy(name, &ctx);
 	if (error)
 		return error;
 
-	error = filename_setxattr(AT_FDCWD, getname(pathname), lookup_flags,
-				  &ctx);
+	filename = getname_maybe_null(pathname, at_flags);
+	if (!filename) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			error = -EBADF;
+		else
+			error = file_setxattr(fd_file(f), &ctx);
+	} else {
+		error = filename_setxattr(dfd, filename, lookup_flags, &ctx);
+	}
 	kvfree(ctx.kvalue);
 	return error;
 }
 
+SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+		const char __user *, name, const struct xattr_args __user *, uargs,
+		size_t, usize)
+{
+	struct xattr_args args = {};
+	int error;
+
+	BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+	BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+	if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+		return -EINVAL;
+	if (usize > PAGE_SIZE)
+		return -E2BIG;
+
+	error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+	if (error)
+		return error;
+
+	return path_setxattrat(dfd, pathname, at_flags, name,
+			       u64_to_user_ptr(args.value), args.size,
+			       args.flags);
+}
+
 SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
 		const char __user *, name, const void __user *, value,
 		size_t, size, int, flags)
 {
-	return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
+	return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags);
 }
 
 SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
 		const char __user *, name, const void __user *, value,
 		size_t, size, int, flags)
 {
-	return path_setxattr(pathname, name, value, size, flags, 0);
+	return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+			       value, size, flags);
 }
 
 SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 		const void __user *,value, size_t, size, int, flags)
 {
-	struct xattr_name kname;
-	struct kernel_xattr_ctx ctx = {
-		.cvalue   = value,
-		.kvalue   = NULL,
-		.size     = size,
-		.kname    = &kname,
-		.flags    = flags,
-	};
-	int error;
-
-	CLASS(fd, f)(fd);
-
-	if (fd_empty(f))
-		return -EBADF;
-
-	error = setxattr_copy(name, &ctx);
-	if (error)
-		return error;
-
-	error = file_setxattr(fd_file(f), &ctx);
-	kvfree(ctx.kvalue);
-	return error;
+	return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name,
+			       value, size, flags);
 }
 
 /*
@@ -804,11 +825,10 @@ out:
 	return error;
 }
 
-static ssize_t path_getxattr(const char __user *pathname,
-			     const char __user *name, void __user *value,
-			     size_t size, unsigned int lookup_flags)
+static ssize_t path_getxattrat(int dfd, const char __user *pathname,
+			       unsigned int at_flags, const char __user *name,
+			       void __user *value, size_t size)
 {
-	ssize_t error;
 	struct xattr_name kname;
 	struct kernel_xattr_ctx ctx = {
 		.value    = value,
@@ -816,44 +836,72 @@ static ssize_t path_getxattr(const char __user *pathname,
 		.kname    = &kname,
 		.flags    = 0,
 	};
+	struct filename *filename;
+	ssize_t error;
+
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
 
 	error = import_xattr_name(&kname, name);
 	if (error)
 		return error;
-	return filename_getxattr(AT_FDCWD, getname(pathname), lookup_flags, &ctx);
+
+	filename = getname_maybe_null(pathname, at_flags);
+	if (!filename) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			return -EBADF;
+		return file_getxattr(fd_file(f), &ctx);
+	} else {
+		int lookup_flags = 0;
+		if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+			lookup_flags = LOOKUP_FOLLOW;
+		return filename_getxattr(dfd, filename, lookup_flags, &ctx);
+	}
+}
+
+SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+		const char __user *, name, struct xattr_args __user *, uargs, size_t, usize)
+{
+	struct xattr_args args = {};
+	int error;
+
+	BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+	BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+	if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+		return -EINVAL;
+	if (usize > PAGE_SIZE)
+		return -E2BIG;
+
+	error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+	if (error)
+		return error;
+
+	if (args.flags != 0)
+		return -EINVAL;
+
+	return path_getxattrat(dfd, pathname, at_flags, name,
+			       u64_to_user_ptr(args.value), args.size);
 }
 
 SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
 		const char __user *, name, void __user *, value, size_t, size)
 {
-	return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
+	return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size);
 }
 
 SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
 		const char __user *, name, void __user *, value, size_t, size)
 {
-	return path_getxattr(pathname, name, value, size, 0);
+	return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+			       value, size);
 }
 
 SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
 		void __user *, value, size_t, size)
 {
-	ssize_t error;
-	struct xattr_name kname;
-	struct kernel_xattr_ctx ctx = {
-		.value    = value,
-		.size     = size,
-		.kname    = &kname,
-		.flags    = 0,
-	};
-	CLASS(fd, f)(fd);
-
-	if (fd_empty(f))
-		return -EBADF;
-	error = import_xattr_name(&kname, name);
-	if (error)
-		return error;
-	return file_getxattr(fd_file(f), &ctx);
+	return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size);
 }
 
 /*
@@ -918,32 +966,50 @@ out:
 	return error;
 }
 
-static ssize_t path_listxattr(const char __user *pathname, char __user *list,
-			      size_t size, unsigned int lookup_flags)
+static ssize_t path_listxattrat(int dfd, const char __user *pathname,
+				unsigned int at_flags, char __user *list,
+				size_t size)
+{
+	struct filename *filename;
+	int lookup_flags;
+
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	filename = getname_maybe_null(pathname, at_flags);
+	if (!filename) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			return -EBADF;
+		return file_listxattr(fd_file(f), list, size);
+	}
+
+	lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	return filename_listxattr(dfd, filename, lookup_flags, list, size);
+}
+
+SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname,
+		unsigned int, at_flags,
+		char __user *, list, size_t, size)
 {
-	return filename_listxattr(AT_FDCWD, getname(pathname), lookup_flags,
-				  list, size);
+	return path_listxattrat(dfd, pathname, at_flags, list, size);
 }
 
 SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
 		size_t, size)
 {
-	return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
+	return path_listxattrat(AT_FDCWD, pathname, 0, list, size);
 }
 
 SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
 		size_t, size)
 {
-	return path_listxattr(pathname, list, size, 0);
+	return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size);
 }
 
 SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
 {
-	CLASS(fd, f)(fd);
-
-	if (fd_empty(f))
-		return -EBADF;
-	return file_listxattr(fd_file(f), list, size);
+	return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size);
 }
 
 /*
@@ -996,44 +1062,53 @@ out:
 	return error;
 }
 
-static int path_removexattr(const char __user *pathname,
-			    const char __user *name, unsigned int lookup_flags)
+static int path_removexattrat(int dfd, const char __user *pathname,
+			      unsigned int at_flags, const char __user *name)
 {
 	struct xattr_name kname;
+	struct filename *filename;
+	unsigned int lookup_flags;
 	int error;
 
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
 	error = import_xattr_name(&kname, name);
 	if (error)
 		return error;
-	return filename_removexattr(AT_FDCWD, getname(pathname), lookup_flags,
-				    &kname);
+
+	filename = getname_maybe_null(pathname, at_flags);
+	if (!filename) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			return -EBADF;
+		return file_removexattr(fd_file(f), &kname);
+	}
+	lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	return filename_removexattr(dfd, filename, lookup_flags, &kname);
+}
+
+SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname,
+		unsigned int, at_flags, const char __user *, name)
+{
+	return path_removexattrat(dfd, pathname, at_flags, name);
 }
 
 SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
 		const char __user *, name)
 {
-	return path_removexattr(pathname, name, LOOKUP_FOLLOW);
+	return path_removexattrat(AT_FDCWD, pathname, 0, name);
 }
 
 SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
 		const char __user *, name)
 {
-	return path_removexattr(pathname, name, 0);
+	return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name);
 }
 
 SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 {
-	CLASS(fd, f)(fd);
-	struct xattr_name kname;
-	int error;
-
-	if (fd_empty(f))
-		return -EBADF;
-
-	error = import_xattr_name(&kname, name);
-	if (error)
-		return error;
-	return file_removexattr(fd_file(f), &kname);
+	return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name);
 }
 
 int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
diff --git a/include/asm-generic/audit_change_attr.h b/include/asm-generic/audit_change_attr.h
index 331670807cf0..cc840537885f 100644
--- a/include/asm-generic/audit_change_attr.h
+++ b/include/asm-generic/audit_change_attr.h
@@ -11,9 +11,15 @@ __NR_lchown,
 __NR_fchown,
 #endif
 __NR_setxattr,
+#ifdef __NR_setxattrat
+__NR_setxattrat,
+#endif
 __NR_lsetxattr,
 __NR_fsetxattr,
 __NR_removexattr,
+#ifdef __NR_removexattrat
+__NR_removexattrat,
+#endif
 __NR_lremovexattr,
 __NR_fremovexattr,
 #ifdef __NR_fchownat
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 5758104921e6..c6333204d451 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -77,6 +77,7 @@ struct cachestat_range;
 struct cachestat;
 struct statmount;
 struct mnt_id_req;
+struct xattr_args;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -338,23 +339,35 @@ asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
 				void __user *arg, unsigned int nr_args);
 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
 			     const void __user *value, size_t size, int flags);
+asmlinkage long sys_setxattrat(int dfd, const char __user *path, unsigned int at_flags,
+			       const char __user *name,
+			       const struct xattr_args __user *args, size_t size);
 asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name,
 			      const void __user *value, size_t size, int flags);
 asmlinkage long sys_fsetxattr(int fd, const char __user *name,
 			      const void __user *value, size_t size, int flags);
 asmlinkage long sys_getxattr(const char __user *path, const char __user *name,
 			     void __user *value, size_t size);
+asmlinkage long sys_getxattrat(int dfd, const char __user *path, unsigned int at_flags,
+			       const char __user *name,
+			       struct xattr_args __user *args, size_t size);
 asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name,
 			      void __user *value, size_t size);
 asmlinkage long sys_fgetxattr(int fd, const char __user *name,
 			      void __user *value, size_t size);
 asmlinkage long sys_listxattr(const char __user *path, char __user *list,
 			      size_t size);
+asmlinkage long sys_listxattrat(int dfd, const char __user *path,
+				unsigned int at_flags,
+				char __user *list, size_t size);
 asmlinkage long sys_llistxattr(const char __user *path, char __user *list,
 			       size_t size);
 asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size);
 asmlinkage long sys_removexattr(const char __user *path,
 				const char __user *name);
+asmlinkage long sys_removexattrat(int dfd, const char __user *path,
+				  unsigned int at_flags,
+				  const char __user *name);
 asmlinkage long sys_lremovexattr(const char __user *path,
 				 const char __user *name);
 asmlinkage long sys_fremovexattr(int fd, const char __user *name);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index d20051865800..86b0d47984a1 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -19,6 +19,10 @@
 #include <linux/user_namespace.h>
 #include <uapi/linux/xattr.h>
 
+/* List of all open_how "versions". */
+#define XATTR_ARGS_SIZE_VER0	16 /* sizeof first published struct */
+#define XATTR_ARGS_SIZE_LATEST	XATTR_ARGS_SIZE_VER0
+
 struct inode;
 struct dentry;
 
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 5bf6148cac2b..88dc393c2bca 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -841,8 +841,17 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 #define __NR_mseal 462
 __SYSCALL(__NR_mseal, sys_mseal)
 
+#define __NR_setxattrat 463
+__SYSCALL(__NR_setxattrat, sys_setxattrat)
+#define __NR_getxattrat 464
+__SYSCALL(__NR_getxattrat, sys_getxattrat)
+#define __NR_listxattrat 465
+__SYSCALL(__NR_listxattrat, sys_listxattrat)
+#define __NR_removexattrat 466
+__SYSCALL(__NR_removexattrat, sys_removexattrat)
+
 #undef __NR_syscalls
-#define __NR_syscalls 463
+#define __NR_syscalls 467
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 9463db2dfa9d..9854f9cff3c6 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -11,6 +11,7 @@
 */
 
 #include <linux/libc-compat.h>
+#include <linux/types.h>
 
 #ifndef _UAPI_LINUX_XATTR_H
 #define _UAPI_LINUX_XATTR_H
@@ -20,6 +21,12 @@
 
 #define XATTR_CREATE	0x1	/* set value, fail if attr already exists */
 #define XATTR_REPLACE	0x2	/* set value, fail if attr does not exist */
+
+struct xattr_args {
+	__aligned_u64 __user value;
+	__u32 size;
+	__u32 flags;
+};
 #endif
 
 /* Namespaces */
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index 845e24eb372e..ebbdb3c42e9f 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -403,3 +403,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
-- 
cgit v1.2.3


From b79276dcac9124a79c8cf7cc8fbdd3d4c3c9a7c7 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 4 Nov 2024 16:28:55 -0600
Subject: ACPI: processor: Move arch_init_invariance_cppc() call later

arch_init_invariance_cppc() is called at the end of
acpi_cppc_processor_probe() in order to configure frequency invariance
based upon the values from _CPC.

This however doesn't work on AMD CPPC shared memory designs that have
AMD preferred cores enabled because _CPC needs to be analyzed from all
cores to judge if preferred cores are enabled.

This issue manifests to users as a warning since commit 21fb59ab4b97
("ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn"):
```
Could not retrieve highest performance (-19)
```

However the warning isn't the cause of this, it was actually
commit 279f838a61f9 ("x86/amd: Detect preferred cores in
amd_get_boost_ratio_numerator()") which exposed the issue.

To fix this problem, change arch_init_invariance_cppc() into a new weak
symbol that is called at the end of acpi_processor_driver_init().
Each architecture that supports it can declare the symbol to override
the weak one.

Define it for x86, in arch/x86/kernel/acpi/cppc.c, and for all of the
architectures using the generic arch_topology.c code.

Fixes: 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()")
Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219431
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Link: https://patch.msgid.link/20241104222855.3959267-1-superm1@kernel.org
[ rjw: Changelog edit ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm64/include/asm/topology.h | 4 ----
 arch/x86/include/asm/topology.h   | 5 -----
 arch/x86/kernel/acpi/cppc.c       | 7 ++++++-
 drivers/acpi/cppc_acpi.c          | 6 ------
 drivers/acpi/processor_driver.c   | 9 +++++++++
 drivers/base/arch_topology.c      | 6 +++++-
 include/acpi/processor.h          | 2 ++
 include/linux/arch_topology.h     | 4 ----
 8 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 5fc3af9f8f29..341174bf9106 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -26,10 +26,6 @@ void update_freq_counters_refs(void);
 #define arch_scale_freq_invariant topology_scale_freq_invariant
 #define arch_scale_freq_ref topology_get_freq_ref
 
-#ifdef CONFIG_ACPI_CPPC_LIB
-#define arch_init_invariance_cppc topology_init_cpu_capacity_cppc
-#endif
-
 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity topology_get_cpu_scale
 
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index aef70336d624..92f3664dd933 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -305,9 +305,4 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled
 extern void arch_scale_freq_tick(void);
 #define arch_scale_freq_tick arch_scale_freq_tick
 
-#ifdef CONFIG_ACPI_CPPC_LIB
-void init_freq_invariance_cppc(void);
-#define arch_init_invariance_cppc init_freq_invariance_cppc
-#endif
-
 #endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 956984054bf3..aab9d0570841 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -110,7 +110,7 @@ static void amd_set_max_freq_ratio(void)
 
 static DEFINE_MUTEX(freq_invariance_lock);
 
-void init_freq_invariance_cppc(void)
+static inline void init_freq_invariance_cppc(void)
 {
 	static bool init_done;
 
@@ -127,6 +127,11 @@ void init_freq_invariance_cppc(void)
 	mutex_unlock(&freq_invariance_lock);
 }
 
+void acpi_processor_init_invariance_cppc(void)
+{
+	init_freq_invariance_cppc();
+}
+
 /*
  * Get the highest performance register value.
  * @cpu: CPU from which to get highest performance.
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 1a40f0514eaa..5c0cc7aae872 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -671,10 +671,6 @@ static int pcc_data_alloc(int pcc_ss_id)
  *  )
  */
 
-#ifndef arch_init_invariance_cppc
-static inline void arch_init_invariance_cppc(void) { }
-#endif
-
 /**
  * acpi_cppc_processor_probe - Search for per CPU _CPC objects.
  * @pr: Ptr to acpi_processor containing this CPU's logical ID.
@@ -905,8 +901,6 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
 		goto out_free;
 	}
 
-	arch_init_invariance_cppc();
-
 	kfree(output.pointer);
 	return 0;
 
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index cb52dd000b95..3b281bc1e73c 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -237,6 +237,9 @@ static struct notifier_block acpi_processor_notifier_block = {
 	.notifier_call = acpi_processor_notifier,
 };
 
+void __weak acpi_processor_init_invariance_cppc(void)
+{ }
+
 /*
  * We keep the driver loaded even when ACPI is not running.
  * This is needed for the powernow-k8 driver, that works even without
@@ -270,6 +273,12 @@ static int __init acpi_processor_driver_init(void)
 				  NULL, acpi_soft_cpu_dead);
 
 	acpi_processor_throttling_init();
+
+	/*
+	 * Frequency invariance calculations on AMD platforms can't be run until
+	 * after acpi_cppc_processor_probe() has been called for all online CPUs
+	 */
+	acpi_processor_init_invariance_cppc();
 	return 0;
 err:
 	driver_unregister(&acpi_processor_driver);
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 75fcb75d5515..3ebe77566788 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -366,7 +366,7 @@ void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate)
 #ifdef CONFIG_ACPI_CPPC_LIB
 #include <acpi/cppc_acpi.h>
 
-void topology_init_cpu_capacity_cppc(void)
+static inline void topology_init_cpu_capacity_cppc(void)
 {
 	u64 capacity, capacity_scale = 0;
 	struct cppc_perf_caps perf_caps;
@@ -417,6 +417,10 @@ void topology_init_cpu_capacity_cppc(void)
 exit:
 	free_raw_capacity();
 }
+void acpi_processor_init_invariance_cppc(void)
+{
+	topology_init_cpu_capacity_cppc();
+}
 #endif
 
 #ifdef CONFIG_CPU_FREQ
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index e6f6074eadbf..a17e97e634a6 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -465,4 +465,6 @@ extern int acpi_processor_ffh_lpi_probe(unsigned int cpu);
 extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
 #endif
 
+void acpi_processor_init_invariance_cppc(void);
+
 #endif
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index b721f360d759..4a952c4885ed 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -11,10 +11,6 @@
 void topology_normalize_cpu_scale(void);
 int topology_update_cpu_topology(void);
 
-#ifdef CONFIG_ACPI_CPPC_LIB
-void topology_init_cpu_capacity_cppc(void);
-#endif
-
 struct device_node;
 bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
-- 
cgit v1.2.3


From 6f94cbc29adacc15007c5a16295052e674099282 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 3 Nov 2024 08:46:07 -0700
Subject: io_uring/rsrc: split io_kiocb node type assignments

Currently the io_rsrc_node assignment in io_kiocb is an array of two
pointers, as two nodes may be assigned to a request - one file node,
and one buffer node. However, the buffer node can co-exist with the
provided buffers, as currently it's not supported to use both provided
and registered buffers at the same time.

This crucially brings struct io_kiocb down to 4 cache lines again, as
before it spilled into the 5th cacheline.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  7 ++++++-
 io_uring/io_uring.c            |  6 +++---
 io_uring/net.c                 |  3 ++-
 io_uring/nop.c                 |  3 ++-
 io_uring/notif.c               |  4 ++--
 io_uring/rsrc.h                | 16 ++++++++++------
 io_uring/rw.c                  |  3 ++-
 io_uring/uring_cmd.c           |  4 ++--
 8 files changed, 29 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d52fec533c51..01e7fb9fcfe2 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -475,6 +475,7 @@ enum {
 	REQ_F_BL_EMPTY_BIT,
 	REQ_F_BL_NO_RECYCLE_BIT,
 	REQ_F_BUFFERS_COMMIT_BIT,
+	REQ_F_BUF_NODE_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -553,6 +554,8 @@ enum {
 	REQ_F_BL_NO_RECYCLE	= IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
 	/* buffer ring head needs incrementing on put */
 	REQ_F_BUFFERS_COMMIT	= IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
+	/* buf node is valid */
+	REQ_F_BUF_NODE		= IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -633,6 +636,8 @@ struct io_kiocb {
 		 * REQ_F_BUFFER_RING is set.
 		 */
 		struct io_buffer_list	*buf_list;
+
+		struct io_rsrc_node	*buf_node;
 	};
 
 	union {
@@ -642,7 +647,7 @@ struct io_kiocb {
 		__poll_t apoll_events;
 	};
 
-	struct io_rsrc_node		*rsrc_nodes[2];
+	struct io_rsrc_node		*file_node;
 
 	atomic_t			refs;
 	bool				cancel_seq_set;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f08ea7fd5998..5bab8a3b0456 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -948,8 +948,8 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
 static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 {
 	req->ctx = ctx;
-	req->rsrc_nodes[IORING_RSRC_FILE] = NULL;
-	req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
+	req->buf_node = NULL;
+	req->file_node = NULL;
 	req->link = NULL;
 	req->async_data = NULL;
 	/* not necessary, but safer to zero */
@@ -1882,7 +1882,7 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	io_ring_submit_lock(ctx, issue_flags);
 	node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
 	if (node) {
-		io_req_assign_rsrc_node(req, node);
+		io_req_assign_rsrc_node(&req->file_node, node);
 		req->flags |= io_slot_flags(node);
 		file = io_slot_file(node);
 	}
diff --git a/io_uring/net.c b/io_uring/net.c
index 2f7b334ed708..2ccc2b409431 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1348,7 +1348,8 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 		io_ring_submit_lock(ctx, issue_flags);
 		node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index);
 		if (node) {
-			io_req_assign_rsrc_node(sr->notif, node);
+			io_req_assign_rsrc_node(&sr->notif->buf_node, node);
+			sr->notif->flags |= REQ_F_BUF_NODE;
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 149dbdc53607..bc22bcc739f3 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -67,7 +67,8 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 		io_ring_submit_lock(ctx, issue_flags);
 		node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer);
 		if (node) {
-			io_req_assign_rsrc_node(req, node);
+			io_req_assign_rsrc_node(&req->buf_node, node);
+			req->flags |= REQ_F_BUF_NODE;
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 4f02e969cf08..8dfbb0bd8e4d 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -117,8 +117,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->file = NULL;
 	notif->task = current;
 	io_get_task_refs(1);
-	notif->rsrc_nodes[IORING_RSRC_FILE] = NULL;
-	notif->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
+	notif->file_node = NULL;
+	notif->buf_node = NULL;
 
 	nd = io_notif_to_data(notif);
 	nd->zc_report = false;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 9a8fac31fa49..bc3a863b14bb 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -95,10 +95,14 @@ static inline bool io_reset_rsrc_node(struct io_rsrc_data *data, int index)
 
 static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
 {
-	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_FILE]);
-	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_BUFFER]);
-	req->rsrc_nodes[IORING_RSRC_FILE] = NULL;
-	req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
+	if (req->file_node) {
+		io_put_rsrc_node(req->file_node);
+		req->file_node = NULL;
+	}
+	if (req->flags & REQ_F_BUF_NODE) {
+		io_put_rsrc_node(req->buf_node);
+		req->buf_node = NULL;
+	}
 }
 
 static inline struct io_ring_ctx *io_rsrc_node_ctx(struct io_rsrc_node *node)
@@ -111,11 +115,11 @@ static inline int io_rsrc_node_type(struct io_rsrc_node *node)
 	return node->ctx_ptr & IORING_RSRC_TYPE_MASK;
 }
 
-static inline void io_req_assign_rsrc_node(struct io_kiocb *req,
+static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node,
 					   struct io_rsrc_node *node)
 {
 	node->refs++;
-	req->rsrc_nodes[io_rsrc_node_type(node)] = node;
+	*dst_node = node;
 }
 
 int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1ea6be2ccc90..144730344c0f 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -341,7 +341,8 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
 	if (!node)
 		return -EFAULT;
-	io_req_assign_rsrc_node(req, node);
+	io_req_assign_rsrc_node(&req->buf_node, node);
+	req->flags |= REQ_F_BUF_NODE;
 
 	io = req->async_data;
 	ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 535909a38e76..88a73d21fc0b 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -219,7 +219,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		 * being called. This prevents destruction of the mapped buffer
 		 * we'll need at actual import time.
 		 */
-		io_req_assign_rsrc_node(req, node);
+		io_req_assign_rsrc_node(&req->buf_node, node);
 	}
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 
@@ -275,7 +275,7 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
 {
 	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
-	struct io_rsrc_node *node = req->rsrc_nodes[IORING_RSRC_BUFFER];
+	struct io_rsrc_node *node = req->buf_node;
 
 	/* Must have had rsrc_node assigned at prep time */
 	if (node)
-- 
cgit v1.2.3


From b6f58a3f4aa8dba424356c7a69388a81f4459300 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 3 Nov 2024 10:23:38 -0700
Subject: io_uring: move struct io_kiocb from task_struct to io_uring_task

Rather than store the task_struct itself in struct io_kiocb, store
the io_uring specific task_struct. The life times are the same in terms
of io_uring, and this avoids doing some dereferences through the
task_struct. For the hot path of putting local task references, we can
deref req->tctx instead, which we'll need anyway in that function
regardless of whether it's local or remote references.

This is mostly straight forward, except the original task PF_EXITING
check needs a bit of tweaking. task_work is _always_ run from the
originating task, except in the fallback case, where it's run from a
kernel thread. Replace the potentially racy (in case of fallback work)
checks for req->task->flags with current->flags. It's either the still
the original task, in which case PF_EXITING will be sane, or it has
PF_KTHREAD set, in which case it's fallback work. Both cases should
prevent moving forward with the given request.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h   |  2 +-
 include/linux/io_uring_types.h |  3 ++-
 io_uring/cancel.c              |  2 +-
 io_uring/fdinfo.c              |  2 +-
 io_uring/io_uring.c            | 34 +++++++++++++++-------------------
 io_uring/io_uring.h            | 13 +++++++++++++
 io_uring/msg_ring.c            |  4 ++--
 io_uring/notif.c               |  4 ++--
 io_uring/poll.c                |  3 +--
 io_uring/rw.c                  |  2 +-
 io_uring/tctx.c                |  1 +
 io_uring/timeout.c             | 10 ++++++----
 io_uring/uring_cmd.c           |  2 +-
 io_uring/waitid.c              |  2 +-
 14 files changed, 48 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index c189d36ad55e..578a3fdf5c71 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -110,7 +110,7 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
 
 static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
 {
-	return cmd_to_io_kiocb(cmd)->task;
+	return cmd_to_io_kiocb(cmd)->tctx->task;
 }
 
 #endif /* _LINUX_IO_URING_CMD_H */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 01e7fb9fcfe2..fba2988accc3 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -84,6 +84,7 @@ struct io_uring_task {
 	/* submission side */
 	int				cached_refs;
 	const struct io_ring_ctx 	*last;
+	struct task_struct		*task;
 	struct io_wq			*io_wq;
 	struct file			*registered_rings[IO_RINGFD_REG_MAX];
 
@@ -625,7 +626,7 @@ struct io_kiocb {
 	struct io_cqe			cqe;
 
 	struct io_ring_ctx		*ctx;
-	struct task_struct		*task;
+	struct io_uring_task		*tctx;
 
 	union {
 		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index bbca5cb69cb5..484193567839 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -205,7 +205,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 		.opcode	= cancel->opcode,
 		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
 	};
-	struct io_uring_task *tctx = req->task->io_uring;
+	struct io_uring_task *tctx = req->tctx;
 	int ret;
 
 	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 8da0d9e4533a..efbec34ccb18 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -203,7 +203,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 
 		hlist_for_each_entry(req, &hb->list, hash_node)
 			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
-					task_work_pending(req->task));
+					task_work_pending(req->tctx->task));
 	}
 
 	if (has_lock)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 43afd9da7d07..7c1ca36b117b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -206,7 +206,7 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx,
 {
 	bool matched;
 
-	if (tctx && head->task->io_uring != tctx)
+	if (tctx && head->tctx != tctx)
 		return false;
 	if (cancel_all)
 		return true;
@@ -407,11 +407,8 @@ static void io_clean_op(struct io_kiocb *req)
 		kfree(req->apoll);
 		req->apoll = NULL;
 	}
-	if (req->flags & REQ_F_INFLIGHT) {
-		struct io_uring_task *tctx = req->task->io_uring;
-
-		atomic_dec(&tctx->inflight_tracked);
-	}
+	if (req->flags & REQ_F_INFLIGHT)
+		atomic_dec(&req->tctx->inflight_tracked);
 	if (req->flags & REQ_F_CREDS)
 		put_cred(req->creds);
 	if (req->flags & REQ_F_ASYNC_DATA) {
@@ -425,7 +422,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_INFLIGHT)) {
 		req->flags |= REQ_F_INFLIGHT;
-		atomic_inc(&req->task->io_uring->inflight_tracked);
+		atomic_inc(&req->tctx->inflight_tracked);
 	}
 }
 
@@ -514,7 +511,7 @@ static void io_prep_async_link(struct io_kiocb *req)
 static void io_queue_iowq(struct io_kiocb *req)
 {
 	struct io_kiocb *link = io_prep_linked_timeout(req);
-	struct io_uring_task *tctx = req->task->io_uring;
+	struct io_uring_task *tctx = req->tctx;
 
 	BUG_ON(!tctx);
 	BUG_ON(!tctx->io_wq);
@@ -529,7 +526,7 @@ static void io_queue_iowq(struct io_kiocb *req)
 	 * procedure rather than attempt to run this request (or create a new
 	 * worker for it).
 	 */
-	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
+	if (WARN_ON_ONCE(!same_thread_group(tctx->task, current)))
 		atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
 
 	trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
@@ -678,17 +675,17 @@ static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
 }
 
 /* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task)
+static inline void io_put_task(struct io_kiocb *req)
 {
-	struct io_uring_task *tctx = task->io_uring;
+	struct io_uring_task *tctx = req->tctx;
 
-	if (likely(task == current)) {
+	if (likely(tctx->task == current)) {
 		tctx->cached_refs++;
 	} else {
 		percpu_counter_sub(&tctx->inflight, 1);
 		if (unlikely(atomic_read(&tctx->in_cancel)))
 			wake_up(&tctx->wait);
-		put_task_struct(task);
+		put_task_struct(tctx->task);
 	}
 }
 
@@ -1207,7 +1204,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req,
 
 static void io_req_normal_work_add(struct io_kiocb *req)
 {
-	struct io_uring_task *tctx = req->task->io_uring;
+	struct io_uring_task *tctx = req->tctx;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	/* task_work already pending, we're done */
@@ -1226,7 +1223,7 @@ static void io_req_normal_work_add(struct io_kiocb *req)
 		return;
 	}
 
-	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
+	if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
 	io_fallback_tw(tctx, false);
@@ -1343,8 +1340,7 @@ static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
 void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	io_tw_lock(req->ctx, ts);
-	/* req->task == current here, checking PF_EXITING is safe */
-	if (unlikely(req->task->flags & PF_EXITING))
+	if (unlikely(io_should_terminate_tw()))
 		io_req_defer_failed(req, -EFAULT);
 	else if (req->flags & REQ_F_FORCE_ASYNC)
 		io_queue_iowq(req);
@@ -1403,7 +1399,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 		}
 		io_put_file(req);
 		io_req_put_rsrc_nodes(req);
-		io_put_task(req->task);
+		io_put_task(req);
 
 		node = req->comp_list.next;
 		io_req_add_to_cache(req, ctx);
@@ -2019,7 +2015,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->flags = (__force io_req_flags_t) sqe_flags;
 	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
-	req->task = current;
+	req->tctx = current->io_uring;
 	req->cancel_seq_set = false;
 
 	if (unlikely(opcode >= IORING_OP_LAST)) {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 17ffdb1e41c5..702c8e987430 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -426,6 +426,19 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
 		      ctx->submitter_task == current);
 }
 
+/*
+ * Terminate the request if either of these conditions are true:
+ *
+ * 1) It's being executed by the original task, but that task is marked
+ *    with PF_EXITING as it's exiting.
+ * 2) PF_KTHREAD is set, in which case the invoker of the task_work is
+ *    our fallback task_work.
+ */
+static inline bool io_should_terminate_tw(void)
+{
+	return current->flags & (PF_KTHREAD | PF_EXITING);
+}
+
 static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
 {
 	io_req_set_res(req, res, 0);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 99af39e1d0fb..e63af34004b7 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -89,8 +89,8 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			      int res, u32 cflags, u64 user_data)
 {
-	req->task = READ_ONCE(ctx->submitter_task);
-	if (!req->task) {
+	req->tctx = READ_ONCE(ctx->submitter_task->io_uring);
+	if (!req->tctx) {
 		kmem_cache_free(req_cachep, req);
 		return -EOWNERDEAD;
 	}
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 8dfbb0bd8e4d..ee3a33510b3c 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -89,7 +89,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg)
 
 	/* make sure all noifications can be finished in the same task_work */
 	if (unlikely(notif->ctx != prev_notif->ctx ||
-		     notif->task != prev_notif->task))
+		     notif->tctx != prev_notif->tctx))
 		return -EEXIST;
 
 	nd->head = prev_nd->head;
@@ -115,7 +115,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->opcode = IORING_OP_NOP;
 	notif->flags = 0;
 	notif->file = NULL;
-	notif->task = current;
+	notif->tctx = current->io_uring;
 	io_get_task_refs(1);
 	notif->file_node = NULL;
 	notif->buf_node = NULL;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 7db3010b5733..bced9edd5233 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -224,8 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	int v;
 
-	/* req->task == current here, checking PF_EXITING is safe */
-	if (unlikely(req->task->flags & PF_EXITING))
+	if (unlikely(io_should_terminate_tw()))
 		return -ECANCELED;
 
 	do {
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 144730344c0f..e368b9afde03 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -435,7 +435,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 	 * Play it safe and assume not safe to re-import and reissue if we're
 	 * not in the original thread group (or in task context).
 	 */
-	if (!same_thread_group(req->task, current) || !in_task())
+	if (!same_thread_group(req->tctx->task, current) || !in_task())
 		return false;
 	return true;
 }
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index c043fe93a3f2..503f3ff8bc4f 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -81,6 +81,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 		return ret;
 	}
 
+	tctx->task = task;
 	xa_init(&tctx->xa);
 	init_waitqueue_head(&tctx->wait);
 	atomic_set(&tctx->in_cancel, 0);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 18286cb53a69..5b12bd6a804c 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -300,16 +300,18 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t
 {
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_kiocb *prev = timeout->prev;
-	int ret = -ENOENT;
+	int ret;
 
 	if (prev) {
-		if (!(req->task->flags & PF_EXITING)) {
+		if (!io_should_terminate_tw()) {
 			struct io_cancel_data cd = {
 				.ctx		= req->ctx,
 				.data		= prev->cqe.user_data,
 			};
 
-			ret = io_try_cancel(req->task->io_uring, &cd, 0);
+			ret = io_try_cancel(req->tctx, &cd, 0);
+		} else {
+			ret = -ECANCELED;
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
 		io_req_task_complete(req, ts);
@@ -643,7 +645,7 @@ static bool io_match_task(struct io_kiocb *head, struct io_uring_task *tctx,
 {
 	struct io_kiocb *req;
 
-	if (tctx && head->task->io_uring != tctx)
+	if (tctx && head->tctx != tctx)
 		return false;
 	if (cancel_all)
 		return true;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index f88fbc9869d0..40b8b777ba12 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -61,7 +61,7 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
 				struct io_uring_cmd);
 		struct file *file = req->file;
 
-		if (!cancel_all && req->task->io_uring != tctx)
+		if (!cancel_all && req->tctx != tctx)
 			continue;
 
 		if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 9b7c23f96c47..daef5dd644f0 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -331,7 +331,7 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
 	hlist_add_head(&req->hash_node, &ctx->waitid_list);
 
 	init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
-	iwa->wo.child_wait.private = req->task;
+	iwa->wo.child_wait.private = req->tctx->task;
 	iw->head = &current->signal->wait_chldexit;
 	add_wait_queue(iw->head, &iwa->wo.child_wait);
 
-- 
cgit v1.2.3


From 6bf90bd8c58a305994948eb3409d91a7d8f2edae Mon Sep 17 00:00:00 2001
From: Olivier Langlois <olivier@trillion01.com>
Date: Sun, 13 Oct 2024 14:29:24 -0400
Subject: io_uring/napi: add static napi tracking strategy

Add the static napi tracking strategy. That allows the user to manually
manage the napi ids list for busy polling, and eliminate the overhead of
dynamically updating the list from the fast path.

Signed-off-by: Olivier Langlois <olivier@trillion01.com>
Link: https://lore.kernel.org/r/96943de14968c35a5c599352259ad98f3c0770ba.1728828877.git.olivier@trillion01.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 include/uapi/linux/io_uring.h  | 32 +++++++++++++-
 io_uring/fdinfo.c              | 54 +++++++++++++++++------
 io_uring/napi.c                | 97 +++++++++++++++++++++++++++++++++++++-----
 io_uring/napi.h                |  2 +-
 5 files changed, 160 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index fba2988accc3..072e65e93105 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -408,7 +408,7 @@ struct io_ring_ctx {
 	/* napi busy poll default timeout */
 	ktime_t			napi_busy_poll_dt;
 	bool			napi_prefer_busy_poll;
-	bool			napi_enabled;
+	u8			napi_track_mode;
 
 	DECLARE_HASHTABLE(napi_ht, 4);
 #endif
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 47977a5c65f5..5d08435b95a8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -790,12 +790,40 @@ struct io_uring_buf_status {
 	__u32	resv[8];
 };
 
+enum io_uring_napi_op {
+	/* register/ungister backward compatible opcode */
+	IO_URING_NAPI_REGISTER_OP = 0,
+
+	/* opcodes to update napi_list when static tracking is used */
+	IO_URING_NAPI_STATIC_ADD_ID = 1,
+	IO_URING_NAPI_STATIC_DEL_ID = 2
+};
+
+enum io_uring_napi_tracking_strategy {
+	/* value must be 0 for backward compatibility */
+	IO_URING_NAPI_TRACKING_DYNAMIC = 0,
+	IO_URING_NAPI_TRACKING_STATIC = 1,
+	IO_URING_NAPI_TRACKING_INACTIVE = 255
+};
+
 /* argument for IORING_(UN)REGISTER_NAPI */
 struct io_uring_napi {
 	__u32	busy_poll_to;
 	__u8	prefer_busy_poll;
-	__u8	pad[3];
-	__u64	resv;
+
+	/* a io_uring_napi_op value */
+	__u8	opcode;
+	__u8	pad[2];
+
+	/*
+	 * for IO_URING_NAPI_REGISTER_OP, it is a
+	 * io_uring_napi_tracking_strategy value.
+	 *
+	 * for IO_URING_NAPI_STATIC_ADD_ID/IO_URING_NAPI_STATIC_DEL_ID
+	 * it is the napi id to add/del from napi_list.
+	 */
+	__u32	op_param;
+	__u32	resv;
 };
 
 /*
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index efbec34ccb18..b214e5a407b5 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -46,6 +46,46 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
 	return 0;
 }
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
+					       struct seq_file *m,
+					       const char *tracking_strategy)
+{
+	seq_puts(m, "NAPI:\tenabled\n");
+	seq_printf(m, "napi tracking:\t%s\n", tracking_strategy);
+	seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
+	if (ctx->napi_prefer_busy_poll)
+		seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
+	else
+		seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
+}
+
+static __cold void napi_show_fdinfo(struct io_ring_ctx *ctx,
+				    struct seq_file *m)
+{
+	unsigned int mode = READ_ONCE(ctx->napi_track_mode);
+
+	switch (mode) {
+	case IO_URING_NAPI_TRACKING_INACTIVE:
+		seq_puts(m, "NAPI:\tdisabled\n");
+		break;
+	case IO_URING_NAPI_TRACKING_DYNAMIC:
+		common_tracking_show_fdinfo(ctx, m, "dynamic");
+		break;
+	case IO_URING_NAPI_TRACKING_STATIC:
+		common_tracking_show_fdinfo(ctx, m, "static");
+		break;
+	default:
+		seq_printf(m, "NAPI:\tunknown mode (%u)\n", mode);
+	}
+}
+#else
+static inline void napi_show_fdinfo(struct io_ring_ctx *ctx,
+				    struct seq_file *m)
+{
+}
+#endif
+
 /*
  * Caller holds a reference to the file already, we don't need to do
  * anything else to get an extra reference.
@@ -219,18 +259,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 
 	}
 	spin_unlock(&ctx->completion_lock);
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	if (ctx->napi_enabled) {
-		seq_puts(m, "NAPI:\tenabled\n");
-		seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
-		if (ctx->napi_prefer_busy_poll)
-			seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
-		else
-			seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
-	} else {
-		seq_puts(m, "NAPI:\tdisabled\n");
-	}
-#endif
+	napi_show_fdinfo(ctx, m);
 }
 #endif
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 1de1543d8034..b1ade3fda30f 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -81,6 +81,27 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
 	return 0;
 }
 
+static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
+{
+	struct hlist_head *hash_list;
+	struct io_napi_entry *e;
+
+	/* Non-NAPI IDs can be rejected. */
+	if (napi_id < MIN_NAPI_ID)
+		return -EINVAL;
+
+	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
+	guard(spinlock)(&ctx->napi_lock);
+	e = io_napi_hash_find(hash_list, napi_id);
+	if (!e)
+		return -ENOENT;
+
+	list_del_rcu(&e->list);
+	hash_del_rcu(&e->node);
+	kfree_rcu(e, rcu);
+	return 0;
+}
+
 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
 {
 	struct io_napi_entry *e;
@@ -136,9 +157,25 @@ static bool io_napi_busy_loop_should_end(void *data,
 	return false;
 }
 
-static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
-				   bool (*loop_end)(void *, unsigned long),
-				   void *loop_end_arg)
+/*
+ * never report stale entries
+ */
+static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+					 bool (*loop_end)(void *, unsigned long),
+					 void *loop_end_arg)
+{
+	struct io_napi_entry *e;
+
+	list_for_each_entry_rcu(e, &ctx->napi_list, list)
+		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
+				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+	return false;
+}
+
+static bool
+dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+			      bool (*loop_end)(void *, unsigned long),
+			      void *loop_end_arg)
 {
 	struct io_napi_entry *e;
 	bool is_stale = false;
@@ -154,6 +191,16 @@ static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
 	return is_stale;
 }
 
+static inline bool
+__io_napi_do_busy_loop(struct io_ring_ctx *ctx,
+		       bool (*loop_end)(void *, unsigned long),
+		       void *loop_end_arg)
+{
+	if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC)
+		return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+	return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+}
+
 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
 				       struct io_wait_queue *iowq)
 {
@@ -195,6 +242,7 @@ void io_napi_init(struct io_ring_ctx *ctx)
 	spin_lock_init(&ctx->napi_lock);
 	ctx->napi_prefer_busy_poll = false;
 	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
+	ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
 }
 
 /*
@@ -215,6 +263,24 @@ void io_napi_free(struct io_ring_ctx *ctx)
 	INIT_LIST_HEAD_RCU(&ctx->napi_list);
 }
 
+static int io_napi_register_napi(struct io_ring_ctx *ctx,
+				 struct io_uring_napi *napi)
+{
+	switch (napi->op_param) {
+	case IO_URING_NAPI_TRACKING_DYNAMIC:
+	case IO_URING_NAPI_TRACKING_STATIC:
+		break;
+	default:
+		return -EINVAL;
+	}
+	/* clean the napi list for new settings */
+	io_napi_free(ctx);
+	WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
+	WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
+	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
+	return 0;
+}
+
 /*
  * io_napi_register() - Register napi with io-uring
  * @ctx: pointer to io-uring context structure
@@ -226,7 +292,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 {
 	const struct io_uring_napi curr = {
 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
-		.prefer_busy_poll = ctx->napi_prefer_busy_poll
+		.prefer_busy_poll = ctx->napi_prefer_busy_poll,
+		.op_param	  = ctx->napi_track_mode
 	};
 	struct io_uring_napi napi;
 
@@ -234,16 +301,26 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 		return -EINVAL;
 	if (copy_from_user(&napi, arg, sizeof(napi)))
 		return -EFAULT;
-	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
+	if (napi.pad[0] || napi.pad[1] || napi.resv)
 		return -EINVAL;
 
 	if (copy_to_user(arg, &curr, sizeof(curr)))
 		return -EFAULT;
 
-	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
-	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
-	WRITE_ONCE(ctx->napi_enabled, true);
-	return 0;
+	switch (napi.opcode) {
+	case IO_URING_NAPI_REGISTER_OP:
+		return io_napi_register_napi(ctx, &napi);
+	case IO_URING_NAPI_STATIC_ADD_ID:
+		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+			return -EINVAL;
+		return __io_napi_add_id(ctx, napi.op_param);
+	case IO_URING_NAPI_STATIC_DEL_ID:
+		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+			return -EINVAL;
+		return __io_napi_del_id(ctx, napi.op_param);
+	default:
+		return -EINVAL;
+	}
 }
 
 /*
@@ -266,7 +343,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
 
 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
-	WRITE_ONCE(ctx->napi_enabled, false);
+	WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
 	return 0;
 }
 
diff --git a/io_uring/napi.h b/io_uring/napi.h
index 4ae622f37b30..fa742f42e09b 100644
--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -44,7 +44,7 @@ static inline void io_napi_add(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct socket *sock;
 
-	if (!READ_ONCE(ctx->napi_enabled))
+	if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC)
 		return;
 
 	sock = sock_from_file(req->file);
-- 
cgit v1.2.3


From 3b96b895127b7c0aed63d82c974b46340e8466c1 Mon Sep 17 00:00:00 2001
From: Esther Shimanovich <eshimanovich@chromium.org>
Date: Tue, 10 Sep 2024 17:57:45 +0000
Subject: PCI: Detect and trust built-in Thunderbolt chips

Some computers with CPUs that lack Thunderbolt features use discrete
Thunderbolt chips to add Thunderbolt functionality. These Thunderbolt
chips are located within the chassis; between the Root Port labeled
ExternalFacingPort and the USB-C port.

These Thunderbolt PCIe devices should be labeled as fixed and trusted, as
they are built into the computer. Otherwise, security policies that rely on
those flags may have unintended results, such as preventing USB-C ports
from enumerating.

Detect the above scenario through the process of elimination.

  1) Integrated Thunderbolt host controllers already have Thunderbolt
     implemented, so anything outside their external facing Root Port is
     removable and untrusted.

     Detect them using the following properties:

       - Most integrated host controllers have the "usb4-host-interface"
         ACPI property, as described here:

         https://learn.microsoft.com/en-us/windows-hardware/drivers/pci/dsd-for-pcie-root-ports#mapping-native-protocols-pcie-displayport-tunneled-through-usb4-to-usb4-host-routers

       - Integrated Thunderbolt PCIe Root Ports before Alder Lake do not
         have the "usb4-host-interface" ACPI property. Identify those by
         their PCI IDs instead.

  2) If a Root Port does not have integrated Thunderbolt capabilities, but
     has the "ExternalFacingPort" ACPI property, that means the
     manufacturer has opted to use a discrete Thunderbolt host controller
     that is built into the computer.

     This host controller can be identified by virtue of being located
     directly below an external-facing Root Port that lacks integrated
     Thunderbolt. Label it as trusted and fixed.

     Everything downstream from it is untrusted and removable.

The "ExternalFacingPort" ACPI property is described here:
https://learn.microsoft.com/en-us/windows-hardware/drivers/pci/dsd-for-pcie-root-ports#identifying-externally-exposed-pcie-root-ports

Link: https://lore.kernel.org/r/20240910-trust-tbt-fix-v5-1-7a7a42a5f496@chromium.org
Suggested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Esther Shimanovich <eshimanovich@chromium.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Tested-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
 arch/x86/pci/acpi.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/probe.c |  30 +++++++++----
 include/linux/pci.h |   6 +++
 3 files changed, 148 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 55c4b07ec1f6..0c316bae1726 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -250,6 +250,125 @@ void __init pci_acpi_crs_quirks(void)
 		pr_info("Please notify linux-pci@vger.kernel.org so future kernels can do this automatically\n");
 }
 
+/*
+ * Check if pdev is part of a PCIe switch that is directly below the
+ * specified bridge.
+ */
+static bool pcie_switch_directly_under(struct pci_dev *bridge,
+				       struct pci_dev *pdev)
+{
+	struct pci_dev *parent = pci_upstream_bridge(pdev);
+
+	/* If the device doesn't have a parent, it's not under anything */
+	if (!parent)
+		return false;
+
+	/*
+	 * If the device has a PCIe type, check if it is below the
+	 * corresponding PCIe switch components (if applicable). Then check
+	 * if its upstream port is directly beneath the specified bridge.
+	 */
+	switch (pci_pcie_type(pdev)) {
+	case PCI_EXP_TYPE_UPSTREAM:
+		return parent == bridge;
+
+	case PCI_EXP_TYPE_DOWNSTREAM:
+		if (pci_pcie_type(parent) != PCI_EXP_TYPE_UPSTREAM)
+			return false;
+		parent = pci_upstream_bridge(parent);
+		return parent == bridge;
+
+	case PCI_EXP_TYPE_ENDPOINT:
+		if (pci_pcie_type(parent) != PCI_EXP_TYPE_DOWNSTREAM)
+			return false;
+		parent = pci_upstream_bridge(parent);
+		if (!parent || pci_pcie_type(parent) != PCI_EXP_TYPE_UPSTREAM)
+			return false;
+		parent = pci_upstream_bridge(parent);
+		return parent == bridge;
+	}
+
+	return false;
+}
+
+static bool pcie_has_usb4_host_interface(struct pci_dev *pdev)
+{
+	struct fwnode_handle *fwnode;
+
+	/*
+	 * For USB4, the tunneled PCIe Root or Downstream Ports are marked
+	 * with the "usb4-host-interface" ACPI property, so we look for
+	 * that first. This should cover most cases.
+	 */
+	fwnode = fwnode_find_reference(dev_fwnode(&pdev->dev),
+				       "usb4-host-interface", 0);
+	if (!IS_ERR(fwnode)) {
+		fwnode_handle_put(fwnode);
+		return true;
+	}
+
+	/*
+	 * Any integrated Thunderbolt 3/4 PCIe Root Ports from Intel
+	 * before Alder Lake do not have the "usb4-host-interface"
+	 * property so we use their PCI IDs instead. All these are
+	 * tunneled. This list is not expected to grow.
+	 */
+	if (pdev->vendor == PCI_VENDOR_ID_INTEL) {
+		switch (pdev->device) {
+		/* Ice Lake Thunderbolt 3 PCIe Root Ports */
+		case 0x8a1d:
+		case 0x8a1f:
+		case 0x8a21:
+		case 0x8a23:
+		/* Tiger Lake-LP Thunderbolt 4 PCIe Root Ports */
+		case 0x9a23:
+		case 0x9a25:
+		case 0x9a27:
+		case 0x9a29:
+		/* Tiger Lake-H Thunderbolt 4 PCIe Root Ports */
+		case 0x9a2b:
+		case 0x9a2d:
+		case 0x9a2f:
+		case 0x9a31:
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool arch_pci_dev_is_removable(struct pci_dev *pdev)
+{
+	struct pci_dev *parent, *root;
+
+	/* pdev without a parent or Root Port is never tunneled */
+	parent = pci_upstream_bridge(pdev);
+	if (!parent)
+		return false;
+	root = pcie_find_root_port(pdev);
+	if (!root)
+		return false;
+
+	/* Internal PCIe devices are not tunneled */
+	if (!root->external_facing)
+		return false;
+
+	/* Anything directly behind a "usb4-host-interface" is tunneled */
+	if (pcie_has_usb4_host_interface(parent))
+		return true;
+
+	/*
+	 * Check if this is a discrete Thunderbolt/USB4 controller that is
+	 * directly behind the non-USB4 PCIe Root Port marked as
+	 * "ExternalFacingPort". Those are not behind a PCIe tunnel.
+	 */
+	if (pcie_switch_directly_under(root, pdev))
+		return false;
+
+	/* PCIe devices after the discrete chip are tunneled */
+	return true;
+}
+
 #ifdef	CONFIG_PCI_MMCONFIG
 static int check_segment(u16 seg, struct device *dev, char *estr)
 {
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..924b23962c45 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1633,23 +1633,33 @@ static void set_pcie_thunderbolt(struct pci_dev *dev)
 
 static void set_pcie_untrusted(struct pci_dev *dev)
 {
-	struct pci_dev *parent;
+	struct pci_dev *parent = pci_upstream_bridge(dev);
 
+	if (!parent)
+		return;
 	/*
-	 * If the upstream bridge is untrusted we treat this device
+	 * If the upstream bridge is untrusted we treat this device as
 	 * untrusted as well.
 	 */
-	parent = pci_upstream_bridge(dev);
-	if (parent && (parent->untrusted || parent->external_facing))
+	if (parent->untrusted) {
+		dev->untrusted = true;
+		return;
+	}
+
+	if (arch_pci_dev_is_removable(dev)) {
+		pci_dbg(dev, "marking as untrusted\n");
 		dev->untrusted = true;
+	}
 }
 
 static void pci_set_removable(struct pci_dev *dev)
 {
 	struct pci_dev *parent = pci_upstream_bridge(dev);
 
+	if (!parent)
+		return;
 	/*
-	 * We (only) consider everything downstream from an external_facing
+	 * We (only) consider everything tunneled below an external_facing
 	 * device to be removable by the user. We're mainly concerned with
 	 * consumer platforms with user accessible thunderbolt ports that are
 	 * vulnerable to DMA attacks, and we expect those ports to be marked by
@@ -1659,9 +1669,15 @@ static void pci_set_removable(struct pci_dev *dev)
 	 * accessible to user / may not be removed by end user, and thus not
 	 * exposed as "removable" to userspace.
 	 */
-	if (parent &&
-	    (parent->external_facing || dev_is_removable(&parent->dev)))
+	if (dev_is_removable(&parent->dev)) {
+		dev_set_removable(&dev->dev, DEVICE_REMOVABLE);
+		return;
+	}
+
+	if (arch_pci_dev_is_removable(dev)) {
+		pci_dbg(dev, "marking as removable\n");
 		dev_set_removable(&dev->dev, DEVICE_REMOVABLE);
+	}
 }
 
 /**
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..4e77c4230c0a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2609,6 +2609,12 @@ pci_host_bridge_acpi_msi_domain(struct pci_bus *bus) { return NULL; }
 static inline bool pci_pr3_present(struct pci_dev *pdev) { return false; }
 #endif
 
+#if defined(CONFIG_X86) && defined(CONFIG_ACPI)
+bool arch_pci_dev_is_removable(struct pci_dev *pdev);
+#else
+static inline bool arch_pci_dev_is_removable(struct pci_dev *pdev) { return false; }
+#endif
+
 #ifdef CONFIG_EEH
 static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
 {
-- 
cgit v1.2.3


From 25caea955cc950507d179f3ef456404b475e8c23 Mon Sep 17 00:00:00 2001
From: Inochi Amaoto <inochiama@gmail.com>
Date: Thu, 31 Oct 2024 14:08:58 +0800
Subject: irqchip: Add T-HEAD C900 ACLINT SSWI driver

Add a driver for the T-HEAD C900 ACLINT SSWI device. This device allows
the system with T-HEAD cpus to send ipi via fast device interface.

Signed-off-by: Inochi Amaoto <inochiama@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20241031060859.722258-3-inochiama@gmail.com
---
 drivers/irqchip/Kconfig                      |  12 ++
 drivers/irqchip/Makefile                     |   1 +
 drivers/irqchip/irq-thead-c900-aclint-sswi.c | 176 +++++++++++++++++++++++++++
 include/linux/cpuhotplug.h                   |   1 +
 4 files changed, 190 insertions(+)
 create mode 100644 drivers/irqchip/irq-thead-c900-aclint-sswi.c

(limited to 'include/linux')

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index ef0fa6975dab..9cac13663871 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -619,6 +619,18 @@ config STARFIVE_JH8100_INTC
 
 	  If you don't know what to do here, say Y.
 
+config THEAD_C900_ACLINT_SSWI
+	bool "THEAD C9XX ACLINT S-mode IPI Interrupt Controller"
+	depends on RISCV
+	depends on SMP
+	select IRQ_DOMAIN_HIERARCHY
+	select GENERIC_IRQ_IPI_MUX
+	help
+	  This enables support for T-HEAD specific ACLINT SSWI device
+	  support.
+
+	  If you don't know what to do here, say Y.
+
 config EXYNOS_IRQ_COMBINER
 	bool "Samsung Exynos IRQ combiner support" if COMPILE_TEST
 	depends on (ARCH_EXYNOS && ARM) || COMPILE_TEST
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 15ed6e952ecd..25e9ad29b8c4 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -103,6 +103,7 @@ obj-$(CONFIG_RISCV_APLIC_MSI)		+= irq-riscv-aplic-msi.o
 obj-$(CONFIG_RISCV_IMSIC)		+= irq-riscv-imsic-state.o irq-riscv-imsic-early.o irq-riscv-imsic-platform.o
 obj-$(CONFIG_SIFIVE_PLIC)		+= irq-sifive-plic.o
 obj-$(CONFIG_STARFIVE_JH8100_INTC)	+= irq-starfive-jh8100-intc.o
+obj-$(CONFIG_THEAD_C900_ACLINT_SSWI)	+= irq-thead-c900-aclint-sswi.o
 obj-$(CONFIG_IMX_IRQSTEER)		+= irq-imx-irqsteer.o
 obj-$(CONFIG_IMX_INTMUX)		+= irq-imx-intmux.o
 obj-$(CONFIG_IMX_MU_MSI)		+= irq-imx-mu-msi.o
diff --git a/drivers/irqchip/irq-thead-c900-aclint-sswi.c b/drivers/irqchip/irq-thead-c900-aclint-sswi.c
new file mode 100644
index 000000000000..b0e366ade427
--- /dev/null
+++ b/drivers/irqchip/irq-thead-c900-aclint-sswi.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Inochi Amaoto <inochiama@gmail.com>
+ */
+
+#define pr_fmt(fmt) "thead-c900-aclint-sswi: " fmt
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/string_choices.h>
+#include <asm/sbi.h>
+#include <asm/vendorid_list.h>
+
+#define THEAD_ACLINT_xSWI_REGISTER_SIZE		4
+
+#define THEAD_C9XX_CSR_SXSTATUS			0x5c0
+#define THEAD_C9XX_SXSTATUS_CLINTEE		BIT(17)
+
+static int sswi_ipi_virq __ro_after_init;
+static DEFINE_PER_CPU(void __iomem *, sswi_cpu_regs);
+
+static void thead_aclint_sswi_ipi_send(unsigned int cpu)
+{
+	writel_relaxed(0x1, per_cpu(sswi_cpu_regs, cpu));
+}
+
+static void thead_aclint_sswi_ipi_clear(void)
+{
+	writel_relaxed(0x0, this_cpu_read(sswi_cpu_regs));
+}
+
+static void thead_aclint_sswi_ipi_handle(struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+
+	chained_irq_enter(chip, desc);
+
+	csr_clear(CSR_IP, IE_SIE);
+	thead_aclint_sswi_ipi_clear();
+
+	ipi_mux_process();
+
+	chained_irq_exit(chip, desc);
+}
+
+static int thead_aclint_sswi_starting_cpu(unsigned int cpu)
+{
+	enable_percpu_irq(sswi_ipi_virq, irq_get_trigger_type(sswi_ipi_virq));
+
+	return 0;
+}
+
+static int thead_aclint_sswi_dying_cpu(unsigned int cpu)
+{
+	thead_aclint_sswi_ipi_clear();
+
+	disable_percpu_irq(sswi_ipi_virq);
+
+	return 0;
+}
+
+static int __init thead_aclint_sswi_parse_irq(struct fwnode_handle *fwnode,
+					      void __iomem *reg)
+{
+	struct of_phandle_args parent;
+	unsigned long hartid;
+	u32 contexts, i;
+	int rc, cpu;
+
+	contexts = of_irq_count(to_of_node(fwnode));
+	if (!(contexts)) {
+		pr_err("%pfwP: no ACLINT SSWI context available\n", fwnode);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < contexts; i++) {
+		rc = of_irq_parse_one(to_of_node(fwnode), i, &parent);
+		if (rc)
+			return rc;
+
+		rc = riscv_of_parent_hartid(parent.np, &hartid);
+		if (rc)
+			return rc;
+
+		if (parent.args[0] != RV_IRQ_SOFT)
+			return -ENOTSUPP;
+
+		cpu = riscv_hartid_to_cpuid(hartid);
+
+		per_cpu(sswi_cpu_regs, cpu) = reg + i * THEAD_ACLINT_xSWI_REGISTER_SIZE;
+	}
+
+	pr_info("%pfwP: register %u CPU%s\n", fwnode, contexts, str_plural(contexts));
+
+	return 0;
+}
+
+static int __init thead_aclint_sswi_probe(struct fwnode_handle *fwnode)
+{
+	struct irq_domain *domain;
+	void __iomem *reg;
+	int virq, rc;
+
+	/* If it is T-HEAD CPU, check whether SSWI is enabled */
+	if (riscv_cached_mvendorid(0) == THEAD_VENDOR_ID &&
+	    !(csr_read(THEAD_C9XX_CSR_SXSTATUS) & THEAD_C9XX_SXSTATUS_CLINTEE))
+		return -ENOTSUPP;
+
+	if (!is_of_node(fwnode))
+		return -EINVAL;
+
+	reg = of_iomap(to_of_node(fwnode), 0);
+	if (!reg)
+		return -ENOMEM;
+
+	/* Parse SSWI setting */
+	rc = thead_aclint_sswi_parse_irq(fwnode, reg);
+	if (rc < 0)
+		return rc;
+
+	/* If mulitple SSWI devices are present, do not register irq again */
+	if (sswi_ipi_virq)
+		return 0;
+
+	/* Find riscv intc domain and create IPI irq mapping */
+	domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), DOMAIN_BUS_ANY);
+	if (!domain) {
+		pr_err("%pfwP: Failed to find INTC domain\n", fwnode);
+		return -ENOENT;
+	}
+
+	sswi_ipi_virq = irq_create_mapping(domain, RV_IRQ_SOFT);
+	if (!sswi_ipi_virq) {
+		pr_err("unable to create ACLINT SSWI IRQ mapping\n");
+		return -ENOMEM;
+	}
+
+	/* Register SSWI irq and handler */
+	virq = ipi_mux_create(BITS_PER_BYTE, thead_aclint_sswi_ipi_send);
+	if (virq <= 0) {
+		pr_err("unable to create muxed IPIs\n");
+		irq_dispose_mapping(sswi_ipi_virq);
+		return virq < 0 ? virq : -ENOMEM;
+	}
+
+	irq_set_chained_handler(sswi_ipi_virq, thead_aclint_sswi_ipi_handle);
+
+	cpuhp_setup_state(CPUHP_AP_IRQ_THEAD_ACLINT_SSWI_STARTING,
+			  "irqchip/thead-aclint-sswi:starting",
+			  thead_aclint_sswi_starting_cpu,
+			  thead_aclint_sswi_dying_cpu);
+
+	riscv_ipi_set_virq_range(virq, BITS_PER_BYTE);
+
+	/* Announce that SSWI is providing IPIs */
+	pr_info("providing IPIs using THEAD ACLINT SSWI\n");
+
+	return 0;
+}
+
+static int __init thead_aclint_sswi_early_probe(struct device_node *node,
+						struct device_node *parent)
+{
+	return thead_aclint_sswi_probe(&node->fwnode);
+}
+IRQCHIP_DECLARE(thead_aclint_sswi, "thead,c900-aclint-sswi", thead_aclint_sswi_early_probe);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2361ed4d2b15..799052249c7b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -147,6 +147,7 @@ enum cpuhp_state {
 	CPUHP_AP_IRQ_EIOINTC_STARTING,
 	CPUHP_AP_IRQ_AVECINTC_STARTING,
 	CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
+	CPUHP_AP_IRQ_THEAD_ACLINT_SSWI_STARTING,
 	CPUHP_AP_IRQ_RISCV_IMSIC_STARTING,
 	CPUHP_AP_IRQ_RISCV_SBI_IPI_STARTING,
 	CPUHP_AP_ARM_MVEBU_COHERENCY,
-- 
cgit v1.2.3


From 513793bc6ab331b947111e8efaf8fcef33fb83e5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:31 +0100
Subject: posix-timers: Make signal delivery consistent

Signals of timers which are reprogammed, disarmed or deleted can deliver
signals related to the past. The POSIX spec is blury about this:

 - "The effect of disarming or resetting a timer with pending expiration
    notifications is unspecified."

 - "The disposition of pending signals for the deleted timer is
    unspecified."

In both cases it is reasonable to expect that pending signals are
discarded. Especially in the reprogramming case it does not make sense to
account for previous overruns or to deliver a signal for a timer which has
been disarmed. This makes the behaviour consistent and understandable.

Remove the si_sys_private check from the signal delivery code and invoke
posix_timer_deliver_signal() unconditionally for posix timer related
signals.

Change posix_timer_deliver_signal() so it controls the actual signal
delivery via the return value. It now instructs the signal code to drop the
signal when:

  1) The timer does not longer exist in the hash table

  2) The timer signal_seq value is not the same as the si_sys_private value
     which was set when the signal was queued.

This is also a preparatory change to embed the sigqueue into the k_itimer
structure, which in turn allows to remove the si_sys_private magic.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/all/20241105064213.040348644@linutronix.de
---
 include/linux/posix-timers.h   |  2 --
 kernel/signal.c                |  6 ++----
 kernel/time/posix-cpu-timers.c |  2 +-
 kernel/time/posix-timers.c     | 28 ++++++++++++++++------------
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 02afbb4da7f7..8c6d97412526 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -137,8 +137,6 @@ static inline void clear_posix_cputimers_work(struct task_struct *p) { }
 static inline void posix_cputimers_init_work(void) { }
 #endif
 
-#define REQUEUE_PENDING 1
-
 /**
  * struct k_itimer - POSIX.1b interval timer structure.
  * @list:		List head for binding the timer to signals->posix_timers
diff --git a/kernel/signal.c b/kernel/signal.c
index df34aa47181e..68e6bc70ccf2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -550,10 +550,8 @@ still_pending:
 		list_del_init(&first->list);
 		copy_siginfo(info, &first->info);
 
-		*resched_timer =
-			(first->flags & SIGQUEUE_PREALLOC) &&
-			(info->si_code == SI_TIMER) &&
-			(info->si_sys_private);
+		*resched_timer = (first->flags & SIGQUEUE_PREALLOC) &&
+				 (info->si_code == SI_TIMER);
 
 		__sigqueue_free(first);
 	} else {
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5f444e372464..4305c003c8d4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -746,7 +746,7 @@ static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *i
 	 *  - Timers which expired, but the signal has not yet been
 	 *    delivered
 	 */
-	if (iv && ((timer->it_signal_seq & REQUEUE_PENDING) || sigev_none))
+	if (iv && timer->it_status != POSIX_TIMER_ARMED)
 		expires = bump_cpu_timer(timer, now);
 	else
 		expires = cpu_timer_getexpires(&timer->it.cpu);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index dd72b8e72697..b380e25d4947 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -269,7 +269,10 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info)
 	if (!timr)
 		goto out;
 
-	if (timr->it_interval && timr->it_signal_seq == info->si_sys_private) {
+	if (timr->it_signal_seq != info->si_sys_private)
+		goto out_unlock;
+
+	if (timr->it_interval && !WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) {
 		timr->kclock->timer_rearm(timr);
 
 		timr->it_status = POSIX_TIMER_ARMED;
@@ -281,6 +284,7 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info)
 	}
 	ret = true;
 
+out_unlock:
 	unlock_timer(timr, flags);
 out:
 	spin_lock(&current->sighand->siglock);
@@ -293,19 +297,18 @@ out:
 int posix_timer_queue_signal(struct k_itimer *timr)
 {
 	enum posix_timer_state state = POSIX_TIMER_DISARMED;
-	int ret, si_private = 0;
 	enum pid_type type;
+	int ret;
 
 	lockdep_assert_held(&timr->it_lock);
 
-	if (timr->it_interval) {
+	if (timr->it_interval)
 		state = POSIX_TIMER_REQUEUE_PENDING;
-		si_private = ++timr->it_signal_seq;
-	}
+
 	timr->it_status = state;
 
 	type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
-	ret = send_sigqueue(timr->sigq, timr->it_pid, type, si_private);
+	ret = send_sigqueue(timr->sigq, timr->it_pid, type, timr->it_signal_seq);
 	/* If we failed to send the signal the timer stops. */
 	return ret > 0;
 }
@@ -663,7 +666,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 	 * is a SIGEV_NONE timer move the expiry time forward by intervals,
 	 * so expiry is > now.
 	 */
-	if (iv && (timr->it_signal_seq & REQUEUE_PENDING || sig_none))
+	if (iv && timr->it_status != POSIX_TIMER_ARMED)
 		timr->it_overrun += kc->timer_forward(timr, now);
 
 	remaining = kc->timer_remaining(timr, now);
@@ -863,8 +866,6 @@ void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_set
 	else
 		timer->it_interval = 0;
 
-	/* Prevent reloading in case there is a signal pending */
-	timer->it_signal_seq = (timer->it_signal_seq + 2) & ~REQUEUE_PENDING;
 	/* Reset overrun accounting */
 	timer->it_overrun_last = 0;
 	timer->it_overrun = -1LL;
@@ -882,8 +883,6 @@ int common_timer_set(struct k_itimer *timr, int flags,
 	if (old_setting)
 		common_timer_get(timr, old_setting);
 
-	/* Prevent rearming by clearing the interval */
-	timr->it_interval = 0;
 	/*
 	 * Careful here. On SMP systems the timer expiry function could be
 	 * active and spinning on timr->it_lock.
@@ -933,6 +932,9 @@ retry:
 	if (old_spec64)
 		old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
 
+	/* Prevent signal delivery and rearming. */
+	timr->it_signal_seq++;
+
 	kc = timr->kclock;
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
@@ -1001,7 +1003,6 @@ int common_timer_del(struct k_itimer *timer)
 {
 	const struct k_clock *kc = timer->kclock;
 
-	timer->it_interval = 0;
 	if (kc->timer_try_to_cancel(timer) < 0)
 		return TIMER_RETRY;
 	timer->it_status = POSIX_TIMER_DISARMED;
@@ -1012,6 +1013,9 @@ static inline int timer_delete_hook(struct k_itimer *timer)
 {
 	const struct k_clock *kc = timer->kclock;
 
+	/* Prevent signal delivery and rearming. */
+	timer->it_signal_seq++;
+
 	if (WARN_ON_ONCE(!kc || !kc->timer_del))
 		return -EINVAL;
 	return kc->timer_del(timer);
-- 
cgit v1.2.3


From bf635681c906ad056d1fda325de8d1c12c9f8201 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:33 +0100
Subject: posix-cpu-timers: Cleanup the firing logic

The firing flag of a posix CPU timer is tristate:

  0: when the timer is not about to deliver a signal

  1: when the timer has expired, but the signal has not been delivered yet

 -1: when the timer was queued for signal delivery and a rearm operation
     raced against it and supressed the signal delivery.

This is a pointless exercise as this can be simply expressed with a
boolean. Only if set, the signal is delivered. This makes delete and rearm
consistent with the rest of the posix timers.

Convert firing to bool and fixup the usage sites accordingly and add
comments why the timer cannot be dequeued right away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/all/20241105064213.172848618@linutronix.de
---
 include/linux/posix-timers.h   |  2 +-
 kernel/time/posix-cpu-timers.c | 34 ++++++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 8c6d97412526..b1de21731a08 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -49,7 +49,7 @@ struct cpu_timer {
 	struct timerqueue_head		*head;
 	struct pid			*pid;
 	struct list_head		elist;
-	int				firing;
+	bool				firing;
 	struct task_struct __rcu	*handling;
 };
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 4305c003c8d4..a282a3c00605 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -493,10 +493,18 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
 		 */
 		WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));
 	} else {
-		if (timer->it.cpu.firing)
+		if (timer->it.cpu.firing) {
+			/*
+			 * Prevent signal delivery. The timer cannot be dequeued
+			 * because it is on the firing list which is not protected
+			 * by sighand->lock. The delivery path is waiting for
+			 * the timer lock. So go back, unlock and retry.
+			 */
+			timer->it.cpu.firing = false;
 			ret = TIMER_RETRY;
-		else
+		} else {
 			disarm_timer(timer, p);
+		}
 		unlock_task_sighand(p, &flags);
 	}
 
@@ -668,7 +676,13 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	old_expires = cpu_timer_getexpires(ctmr);
 
 	if (unlikely(timer->it.cpu.firing)) {
-		timer->it.cpu.firing = -1;
+		/*
+		 * Prevent signal delivery. The timer cannot be dequeued
+		 * because it is on the firing list which is not protected
+		 * by sighand->lock. The delivery path is waiting for
+		 * the timer lock. So go back, unlock and retry.
+		 */
+		timer->it.cpu.firing = false;
 		ret = TIMER_RETRY;
 	} else {
 		cpu_timer_dequeue(ctmr);
@@ -809,7 +823,7 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
 		if (++i == MAX_COLLECTED || now < expires)
 			return expires;
 
-		ctmr->firing = 1;
+		ctmr->firing = true;
 		/* See posix_cpu_timer_wait_running() */
 		rcu_assign_pointer(ctmr->handling, current);
 		cpu_timer_dequeue(ctmr);
@@ -1364,7 +1378,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
 	 * timer call will interfere.
 	 */
 	list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
-		int cpu_firing;
+		bool cpu_firing;
 
 		/*
 		 * spin_lock() is sufficient here even independent of the
@@ -1376,13 +1390,13 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
 		spin_lock(&timer->it_lock);
 		list_del_init(&timer->it.cpu.elist);
 		cpu_firing = timer->it.cpu.firing;
-		timer->it.cpu.firing = 0;
+		timer->it.cpu.firing = false;
 		/*
-		 * The firing flag is -1 if we collided with a reset
-		 * of the timer, which already reported this
-		 * almost-firing as an overrun.  So don't generate an event.
+		 * If the firing flag is cleared then this raced with a
+		 * timer rearm/delete operation. So don't generate an
+		 * event.
 		 */
-		if (likely(cpu_firing >= 0))
+		if (likely(cpu_firing))
 			cpu_timer_fire(timer);
 		/* See posix_cpu_timer_wait_running() */
 		rcu_assign_pointer(timer->it.cpu.handling, NULL);
-- 
cgit v1.2.3


From 4cf7bf2a2f1a8ace4a49a1138c8123fdb5990093 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:35 +0100
Subject: posix-cpu-timers: Use dedicated flag for CPU timer nanosleep

POSIX CPU timer nanosleep creates a k_itimer on stack and uses the sigq
pointer to detect the nanosleep case in the expiry function.

Prepare for embedding sigqueue into struct k_itimer by using a dedicated
flag for nanosleep.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064213.238550394@linutronix.de
---
 include/linux/posix-timers.h   | 2 ++
 kernel/time/posix-cpu-timers.c | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index b1de21731a08..bcd01208d795 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -42,6 +42,7 @@ static inline int clockid_to_fd(const clockid_t clk)
  * @pid:	Pointer to target task PID
  * @elist:	List head for the expiry list
  * @firing:	Timer is currently firing
+ * @nanosleep:	Timer is used for nanosleep and is not a regular posix-timer
  * @handling:	Pointer to the task which handles expiry
  */
 struct cpu_timer {
@@ -50,6 +51,7 @@ struct cpu_timer {
 	struct pid			*pid;
 	struct list_head		elist;
 	bool				firing;
+	bool				nanosleep;
 	struct task_struct __rcu	*handling;
 };
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a282a3c00605..0c441d8c2604 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -596,7 +596,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
 
 	timer->it_status = POSIX_TIMER_DISARMED;
 
-	if (unlikely(timer->sigq == NULL)) {
+	if (unlikely(ctmr->nanosleep)) {
 		/*
 		 * This a special case for clock_nanosleep,
 		 * not a normal timer from sys_timer_create.
@@ -1493,6 +1493,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 	timer.it_overrun = -1;
 	error = posix_cpu_timer_create(&timer);
 	timer.it_process = current;
+	timer.it.cpu.nanosleep = true;
 
 	if (!error) {
 		static struct itimerspec64 zero_it;
-- 
cgit v1.2.3


From 5d916a0988eed5217c103932ff4887c9ae83c89c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:36 +0100
Subject: posix-timers: Add a refcount to struct k_itimer

To cure the SIG_IGN handling for posix interval timers, the preallocated
sigqueue needs to be embedded into struct k_itimer to prevent life time
races of all sorts.

To make that work correctly it needs reference counting so that timer
deletion does not free the timer prematuraly when there is a signal queued
or delivered concurrently.

Add a rcuref to the posix timer part.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064213.304756440@linutronix.de
---
 include/linux/posix-timers.h | 14 ++++++++++++++
 kernel/time/posix-timers.c   |  7 ++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index bcd01208d795..9740fd0c2933 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -6,11 +6,13 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/posix-timers_types.h>
+#include <linux/rcuref.h>
 #include <linux/spinlock.h>
 #include <linux/timerqueue.h>
 
 struct kernel_siginfo;
 struct task_struct;
+struct k_itimer;
 
 static inline clockid_t make_process_cpuclock(const unsigned int pid,
 		const clockid_t clock)
@@ -105,6 +107,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 
 void posixtimer_rearm_itimer(struct task_struct *p);
 bool posixtimer_deliver_signal(struct kernel_siginfo *info);
+void posixtimer_free_timer(struct k_itimer *timer);
 
 /* Init task static initializer */
 #define INIT_CPU_TIMERBASE(b) {						\
@@ -129,6 +132,7 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
 					      u64 cpu_limit) { }
 static inline void posixtimer_rearm_itimer(struct task_struct *p) { }
 static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info) { return false; }
+static inline void posixtimer_free_timer(struct k_itimer *timer) { }
 #endif
 
 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
@@ -156,6 +160,7 @@ static inline void posix_cputimers_init_work(void) { }
  * @it_signal:		Pointer to the creators signal struct
  * @it_pid:		The pid of the process/task targeted by the signal
  * @it_process:		The task to wakeup on clock_nanosleep (CPU timers)
+ * @rcuref:		Reference count for life time management
  * @sigq:		Pointer to preallocated sigqueue
  * @it:			Union representing the various posix timer type
  *			internals.
@@ -180,6 +185,7 @@ struct k_itimer {
 		struct task_struct	*it_process;
 	};
 	struct sigqueue		*sigq;
+	rcuref_t		rcuref;
 	union {
 		struct {
 			struct hrtimer	timer;
@@ -200,4 +206,12 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 
 int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
 
+#ifdef CONFIG_POSIX_TIMERS
+static inline void posixtimer_putref(struct k_itimer *tmr)
+{
+	if (rcuref_put(&tmr->rcuref))
+		posixtimer_free_timer(tmr);
+}
+#endif /* !CONFIG_POSIX_TIMERS */
+
 #endif
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 66ed49efc02f..53bd3c4de92c 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -417,10 +417,11 @@ static struct k_itimer * alloc_posix_timer(void)
 		return NULL;
 	}
 	clear_siginfo(&tmr->sigq->info);
+	rcuref_init(&tmr->rcuref, 1);
 	return tmr;
 }
 
-static void posix_timer_free(struct k_itimer *tmr)
+void posixtimer_free_timer(struct k_itimer *tmr)
 {
 	put_pid(tmr->it_pid);
 	sigqueue_free(tmr->sigq);
@@ -432,7 +433,7 @@ static void posix_timer_unhash_and_free(struct k_itimer *tmr)
 	spin_lock(&hash_lock);
 	hlist_del_rcu(&tmr->t_hash);
 	spin_unlock(&hash_lock);
-	posix_timer_free(tmr);
+	posixtimer_putref(tmr);
 }
 
 static int common_timer_create(struct k_itimer *new_timer)
@@ -467,7 +468,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
 	 */
 	new_timer_id = posix_timer_add(new_timer);
 	if (new_timer_id < 0) {
-		posix_timer_free(new_timer);
+		posixtimer_free_timer(new_timer);
 		return new_timer_id;
 	}
 
-- 
cgit v1.2.3


From 54f1dd642fd088ba969206f09e7afffad7d9db2c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:39 +0100
Subject: signal: Provide posixtimer_sigqueue_init()

To cure the SIG_IGN handling for posix interval timers, the preallocated
sigqueue needs to be embedded into struct k_itimer to prevent life time
races of all sorts.

Provide a new function to initialize the embedded sigqueue to prepare for
that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064213.450427515@linutronix.de
---
 include/linux/posix-timers.h |  2 ++
 kernel/signal.c              | 11 +++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 9740fd0c2933..200098d27cc0 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -12,6 +12,7 @@
 
 struct kernel_siginfo;
 struct task_struct;
+struct sigqueue;
 struct k_itimer;
 
 static inline clockid_t make_process_cpuclock(const unsigned int pid,
@@ -106,6 +107,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 }
 
 void posixtimer_rearm_itimer(struct task_struct *p);
+bool posixtimer_init_sigqueue(struct sigqueue *q);
 bool posixtimer_deliver_signal(struct kernel_siginfo *info);
 void posixtimer_free_timer(struct k_itimer *timer);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index dbd42471cf03..911ed3ab479e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1905,6 +1905,17 @@ void flush_itimer_signals(void)
 	__flush_itimer_signals(&tsk->signal->shared_pending);
 }
 
+bool posixtimer_init_sigqueue(struct sigqueue *q)
+{
+	struct ucounts *ucounts = sig_get_ucounts(current, -1, 0);
+
+	if (!ucounts)
+		return false;
+	clear_siginfo(&q->info);
+	__sigqueue_init(q, ucounts, SIGQUEUE_PREALLOC);
+	return true;
+}
+
 struct sigqueue *sigqueue_alloc(void)
 {
 	return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
-- 
cgit v1.2.3


From ef1c5bcd6daa674392bdf89b8ae889aafd73f956 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:41 +0100
Subject: posix-timers: Store PID type in the timer

instead of re-evaluating the signal delivery mode everywhere.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064213.519086500@linutronix.de
---
 include/linux/posix-timers.h | 2 ++
 kernel/time/posix-timers.c   | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 200098d27cc0..947176582de9 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -5,6 +5,7 @@
 #include <linux/alarmtimer.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/pid.h>
 #include <linux/posix-timers_types.h>
 #include <linux/rcuref.h>
 #include <linux/spinlock.h>
@@ -180,6 +181,7 @@ struct k_itimer {
 	s64			it_overrun_last;
 	unsigned int		it_signal_seq;
 	int			it_sigev_notify;
+	enum pid_type		it_pid_type;
 	ktime_t			it_interval;
 	struct signal_struct	*it_signal;
 	union {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 53bd3c4de92c..f18d64c7cd3b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -298,7 +298,6 @@ out:
 int posix_timer_queue_signal(struct k_itimer *timr)
 {
 	enum posix_timer_state state = POSIX_TIMER_DISARMED;
-	enum pid_type type;
 	int ret;
 
 	lockdep_assert_held(&timr->it_lock);
@@ -308,8 +307,7 @@ int posix_timer_queue_signal(struct k_itimer *timr)
 
 	timr->it_status = state;
 
-	type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
-	ret = send_sigqueue(timr->sigq, timr->it_pid, type, timr->it_signal_seq);
+	ret = send_sigqueue(timr->sigq, timr->it_pid, timr->it_pid_type, timr->it_signal_seq);
 	/* If we failed to send the signal the timer stops. */
 	return ret > 0;
 }
@@ -496,6 +494,11 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
 		new_timer->it_pid = get_pid(task_tgid(current));
 	}
 
+	if (new_timer->it_sigev_notify & SIGEV_THREAD_ID)
+		new_timer->it_pid_type = PIDTYPE_PID;
+	else
+		new_timer->it_pid_type = PIDTYPE_TGID;
+
 	new_timer->sigq->info.si_tid   = new_timer->it_id;
 	new_timer->sigq->info.si_code  = SI_TIMER;
 
-- 
cgit v1.2.3


From 0360ed14d9826678a50fa2b873e522a24cd3c018 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:42 +0100
Subject: signal: Refactor send_sigqueue()

To handle posix timers which have their signal ignored via SIG_IGN properly
it is required to requeue a ignored signal for delivery when SIG_IGN is
lifted so the timer gets rearmed.

Split the required code out of send_sigqueue() so it can be reused in
context of sigaction().

While at it rename send_sigqueue() to posixtimer_send_sigqueue() so its
clear what this is about.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064213.586453412@linutronix.de
---
 include/linux/posix-timers.h |  1 +
 include/linux/sched/signal.h |  1 -
 kernel/signal.c              | 82 ++++++++++++++++++++++++--------------------
 kernel/time/posix-timers.c   |  2 +-
 4 files changed, 47 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 947176582de9..52611ea923b2 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -109,6 +109,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 
 void posixtimer_rearm_itimer(struct task_struct *p);
 bool posixtimer_init_sigqueue(struct sigqueue *q);
+int posixtimer_send_sigqueue(struct k_itimer *tmr);
 bool posixtimer_deliver_signal(struct kernel_siginfo *info);
 void posixtimer_free_timer(struct k_itimer *timer);
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index bd9f569231d9..36283c1c55e9 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -340,7 +340,6 @@ extern int send_sig(int, struct task_struct *, int);
 extern int zap_other_threads(struct task_struct *p);
 extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
-extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type, int si_private);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 
 static inline void clear_notify_signal(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 911ed3ab479e..5b71e26abb0e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1947,40 +1947,54 @@ void sigqueue_free(struct sigqueue *q)
 		__sigqueue_free(q);
 }
 
-int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type, int si_private)
+static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type)
 {
-	int sig = q->info.si_signo;
 	struct sigpending *pending;
+	int sig = q->info.si_signo;
+
+	signalfd_notify(t, sig);
+	pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
+	list_add_tail(&q->list, &pending->list);
+	sigaddset(&pending->signal, sig);
+	complete_signal(sig, t, type);
+}
+
+/*
+ * This function is used by POSIX timers to deliver a timer signal.
+ * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
+ * set), the signal must be delivered to the specific thread (queues
+ * into t->pending).
+ *
+ * Where type is not PIDTYPE_PID, signals must be delivered to the
+ * process. In this case, prefer to deliver to current if it is in
+ * the same thread group as the target process, which avoids
+ * unnecessarily waking up a potentially idle task.
+ */
+static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr)
+{
+	struct task_struct *t = pid_task(tmr->it_pid, tmr->it_pid_type);
+
+	if (t && tmr->it_pid_type != PIDTYPE_PID && same_thread_group(t, current))
+		t = current;
+	return t;
+}
+
+int posixtimer_send_sigqueue(struct k_itimer *tmr)
+{
+	struct sigqueue *q = tmr->sigq;
+	int sig = q->info.si_signo;
 	struct task_struct *t;
 	unsigned long flags;
 	int ret, result;
 
-	if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
-		return 0;
-	if (WARN_ON_ONCE(q->info.si_code != SI_TIMER))
-		return 0;
-
-	ret = -1;
-	rcu_read_lock();
+	guard(rcu)();
 
-	/*
-	 * This function is used by POSIX timers to deliver a timer signal.
-	 * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
-	 * set), the signal must be delivered to the specific thread (queues
-	 * into t->pending).
-	 *
-	 * Where type is not PIDTYPE_PID, signals must be delivered to the
-	 * process. In this case, prefer to deliver to current if it is in
-	 * the same thread group as the target process, which avoids
-	 * unnecessarily waking up a potentially idle task.
-	 */
-	t = pid_task(pid, type);
+	t = posixtimer_get_target(tmr);
 	if (!t)
-		goto ret;
-	if (type != PIDTYPE_PID && same_thread_group(t, current))
-		t = current;
+		return -1;
+
 	if (!likely(lock_task_sighand(t, &flags)))
-		goto ret;
+		return -1;
 
 	/*
 	 * Update @q::info::si_sys_private for posix timer signals with
@@ -1988,30 +2002,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type, int s
 	 * decides based on si_sys_private whether to invoke
 	 * posixtimer_rearm() or not.
 	 */
-	q->info.si_sys_private = si_private;
+	q->info.si_sys_private = tmr->it_signal_seq;
 
 	ret = 1; /* the signal is ignored */
-	result = TRACE_SIGNAL_IGNORED;
-	if (!prepare_signal(sig, t, false))
+	if (!prepare_signal(sig, t, false)) {
+		result = TRACE_SIGNAL_IGNORED;
 		goto out;
+	}
 
 	ret = 0;
 	if (unlikely(!list_empty(&q->list))) {
 		result = TRACE_SIGNAL_ALREADY_PENDING;
 		goto out;
 	}
-
-	signalfd_notify(t, sig);
-	pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
-	list_add_tail(&q->list, &pending->list);
-	sigaddset(&pending->signal, sig);
-	complete_signal(sig, t, type);
+	posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
 	result = TRACE_SIGNAL_DELIVERED;
 out:
-	trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result);
+	trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result);
 	unlock_task_sighand(t, &flags);
-ret:
-	rcu_read_unlock();
 	return ret;
 }
 #endif /* CONFIG_POSIX_TIMERS */
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f18d64c7cd3b..0901ed9ca183 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -307,7 +307,7 @@ int posix_timer_queue_signal(struct k_itimer *timr)
 
 	timr->it_status = state;
 
-	ret = send_sigqueue(timr->sigq, timr->it_pid, timr->it_pid_type, timr->it_signal_seq);
+	ret = posixtimer_send_sigqueue(timr);
 	/* If we failed to send the signal the timer stops. */
 	return ret > 0;
 }
-- 
cgit v1.2.3


From 11629b9808e5900d675fd469d19932ea48060de3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:43 +0100
Subject: signal: Replace resched_timer logic

In preparation for handling ignored posix timer signals correctly and
embedding the sigqueue struct into struct k_itimer, hand down a pointer to
the sigqueue struct into posix_timer_deliver_signal() instead of just
having a boolean flag.

No functional change.

Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Link: https://lore.kernel.org/all/20241105064213.652658158@linutronix.de
---
 include/linux/posix-timers.h |  5 +++--
 kernel/signal.c              | 32 ++++++++++++++++++++------------
 kernel/time/posix-timers.c   |  2 +-
 3 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 52611ea923b2..39f1db76833a 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -110,7 +110,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 void posixtimer_rearm_itimer(struct task_struct *p);
 bool posixtimer_init_sigqueue(struct sigqueue *q);
 int posixtimer_send_sigqueue(struct k_itimer *tmr);
-bool posixtimer_deliver_signal(struct kernel_siginfo *info);
+bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
 void posixtimer_free_timer(struct k_itimer *timer);
 
 /* Init task static initializer */
@@ -135,7 +135,8 @@ static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
 static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
 					      u64 cpu_limit) { }
 static inline void posixtimer_rearm_itimer(struct task_struct *p) { }
-static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info) { return false; }
+static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info,
+					     struct sigqueue *timer_sigq) { return false; }
 static inline void posixtimer_free_timer(struct k_itimer *timer) { }
 #endif
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 5b71e26abb0e..0ddb5dd284aa 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -545,7 +545,7 @@ bool unhandled_signal(struct task_struct *tsk, int sig)
 }
 
 static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info,
-			   bool *resched_timer)
+			   struct sigqueue **timer_sigq)
 {
 	struct sigqueue *q, *first = NULL;
 
@@ -568,10 +568,17 @@ still_pending:
 		list_del_init(&first->list);
 		copy_siginfo(info, &first->info);
 
-		*resched_timer = (first->flags & SIGQUEUE_PREALLOC) &&
-				 (info->si_code == SI_TIMER);
-
-		__sigqueue_free(first);
+		/*
+		 * posix-timer signals are preallocated and freed when the
+		 * timer goes away. Either directly or by clearing
+		 * SIGQUEUE_PREALLOC so that the next delivery will free
+		 * them. Spare the extra round through __sigqueue_free()
+		 * which is ignoring preallocated signals.
+		 */
+		if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER)))
+			*timer_sigq = first;
+		else
+			__sigqueue_free(first);
 	} else {
 		/*
 		 * Ok, it wasn't in the queue.  This must be
@@ -588,12 +595,12 @@ still_pending:
 }
 
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
-			kernel_siginfo_t *info, bool *resched_timer)
+			    kernel_siginfo_t *info, struct sigqueue **timer_sigq)
 {
 	int sig = next_signal(pending, mask);
 
 	if (sig)
-		collect_signal(sig, pending, info, resched_timer);
+		collect_signal(sig, pending, info, timer_sigq);
 	return sig;
 }
 
@@ -605,18 +612,19 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
 {
 	struct task_struct *tsk = current;
-	bool resched_timer = false;
+	struct sigqueue *timer_sigq;
 	int signr;
 
 	lockdep_assert_held(&tsk->sighand->siglock);
 
 again:
 	*type = PIDTYPE_PID;
-	signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
+	timer_sigq = NULL;
+	signr = __dequeue_signal(&tsk->pending, mask, info, &timer_sigq);
 	if (!signr) {
 		*type = PIDTYPE_TGID;
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
-					 mask, info, &resched_timer);
+					 mask, info, &timer_sigq);
 
 		if (unlikely(signr == SIGALRM))
 			posixtimer_rearm_itimer(tsk);
@@ -642,8 +650,8 @@ again:
 		current->jobctl |= JOBCTL_STOP_DEQUEUED;
 	}
 
-	if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(resched_timer)) {
-		if (!posixtimer_deliver_signal(info))
+	if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(timer_sigq)) {
+		if (!posixtimer_deliver_signal(info, timer_sigq))
 			goto again;
 	}
 
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0901ed9ca183..d6fef064b357 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -254,7 +254,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
  * This function is called from the signal delivery code. It decides
  * whether the signal should be dropped and rearms interval timers.
  */
-bool posixtimer_deliver_signal(struct kernel_siginfo *info)
+bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq)
 {
 	struct k_itimer *timr;
 	unsigned long flags;
-- 
cgit v1.2.3


From 6017a158beb13b412e55a451379798aae5876514 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:45 +0100
Subject: posix-timers: Embed sigqueue in struct k_itimer

To cure the SIG_IGN handling for posix interval timers, the preallocated
sigqueue needs to be embedded into struct k_itimer to prevent life time
races of all sorts.

Now that the prerequisites are in place, embed the sigqueue into struct
k_itimer and fixup the relevant usage sites.

Aside of preparing for proper SIG_IGN handling, this spares an extra
allocation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064213.719695194@linutronix.de
---
 fs/proc/base.c               |  4 +-
 include/linux/posix-timers.h | 23 +++++++++++-
 kernel/signal.c              | 19 ++++++----
 kernel/time/posix-timers.c   | 88 ++++++++++++++++++++++++++------------------
 4 files changed, 87 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b31283d81c52..6a37a43241e4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2553,8 +2553,8 @@ static int show_timer(struct seq_file *m, void *v)
 
 	seq_printf(m, "ID: %d\n", timer->it_id);
 	seq_printf(m, "signal: %d/%px\n",
-		   timer->sigq->info.si_signo,
-		   timer->sigq->info.si_value.sival_ptr);
+		   timer->sigq.info.si_signo,
+		   timer->sigq.info.si_value.sival_ptr);
 	seq_printf(m, "notify: %s/%s.%d\n",
 		   nstr[notify & ~SIGEV_THREAD_ID],
 		   (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 39f1db76833a..28c0a30e0853 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -39,6 +39,8 @@ static inline int clockid_to_fd(const clockid_t clk)
 
 #ifdef CONFIG_POSIX_TIMERS
 
+#include <linux/signal_types.h>
+
 /**
  * cpu_timer - Posix CPU timer representation for k_itimer
  * @node:	timerqueue node to queue in the task/sig
@@ -166,7 +168,7 @@ static inline void posix_cputimers_init_work(void) { }
  * @it_pid:		The pid of the process/task targeted by the signal
  * @it_process:		The task to wakeup on clock_nanosleep (CPU timers)
  * @rcuref:		Reference count for life time management
- * @sigq:		Pointer to preallocated sigqueue
+ * @sigq:		Embedded sigqueue
  * @it:			Union representing the various posix timer type
  *			internals.
  * @rcu:		RCU head for freeing the timer.
@@ -190,7 +192,7 @@ struct k_itimer {
 		struct pid		*it_pid;
 		struct task_struct	*it_process;
 	};
-	struct sigqueue		*sigq;
+	struct sigqueue		sigq;
 	rcuref_t		rcuref;
 	union {
 		struct {
@@ -218,6 +220,23 @@ static inline void posixtimer_putref(struct k_itimer *tmr)
 	if (rcuref_put(&tmr->rcuref))
 		posixtimer_free_timer(tmr);
 }
+
+static inline void posixtimer_sigqueue_getref(struct sigqueue *q)
+{
+	struct k_itimer *tmr = container_of(q, struct k_itimer, sigq);
+
+	WARN_ON_ONCE(!rcuref_get(&tmr->rcuref));
+}
+
+static inline void posixtimer_sigqueue_putref(struct sigqueue *q)
+{
+	struct k_itimer *tmr = container_of(q, struct k_itimer, sigq);
+
+	posixtimer_putref(tmr);
+}
+#else  /* CONFIG_POSIX_TIMERS */
+static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { }
+static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { }
 #endif /* !CONFIG_POSIX_TIMERS */
 
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 0ddb5dd284aa..2d74cd5841ae 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -460,8 +460,10 @@ static struct sigqueue *__sigqueue_alloc(int sig, struct task_struct *t, gfp_t g
 
 static void __sigqueue_free(struct sigqueue *q)
 {
-	if (q->flags & SIGQUEUE_PREALLOC)
+	if (q->flags & SIGQUEUE_PREALLOC) {
+		posixtimer_sigqueue_putref(q);
 		return;
+	}
 	if (q->ucounts) {
 		dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
 		q->ucounts = NULL;
@@ -569,11 +571,11 @@ still_pending:
 		copy_siginfo(info, &first->info);
 
 		/*
-		 * posix-timer signals are preallocated and freed when the
-		 * timer goes away. Either directly or by clearing
-		 * SIGQUEUE_PREALLOC so that the next delivery will free
-		 * them. Spare the extra round through __sigqueue_free()
-		 * which is ignoring preallocated signals.
+		 * posix-timer signals are preallocated and freed when the last
+		 * reference count is dropped in posixtimer_deliver_signal() or
+		 * immediately on timer deletion when the signal is not pending.
+		 * Spare the extra round through __sigqueue_free() which is
+		 * ignoring preallocated signals.
 		 */
 		if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER)))
 			*timer_sigq = first;
@@ -1989,7 +1991,7 @@ static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr)
 
 int posixtimer_send_sigqueue(struct k_itimer *tmr)
 {
-	struct sigqueue *q = tmr->sigq;
+	struct sigqueue *q = &tmr->sigq;
 	int sig = q->info.si_signo;
 	struct task_struct *t;
 	unsigned long flags;
@@ -2020,9 +2022,12 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr)
 
 	ret = 0;
 	if (unlikely(!list_empty(&q->list))) {
+		/* This holds a reference count already */
 		result = TRACE_SIGNAL_ALREADY_PENDING;
 		goto out;
 	}
+
+	posixtimer_sigqueue_getref(q);
 	posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
 	result = TRACE_SIGNAL_DELIVERED;
 out:
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index d6fef064b357..2e2c0edcfa97 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -250,15 +250,40 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
 	hrtimer_restart(timer);
 }
 
+static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr)
+{
+	guard(spinlock)(&timr->it_lock);
+
+	/*
+	 * Check if the timer is still alive or whether it got modified
+	 * since the signal was queued. In either case, don't rearm and
+	 * drop the signal.
+	 */
+	if (timr->it_signal_seq != info->si_sys_private || WARN_ON_ONCE(!timr->it_signal))
+		return false;
+
+	if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
+		return true;
+
+	timr->kclock->timer_rearm(timr);
+	timr->it_status = POSIX_TIMER_ARMED;
+	timr->it_overrun_last = timr->it_overrun;
+	timr->it_overrun = -1LL;
+	++timr->it_signal_seq;
+	info->si_overrun = timer_overrun_to_int(timr);
+	return true;
+}
+
 /*
  * This function is called from the signal delivery code. It decides
- * whether the signal should be dropped and rearms interval timers.
+ * whether the signal should be dropped and rearms interval timers.  The
+ * timer can be unconditionally accessed as there is a reference held on
+ * it.
  */
 bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq)
 {
-	struct k_itimer *timr;
-	unsigned long flags;
-	bool ret = false;
+	struct k_itimer *timr = container_of(timer_sigq, struct k_itimer, sigq);
+	bool ret;
 
 	/*
 	 * Release siglock to ensure proper locking order versus
@@ -266,28 +291,11 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *tim
 	 */
 	spin_unlock(&current->sighand->siglock);
 
-	timr = lock_timer(info->si_tid, &flags);
-	if (!timr)
-		goto out;
-
-	if (timr->it_signal_seq != info->si_sys_private)
-		goto out_unlock;
-
-	if (timr->it_interval && !WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) {
-		timr->kclock->timer_rearm(timr);
+	ret = __posixtimer_deliver_signal(info, timr);
 
-		timr->it_status = POSIX_TIMER_ARMED;
-		timr->it_overrun_last = timr->it_overrun;
-		timr->it_overrun = -1LL;
-		++timr->it_signal_seq;
-
-		info->si_overrun = timer_overrun_to_int(timr);
-	}
-	ret = true;
+	/* Drop the reference which was acquired when the signal was queued */
+	posixtimer_putref(timr);
 
-out_unlock:
-	unlock_timer(timr, flags);
-out:
 	spin_lock(&current->sighand->siglock);
 
 	/* Don't expose the si_sys_private value to userspace */
@@ -404,17 +412,17 @@ static struct pid *good_sigevent(sigevent_t * event)
 	}
 }
 
-static struct k_itimer * alloc_posix_timer(void)
+static struct k_itimer *alloc_posix_timer(void)
 {
 	struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
 
 	if (!tmr)
 		return tmr;
-	if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
+
+	if (unlikely(!posixtimer_init_sigqueue(&tmr->sigq))) {
 		kmem_cache_free(posix_timers_cache, tmr);
 		return NULL;
 	}
-	clear_siginfo(&tmr->sigq->info);
 	rcuref_init(&tmr->rcuref, 1);
 	return tmr;
 }
@@ -422,7 +430,8 @@ static struct k_itimer * alloc_posix_timer(void)
 void posixtimer_free_timer(struct k_itimer *tmr)
 {
 	put_pid(tmr->it_pid);
-	sigqueue_free(tmr->sigq);
+	if (tmr->sigq.ucounts)
+		dec_rlimit_put_ucounts(tmr->sigq.ucounts, UCOUNT_RLIMIT_SIGPENDING);
 	kfree_rcu(tmr, rcu);
 }
 
@@ -484,13 +493,13 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
 			goto out;
 		}
 		new_timer->it_sigev_notify     = event->sigev_notify;
-		new_timer->sigq->info.si_signo = event->sigev_signo;
-		new_timer->sigq->info.si_value = event->sigev_value;
+		new_timer->sigq.info.si_signo = event->sigev_signo;
+		new_timer->sigq.info.si_value = event->sigev_value;
 	} else {
 		new_timer->it_sigev_notify     = SIGEV_SIGNAL;
-		new_timer->sigq->info.si_signo = SIGALRM;
-		memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t));
-		new_timer->sigq->info.si_value.sival_int = new_timer->it_id;
+		new_timer->sigq.info.si_signo = SIGALRM;
+		memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t));
+		new_timer->sigq.info.si_value.sival_int = new_timer->it_id;
 		new_timer->it_pid = get_pid(task_tgid(current));
 	}
 
@@ -499,8 +508,8 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
 	else
 		new_timer->it_pid_type = PIDTYPE_TGID;
 
-	new_timer->sigq->info.si_tid   = new_timer->it_id;
-	new_timer->sigq->info.si_code  = SI_TIMER;
+	new_timer->sigq.info.si_tid = new_timer->it_id;
+	new_timer->sigq.info.si_code = SI_TIMER;
 
 	if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) {
 		error = -EFAULT;
@@ -584,7 +593,14 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 	 *  1) Set timr::it_signal to NULL with timr::it_lock held
 	 *  2) Release timr::it_lock
 	 *  3) Remove from the hash under hash_lock
-	 *  4) Call RCU for removal after the grace period
+	 *  4) Put the reference count.
+	 *
+	 * The reference count might not drop to zero if timr::sigq is
+	 * queued. In that case the signal delivery or flush will put the
+	 * last reference count.
+	 *
+	 * When the reference count reaches zero, the timer is scheduled
+	 * for RCU removal after the grace period.
 	 *
 	 * Holding rcu_read_lock() accross the lookup ensures that
 	 * the timer cannot be freed.
-- 
cgit v1.2.3


From c2a4796a154bb952be1106911841aab2c8c17c4d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:46 +0100
Subject: signal: Cleanup unused posix-timer leftovers

Remove the leftovers of sigqueue preallocation as it's not longer used.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064213.786506636@linutronix.de
---
 include/linux/sched/signal.h |  2 --
 kernel/signal.c              | 39 ++++-----------------------------------
 2 files changed, 4 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 36283c1c55e9..02972fd41931 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -338,8 +338,6 @@ extern void force_fatal_sig(int);
 extern void force_exit_sig(int);
 extern int send_sig(int, struct task_struct *, int);
 extern int zap_other_threads(struct task_struct *p);
-extern struct sigqueue *sigqueue_alloc(void);
-extern void sigqueue_free(struct sigqueue *);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 
 static inline void clear_notify_signal(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 2d74cd5841ae..d267a2c5e977 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -439,8 +439,8 @@ static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts,
  * - this may be called without locks if and only if t == current, otherwise an
  *   appropriate lock must be held to stop the target task from exiting
  */
-static struct sigqueue *__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
-					 int override_rlimit, const unsigned int sigqueue_flags)
+static struct sigqueue *sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
+				       int override_rlimit)
 {
 	struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit);
 	struct sigqueue *q;
@@ -454,7 +454,7 @@ static struct sigqueue *__sigqueue_alloc(int sig, struct task_struct *t, gfp_t g
 		return NULL;
 	}
 
-	__sigqueue_init(q, ucounts, sigqueue_flags);
+	__sigqueue_init(q, ucounts, 0);
 	return q;
 }
 
@@ -1070,7 +1070,7 @@ static int __send_signal_locked(int sig, struct kernel_siginfo *info,
 	else
 		override_rlimit = 0;
 
-	q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0);
+	q = sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
 
 	if (q) {
 		list_add_tail(&q->list, &pending->list);
@@ -1926,37 +1926,6 @@ bool posixtimer_init_sigqueue(struct sigqueue *q)
 	return true;
 }
 
-struct sigqueue *sigqueue_alloc(void)
-{
-	return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
-}
-
-void sigqueue_free(struct sigqueue *q)
-{
-	spinlock_t *lock = &current->sighand->siglock;
-	unsigned long flags;
-
-	if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
-		return;
-	/*
-	 * We must hold ->siglock while testing q->list
-	 * to serialize with collect_signal() or with
-	 * __exit_signal()->flush_sigqueue().
-	 */
-	spin_lock_irqsave(lock, flags);
-	q->flags &= ~SIGQUEUE_PREALLOC;
-	/*
-	 * If it is queued it will be freed when dequeued,
-	 * like the "regular" sigqueue.
-	 */
-	if (!list_empty(&q->list))
-		q = NULL;
-	spin_unlock_irqrestore(lock, flags);
-
-	if (q)
-		__sigqueue_free(q);
-}
-
 static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type)
 {
 	struct sigpending *pending;
-- 
cgit v1.2.3


From 647da5f709f112319c0d51e06f330d8afecb1940 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:48 +0100
Subject: posix-timers: Move sequence logic into struct k_itimer

The posix timer signal handling uses siginfo::si_sys_private for handling
the sequence counter check. That indirection is not longer required and the
sequence count value at signal queueing time can be stored in struct
k_itimer itself.

This removes the requirement of treating siginfo::si_sys_private special as
it's now always zero as the kernel does not touch it anymore.

Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Link: https://lore.kernel.org/all/20241105064213.852619866@linutronix.de
---
 include/linux/posix-timers.h       | 2 ++
 include/uapi/asm-generic/siginfo.h | 2 +-
 kernel/signal.c                    | 8 +++-----
 kernel/time/posix-timers.c         | 5 +----
 4 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 28c0a30e0853..49a89614d900 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -162,6 +162,7 @@ static inline void posix_cputimers_init_work(void) { }
  * @it_overrun:		The overrun counter for pending signals
  * @it_overrun_last:	The overrun at the time of the last delivered signal
  * @it_signal_seq:	Sequence count to control signal delivery
+ * @it_sigqueue_seq:	The sequence count at the point where the signal was queued
  * @it_sigev_notify:	The notify word of sigevent struct for signal delivery
  * @it_interval:	The interval for periodic timers
  * @it_signal:		Pointer to the creators signal struct
@@ -184,6 +185,7 @@ struct k_itimer {
 	s64			it_overrun;
 	s64			it_overrun_last;
 	unsigned int		it_signal_seq;
+	unsigned int		it_sigqueue_seq;
 	int			it_sigev_notify;
 	enum pid_type		it_pid_type;
 	ktime_t			it_interval;
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index b7bc545ec3b2..5a1ca43b5fc6 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -46,7 +46,7 @@ union __sifields {
 		__kernel_timer_t _tid;	/* timer id */
 		int _overrun;		/* overrun count */
 		sigval_t _sigval;	/* same as below */
-		int _sys_private;       /* not to be passed to user */
+		int _sys_private;       /* Not used by the kernel. Historic leftover. Always 0. */
 	} _timer;
 
 	/* POSIX.1b signals */
diff --git a/kernel/signal.c b/kernel/signal.c
index d267a2c5e977..d2734dc4d74f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1976,12 +1976,10 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr)
 		return -1;
 
 	/*
-	 * Update @q::info::si_sys_private for posix timer signals with
-	 * sighand locked to prevent a race against dequeue_signal() which
-	 * decides based on si_sys_private whether to invoke
-	 * posixtimer_rearm() or not.
+	 * Update @tmr::sigqueue_seq for posix timer signals with sighand
+	 * locked to prevent a race against dequeue_signal().
 	 */
-	q->info.si_sys_private = tmr->it_signal_seq;
+	tmr->it_sigqueue_seq = tmr->it_signal_seq;
 
 	ret = 1; /* the signal is ignored */
 	if (!prepare_signal(sig, t, false)) {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 2e2c0edcfa97..f20c06d0cf09 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -259,7 +259,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it
 	 * since the signal was queued. In either case, don't rearm and
 	 * drop the signal.
 	 */
-	if (timr->it_signal_seq != info->si_sys_private || WARN_ON_ONCE(!timr->it_signal))
+	if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal))
 		return false;
 
 	if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
@@ -297,9 +297,6 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *tim
 	posixtimer_putref(timr);
 
 	spin_lock(&current->sighand->siglock);
-
-	/* Don't expose the si_sys_private value to userspace */
-	info->si_sys_private = 0;
 	return ret;
 }
 
-- 
cgit v1.2.3


From 69f032c92cf883ea74a4b69ba3d91317aa6f174e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:49 +0100
Subject: signal: Provide ignored_posix_timers list

To prepare for handling posix timer signals on sigaction(SIG_IGN) properly,
add a list to task::signal.

This list will be used to queue posix timers so their signal can be
requeued when SIG_IGN is lifted later.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064213.920101900@linutronix.de
---
 include/linux/sched/signal.h | 1 +
 init/init_task.c             | 5 +++--
 kernel/fork.c                | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 02972fd41931..d5d03d919df8 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -138,6 +138,7 @@ struct signal_struct {
 	/* POSIX.1b Interval Timers */
 	unsigned int		next_posix_timer_id;
 	struct hlist_head	posix_timers;
+	struct hlist_head	ignored_posix_timers;
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
diff --git a/init/init_task.c b/init/init_task.c
index 136a8231355a..e557f622bd90 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -30,8 +30,9 @@ static struct signal_struct init_signals = {
 	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
 	.exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
 #ifdef CONFIG_POSIX_TIMERS
-	.posix_timers	= HLIST_HEAD_INIT,
-	.cputimer	= {
+	.posix_timers		= HLIST_HEAD_INIT,
+	.ignored_posix_timers	= HLIST_HEAD_INIT,
+	.cputimer		= {
 		.cputime_atomic	= INIT_CPUTIME_ATOMIC,
 	},
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 60c0b4868fd4..c2bd8367a850 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1864,6 +1864,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
 #ifdef CONFIG_POSIX_TIMERS
 	INIT_HLIST_HEAD(&sig->posix_timers);
+	INIT_HLIST_HEAD(&sig->ignored_posix_timers);
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->real_timer.function = it_real_fn;
 #endif
-- 
cgit v1.2.3


From 0e20cd33acc7a173b23900550331ee82a23e9f00 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:51 +0100
Subject: posix-timers: Handle ignored list on delete and exit

To handle posix timer signals on sigaction(SIG_IGN) properly, the timers
will be queued on a separate ignored list.

Add the necessary cleanup code for timer_delete() and exit_itimers().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064213.987530588@linutronix.de
---
 include/linux/posix-timers.h |  4 +++-
 kernel/time/posix-timers.c   | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 49a89614d900..1608b52a44d5 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -152,7 +152,8 @@ static inline void posix_cputimers_init_work(void) { }
 
 /**
  * struct k_itimer - POSIX.1b interval timer structure.
- * @list:		List head for binding the timer to signals->posix_timers
+ * @list:		List node for binding the timer to tsk::signal::posix_timers
+ * @ignored_list:	List node for tracking ignored timers in tsk::signal::ignored_posix_timers
  * @t_hash:		Entry in the posix timer hash table
  * @it_lock:		Lock protecting the timer
  * @kclock:		Pointer to the k_clock struct handling this timer
@@ -176,6 +177,7 @@ static inline void posix_cputimers_init_work(void) { }
  */
 struct k_itimer {
 	struct hlist_node	list;
+	struct hlist_node	ignored_list;
 	struct hlist_node	t_hash;
 	spinlock_t		it_lock;
 	const struct k_clock	*kclock;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f20c06d0cf09..2b88fb4e937e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1027,6 +1027,18 @@ int common_timer_del(struct k_itimer *timer)
 	return 0;
 }
 
+/*
+ * If the deleted timer is on the ignored list, remove it and
+ * drop the associated reference.
+ */
+static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr)
+{
+	if (!hlist_unhashed(&tmr->ignored_list)) {
+		hlist_del_init(&tmr->ignored_list);
+		posixtimer_putref(tmr);
+	}
+}
+
 static inline int timer_delete_hook(struct k_itimer *timer)
 {
 	const struct k_clock *kc = timer->kclock;
@@ -1059,6 +1071,7 @@ retry_delete:
 
 	spin_lock(&current->sighand->siglock);
 	hlist_del(&timer->list);
+	posix_timer_cleanup_ignored(timer);
 	spin_unlock(&current->sighand->siglock);
 	/*
 	 * A concurrent lookup could check timer::it_signal lockless. It
@@ -1110,6 +1123,8 @@ retry_delete:
 	}
 	hlist_del(&timer->list);
 
+	posix_timer_cleanup_ignored(timer);
+
 	/*
 	 * Setting timer::it_signal to NULL is technically not required
 	 * here as nothing can access the timer anymore legitimately via
@@ -1142,6 +1157,19 @@ void exit_itimers(struct task_struct *tsk)
 	/* The timers are not longer accessible via tsk::signal */
 	while (!hlist_empty(&timers))
 		itimer_delete(hlist_entry(timers.first, struct k_itimer, list));
+
+	/*
+	 * There should be no timers on the ignored list. itimer_delete() has
+	 * mopped them up.
+	 */
+	if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers)))
+		return;
+
+	hlist_move_list(&tsk->signal->ignored_posix_timers, &timers);
+	while (!hlist_empty(&timers)) {
+		posix_timer_cleanup_ignored(hlist_entry(timers.first, struct k_itimer,
+							ignored_list));
+	}
 }
 
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
-- 
cgit v1.2.3


From df7a996b4dab03c889fa86d849447b716f07b069 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:54 +0100
Subject: signal: Queue ignored posixtimers on ignore list

Queue posixtimers which have their signal ignored on the ignored list:

   1) When the timer fires and the signal has SIG_IGN set

   2) When SIG_IGN is installed via sigaction() and a timer signal
      is already queued

This only happens when the signal is for a valid timer, which delivered the
signal in periodic mode. One-shot timer signals are correctly dropped.

Due to the lock order constraints (sighand::siglock nests inside
timer::lock) the signal code cannot access any of the timer fields which
are relevant to make this decision, e.g. timer::it_status.

This is addressed by establishing a protection scheme which requires to
lock both locks on the timer side for modifying decision fields in the
timer struct and therefore makes it possible for the signal delivery to
evaluate with only sighand:siglock being held:

  1) Move the NULLification of timer->it_signal into the sighand::siglock
     protected section of timer_delete() and check timer::it_signal in the
     code path which determines whether the signal is dropped or queued on
     the ignore list.

     This ensures that a deleted timer cannot be moved onto the ignore
     list, which would prevent it from being freed on exit() as it is not
     longer in the process' posix timer list.

     If the timer got moved to the ignored list before deletion then it is
     removed from the ignored list under sighand lock in timer_delete().

  2) Provide a new timer::it_sig_periodic flag, which gets set in the
     signal queue path with both timer and sighand locks held if the timer
     is actually in periodic mode at expiry time.

     The ignore list code checks this flag under sighand::siglock and drops
     the signal when it is not set.

     If it is set, then the signal is moved to the ignored list independent
     of the actual state of the timer.

     When the signal is un-ignored later then the signal is moved back to
     the signal queue. On signal delivery the posix timer side decides
     about dropping the signal if the timer was re-armed, dis-armed or
     deleted based on the signal sequence counter check.

     If the thread/process exits then not yet delivered signals are
     discarded which means the reference of the timer containing the
     sigqueue is dropped and frees the timer.

     This is way cheaper than requiring all code paths to lock
     sighand::siglock of the target thread/process on any modification of
     timer::it_status or going all the way and removing pending signals
     from the signal queues on every rearm, disarm or delete operation.

So the protection scheme here is that on the timer side both timer::lock
and sighand::siglock have to be held for modifying

   timer::it_signal
   timer::it_sig_periodic

which means that on the signal side holding sighand::siglock is enough to
evaluate these fields.

In posixtimer_deliver_signal() holding timer::lock is sufficient to do the
sequence validation against timer::it_signal_seq because a concurrent
expiry is waiting on timer::lock to be released.

This completes the SIG_IGN handling and such timers are not longer self
rearmed which avoids pointless wakeups.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064214.120756416@linutronix.de
---
 include/linux/posix-timers.h |  2 ++
 kernel/signal.c              | 80 +++++++++++++++++++++++++++++++++++++++++---
 kernel/time/posix-timers.c   |  7 +++-
 3 files changed, 83 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 1608b52a44d5..43ea6e784a25 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -160,6 +160,7 @@ static inline void posix_cputimers_init_work(void) { }
  * @it_clock:		The posix timer clock id
  * @it_id:		The posix timer id for identifying the timer
  * @it_status:		The status of the timer
+ * @it_sig_periodic:	The periodic status at signal delivery
  * @it_overrun:		The overrun counter for pending signals
  * @it_overrun_last:	The overrun at the time of the last delivered signal
  * @it_signal_seq:	Sequence count to control signal delivery
@@ -184,6 +185,7 @@ struct k_itimer {
 	clockid_t		it_clock;
 	timer_t			it_id;
 	int			it_status;
+	bool			it_sig_periodic;
 	s64			it_overrun;
 	s64			it_overrun_last;
 	unsigned int		it_signal_seq;
diff --git a/kernel/signal.c b/kernel/signal.c
index 908b49c594e4..9b098a7a206f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -59,6 +59,8 @@
 #include <asm/cacheflush.h>
 #include <asm/syscall.h>	/* for syscall_get_* */
 
+#include "time/posix-timers.h"
+
 /*
  * SLAB caches for signal bits.
  */
@@ -731,6 +733,16 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
 		kick_process(t);
 }
 
+static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q);
+
+static void sigqueue_free_ignored(struct task_struct *tsk, struct sigqueue *q)
+{
+	if (likely(!(q->flags & SIGQUEUE_PREALLOC) || q->info.si_code != SI_TIMER))
+		__sigqueue_free(q);
+	else
+		posixtimer_sig_ignore(tsk, q);
+}
+
 /* Remove signals in mask from the pending set and queue. */
 static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s)
 {
@@ -747,7 +759,7 @@ static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct si
 	list_for_each_entry_safe(q, n, &s->list, list) {
 		if (sigismember(mask, q->info.si_signo)) {
 			list_del_init(&q->list);
-			__sigqueue_free(q);
+			sigqueue_free_ignored(p, q);
 		}
 	}
 }
@@ -1964,7 +1976,7 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr)
 	int sig = q->info.si_signo;
 	struct task_struct *t;
 	unsigned long flags;
-	int ret, result;
+	int result;
 
 	guard(rcu)();
 
@@ -1981,13 +1993,55 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr)
 	 */
 	tmr->it_sigqueue_seq = tmr->it_signal_seq;
 
-	ret = 1; /* the signal is ignored */
+	/*
+	 * Set the signal delivery status under sighand lock, so that the
+	 * ignored signal handling can distinguish between a periodic and a
+	 * non-periodic timer.
+	 */
+	tmr->it_sig_periodic = tmr->it_status == POSIX_TIMER_REQUEUE_PENDING;
+
 	if (!prepare_signal(sig, t, false)) {
 		result = TRACE_SIGNAL_IGNORED;
+
+		/* Paranoia check. Try to survive. */
+		if (WARN_ON_ONCE(!list_empty(&q->list)))
+			goto out;
+
+		/* Periodic timers with SIG_IGN are queued on the ignored list */
+		if (tmr->it_sig_periodic) {
+			/*
+			 * Already queued means the timer was rearmed after
+			 * the previous expiry got it on the ignore list.
+			 * Nothing to do for that case.
+			 */
+			if (hlist_unhashed(&tmr->ignored_list)) {
+				/*
+				 * Take a signal reference and queue it on
+				 * the ignored list.
+				 */
+				posixtimer_sigqueue_getref(q);
+				posixtimer_sig_ignore(t, q);
+			}
+		} else if (!hlist_unhashed(&tmr->ignored_list)) {
+			/*
+			 * Covers the case where a timer was periodic and
+			 * then the signal was ignored. Later it was rearmed
+			 * as oneshot timer. The previous signal is invalid
+			 * now, and this oneshot signal has to be dropped.
+			 * Remove it from the ignored list and drop the
+			 * reference count as the signal is not longer
+			 * queued.
+			 */
+			hlist_del_init(&tmr->ignored_list);
+			posixtimer_putref(tmr);
+		}
 		goto out;
 	}
 
-	ret = 0;
+	/* This should never happen and leaks a reference count */
+	if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list)))
+		hlist_del_init(&tmr->ignored_list);
+
 	if (unlikely(!list_empty(&q->list))) {
 		/* This holds a reference count already */
 		result = TRACE_SIGNAL_ALREADY_PENDING;
@@ -2000,7 +2054,22 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr)
 out:
 	trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result);
 	unlock_task_sighand(t, &flags);
-	return ret;
+	return 0;
+}
+
+static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q)
+{
+	struct k_itimer *tmr = container_of(q, struct k_itimer, sigq);
+
+	/*
+	 * If the timer is marked deleted already or the signal originates
+	 * from a non-periodic timer, then just drop the reference
+	 * count. Otherwise queue it on the ignored list.
+	 */
+	if (tmr->it_signal && tmr->it_sig_periodic)
+		hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers);
+	else
+		posixtimer_putref(tmr);
 }
 
 static void posixtimer_sig_unignore(struct task_struct *tsk, int sig)
@@ -2048,6 +2117,7 @@ static void posixtimer_sig_unignore(struct task_struct *tsk, int sig)
 	}
 }
 #else /* CONFIG_POSIX_TIMERS */
+static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { }
 static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { }
 #endif /* !CONFIG_POSIX_TIMERS */
 
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 2b88fb4e937e..ea72db3c9365 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1072,12 +1072,17 @@ retry_delete:
 	spin_lock(&current->sighand->siglock);
 	hlist_del(&timer->list);
 	posix_timer_cleanup_ignored(timer);
-	spin_unlock(&current->sighand->siglock);
 	/*
 	 * A concurrent lookup could check timer::it_signal lockless. It
 	 * will reevaluate with timer::it_lock held and observe the NULL.
+	 *
+	 * It must be written with siglock held so that the signal code
+	 * observes timer->it_signal == NULL in do_sigaction(SIG_IGN),
+	 * which prevents it from moving a pending signal of a deleted
+	 * timer to the ignore list.
 	 */
 	WRITE_ONCE(timer->it_signal, NULL);
+	spin_unlock(&current->sighand->siglock);
 
 	unlock_timer(timer, flags);
 	posix_timer_unhash_and_free(timer);
-- 
cgit v1.2.3


From 7a66f72b09bb0762360274b1fb677b3433dbaa06 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:55 +0100
Subject: posix-timers: Cleanup SIG_IGN workaround leftovers

Now that ignored posix timer signals are requeued and the timers are
rearmed on signal delivery the workaround to keep such timers alive and
self rearm them is not longer required.

Remove the relevant hacks and the not longer required return values from
the related functions. The alarm timer workarounds will be cleaned up in a
separate step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241105064214.187239060@linutronix.de
---
 include/linux/posix-timers.h   |  2 +-
 kernel/signal.c                |  7 ++--
 kernel/time/alarmtimer.c       | 47 ++++++---------------------
 kernel/time/posix-cpu-timers.c | 18 +++--------
 kernel/time/posix-timers.c     | 73 ++++--------------------------------------
 kernel/time/posix-timers.h     |  2 +-
 6 files changed, 24 insertions(+), 125 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 43ea6e784a25..f11f10c97bd9 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -111,7 +111,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 
 void posixtimer_rearm_itimer(struct task_struct *p);
 bool posixtimer_init_sigqueue(struct sigqueue *q);
-int posixtimer_send_sigqueue(struct k_itimer *tmr);
+void posixtimer_send_sigqueue(struct k_itimer *tmr);
 bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
 void posixtimer_free_timer(struct k_itimer *timer);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 9b098a7a206f..cbf70c808969 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1970,7 +1970,7 @@ static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr)
 	return t;
 }
 
-int posixtimer_send_sigqueue(struct k_itimer *tmr)
+void posixtimer_send_sigqueue(struct k_itimer *tmr)
 {
 	struct sigqueue *q = &tmr->sigq;
 	int sig = q->info.si_signo;
@@ -1982,10 +1982,10 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr)
 
 	t = posixtimer_get_target(tmr);
 	if (!t)
-		return -1;
+		return;
 
 	if (!likely(lock_task_sighand(t, &flags)))
-		return -1;
+		return;
 
 	/*
 	 * Update @tmr::sigqueue_seq for posix timer signals with sighand
@@ -2054,7 +2054,6 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr)
 out:
 	trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result);
 	unlock_task_sighand(t, &flags);
-	return 0;
 }
 
 static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 452d8aa2f6e0..8543d7f1cdb4 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -197,28 +197,15 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 {
 	struct alarm *alarm = container_of(timer, struct alarm, timer);
 	struct alarm_base *base = &alarm_bases[alarm->type];
-	unsigned long flags;
-	int ret = HRTIMER_NORESTART;
-	int restart = ALARMTIMER_NORESTART;
 
-	spin_lock_irqsave(&base->lock, flags);
-	alarmtimer_dequeue(base, alarm);
-	spin_unlock_irqrestore(&base->lock, flags);
+	scoped_guard (spinlock_irqsave, &base->lock)
+		alarmtimer_dequeue(base, alarm);
 
 	if (alarm->function)
-		restart = alarm->function(alarm, base->get_ktime());
-
-	spin_lock_irqsave(&base->lock, flags);
-	if (restart != ALARMTIMER_NORESTART) {
-		hrtimer_set_expires(&alarm->timer, alarm->node.expires);
-		alarmtimer_enqueue(base, alarm);
-		ret = HRTIMER_RESTART;
-	}
-	spin_unlock_irqrestore(&base->lock, flags);
+		alarm->function(alarm, base->get_ktime());
 
 	trace_alarmtimer_fired(alarm, base->get_ktime());
-	return ret;
-
+	return HRTIMER_NORESTART;
 }
 
 ktime_t alarm_expires_remaining(const struct alarm *alarm)
@@ -567,30 +554,14 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
  *
  * Return: whether the timer is to be restarted
  */
-static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
-							ktime_t now)
+static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now)
 {
-	struct k_itimer *ptr = container_of(alarm, struct k_itimer,
-					    it.alarm.alarmtimer);
-	enum alarmtimer_restart result = ALARMTIMER_NORESTART;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ptr->it_lock, flags);
+	struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer);
 
-	if (posix_timer_queue_signal(ptr) && ptr->it_interval) {
-		/*
-		 * Handle ignored signals and rearm the timer. This will go
-		 * away once we handle ignored signals proper. Ensure that
-		 * small intervals cannot starve the system.
-		 */
-		ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
-		++ptr->it_signal_seq;
-		ptr->it_status = POSIX_TIMER_ARMED;
-		result = ALARMTIMER_RESTART;
-	}
-	spin_unlock_irqrestore(&ptr->it_lock, flags);
+	guard(spinlock_irqsave)(&ptr->it_lock);
+	posix_timer_queue_signal(ptr);
 
-	return result;
+	return ALARMTIMER_NORESTART;
 }
 
 /**
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0c441d8c2604..50e8d04ab661 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -603,21 +603,11 @@ static void cpu_timer_fire(struct k_itimer *timer)
 		 */
 		wake_up_process(timer->it_process);
 		cpu_timer_setexpires(ctmr, 0);
-	} else if (!timer->it_interval) {
-		/*
-		 * One-shot timer.  Clear it as soon as it's fired.
-		 */
+	} else {
 		posix_timer_queue_signal(timer);
-		cpu_timer_setexpires(ctmr, 0);
-	} else if (posix_timer_queue_signal(timer)) {
-		/*
-		 * The signal did not get queued because the signal
-		 * was ignored, so we won't get any callback to
-		 * reload the timer.  But we need to keep it
-		 * ticking in case the signal is deliverable next time.
-		 */
-		posix_cpu_timer_rearm(timer);
-		++timer->it_signal_seq;
+		/* Disable oneshot timers */
+		if (!timer->it_interval)
+			cpu_timer_setexpires(ctmr, 0);
 	}
 }
 
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index ea72db3c9365..881a9ce96af7 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -300,21 +300,12 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *tim
 	return ret;
 }
 
-int posix_timer_queue_signal(struct k_itimer *timr)
+void posix_timer_queue_signal(struct k_itimer *timr)
 {
-	enum posix_timer_state state = POSIX_TIMER_DISARMED;
-	int ret;
-
 	lockdep_assert_held(&timr->it_lock);
 
-	if (timr->it_interval)
-		state = POSIX_TIMER_REQUEUE_PENDING;
-
-	timr->it_status = state;
-
-	ret = posixtimer_send_sigqueue(timr);
-	/* If we failed to send the signal the timer stops. */
-	return ret > 0;
+	timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED;
+	posixtimer_send_sigqueue(timr);
 }
 
 /*
@@ -327,62 +318,10 @@ int posix_timer_queue_signal(struct k_itimer *timr)
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 {
 	struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer);
-	enum hrtimer_restart ret = HRTIMER_NORESTART;
-	unsigned long flags;
-
-	spin_lock_irqsave(&timr->it_lock, flags);
-
-	if (posix_timer_queue_signal(timr)) {
-		/*
-		 * The signal was not queued due to SIG_IGN. As a
-		 * consequence the timer is not going to be rearmed from
-		 * the signal delivery path. But as a real signal handler
-		 * can be installed later the timer must be rearmed here.
-		 */
-		if (timr->it_interval != 0) {
-			ktime_t now = hrtimer_cb_get_time(timer);
-
-			/*
-			 * FIXME: What we really want, is to stop this
-			 * timer completely and restart it in case the
-			 * SIG_IGN is removed. This is a non trivial
-			 * change to the signal handling code.
-			 *
-			 * For now let timers with an interval less than a
-			 * jiffy expire every jiffy and recheck for a
-			 * valid signal handler.
-			 *
-			 * This avoids interrupt starvation in case of a
-			 * very small interval, which would expire the
-			 * timer immediately again.
-			 *
-			 * Moving now ahead of time by one jiffy tricks
-			 * hrtimer_forward() to expire the timer later,
-			 * while it still maintains the overrun accuracy
-			 * for the price of a slight inconsistency in the
-			 * timer_gettime() case. This is at least better
-			 * than a timer storm.
-			 *
-			 * Only required when high resolution timers are
-			 * enabled as the periodic tick based timers are
-			 * automatically aligned to the next tick.
-			 */
-			if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) {
-				ktime_t kj = TICK_NSEC;
-
-				if (timr->it_interval < kj)
-					now = ktime_add(now, kj);
-			}
-
-			timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval);
-			ret = HRTIMER_RESTART;
-			++timr->it_signal_seq;
-			timr->it_status = POSIX_TIMER_ARMED;
-		}
-	}
 
-	unlock_timer(timr, flags);
-	return ret;
+	guard(spinlock_irqsave)(&timr->it_lock);
+	posix_timer_queue_signal(timr);
+	return HRTIMER_NORESTART;
 }
 
 static struct pid *good_sigevent(sigevent_t * event)
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 4d09677e584e..61906f0688c1 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -42,7 +42,7 @@ extern const struct k_clock clock_process;
 extern const struct k_clock clock_thread;
 extern const struct k_clock alarm_clock;
 
-int posix_timer_queue_signal(struct k_itimer *timr);
+void posix_timer_queue_signal(struct k_itimer *timr);
 
 void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting);
 int common_timer_set(struct k_itimer *timr, int flags,
-- 
cgit v1.2.3


From 2634303f8773b0c602069887565cd412440be15d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Nov 2024 09:14:58 +0100
Subject: alarmtimers: Remove return value from alarm functions

Now that the SIG_IGN problem is solved in the core code, the alarmtimer
callbacks do not require a return value anymore.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/all/20241105064214.318837272@linutronix.de
---
 drivers/power/supply/charger-manager.c |  3 +--
 fs/timerfd.c                           |  4 +---
 include/linux/alarmtimer.h             | 10 ++--------
 kernel/time/alarmtimer.c               | 16 +++++-----------
 net/netfilter/xt_IDLETIMER.c           |  4 +---
 5 files changed, 10 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/charger-manager.c b/drivers/power/supply/charger-manager.c
index 96f0a7fbf105..09ec0ecf1486 100644
--- a/drivers/power/supply/charger-manager.c
+++ b/drivers/power/supply/charger-manager.c
@@ -1412,10 +1412,9 @@ static inline struct charger_desc *cm_get_drv_data(struct platform_device *pdev)
 	return dev_get_platdata(&pdev->dev);
 }
 
-static enum alarmtimer_restart cm_timer_func(struct alarm *alarm, ktime_t now)
+static void cm_timer_func(struct alarm *alarm, ktime_t now)
 {
 	cm_timer_set = false;
-	return ALARMTIMER_NORESTART;
 }
 
 static int charger_manager_probe(struct platform_device *pdev)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 137523e0bb21..f10c99ad5c60 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -79,13 +79,11 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
 	return HRTIMER_NORESTART;
 }
 
-static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
-	ktime_t now)
+static void timerfd_alarmproc(struct alarm *alarm, ktime_t now)
 {
 	struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
 					       t.alarm);
 	timerfd_triggered(ctx);
-	return ALARMTIMER_NORESTART;
 }
 
 /*
diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h
index 05e758b8b894..3ffa5341dce2 100644
--- a/include/linux/alarmtimer.h
+++ b/include/linux/alarmtimer.h
@@ -20,12 +20,6 @@ enum alarmtimer_type {
 	ALARM_BOOTTIME_FREEZER,
 };
 
-enum alarmtimer_restart {
-	ALARMTIMER_NORESTART,
-	ALARMTIMER_RESTART,
-};
-
-
 #define ALARMTIMER_STATE_INACTIVE	0x00
 #define ALARMTIMER_STATE_ENQUEUED	0x01
 
@@ -42,14 +36,14 @@ enum alarmtimer_restart {
 struct alarm {
 	struct timerqueue_node	node;
 	struct hrtimer		timer;
-	enum alarmtimer_restart	(*function)(struct alarm *, ktime_t now);
+	void			(*function)(struct alarm *, ktime_t now);
 	enum alarmtimer_type	type;
 	int			state;
 	void			*data;
 };
 
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
-		enum alarmtimer_restart (*function)(struct alarm *, ktime_t));
+		void (*function)(struct alarm *, ktime_t));
 void alarm_start(struct alarm *alarm, ktime_t start);
 void alarm_start_relative(struct alarm *alarm, ktime_t start);
 void alarm_restart(struct alarm *alarm);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 593e7d561fa8..37d2d79daea4 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -321,7 +321,7 @@ static int alarmtimer_resume(struct device *dev)
 
 static void
 __alarm_init(struct alarm *alarm, enum alarmtimer_type type,
-	     enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+	     void (*function)(struct alarm *, ktime_t))
 {
 	timerqueue_init(&alarm->node);
 	alarm->timer.function = alarmtimer_fired;
@@ -337,7 +337,7 @@ __alarm_init(struct alarm *alarm, enum alarmtimer_type type,
  * @function: callback that is run when the alarm fires
  */
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
-		enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+		void (*function)(struct alarm *, ktime_t))
 {
 	hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
 		     HRTIMER_MODE_ABS);
@@ -530,14 +530,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
  *
  * Return: whether the timer is to be restarted
  */
-static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now)
+static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
 {
 	struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer);
 
 	guard(spinlock_irqsave)(&ptr->it_lock);
 	posix_timer_queue_signal(ptr);
-
-	return ALARMTIMER_NORESTART;
 }
 
 /**
@@ -698,18 +696,14 @@ static int alarm_timer_create(struct k_itimer *new_timer)
  * @now: time at the timer expiration
  *
  * Wakes up the task that set the alarmtimer
- *
- * Return: ALARMTIMER_NORESTART
  */
-static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
-								ktime_t now)
+static void alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now)
 {
 	struct task_struct *task = alarm->data;
 
 	alarm->data = NULL;
 	if (task)
 		wake_up_process(task);
-	return ALARMTIMER_NORESTART;
 }
 
 /**
@@ -761,7 +755,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 
 static void
 alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type,
-		    enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+		    void (*function)(struct alarm *, ktime_t))
 {
 	hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid,
 			      HRTIMER_MODE_ABS);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index db720efa811d..5514600586a9 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -107,14 +107,12 @@ static void idletimer_tg_expired(struct timer_list *t)
 	schedule_work(&timer->work);
 }
 
-static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm,
-							  ktime_t now)
+static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now)
 {
 	struct idletimer_tg *timer = alarm->data;
 
 	pr_debug("alarm %s expired\n", timer->attr.attr.name);
 	schedule_work(&timer->work);
-	return ALARMTIMER_NORESTART;
 }
 
 static int idletimer_check_sysfs_name(const char *name, unsigned int size)
-- 
cgit v1.2.3


From 1e4033b53db48cc77c436feac9c6d7d63db87b87 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2024 17:43:57 +0000
Subject: net: skb_reset_mac_len() must check if mac_header was set

Recent discussions show that skb_reset_mac_len() should be more careful.

We expect the MAC header being set.

If not, clear skb->mac_len and fire a warning for CONFIG_DEBUG_NET=y builds.

If after investigations we find that not having a MAC header was okay,
we can remove the warning.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/netdev/CANn89iJZGH+yEfJxfPWa3Hm7jxb-aeY2Up4HufmLMnVuQXt38A@mail.gmail.com/T/
Cc: En-Wei Wu <en-wei.wu@canonical.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://patch.msgid.link/20241105174403.850330-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 48f1e0fa2a13..5d8fefa71cac 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2909,9 +2909,19 @@ static inline void skb_reset_inner_headers(struct sk_buff *skb)
 	skb->inner_transport_header = skb->transport_header;
 }
 
+static inline int skb_mac_header_was_set(const struct sk_buff *skb)
+{
+	return skb->mac_header != (typeof(skb->mac_header))~0U;
+}
+
 static inline void skb_reset_mac_len(struct sk_buff *skb)
 {
-	skb->mac_len = skb->network_header - skb->mac_header;
+	if (!skb_mac_header_was_set(skb)) {
+		DEBUG_NET_WARN_ON_ONCE(1);
+		skb->mac_len = 0;
+	} else {
+		skb->mac_len = skb->network_header - skb->mac_header;
+	}
 }
 
 static inline unsigned char *skb_inner_transport_header(const struct sk_buff
@@ -3014,11 +3024,6 @@ static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
 	skb->network_header += offset;
 }
 
-static inline int skb_mac_header_was_set(const struct sk_buff *skb)
-{
-	return skb->mac_header != (typeof(skb->mac_header))~0U;
-}
-
 static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
 {
 	DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
-- 
cgit v1.2.3


From cfe8394e06f22847ecae0312bdd0b633b471b3f0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2024 17:43:58 +0000
Subject: net: add debug check in skb_reset_inner_transport_header()

Make sure (skb->data - skb->head) can fit in skb->inner_transport_header

This needs CONFIG_DEBUG_NET=y.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://patch.msgid.link/20241105174403.850330-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5d8fefa71cac..75795568803c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2937,7 +2937,10 @@ static inline int skb_inner_transport_offset(const struct sk_buff *skb)
 
 static inline void skb_reset_inner_transport_header(struct sk_buff *skb)
 {
-	skb->inner_transport_header = skb->data - skb->head;
+	long offset = skb->data - skb->head;
+
+	DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_transport_header))offset);
+	skb->inner_transport_header = offset;
 }
 
 static inline void skb_set_inner_transport_header(struct sk_buff *skb,
-- 
cgit v1.2.3


From 1732e4bedb3e29986da6ea315e8bf8caf74e1e38 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2024 17:43:59 +0000
Subject: net: add debug check in skb_reset_inner_network_header()

Make sure (skb->data - skb->head) can fit in skb->inner_network_header

This needs CONFIG_DEBUG_NET=y.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://patch.msgid.link/20241105174403.850330-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 75795568803c..8dafd1295a6e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2957,7 +2957,10 @@ static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
 
 static inline void skb_reset_inner_network_header(struct sk_buff *skb)
 {
-	skb->inner_network_header = skb->data - skb->head;
+	long offset = skb->data - skb->head;
+
+	DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_network_header))offset);
+	skb->inner_network_header = offset;
 }
 
 static inline void skb_set_inner_network_header(struct sk_buff *skb,
-- 
cgit v1.2.3


From 78a0cb2f45dc9327f26956c242f7f4b450efff49 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2024 17:44:00 +0000
Subject: net: add debug check in skb_reset_inner_mac_header()

Make sure (skb->data - skb->head) can fit in skb->inner_mac_header

This needs CONFIG_DEBUG_NET=y.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://patch.msgid.link/20241105174403.850330-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8dafd1295a6e..32a7d8a65b9f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2982,7 +2982,10 @@ static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
 
 static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
 {
-	skb->inner_mac_header = skb->data - skb->head;
+	long offset = skb->data - skb->head;
+
+	DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_mac_header))offset);
+	skb->inner_mac_header = offset;
 }
 
 static inline void skb_set_inner_mac_header(struct sk_buff *skb,
-- 
cgit v1.2.3


From ae50ea52bdd77eabd8f3c4630c1b03c3373f61a5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2024 17:44:01 +0000
Subject: net: add debug check in skb_reset_transport_header()

Make sure (skb->data - skb->head) can fit in skb->transport_header

This needs CONFIG_DEBUG_NET=y.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://patch.msgid.link/20241105174403.850330-6-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 32a7d8a65b9f..f76524844e7e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3007,7 +3007,10 @@ static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
 
 static inline void skb_reset_transport_header(struct sk_buff *skb)
 {
-	skb->transport_header = skb->data - skb->head;
+	long offset = skb->data - skb->head;
+
+	DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->transport_header))offset);
+	skb->transport_header = offset;
 }
 
 static inline void skb_set_transport_header(struct sk_buff *skb,
-- 
cgit v1.2.3


From 305ae87dafc1a9f35374a783119d9e6001d1b8e0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2024 17:44:02 +0000
Subject: net: add debug check in skb_reset_network_header()

Make sure (skb->data - skb->head) can fit in skb->network_header

This needs CONFIG_DEBUG_NET=y.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://patch.msgid.link/20241105174403.850330-7-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f76524844e7e..f1e49a32880d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3027,7 +3027,10 @@ static inline unsigned char *skb_network_header(const struct sk_buff *skb)
 
 static inline void skb_reset_network_header(struct sk_buff *skb)
 {
-	skb->network_header = skb->data - skb->head;
+	long offset = skb->data - skb->head;
+
+	DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->network_header))offset);
+	skb->network_header = offset;
 }
 
 static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
-- 
cgit v1.2.3


From 3b6167e9bfc9eae4edad065c166c667f9a8e693a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2024 17:44:03 +0000
Subject: net: add debug check in skb_reset_mac_header()

Make sure (skb->data - skb->head) can fit in skb->mac_header

This needs CONFIG_DEBUG_NET=y.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://patch.msgid.link/20241105174403.850330-8-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f1e49a32880d..e5095d75abba 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3063,7 +3063,10 @@ static inline void skb_unset_mac_header(struct sk_buff *skb)
 
 static inline void skb_reset_mac_header(struct sk_buff *skb)
 {
-	skb->mac_header = skb->data - skb->head;
+	long offset = skb->data - skb->head;
+
+	DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->mac_header))offset);
+	skb->mac_header = offset;
 }
 
 static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
-- 
cgit v1.2.3


From 49a17639508c3b35f90ca829e60dddeeeb750e74 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 6 Nov 2024 15:51:39 +0100
Subject: softirq: Use a dedicated thread for timer wakeups on PREEMPT_RT.

The timer and hrtimer soft interrupts are raised in hard interrupt
context. With threaded interrupts force enabled or on PREEMPT_RT this leads
to waking the ksoftirqd for the processing of the soft interrupt.

ksoftirqd runs as SCHED_OTHER task which means it will compete with other
tasks for CPU resources.  This can introduce long delays for timer
processing on heavy loaded systems and is not desired.

Split the TIMER_SOFTIRQ and HRTIMER_SOFTIRQ processing into a dedicated
timers thread and let it run at the lowest SCHED_FIFO priority.
Wake-ups for RT tasks happen from hardirq context so only timer_list timers
and hrtimers for "regular" tasks are processed here. The higher priority
ensures that wakeups are performed before scheduling SCHED_OTHER tasks.

Using a dedicated variable to store the pending softirq bits values ensure
that the timer are not accidentally picked up by ksoftirqd and other
threaded interrupts.

It shouldn't be picked up by ksoftirqd since it runs at lower priority.
However if ksoftirqd is already running while a timer fires, then ksoftird
will be PI-boosted due to the BH-lock to ktimer's priority.

The timer thread can pick up pending softirqs from ksoftirqd but only
if the softirq load is high. It is not be desired that the picked up
softirqs are processed at SCHED_FIFO priority under high softirq load
but this can already happen by a PI-boost by a force-threaded interrupt.

[ frederic@kernel.org: rcutorture.c fixes, storm fix by introduction of
  local_timers_pending() for tick_nohz_next_event() ]

[ junxiao.chang@intel.com: Ensure ktimersd gets woken up even if a
  softirq is currently served. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org> [rcutorture]
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/all/20241106150419.2593080-4-bigeasy@linutronix.de
---
 include/linux/interrupt.h | 47 ++++++++++++++++++++++++++++++++
 kernel/rcu/rcutorture.c   |  8 ++++++
 kernel/softirq.c          | 69 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/time/hrtimer.c     |  4 +--
 kernel/time/tick-sched.c  |  2 +-
 kernel/time/timer.c       |  2 +-
 6 files changed, 127 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 457151f9f263..8cd9327e4e78 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -616,6 +616,53 @@ extern void __raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
+/*
+ * With forced-threaded interrupts enabled a raised softirq is deferred to
+ * ksoftirqd unless it can be handled within the threaded interrupt. This
+ * affects timer_list timers and hrtimers which are explicitly marked with
+ * HRTIMER_MODE_SOFT.
+ * With PREEMPT_RT enabled more hrtimers are moved to softirq for processing
+ * which includes all timers which are not explicitly marked HRTIMER_MODE_HARD.
+ * Userspace controlled timers (like the clock_nanosleep() interface) is divided
+ * into two categories: Tasks with elevated scheduling policy including
+ * SCHED_{FIFO|RR|DL} and the remaining scheduling policy. The tasks with the
+ * elevated scheduling policy are woken up directly from the HARDIRQ while all
+ * other wake ups are delayed to softirq and so to ksoftirqd.
+ *
+ * The ksoftirqd runs at SCHED_OTHER policy at which it should remain since it
+ * handles the softirq in an overloaded situation (not handled everything
+ * within its last run).
+ * If the timers are handled at SCHED_OTHER priority then they competes with all
+ * other SCHED_OTHER tasks for CPU resources are possibly delayed.
+ * Moving timers softirqs to a low priority SCHED_FIFO thread instead ensures
+ * that timer are performed before scheduling any SCHED_OTHER thread.
+ */
+DECLARE_PER_CPU(struct task_struct *, ktimerd);
+DECLARE_PER_CPU(unsigned long, pending_timer_softirq);
+void raise_ktimers_thread(unsigned int nr);
+
+static inline unsigned int local_timers_pending_force_th(void)
+{
+	return __this_cpu_read(pending_timer_softirq);
+}
+
+static inline void raise_timer_softirq(unsigned int nr)
+{
+	lockdep_assert_in_irq();
+	if (force_irqthreads())
+		raise_ktimers_thread(nr);
+	else
+		__raise_softirq_irqoff(nr);
+}
+
+static inline unsigned int local_timers_pending(void)
+{
+	if (force_irqthreads())
+		return local_timers_pending_force_th();
+	else
+		return local_softirq_pending();
+}
+
 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
 
 static inline struct task_struct *this_cpu_ksoftirqd(void)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bb75dbf5c800..270c31a1e857 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2440,6 +2440,14 @@ static int rcutorture_booster_init(unsigned int cpu)
 		WARN_ON_ONCE(!t);
 		sp.sched_priority = 2;
 		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#ifdef CONFIG_IRQ_FORCED_THREADING
+		if (force_irqthreads()) {
+			t = per_cpu(ktimerd, cpu);
+			WARN_ON_ONCE(!t);
+			sp.sched_priority = 2;
+			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+		}
+#endif
 	}
 
 	/* Don't allow time recalculation while creating a new task. */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d082e7840f88..7b525c904462 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -624,6 +624,24 @@ static inline void tick_irq_exit(void)
 #endif
 }
 
+#ifdef CONFIG_IRQ_FORCED_THREADING
+DEFINE_PER_CPU(struct task_struct *, ktimerd);
+DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
+
+static void wake_timersd(void)
+{
+	struct task_struct *tsk = __this_cpu_read(ktimerd);
+
+	if (tsk)
+		wake_up_process(tsk);
+}
+
+#else
+
+static inline void wake_timersd(void) { }
+
+#endif
+
 static inline void __irq_exit_rcu(void)
 {
 #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -636,6 +654,10 @@ static inline void __irq_exit_rcu(void)
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
 
+	if (IS_ENABLED(CONFIG_IRQ_FORCED_THREADING) && force_irqthreads() &&
+	    local_timers_pending_force_th() && !(in_nmi() | in_hardirq()))
+		wake_timersd();
+
 	tick_irq_exit();
 }
 
@@ -971,12 +993,57 @@ static struct smp_hotplug_thread softirq_threads = {
 	.thread_comm		= "ksoftirqd/%u",
 };
 
+#ifdef CONFIG_IRQ_FORCED_THREADING
+static void ktimerd_setup(unsigned int cpu)
+{
+	/* Above SCHED_NORMAL to handle timers before regular tasks. */
+	sched_set_fifo_low(current);
+}
+
+static int ktimerd_should_run(unsigned int cpu)
+{
+	return local_timers_pending_force_th();
+}
+
+void raise_ktimers_thread(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	__this_cpu_or(pending_timer_softirq, BIT(nr));
+}
+
+static void run_ktimerd(unsigned int cpu)
+{
+	unsigned int timer_si;
+
+	ksoftirqd_run_begin();
+
+	timer_si = local_timers_pending_force_th();
+	__this_cpu_write(pending_timer_softirq, 0);
+	or_softirq_pending(timer_si);
+
+	__do_softirq();
+
+	ksoftirqd_run_end();
+}
+
+static struct smp_hotplug_thread timer_thread = {
+	.store			= &ktimerd,
+	.setup			= ktimerd_setup,
+	.thread_should_run	= ktimerd_should_run,
+	.thread_fn		= run_ktimerd,
+	.thread_comm		= "ktimers/%u",
+};
+#endif
+
 static __init int spawn_ksoftirqd(void)
 {
 	cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
 				  takeover_tasklets);
 	BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
-
+#ifdef CONFIG_IRQ_FORCED_THREADING
+	if (force_irqthreads())
+		BUG_ON(smpboot_register_percpu_thread(&timer_thread));
+#endif
 	return 0;
 }
 early_initcall(spawn_ksoftirqd);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5402e0f24217..d9911516e743 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1811,7 +1811,7 @@ retry:
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
-		__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		raise_timer_softirq(HRTIMER_SOFTIRQ);
 	}
 
 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
@@ -1906,7 +1906,7 @@ void hrtimer_run_queues(void)
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
-		__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		raise_timer_softirq(HRTIMER_SOFTIRQ);
 	}
 
 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 753a184c7090..976a212cca2e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -859,7 +859,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 
 static inline bool local_timer_softirq_pending(void)
 {
-	return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
+	return local_timers_pending() & BIT(TIMER_SOFTIRQ);
 }
 
 /*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1759de934284..06f0bc1db6d9 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2499,7 +2499,7 @@ static void run_local_timers(void)
 		 */
 		if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) ||
 		    (i == BASE_DEF && tmigr_requires_handle_remote())) {
-			__raise_softirq_irqoff(TIMER_SOFTIRQ);
+			raise_timer_softirq(TIMER_SOFTIRQ);
 			return;
 		}
 	}
-- 
cgit v1.2.3


From fbf920f255315974808ce91d934fe50198294d51 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 31 Oct 2024 16:14:15 +0100
Subject: hrtimers: Add missing hrtimer_init() trace points

hrtimer_init*_on_stack() is not covered by tracing when
CONFIG_DEBUG_OBJECTS_TIMERS=y.

Rework the functions similar to hrtimer_init() and hrtimer_init_sleeper()
so that the hrtimer_init() tracepoint is unconditionally available.

The rework makes hrtimer_init_sleeper() unused. Delete it.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/74528e8abf2bb96e8bee85ffacbf14e15cf89f0d.1730386209.git.namcao@linutronix.de
---
 include/linux/hrtimer.h | 19 +--------------
 kernel/time/hrtimer.c   | 65 +++++++++++++++++++++++++++----------------------
 2 files changed, 37 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index aa1e65ccb615..5aa9d57528c4 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -228,32 +228,15 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer)
 /* Initialize timers: */
 extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
 			 enum hrtimer_mode mode);
-extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
-				 enum hrtimer_mode mode);
-
-#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
 				  enum hrtimer_mode mode);
 extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
 					  clockid_t clock_id,
 					  enum hrtimer_mode mode);
 
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
 #else
-static inline void hrtimer_init_on_stack(struct hrtimer *timer,
-					 clockid_t which_clock,
-					 enum hrtimer_mode mode)
-{
-	hrtimer_init(timer, which_clock, mode);
-}
-
-static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
-						 clockid_t clock_id,
-						 enum hrtimer_mode mode)
-{
-	hrtimer_init_sleeper(sl, clock_id, mode);
-}
-
 static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 #endif
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 04f7d8a392c3..4b0507cf38ea 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -417,6 +417,11 @@ static inline void debug_hrtimer_init(struct hrtimer *timer)
 	debug_object_init(timer, &hrtimer_debug_descr);
 }
 
+static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
+{
+	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
+}
+
 static inline void debug_hrtimer_activate(struct hrtimer *timer,
 					  enum hrtimer_mode mode)
 {
@@ -428,28 +433,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
 	debug_object_deactivate(timer, &hrtimer_debug_descr);
 }
 
-static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
-			   enum hrtimer_mode mode);
-
-void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
-			   enum hrtimer_mode mode)
-{
-	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
-	__hrtimer_init(timer, clock_id, mode);
-}
-EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
-
-static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
-				   clockid_t clock_id, enum hrtimer_mode mode);
-
-void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
-				   clockid_t clock_id, enum hrtimer_mode mode)
-{
-	debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
-	__hrtimer_init_sleeper(sl, clock_id, mode);
-}
-EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
-
 void destroy_hrtimer_on_stack(struct hrtimer *timer)
 {
 	debug_object_free(timer, &hrtimer_debug_descr);
@@ -459,6 +442,7 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
 #else
 
 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
+static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
 static inline void debug_hrtimer_activate(struct hrtimer *timer,
 					  enum hrtimer_mode mode) { }
 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
@@ -472,6 +456,13 @@ debug_init(struct hrtimer *timer, clockid_t clockid,
 	trace_hrtimer_init(timer, clockid, mode);
 }
 
+static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid,
+				       enum hrtimer_mode mode)
+{
+	debug_hrtimer_init_on_stack(timer);
+	trace_hrtimer_init(timer, clockid, mode);
+}
+
 static inline void debug_activate(struct hrtimer *timer,
 				  enum hrtimer_mode mode)
 {
@@ -1600,6 +1591,23 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
+/**
+ * hrtimer_init_on_stack - initialize a timer in stack memory
+ * @timer:	The timer to be initialized
+ * @clock_id:	The clock to be used
+ * @mode:       The timer mode
+ *
+ * Similar to hrtimer_init(), except that this one must be used if struct hrtimer is in stack
+ * memory.
+ */
+void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
+			   enum hrtimer_mode mode)
+{
+	debug_init_on_stack(timer, clock_id, mode);
+	__hrtimer_init(timer, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
+
 /*
  * A timer is active, when it is enqueued into the rbtree or the
  * callback function is running or it's in the state of being migrated
@@ -1944,7 +1952,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
 	 * Make the enqueue delivery mode check work on RT. If the sleeper
 	 * was initialized for hard interrupt delivery, force the mode bit.
 	 * This is a special case for hrtimer_sleepers because
-	 * hrtimer_init_sleeper() determines the delivery mode on RT so the
+	 * __hrtimer_init_sleeper() determines the delivery mode on RT so the
 	 * fiddling with this decision is avoided at the call sites.
 	 */
 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
@@ -1987,19 +1995,18 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 }
 
 /**
- * hrtimer_init_sleeper - initialize sleeper to the given clock
+ * hrtimer_init_sleeper_on_stack - initialize a sleeper in stack memory
  * @sl:		sleeper to be initialized
  * @clock_id:	the clock to be used
  * @mode:	timer mode abs/rel
  */
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
-			  enum hrtimer_mode mode)
+void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
+				   clockid_t clock_id, enum hrtimer_mode mode)
 {
-	debug_init(&sl->timer, clock_id, mode);
+	debug_init_on_stack(&sl->timer, clock_id, mode);
 	__hrtimer_init_sleeper(sl, clock_id, mode);
-
 }
-EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
 
 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
 {
-- 
cgit v1.2.3


From 908a1d775422ba2e27a5e33d0c130b522419e121 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 31 Oct 2024 16:14:20 +0100
Subject: hrtimers: Introduce hrtimer_setup() to replace hrtimer_init()

To initialize hrtimer, hrtimer_init() needs to be called and also
hrtimer::function must be set. This is error-prone and awkward to use.

Introduce hrtimer_setup() which does both of these things, so that users of
hrtimer can be simplified.

The new setup function also has a sanity check for the provided function
pointer. If NULL, a warning is emitted and a dummy callback installed.

hrtimer_init() will be removed as soon as all of its users have been
converted to the new function.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/5057c1ddbfd4b92033cd93d37fe38e6b069d5ba6.1730386209.git.namcao@linutronix.de
---
 include/linux/hrtimer.h |  2 ++
 kernel/time/hrtimer.c   | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5aa9d57528c4..bcc0715c59a8 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -228,6 +228,8 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer)
 /* Initialize timers: */
 extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
 			 enum hrtimer_mode mode);
+extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
+			  clockid_t clock_id, enum hrtimer_mode mode);
 extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
 				  enum hrtimer_mode mode);
 extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4b0507cf38ea..a5ef67edcda9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1535,6 +1535,11 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 	return HRTIMER_BASE_MONOTONIC;
 }
 
+static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)
+{
+	return HRTIMER_NORESTART;
+}
+
 static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 			   enum hrtimer_mode mode)
 {
@@ -1571,6 +1576,18 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	timerqueue_init(&timer->node);
 }
 
+static void __hrtimer_setup(struct hrtimer *timer,
+			    enum hrtimer_restart (*function)(struct hrtimer *),
+			    clockid_t clock_id, enum hrtimer_mode mode)
+{
+	__hrtimer_init(timer, clock_id, mode);
+
+	if (WARN_ON_ONCE(!function))
+		timer->function = hrtimer_dummy_timeout;
+	else
+		timer->function = function;
+}
+
 /**
  * hrtimer_init - initialize a timer to the given clock
  * @timer:	the timer to be initialized
@@ -1591,6 +1608,27 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
+/**
+ * hrtimer_setup - initialize a timer to the given clock
+ * @timer:	the timer to be initialized
+ * @function:	the callback function
+ * @clock_id:	the clock to be used
+ * @mode:       The modes which are relevant for initialization:
+ *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
+ *              HRTIMER_MODE_REL_SOFT
+ *
+ *              The PINNED variants of the above can be handed in,
+ *              but the PINNED bit is ignored as pinning happens
+ *              when the hrtimer is started
+ */
+void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
+		   clockid_t clock_id, enum hrtimer_mode mode)
+{
+	debug_init(timer, clock_id, mode);
+	__hrtimer_setup(timer, function, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_setup);
+
 /**
  * hrtimer_init_on_stack - initialize a timer in stack memory
  * @timer:	The timer to be initialized
-- 
cgit v1.2.3


From 444cb7db4c9f9b5d96be17c38b3e989df7bfabd5 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 31 Oct 2024 16:14:21 +0100
Subject: hrtimers: Introduce hrtimer_setup_on_stack()

To initialize hrtimer on stack, hrtimer_init_on_stack() needs to be called
and also hrtimer::function must be set. This is error-prone and awkward to
use.

Introduce hrtimer_setup_on_stack() which does both of these things, so that
users of hrtimer can be simplified.

The new setup function also has a sanity check for the provided function
pointer. If NULL, a warning is emitted and a dummy callback installed.

hrtimer_init_on_stack() will be removed as soon as all of its users have
been converted to the new function.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/4b05e2ab3a82c517adf67fabc0f0cd8fe118b97c.1730386209.git.namcao@linutronix.de
---
 include/linux/hrtimer.h |  3 +++
 kernel/time/hrtimer.c   | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bcc0715c59a8..2da513f8d66a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -232,6 +232,9 @@ extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function
 			  clockid_t clock_id, enum hrtimer_mode mode);
 extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
 				  enum hrtimer_mode mode);
+extern void hrtimer_setup_on_stack(struct hrtimer *timer,
+				   enum hrtimer_restart (*function)(struct hrtimer *),
+				   clockid_t clock_id, enum hrtimer_mode mode);
 extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
 					  clockid_t clock_id,
 					  enum hrtimer_mode mode);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index a5ef67edcda9..daee4e27f839 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1646,6 +1646,25 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
 
+/**
+ * hrtimer_setup_on_stack - initialize a timer on stack memory
+ * @timer:	The timer to be initialized
+ * @function:	the callback function
+ * @clock_id:	The clock to be used
+ * @mode:       The timer mode
+ *
+ * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack
+ * memory.
+ */
+void hrtimer_setup_on_stack(struct hrtimer *timer,
+			    enum hrtimer_restart (*function)(struct hrtimer *),
+			    clockid_t clock_id, enum hrtimer_mode mode)
+{
+	debug_init_on_stack(timer, clock_id, mode);
+	__hrtimer_setup(timer, function, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack);
+
 /*
  * A timer is active, when it is enqueued into the rbtree or the
  * callback function is running or it's in the state of being migrated
-- 
cgit v1.2.3


From c9bd83abfeb9a9b103e689b251ccff7a01be8366 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 31 Oct 2024 16:14:22 +0100
Subject: hrtimers: Introduce hrtimer_setup_sleeper_on_stack()

The hrtimer_init*() API is replaced by hrtimer_setup*() variants to
initialize the timer including the callback function at once.

hrtimer_init_sleeper_on_stack() does not need user to setup the callback
function separately, so a new variant would not be strictly necessary.

Nonetheless, to keep the naming convention consistent, introduce
hrtimer_setup_sleeper_on_stack(). hrtimer_init_on_stack() will be removed
once all users are converted.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/7b5e18e6dd0ace9eaa211201528cb9dc23752454.1730386209.git.namcao@linutronix.de
---
 include/linux/hrtimer.h |  2 ++
 kernel/time/hrtimer.c   | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2da513f8d66a..48872a2b4071 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -238,6 +238,8 @@ extern void hrtimer_setup_on_stack(struct hrtimer *timer,
 extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
 					  clockid_t clock_id,
 					  enum hrtimer_mode mode);
+extern void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
+					   enum hrtimer_mode mode);
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index daee4e27f839..1d1f5c03673c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2065,6 +2065,20 @@ void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
 
+/**
+ * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory
+ * @sl:		sleeper to be initialized
+ * @clock_id:	the clock to be used
+ * @mode:	timer mode abs/rel
+ */
+void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl,
+				    clockid_t clock_id, enum hrtimer_mode mode)
+{
+	debug_init_on_stack(&sl->timer, clock_id, mode);
+	__hrtimer_init_sleeper(sl, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack);
+
 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
 {
 	switch(restart->nanosleep.type) {
-- 
cgit v1.2.3


From 8f02e3563bb5824eb01c94f2c75f1dcee2d05625 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 31 Oct 2024 16:14:23 +0100
Subject: hrtimers: Introduce hrtimer_update_function()

Some users of hrtimer need to change the callback function after the
initial setup. They write to hrtimer::function directly.

That's not safe under all circumstances as the write is lockless and a
concurrent timer expiry might end up using the wrong function pointer.

Introduce hrtimer_update_function(), which also performs runtime checks
whether it is safe to modify the callback.

This allows to make hrtimer::function private once all users are converted.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20a937b0ae09ad54b5b6d86eabead7c570f1b72e.1730386209.git.namcao@linutronix.de
---
 include/linux/hrtimer.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 48872a2b4071..6e026730e803 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -327,6 +327,28 @@ static inline int hrtimer_callback_running(struct hrtimer *timer)
 	return timer->base->running == timer;
 }
 
+/**
+ * hrtimer_update_function - Update the timer's callback function
+ * @timer:	Timer to update
+ * @function:	New callback function
+ *
+ * Only safe to call if the timer is not enqueued. Can be called in the callback function if the
+ * timer is not enqueued at the same time (see the comments above HRTIMER_STATE_ENQUEUED).
+ */
+static inline void hrtimer_update_function(struct hrtimer *timer,
+					   enum hrtimer_restart (*function)(struct hrtimer *))
+{
+	guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock);
+
+	if (WARN_ON_ONCE(hrtimer_is_queued(timer)))
+		return;
+
+	if (WARN_ON_ONCE(!function))
+		return;
+
+	timer->function = function;
+}
+
 /* Forward a hrtimer so it expires after now: */
 extern u64
 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
-- 
cgit v1.2.3


From 211647e5121e0e0da974bf69a8eb7c9fe57fa3bd Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 31 Oct 2024 16:14:28 +0100
Subject: wait: Switch to use hrtimer_setup_sleeper_on_stack()

hrtimer_setup_sleeper_on_stack() replaces hrtimer_init_sleeper_on_stack()
to keep the naming convention consistent.

Convert the usage site over to it.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/fc91182375df81120a88dbe0263267e24d1bf19e.1730386209.git.namcao@linutronix.de
---
 include/linux/wait.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 8aa3372f21a0..643b7c7bf376 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -541,8 +541,8 @@ do {										\
 	int __ret = 0;								\
 	struct hrtimer_sleeper __t;						\
 										\
-	hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC,			\
-				      HRTIMER_MODE_REL);			\
+	hrtimer_setup_sleeper_on_stack(&__t, CLOCK_MONOTONIC,			\
+				       HRTIMER_MODE_REL);			\
 	if ((timeout) != KTIME_MAX) {						\
 		hrtimer_set_expires_range_ns(&__t.timer, timeout,		\
 					current->timer_slack_ns);		\
-- 
cgit v1.2.3


From f3bef7aaa6c807b78e8fc6929c3226d3038fe505 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 31 Oct 2024 16:14:29 +0100
Subject: hrtimers: Delete hrtimer_init_sleeper_on_stack()

hrtimer_init_sleeper_on_stack() is now unused. Delete it.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/52549846635c0b3a2abf82101f539efdabcd9778.1730386209.git.namcao@linutronix.de
---
 include/linux/hrtimer.h |  3 ---
 kernel/time/hrtimer.c   | 14 --------------
 2 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6e026730e803..4e4f04b3c0c2 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -235,9 +235,6 @@ extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
 extern void hrtimer_setup_on_stack(struct hrtimer *timer,
 				   enum hrtimer_restart (*function)(struct hrtimer *),
 				   clockid_t clock_id, enum hrtimer_mode mode);
-extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
-					  clockid_t clock_id,
-					  enum hrtimer_mode mode);
 extern void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
 					   enum hrtimer_mode mode);
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 69430467a17d..376b8182b72e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2051,20 +2051,6 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 	sl->task = current;
 }
 
-/**
- * hrtimer_init_sleeper_on_stack - initialize a sleeper in stack memory
- * @sl:		sleeper to be initialized
- * @clock_id:	the clock to be used
- * @mode:	timer mode abs/rel
- */
-void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
-				   clockid_t clock_id, enum hrtimer_mode mode)
-{
-	debug_init_on_stack(&sl->timer, clock_id, mode);
-	__hrtimer_init_sleeper(sl, clock_id, mode);
-}
-EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
-
 /**
  * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory
  * @sl:		sleeper to be initialized
-- 
cgit v1.2.3


From 3c2fb0152175f9f596b40763cdc1378297da60af Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 31 Oct 2024 16:14:33 +0100
Subject: hrtimers: Delete hrtimer_init_on_stack()

hrtimer_init_on_stack() is now unused. Delete it.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/510ce0d2944c4a382ea51e51d03dcfb73ba0f4f7.1730386209.git.namcao@linutronix.de
---
 include/linux/hrtimer.h |  2 --
 kernel/time/hrtimer.c   | 17 -----------------
 2 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 4e4f04b3c0c2..7ef5f7ef31a9 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -230,8 +230,6 @@ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
 			 enum hrtimer_mode mode);
 extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
 			  clockid_t clock_id, enum hrtimer_mode mode);
-extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
-				  enum hrtimer_mode mode);
 extern void hrtimer_setup_on_stack(struct hrtimer *timer,
 				   enum hrtimer_restart (*function)(struct hrtimer *),
 				   clockid_t clock_id, enum hrtimer_mode mode);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 376b8182b72e..55e9ffbcd49a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1629,23 +1629,6 @@ void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struc
 }
 EXPORT_SYMBOL_GPL(hrtimer_setup);
 
-/**
- * hrtimer_init_on_stack - initialize a timer in stack memory
- * @timer:	The timer to be initialized
- * @clock_id:	The clock to be used
- * @mode:       The timer mode
- *
- * Similar to hrtimer_init(), except that this one must be used if struct hrtimer is in stack
- * memory.
- */
-void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
-			   enum hrtimer_mode mode)
-{
-	debug_init_on_stack(timer, clock_id, mode);
-	__hrtimer_init(timer, clock_id, mode);
-}
-EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
-
 /**
  * hrtimer_setup_on_stack - initialize a timer on stack memory
  * @timer:	The timer to be initialized
-- 
cgit v1.2.3


From b33cc96c7020b923085046e5cf2e934f41c530ec Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 2 Oct 2024 16:25:30 +0100
Subject: mm: add PageAnonNotKsm()

Check that this anonymous page is really anonymous, not anonymous-or-KSM.
This optimises the debug check, but its real purpose is to remove the last
two users of PageKsm().

[willy@infradead.org: fix assertions]
  Link: https://lkml.kernel.org/r/ZwApWPER7caIA_N3@casper.infradead.org
Link: https://lkml.kernel.org/r/20241002152533.1350629-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alex Shi <alexs@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index cc839e4365c1..1fcef06a2d31 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -689,6 +689,13 @@ static __always_inline bool folio_test_anon(const struct folio *folio)
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
 }
 
+static __always_inline bool PageAnonNotKsm(const struct page *page)
+{
+	unsigned long flags = (unsigned long)page_folio(page)->mapping;
+
+	return (flags & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_ANON;
+}
+
 static __always_inline bool PageAnon(const struct page *page)
 {
 	return folio_test_anon(page_folio(page));
@@ -1137,14 +1144,14 @@ static __always_inline int PageAnonExclusive(const struct page *page)
 
 static __always_inline void SetPageAnonExclusive(struct page *page)
 {
-	VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page);
+	VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
 	VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
 	set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
 }
 
 static __always_inline void ClearPageAnonExclusive(struct page *page)
 {
-	VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page);
+	VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
 	VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
 	clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
 }
-- 
cgit v1.2.3


From b9a256352f3ba697396c26d2a74f4081335f8cef Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 2 Oct 2024 16:25:31 +0100
Subject: mm: remove PageKsm()

All callers have been converted to use folio_test_ksm() or
PageAnonNotKsm(), so we can remove this wrapper.

Link: https://lkml.kernel.org/r/20241002152533.1350629-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alex Shi <alexs@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 7 +------
 mm/internal.h              | 2 +-
 mm/ksm.c                   | 4 ++--
 3 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1fcef06a2d31..e80665bc51fa 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -725,13 +725,8 @@ static __always_inline bool folio_test_ksm(const struct folio *folio)
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
 				PAGE_MAPPING_KSM;
 }
-
-static __always_inline bool PageKsm(const struct page *page)
-{
-	return folio_test_ksm(page_folio(page));
-}
 #else
-TESTPAGEFLAG_FALSE(Ksm, ksm)
+FOLIO_TEST_FLAG_FALSE(ksm)
 #endif
 
 u64 stable_page_flags(const struct page *page);
diff --git a/mm/internal.h b/mm/internal.h
index 64c2eb0b160e..fc2f523258a3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1356,7 +1356,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
 		smp_rmb();
 
 	/*
-	 * Note that PageKsm() pages cannot be exclusive, and consequently,
+	 * Note that KSM pages cannot be exclusive, and consequently,
 	 * cannot get pinned.
 	 */
 	return !PageAnonExclusive(page);
diff --git a/mm/ksm.c b/mm/ksm.c
index b1c5c8aff41b..556b8a8f37d0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -656,7 +656,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
 	 *
 	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
 	 * backing file, which also invalidates anonymous pages: that's
-	 * okay, that truncation will have unmapped the PageKsm for us.
+	 * okay, that truncation will have unmapped the KSM page for us.
 	 *
 	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
 	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
@@ -1434,7 +1434,7 @@ out:
  * try_to_merge_one_page - take two pages and merge them into one
  * @vma: the vma that holds the pte pointing to page
  * @page: the PageAnon page that we want to replace with kpage
- * @kpage: the PageKsm page that we want to map instead of page,
+ * @kpage: the KSM page that we want to map instead of page,
  *         or NULL the first time when we want to use page as kpage.
  *
  * This function returns 0 if the pages were merged, -EFAULT otherwise.
-- 
cgit v1.2.3


From d7d65b1039019e8789119b498d97cf2531d989a8 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 3 Oct 2024 10:18:42 +0530
Subject: mm: move set_pxd_safe() helpers from generic to platform

set_pxd_safe() helpers that serve a specific purpose for both x86 and
riscv platforms, do not need to be in the common memory code.  Otherwise
they just unnecessarily make the common API more complicated.  This moves
the helpers from common code to platform instead.

Link: https://lkml.kernel.org/r/20241003044842.246016-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/include/asm/pgtable.h | 19 +++++++++++++++++++
 arch/x86/include/asm/pgtable.h   | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/pgtable.h          | 38 --------------------------------------
 3 files changed, 56 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index e79f15293492..5d7f3e8c2e50 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -963,6 +963,25 @@ void misc_mem_init(void);
 extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 
+/*
+ * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
+ * TLB flush will be required as a result of the "set". For example, use
+ * in scenarios where it is known ahead of time that the routine is
+ * setting non-present entries, or re-setting an existing entry to the
+ * same value. Otherwise, use the typical "set" helpers and flush the
+ * TLB.
+ */
+#define set_p4d_safe(p4dp, p4d) \
+({ \
+	WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
+	set_p4d(p4dp, p4d); \
+})
+
+#define set_pgd_safe(pgdp, pgd) \
+({ \
+	WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
+	set_pgd(pgdp, pgd); \
+})
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_RISCV_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 4c2d080d26b4..593f10aabd45 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1775,6 +1775,43 @@ bool arch_is_platform_page(u64 paddr);
 #define arch_is_platform_page arch_is_platform_page
 #endif
 
+/*
+ * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
+ * TLB flush will be required as a result of the "set". For example, use
+ * in scenarios where it is known ahead of time that the routine is
+ * setting non-present entries, or re-setting an existing entry to the
+ * same value. Otherwise, use the typical "set" helpers and flush the
+ * TLB.
+ */
+#define set_pte_safe(ptep, pte) \
+({ \
+	WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
+	set_pte(ptep, pte); \
+})
+
+#define set_pmd_safe(pmdp, pmd) \
+({ \
+	WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
+	set_pmd(pmdp, pmd); \
+})
+
+#define set_pud_safe(pudp, pud) \
+({ \
+	WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
+	set_pud(pudp, pud); \
+})
+
+#define set_p4d_safe(p4dp, p4d) \
+({ \
+	WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
+	set_p4d(p4dp, p4d); \
+})
+
+#define set_pgd_safe(pgdp, pgd) \
+({ \
+	WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
+	set_pgd(pgdp, pgd); \
+})
 #endif	/* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e8b2ac6bd2ae..23aeffd89a4e 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1056,44 +1056,6 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
 }
 #endif
 
-/*
- * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
- * TLB flush will be required as a result of the "set". For example, use
- * in scenarios where it is known ahead of time that the routine is
- * setting non-present entries, or re-setting an existing entry to the
- * same value. Otherwise, use the typical "set" helpers and flush the
- * TLB.
- */
-#define set_pte_safe(ptep, pte) \
-({ \
-	WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
-	set_pte(ptep, pte); \
-})
-
-#define set_pmd_safe(pmdp, pmd) \
-({ \
-	WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
-	set_pmd(pmdp, pmd); \
-})
-
-#define set_pud_safe(pudp, pud) \
-({ \
-	WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
-	set_pud(pudp, pud); \
-})
-
-#define set_p4d_safe(p4dp, p4d) \
-({ \
-	WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
-	set_p4d(p4dp, p4d); \
-})
-
-#define set_pgd_safe(pgdp, pgd) \
-({ \
-	WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
-	set_pgd(pgdp, pgd); \
-})
-
 #ifndef __HAVE_ARCH_DO_SWAP_PAGE
 static inline void arch_do_swap_page_nr(struct mm_struct *mm,
 				     struct vm_area_struct *vma,
-- 
cgit v1.2.3


From 7f24cbc9c4d42db8a3c8484d120cf9c1da557fab Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Mon, 7 Oct 2024 09:50:29 +0200
Subject: mm/mmap: teach generic_get_unmapped_area{_topdown} to handle hugetlb
 mappings

Patch series "Unify hugetlb into arch_get_unmapped_area functions", v4.

This is an attempt to get rid of a fair amount of duplicated code wrt.
hugetlb and *get_unmapped_area* functions.

HugeTLB registers a .get_unmapped_area function which gets called from
__get_unmapped_area().
hugetlb_get_unmapped_area() is defined by a bunch of architectures and
it also has a generic definition for those that do not define it.
Short-long story is that there is a ton of duplicated code between
specific hugetlb *_get_unmapped_area_* functions and mm-core functions,
so we can do better by teaching arch_get_unmapped_area* functions how
to deal with hugetlb mappings.

Note that not a lot of things need to be taught though.
hugetlb_get_unmapped_area, that gets called for hugetlb mappings, runs
some sanity checks prior to calling mm_get_unmapped_area_vmflags(), so we
do not need to that down the road in the respective
{generic,arch}_get_unmapped_area* functions.

More information can be found in the respective patches.

LTP mmapstress hugetlb selftests were ran succesfully on:


This patch (of 9):

We want to stop special casing hugetlb mappings and make them go through
generic channels, so teach generic_get_unmapped_area{_topdown} to handle
those.  The main difference is that we set info.align_mask for huge
mappings.

Link: https://lkml.kernel.org/r/20241007075037.267650-1-osalvador@suse.de
Link: https://lkml.kernel.org/r/20241007075037.267650-2-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Donet Tom <donettom@linux.ibm.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 10 ++++++++++
 mm/mmap.c               |  4 ++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e4697539b665..368d552e4860 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1035,9 +1035,19 @@ void hugetlb_unregister_node(struct node *node);
  */
 bool is_raw_hwpoison_page_in_hugepage(struct page *page);
 
+static inline unsigned long huge_page_mask_align(struct file *file)
+{
+	return PAGE_MASK & ~huge_page_mask(hstate_file(file));
+}
+
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
+static inline unsigned long huge_page_mask_align(struct file *file)
+{
+	return 0;
+}
+
 static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
 {
 	return NULL;
diff --git a/mm/mmap.c b/mm/mmap.c
index 79d541f1502b..fb91b2cb5561 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -776,6 +776,8 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr,
 	info.low_limit = mm->mmap_base;
 	info.high_limit = mmap_end;
 	info.start_gap = stack_guard_placement(vm_flags);
+	if (filp && is_file_hugepages(filp))
+		info.align_mask = huge_page_mask_align(filp);
 	return vm_unmapped_area(&info);
 }
 
@@ -826,6 +828,8 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 	info.low_limit = PAGE_SIZE;
 	info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
 	info.start_gap = stack_guard_placement(vm_flags);
+	if (filp && is_file_hugepages(filp))
+		info.align_mask = huge_page_mask_align(filp);
 	addr = vm_unmapped_area(&info);
 
 	/*
-- 
cgit v1.2.3


From 7bd3f1e1a9ae7f7508c88dd056755a0a4741ae88 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Mon, 7 Oct 2024 09:50:34 +0200
Subject: mm: make hugetlb mappings go through mm_get_unmapped_area_vmflags

Hugetlb mappings will no longer be special cased but rather go through the
generic mm_get_unmapped_area_vmflags function.  For that to happen, let us
remove the .get_unmapped_area from hugetlbfs_file_operations struct, and
hint __get_unmapped_area that it should not send hugetlb mappings through
thp_get_unmapped_area_vmflags but through mm_get_unmapped_area_vmflags.

Create also a function called hugetlb_mmap_check_and_align() where a
couple of safety checks are being done and the addr is aligned to the huge
page size.  Otherwise we will have to do this in every single function,
which duplicates quite a lot of code.

Link: https://lkml.kernel.org/r/20241007075037.267650-7-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Donet Tom <donettom@linux.ibm.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 24 ++++++++++++++++--------
 include/linux/hugetlb.h |  9 ++++-----
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5cf327337e22..2c5f34e315d2 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -258,15 +258,23 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 			pgoff, flags);
 }
 
-#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-static unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-			  unsigned long len, unsigned long pgoff,
-			  unsigned long flags)
+unsigned long
+__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+			    unsigned long len, unsigned long flags)
 {
-	return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
+	unsigned long addr0 = 0;
+	struct hstate *h = hstate_file(file);
+
+	if (len & ~huge_page_mask(h))
+		return -EINVAL;
+	if ((flags & MAP_FIXED) && prepare_hugepage_range(file, addr, len))
+		return -EINVAL;
+	if (addr)
+		addr0 = ALIGN(addr, huge_page_size(h));
+
+	return mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff,
+					    flags, 0);
 }
-#endif
 
 /*
  * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
@@ -1300,7 +1308,7 @@ static const struct file_operations hugetlbfs_file_operations = {
 	.read_iter		= hugetlbfs_read_iter,
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
-	.get_unmapped_area	= hugetlb_get_unmapped_area,
+	.get_unmapped_area	= __hugetlb_get_unmapped_area,
 	.llseek			= default_llseek,
 	.fallocate		= hugetlbfs_fallocate,
 	.fop_flags		= FOP_HUGE_PAGES,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 368d552e4860..3a81b6126f62 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -546,11 +546,10 @@ static inline struct hstate *hstate_inode(struct inode *i)
 }
 #endif /* !CONFIG_HUGETLBFS */
 
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-					unsigned long len, unsigned long pgoff,
-					unsigned long flags);
-#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
+unsigned long
+__generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+				    unsigned long len, unsigned long pgoff,
+				    unsigned long flags);
 
 unsigned long
 generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-- 
cgit v1.2.3


From cc92882ee218d62ef017fa545b3c8a2d1e060a5a Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Mon, 7 Oct 2024 09:50:35 +0200
Subject: mm: drop hugetlb_get_unmapped_area{_*} functions

Hugetlb mappings are now handled through normal channels just like any
other mapping, so we no longer need hugetlb_get_unmapped_area* specific
functions.

Link: https://lkml.kernel.org/r/20241007075037.267650-8-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Donet Tom <donettom@linux.ibm.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/parisc/mm/hugetlbpage.c     |  21 --------
 arch/powerpc/mm/book3s64/slice.c |  10 ----
 arch/s390/mm/hugetlbpage.c       |  85 ------------------------------
 arch/sparc/mm/hugetlbpage.c      | 108 ---------------------------------------
 arch/x86/mm/hugetlbpage.c        | 101 ------------------------------------
 fs/hugetlbfs/inode.c             |  96 ++--------------------------------
 include/linux/hugetlb.h          |   7 +--
 7 files changed, 6 insertions(+), 422 deletions(-)

(limited to 'include/linux')

diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index aa664f7ddb63..e9d18cf25b79 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -21,27 +21,6 @@
 #include <asm/mmu_context.h>
 
 
-unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-
-	if (len & ~huge_page_mask(h))
-		return -EINVAL;
-	if (len > TASK_SIZE)
-		return -ENOMEM;
-
-	if (flags & MAP_FIXED)
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-
-	if (addr)
-		addr = ALIGN(addr, huge_page_size(h));
-
-	/* we need to make sure the colouring is OK */
-	return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0);
-}
 
 
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c
index 3a858f6b7270..bc9a39821d1c 100644
--- a/arch/powerpc/mm/book3s64/slice.c
+++ b/arch/powerpc/mm/book3s64/slice.c
@@ -814,14 +814,4 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 
 	return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start));
 }
-
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-					unsigned long len, unsigned long pgoff,
-					unsigned long flags)
-{
-	if (radix_enabled())
-		return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
-
-	return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1);
-}
 #endif
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index ded0eff58a19..7c79cf1bc7d7 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -242,88 +242,3 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 	else
 		return false;
 }
-
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
-		unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct vm_unmapped_area_info info = {};
-
-	info.length = len;
-	info.low_limit = current->mm->mmap_base;
-	info.high_limit = TASK_SIZE;
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	return vm_unmapped_area(&info);
-}
-
-static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
-		unsigned long addr0, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct vm_unmapped_area_info info = {};
-	unsigned long addr;
-
-	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
-	info.length = len;
-	info.low_limit = PAGE_SIZE;
-	info.high_limit = current->mm->mmap_base;
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	addr = vm_unmapped_area(&info);
-
-	/*
-	 * A failed mmap() very likely causes application failure,
-	 * so fall back to the bottom-up function here. This scenario
-	 * can happen with large stack limits and large mmap()
-	 * allocations.
-	 */
-	if (addr & ~PAGE_MASK) {
-		VM_BUG_ON(addr != -ENOMEM);
-		info.flags = 0;
-		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = TASK_SIZE;
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
-}
-
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-
-	if (len & ~huge_page_mask(h))
-		return -EINVAL;
-	if (len > TASK_SIZE - mmap_min_addr)
-		return -ENOMEM;
-
-	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-		goto check_asce_limit;
-	}
-
-	if (addr) {
-		addr = ALIGN(addr, huge_page_size(h));
-		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
-		    (!vma || addr + len <= vm_start_gap(vma)))
-			goto check_asce_limit;
-	}
-
-	if (!test_bit(MMF_TOPDOWN, &mm->flags))
-		addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
-				pgoff, flags);
-	else
-		addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
-				pgoff, flags);
-	if (offset_in_page(addr))
-		return addr;
-
-check_asce_limit:
-	return check_asce_limit(mm, addr, len);
-}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index cc91ca7a1e18..eee601a0d2cf 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -19,114 +19,6 @@
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 
-/* Slightly simplified from the non-hugepage variant because by
- * definition we don't have to worry about any page coloring stuff
- */
-
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
-							unsigned long addr,
-							unsigned long len,
-							unsigned long pgoff,
-							unsigned long flags)
-{
-	struct hstate *h = hstate_file(filp);
-	unsigned long task_size = TASK_SIZE;
-	struct vm_unmapped_area_info info = {};
-
-	if (test_thread_flag(TIF_32BIT))
-		task_size = STACK_TOP32;
-
-	info.length = len;
-	info.low_limit = TASK_UNMAPPED_BASE;
-	info.high_limit = min(task_size, VA_EXCLUDE_START);
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	addr = vm_unmapped_area(&info);
-
-	if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
-		VM_BUG_ON(addr != -ENOMEM);
-		info.low_limit = VA_EXCLUDE_END;
-		info.high_limit = task_size;
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
-}
-
-static unsigned long
-hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
-				  const unsigned long len,
-				  const unsigned long pgoff,
-				  const unsigned long flags)
-{
-	struct hstate *h = hstate_file(filp);
-	struct mm_struct *mm = current->mm;
-	unsigned long addr = addr0;
-	struct vm_unmapped_area_info info = {};
-
-	/* This should only ever run for 32-bit processes.  */
-	BUG_ON(!test_thread_flag(TIF_32BIT));
-
-	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
-	info.length = len;
-	info.low_limit = PAGE_SIZE;
-	info.high_limit = mm->mmap_base;
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	addr = vm_unmapped_area(&info);
-
-	/*
-	 * A failed mmap() very likely causes application failure,
-	 * so fall back to the bottom-up function here. This scenario
-	 * can happen with large stack limits and large mmap()
-	 * allocations.
-	 */
-	if (addr & ~PAGE_MASK) {
-		VM_BUG_ON(addr != -ENOMEM);
-		info.flags = 0;
-		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = STACK_TOP32;
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
-}
-
-unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	unsigned long task_size = TASK_SIZE;
-
-	if (test_thread_flag(TIF_32BIT))
-		task_size = STACK_TOP32;
-
-	if (len & ~huge_page_mask(h))
-		return -EINVAL;
-	if (len > task_size)
-		return -ENOMEM;
-
-	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-		return addr;
-	}
-
-	if (addr) {
-		addr = ALIGN(addr, huge_page_size(h));
-		vma = find_vma(mm, addr);
-		if (task_size - len >= addr &&
-		    (!vma || addr + len <= vm_start_gap(vma)))
-			return addr;
-	}
-	if (!test_bit(MMF_TOPDOWN, &mm->flags))
-		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
-				pgoff, flags);
-	else
-		return hugetlb_get_unmapped_area_topdown(file, addr, len,
-				pgoff, flags);
-}
 
 static pte_t sun4u_hugepage_shift_to_tte(pte_t entry, unsigned int shift)
 {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 807a5859a3c4..58f7f2bd535d 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -19,107 +19,6 @@
 #include <asm/tlbflush.h>
 #include <asm/elf.h>
 
-#ifdef CONFIG_HUGETLB_PAGE
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
-		unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct vm_unmapped_area_info info = {};
-
-	info.length = len;
-	info.low_limit = get_mmap_base(1);
-
-	/*
-	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
-	 * in the full address space.
-	 */
-	info.high_limit = in_32bit_syscall() ?
-		task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
-
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	return vm_unmapped_area(&info);
-}
-
-static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
-		unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct vm_unmapped_area_info info = {};
-
-	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
-	info.length = len;
-	info.low_limit = PAGE_SIZE;
-	info.high_limit = get_mmap_base(0);
-
-	/*
-	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
-	 * in the full address space.
-	 */
-	if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
-		info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
-
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	addr = vm_unmapped_area(&info);
-
-	/*
-	 * A failed mmap() very likely causes application failure,
-	 * so fall back to the bottom-up function here. This scenario
-	 * can happen with large stack limits and large mmap()
-	 * allocations.
-	 */
-	if (addr & ~PAGE_MASK) {
-		VM_BUG_ON(addr != -ENOMEM);
-		info.flags = 0;
-		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = TASK_SIZE_LOW;
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
-}
-
-unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-
-	if (len & ~huge_page_mask(h))
-		return -EINVAL;
-
-	if (len > TASK_SIZE)
-		return -ENOMEM;
-
-	/* No address checking. See comment at mmap_address_hint_valid() */
-	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-		return addr;
-	}
-
-	if (addr) {
-		addr &= huge_page_mask(h);
-		if (!mmap_address_hint_valid(addr, len))
-			goto get_unmapped_area;
-
-		vma = find_vma(mm, addr);
-		if (!vma || addr + len <= vm_start_gap(vma))
-			return addr;
-	}
-
-get_unmapped_area:
-	if (!test_bit(MMF_TOPDOWN, &mm->flags))
-		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
-				pgoff, flags);
-	else
-		return hugetlb_get_unmapped_area_topdown(file, addr, len,
-				pgoff, flags);
-}
-#endif /* CONFIG_HUGETLB_PAGE */
 
 #ifdef CONFIG_X86_64
 bool __init arch_hugetlb_valid_size(unsigned long size)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2c5f34e315d2..935c0ed3aa1e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -171,96 +171,10 @@ out:
  * Called under mmap_write_lock(mm).
  */
 
-static unsigned long
-hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct vm_unmapped_area_info info = {};
-
-	info.length = len;
-	info.low_limit = current->mm->mmap_base;
-	info.high_limit = arch_get_mmap_end(addr, len, flags);
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	return vm_unmapped_area(&info);
-}
-
-static unsigned long
-hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct vm_unmapped_area_info info = {};
-
-	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
-	info.length = len;
-	info.low_limit = PAGE_SIZE;
-	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	addr = vm_unmapped_area(&info);
-
-	/*
-	 * A failed mmap() very likely causes application failure,
-	 * so fall back to the bottom-up function here. This scenario
-	 * can happen with large stack limits and large mmap()
-	 * allocations.
-	 */
-	if (unlikely(offset_in_page(addr))) {
-		VM_BUG_ON(addr != -ENOMEM);
-		info.flags = 0;
-		info.low_limit = current->mm->mmap_base;
-		info.high_limit = arch_get_mmap_end(addr, len, flags);
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
-}
-
-unsigned long
-generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-				  unsigned long len, unsigned long pgoff,
-				  unsigned long flags)
-{
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma, *prev;
-	struct hstate *h = hstate_file(file);
-	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
-
-	if (len & ~huge_page_mask(h))
-		return -EINVAL;
-	if (len > mmap_end - mmap_min_addr)
-		return -ENOMEM;
-
-	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-		return addr;
-	}
-
-	if (addr) {
-		addr = ALIGN(addr, huge_page_size(h));
-		vma = find_vma_prev(mm, addr, &prev);
-		if (mmap_end - len >= addr && addr >= mmap_min_addr &&
-		    (!vma || addr + len <= vm_start_gap(vma)) &&
-		    (!prev || addr >= vm_end_gap(prev)))
-			return addr;
-	}
-
-	/*
-	 * Use MMF_TOPDOWN flag as a hint to use topdown routine.
-	 * If architectures have special needs, they should define their own
-	 * version of hugetlb_get_unmapped_area.
-	 */
-	if (test_bit(MMF_TOPDOWN, &mm->flags))
-		return hugetlb_get_unmapped_area_topdown(file, addr, len,
-				pgoff, flags);
-	return hugetlb_get_unmapped_area_bottomup(file, addr, len,
-			pgoff, flags);
-}
-
 unsigned long
-__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-			    unsigned long len, unsigned long flags)
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+			    unsigned long len, unsigned long pgoff,
+			    unsigned long flags)
 {
 	unsigned long addr0 = 0;
 	struct hstate *h = hstate_file(file);
@@ -272,7 +186,7 @@ __hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (addr)
 		addr0 = ALIGN(addr, huge_page_size(h));
 
-	return mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff,
+	return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff,
 					    flags, 0);
 }
 
@@ -1308,7 +1222,7 @@ static const struct file_operations hugetlbfs_file_operations = {
 	.read_iter		= hugetlbfs_read_iter,
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
-	.get_unmapped_area	= __hugetlb_get_unmapped_area,
+	.get_unmapped_area	= hugetlb_get_unmapped_area,
 	.llseek			= default_llseek,
 	.fallocate		= hugetlbfs_fallocate,
 	.fop_flags		= FOP_HUGE_PAGES,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3a81b6126f62..ae4fe8615bb6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -547,15 +547,10 @@ static inline struct hstate *hstate_inode(struct inode *i)
 #endif /* !CONFIG_HUGETLBFS */
 
 unsigned long
-__generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 				    unsigned long len, unsigned long pgoff,
 				    unsigned long flags);
 
-unsigned long
-generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-				  unsigned long len, unsigned long pgoff,
-				  unsigned long flags);
-
 /*
  * huegtlb page specific state flags.  These flags are located in page.private
  * of the hugetlb head page.  Functions created via the below macros should be
-- 
cgit v1.2.3


From afe789b7367ad43ba8f079981d40851f8bd319ce Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Tue, 8 Oct 2024 19:50:24 -0700
Subject: kaslr: rename physmem_end and PHYSMEM_END to direct_map_physmem_end

For clarity.  It's increasingly hard to reason about the code, when KASLR
is moving around the boundaries.  In this case where KASLR is randomizing
the location of the kernel image within physical memory, the maximum
number of address bits for physical memory has not changed.

What has changed is the ending address of memory that is allowed to be
directly mapped by the kernel.

Let's name the variable, and the associated macro accordingly.

Also, enhance the comment above the direct_map_physmem_end definition,
to further clarify how this all works.

Link: https://lkml.kernel.org/r/20241009025024.89813-1-jhubbard@nvidia.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Jordan Niethe <jniethe@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/memory.h         |  2 +-
 arch/x86/include/asm/page_64.h          |  2 +-
 arch/x86/include/asm/pgtable_64_types.h |  2 +-
 arch/x86/mm/init_64.c                   |  2 +-
 arch/x86/mm/kaslr.c                     | 14 +++++++++-----
 include/linux/mm.h                      |  6 +++---
 kernel/resource.c                       |  4 ++--
 mm/memory_hotplug.c                     |  2 +-
 mm/sparse.c                             |  2 +-
 9 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 0480c61dbb4f..73eaa8c2536a 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -110,7 +110,7 @@
 #define PAGE_END		(_PAGE_END(VA_BITS_MIN))
 #endif /* CONFIG_KASAN */
 
-#define PHYSMEM_END		__pa(PAGE_END - 1)
+#define DIRECT_MAP_PHYSMEM_END	__pa(PAGE_END - 1)
 
 #define MIN_THREAD_SHIFT	(14 + KASAN_THREAD_SHIFT)
 
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index f3d257c45225..d63576608ce7 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,7 +17,7 @@ extern unsigned long phys_base;
 extern unsigned long page_offset_base;
 extern unsigned long vmalloc_base;
 extern unsigned long vmemmap_base;
-extern unsigned long physmem_end;
+extern unsigned long direct_map_physmem_end;
 
 static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
 {
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index a98e53491a4e..ec68f8369bdc 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -141,7 +141,7 @@ extern unsigned int ptrs_per_p4d;
 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
 
 #ifdef CONFIG_RANDOMIZE_MEMORY
-# define PHYSMEM_END		physmem_end
+# define DIRECT_MAP_PHYSMEM_END	direct_map_physmem_end
 #endif
 
 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ff253648706f..5a564130b9d0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -961,7 +961,7 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
 	unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
 	int ret;
 
-	if (WARN_ON_ONCE(end > PHYSMEM_END))
+	if (WARN_ON_ONCE(end > DIRECT_MAP_PHYSMEM_END))
 		return -ERANGE;
 
 	ret = __add_pages(nid, start_pfn, nr_pages, params);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 230f1dee4f09..70d3353c92fa 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -52,7 +52,7 @@ static __initdata struct kaslr_memory_region {
 } kaslr_regions[] = {
 	{
 		.base	= &page_offset_base,
-		.end	= &physmem_end,
+		.end	= &direct_map_physmem_end,
 	},
 	{
 		.base	= &vmalloc_base,
@@ -62,8 +62,12 @@ static __initdata struct kaslr_memory_region {
 	},
 };
 
-/* The end of the possible address space for physical memory */
-unsigned long physmem_end __ro_after_init;
+/*
+ * The end of the physical address space that can be mapped directly by the
+ * kernel. This starts out at (1<<MAX_PHYSMEM_BITS) - 1), but KASLR may reduce
+ * that in order to increase the available entropy for mapping other regions.
+ */
+unsigned long direct_map_physmem_end __ro_after_init;
 
 /* Get size in bytes used by the memory region */
 static inline unsigned long get_padding(struct kaslr_memory_region *region)
@@ -94,7 +98,7 @@ void __init kernel_randomize_memory(void)
 	BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
 
 	/* Preset the end of the possible address space for physical memory */
-	physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
+	direct_map_physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
 	if (!kaslr_memory_enabled())
 		return;
 
@@ -145,7 +149,7 @@ void __init kernel_randomize_memory(void)
 		vaddr += get_padding(&kaslr_regions[i]);
 		/*
 		 * KASLR trims the maximum possible size of the
-		 * direct-map. Update the physmem_end boundary.
+		 * direct-map. Update the direct_map_physmem_end boundary.
 		 * No rounding required as the region starts
 		 * PUD aligned and size is in units of TB.
 		 */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f5394d75ce2..4570f33e2429 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -97,11 +97,11 @@ extern const int mmap_rnd_compat_bits_max;
 extern int mmap_rnd_compat_bits __read_mostly;
 #endif
 
-#ifndef PHYSMEM_END
+#ifndef DIRECT_MAP_PHYSMEM_END
 # ifdef MAX_PHYSMEM_BITS
-# define PHYSMEM_END	((1ULL << MAX_PHYSMEM_BITS) - 1)
+# define DIRECT_MAP_PHYSMEM_END	((1ULL << MAX_PHYSMEM_BITS) - 1)
 # else
-# define PHYSMEM_END	(((phys_addr_t)-1)&~(1ULL<<63))
+# define DIRECT_MAP_PHYSMEM_END	(((phys_addr_t)-1)&~(1ULL<<63))
 # endif
 #endif
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 4101016e8b20..d2c8143ae4ff 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1869,7 +1869,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
 	if (flags & GFR_DESCENDING) {
 		resource_size_t end;
 
-		end = min_t(resource_size_t, base->end, PHYSMEM_END);
+		end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
 		return end - size + 1;
 	}
 
@@ -1886,7 +1886,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr,
 	 * @size did not wrap 0.
 	 */
 	return addr > addr - size &&
-	       addr <= min_t(resource_size_t, base->end, PHYSMEM_END);
+	       addr <= min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
 }
 
 static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 621ae1015106..c43b4e7fb298 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void)
 
 struct range mhp_get_pluggable_range(bool need_mapping)
 {
-	const u64 max_phys = PHYSMEM_END;
+	const u64 max_phys = DIRECT_MAP_PHYSMEM_END;
 	struct range mhp_range;
 
 	if (need_mapping) {
diff --git a/mm/sparse.c b/mm/sparse.c
index dc38539f8560..4cb9793f0b52 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section)
 static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
 						unsigned long *end_pfn)
 {
-	unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;
+	unsigned long max_sparsemem_pfn = (DIRECT_MAP_PHYSMEM_END + 1) >> PAGE_SHIFT;
 
 	/*
 	 * Sanity checks - do not allow an architecture to pass
-- 
cgit v1.2.3


From 6359c39c9de66dede8ff5ff257c9e117483dbc7c Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 10 Oct 2024 14:15:56 +0800
Subject: mm: remove unused hugepage for vma_alloc_folio()

The hugepage parameter was deprecated since commit ddc1a5cbc05d
("mempolicy: alloc_pages_mpol() for NUMA policy without vma"), for
PMD-sized THP, it still tries only preferred node if possible in
vma_alloc_folio() by checking the order of the folio allocation.

Link: https://lkml.kernel.org/r/20241010061556.1846751-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/include/asm/page.h   |  2 +-
 arch/arm64/mm/fault.c           |  2 +-
 arch/m68k/include/asm/page_no.h |  2 +-
 arch/s390/include/asm/page.h    |  2 +-
 arch/x86/include/asm/page.h     |  2 +-
 include/linux/gfp.h             |  6 +++---
 include/linux/highmem.h         |  2 +-
 mm/huge_memory.c                |  2 +-
 mm/ksm.c                        |  2 +-
 mm/memory.c                     | 10 ++++------
 mm/mempolicy.c                  |  3 +--
 mm/userfaultfd.c                |  2 +-
 12 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h
index 70419e6be1a3..3dffa2a461d7 100644
--- a/arch/alpha/include/asm/page.h
+++ b/arch/alpha/include/asm/page.h
@@ -18,7 +18,7 @@ extern void clear_page(void *page);
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 
 #define vma_alloc_zeroed_movable_folio(vma, vaddr) \
-	vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
+	vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
 
 extern void copy_page(void * _to, void * _from);
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 8b281cf308b3..d95dca561f7a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -983,7 +983,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
 	if (vma->vm_flags & VM_MTE)
 		flags |= __GFP_ZEROTAGS;
 
-	return vma_alloc_folio(flags, 0, vma, vaddr, false);
+	return vma_alloc_folio(flags, 0, vma, vaddr);
 }
 
 void tag_clear_highpage(struct page *page)
diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h
index af3a10973233..63c0e706084b 100644
--- a/arch/m68k/include/asm/page_no.h
+++ b/arch/m68k/include/asm/page_no.h
@@ -14,7 +14,7 @@ extern unsigned long memory_end;
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
 #define vma_alloc_zeroed_movable_folio(vma, vaddr) \
-	vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
+	vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
 
 #define __pa(vaddr)		((unsigned long)(vaddr))
 #define __va(paddr)		((void *)((unsigned long)(paddr)))
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 73e1e03317b4..d02058f96bcf 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -74,7 +74,7 @@ static inline void copy_page(void *to, void *from)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
 #define vma_alloc_zeroed_movable_folio(vma, vaddr) \
-	vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
+	vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
 
 /*
  * These are used to make use of C type-checking..
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 1b93ff80b43b..c9fe207916f4 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -35,7 +35,7 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
 }
 
 #define vma_alloc_zeroed_movable_folio(vma, vaddr) \
-	vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
+	vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
 
 #ifndef __pa
 #define __pa(x)		__phys_addr((unsigned long)(x))
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a951de920e20..b65724c3427d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -306,7 +306,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
 		struct mempolicy *mpol, pgoff_t ilx, int nid);
 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr, bool hugepage);
+		unsigned long addr);
 #else
 static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
 {
@@ -326,7 +326,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde
 {
 	return folio_alloc_noprof(gfp, order);
 }
-#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage)		\
+#define vma_alloc_folio_noprof(gfp, order, vma, addr)		\
 	folio_alloc_noprof(gfp, order)
 #endif
 
@@ -341,7 +341,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde
 static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
 		struct vm_area_struct *vma, unsigned long addr)
 {
-	struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false);
+	struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr);
 
 	return &folio->page;
 }
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 930a591b9b61..bec9bd715acf 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -226,7 +226,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
 {
 	struct folio *folio;
 
-	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false);
+	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr);
 	if (folio)
 		clear_user_highpage(&folio->page, vaddr);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c674afd16245..387c046a389e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1143,7 +1143,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
 	const int order = HPAGE_PMD_ORDER;
 	struct folio *folio;
 
-	folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true);
+	folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
 
 	if (unlikely(!folio)) {
 		count_vm_event(THP_FAULT_FALLBACK);
diff --git a/mm/ksm.c b/mm/ksm.c
index 556b8a8f37d0..e596bc1b5fa7 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2971,7 +2971,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio,
 	if (!folio_test_uptodate(folio))
 		return folio;		/* let do_swap_page report the error */
 
-	new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
+	new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
 	if (new_folio &&
 	    mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
 		folio_put(new_folio);
diff --git a/mm/memory.c b/mm/memory.c
index 5e9d6a22eb08..c51bc45a7009 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1059,8 +1059,7 @@ static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
 	if (need_zero)
 		new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
 	else
-		new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
-					    addr, false);
+		new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
 
 	if (!new_folio)
 		return NULL;
@@ -4017,8 +4016,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
 	struct folio *folio;
 	swp_entry_t entry;
 
-	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
-				vmf->address, false);
+	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
 	if (!folio)
 		return NULL;
 
@@ -4174,7 +4172,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	gfp = vma_thp_gfp_mask(vma);
 	while (orders) {
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
-		folio = vma_alloc_folio(gfp, order, vma, addr, true);
+		folio = vma_alloc_folio(gfp, order, vma, addr);
 		if (folio) {
 			if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
 							    gfp, entry))
@@ -4713,7 +4711,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 	gfp = vma_thp_gfp_mask(vma);
 	while (orders) {
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
-		folio = vma_alloc_folio(gfp, order, vma, addr, true);
+		folio = vma_alloc_folio(gfp, order, vma, addr);
 		if (folio) {
 			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
 				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9e18a6fc3061..a29eff5d0585 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2290,7 +2290,6 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
  * @order: Order of the folio.
  * @vma: Pointer to VMA.
  * @addr: Virtual address of the allocation.  Must be inside @vma.
- * @hugepage: Unused (was: For hugepages try only preferred node if possible).
  *
  * Allocate a folio for a specific address in @vma, using the appropriate
  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
@@ -2301,7 +2300,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
  * Return: The folio on success or NULL if allocation fails.
  */
 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr, bool hugepage)
+		unsigned long addr)
 {
 	struct mempolicy *pol;
 	pgoff_t ilx;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 48b87c62fc3d..60a0be33766f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -251,7 +251,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
 	if (!*foliop) {
 		ret = -ENOMEM;
 		folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
-					dst_addr, false);
+					dst_addr);
 		if (!folio)
 			goto out;
 
-- 
cgit v1.2.3


From 5708d96da20b99b4665ad72395e3727016057f70 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Fri, 11 Oct 2024 11:03:04 -0400
Subject: mm: avoid zeroing user movable page twice with init_on_alloc=1

Commit 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and
init_on_free=1 boot options") forces allocated page to be zeroed in
post_alloc_hook() when init_on_alloc=1.

For order-0 folios, if arch does not define
vma_alloc_zeroed_movable_folio(), the default implementation again zeros
the page return from the buddy allocator.  So the page is zeroed twice.
Fix it by passing __GFP_ZERO instead to avoid double page zeroing.  At the
moment, s390,arm64,x86,alpha,m68k are not impacted since they define their
own vma_alloc_zeroed_movable_folio().

For >0 order folios (mTHP and PMD THP), folio_zero_user() is called to
zero the folio again.  Fix it by calling folio_zero_user() only if
init_on_alloc is set.  All arch are impacted.

Add alloc_zeroed() helper to encapsulate the init_on_alloc check.

[ziy@nvidia.com: comment fixes, per David]
  Link: https://lkml.kernel.org/r/97DB52E1-C594-49B5-9736-89AC302FAB01@nvidia.com
Link: https://lkml.kernel.org/r/20241011150304.709590-1-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h |  8 +-------
 mm/huge_memory.c        |  8 +++++++-
 mm/internal.h           |  6 ++++++
 mm/memory.c             | 10 +++++++++-
 4 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index bec9bd715acf..6e452bd8e7e3 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -224,13 +224,7 @@ static inline
 struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
 				   unsigned long vaddr)
 {
-	struct folio *folio;
-
-	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr);
-	if (folio)
-		clear_user_highpage(&folio->page, vaddr);
-
-	return folio;
+	return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr);
 }
 #endif
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 387c046a389e..73194aa0544c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1162,7 +1162,13 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
 	}
 	folio_throttle_swaprate(folio, gfp);
 
-	folio_zero_user(folio, addr);
+       /*
+	* When a folio is not zeroed during allocation (__GFP_ZERO not used),
+	* folio_zero_user() is used to make sure that the page corresponding
+	* to the faulting address will be hot in the cache after zeroing.
+	*/
+	if (!alloc_zeroed())
+		folio_zero_user(folio, addr);
 	/*
 	 * The memory barrier inside __folio_mark_uptodate makes sure that
 	 * folio_zero_user writes become visible before the set_pmd_at()
diff --git a/mm/internal.h b/mm/internal.h
index fc2f523258a3..c743c2b21dba 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1276,6 +1276,12 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr,
 void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	       pmd_t *pmd, bool write);
 
+static inline bool alloc_zeroed(void)
+{
+	return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
+			&init_on_alloc);
+}
+
 enum {
 	/* mark page accessed */
 	FOLL_TOUCH = 1 << 16,
diff --git a/mm/memory.c b/mm/memory.c
index c51bc45a7009..68e57b33363b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4719,7 +4719,15 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 				goto next;
 			}
 			folio_throttle_swaprate(folio, gfp);
-			folio_zero_user(folio, vmf->address);
+			/*
+			 * When a folio is not zeroed during allocation
+			 * (__GFP_ZERO not used), folio_zero_user() is used
+			 * to make sure that the page corresponding to the
+			 * faulting address will be hot in the cache after
+			 * zeroing.
+			 */
+			if (!alloc_zeroed())
+				folio_zero_user(folio, vmf->address);
 			return folio;
 		}
 next:
-- 
cgit v1.2.3


From f1001f3d3b6868998cab73d10fda1a5c99ddf963 Mon Sep 17 00:00:00 2001
From: Wei Xu <weixugc@google.com>
Date: Thu, 17 Oct 2024 18:15:28 +0000
Subject: mm/mglru: reset page lru tier bits when activating

When a folio is activated, lru_gen_add_folio() moves the folio to the
youngest generation.  But unlike folio_update_gen()/folio_inc_gen(),
lru_gen_add_folio() doesn't reset the folio lru tier bits (LRU_REFS_MASK |
LRU_REFS_FLAGS).  This inconsistency can affect how pages are aged via
folio_mark_accessed() (e.g.  fd accesses), though no user visible impact
related to this has been detected yet.

Note that lru_gen_add_folio() cannot clear PG_workingset if the activation
is due to workingset refault, otherwise PSI accounting will be skipped.
So fix lru_gen_add_folio() to clear the lru tier bits other than
PG_workingset when activating a folio, and also clear all the lru tier
bits when a folio is activated via folio_activate() in
lru_gen_look_around().

Link: https://lkml.kernel.org/r/20241017181528.3358821-1-weixugc@google.com
Fixes: 018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap")
Signed-off-by: Wei Xu <weixugc@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Jan Alexander Steffens <heftig@archlinux.org>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h | 15 ++++++++++++++-
 include/linux/mmzone.h    |  2 ++
 mm/vmscan.c               |  8 ++++----
 3 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 6f801c7b36e2..355cf46a01a6 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -155,6 +155,11 @@ static inline int folio_lru_refs(struct folio *folio)
 	return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
 }
 
+static inline void folio_clear_lru_refs(struct folio *folio)
+{
+	set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+}
+
 static inline int folio_lru_gen(struct folio *folio)
 {
 	unsigned long flags = READ_ONCE(folio->flags);
@@ -222,6 +227,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
 {
 	unsigned long seq;
 	unsigned long flags;
+	unsigned long mask;
 	int gen = folio_lru_gen(folio);
 	int type = folio_is_file_lru(folio);
 	int zone = folio_zonenum(folio);
@@ -257,7 +263,14 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
 	gen = lru_gen_from_seq(seq);
 	flags = (gen + 1UL) << LRU_GEN_PGOFF;
 	/* see the comment on MIN_NR_GENS about PG_active */
-	set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+	mask = LRU_GEN_MASK;
+	/*
+	 * Don't clear PG_workingset here because it can affect PSI accounting
+	 * if the activation is due to workingset refault.
+	 */
+	if (folio_test_active(folio))
+		mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active);
+	set_mask_bits(&folio->flags, mask, flags);
 
 	lru_gen_update_size(lruvec, folio, -1, gen);
 	/* for folio_rotate_reclaimable() */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5b1c984daf45..2e8c4307c728 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -403,6 +403,8 @@ enum {
 	NR_LRU_GEN_CAPS
 };
 
+#define LRU_REFS_FLAGS		(BIT(PG_referenced) | BIT(PG_workingset))
+
 #define MIN_LRU_BATCH		BITS_PER_LONG
 #define MAX_LRU_BATCH		(MIN_LRU_BATCH * 64)
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5bec29914f12..8d1301c0f22a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2603,8 +2603,6 @@ static bool should_clear_pmd_young(void)
  *                          shorthand helpers
  ******************************************************************************/
 
-#define LRU_REFS_FLAGS	(BIT(PG_referenced) | BIT(PG_workingset))
-
 #define DEFINE_MAX_SEQ(lruvec)						\
 	unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
 
@@ -4142,8 +4140,10 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 		old_gen = folio_lru_gen(folio);
 		if (old_gen < 0)
 			folio_set_referenced(folio);
-		else if (old_gen != new_gen)
+		else if (old_gen != new_gen) {
+			folio_clear_lru_refs(folio);
 			folio_activate(folio);
+		}
 	}
 
 	arch_leave_lazy_mmu_mode();
@@ -4376,7 +4376,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
 
 	/* see the comment on MAX_NR_TIERS */
 	if (!folio_test_referenced(folio))
-		set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+		folio_clear_lru_refs(folio);
 
 	/* for shrink_folio_list() */
 	folio_clear_reclaim(folio);
-- 
cgit v1.2.3


From 9884efd795cc2f71ef3b7f42df32420b0b7ce34f Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 17 Oct 2024 22:14:56 +0800
Subject: mm: huge_memory: move file_thp_enabled() into huge_memory.c

file_thp_enabled() is only used in __thp_vma_allowable_orders(), so move
it into huge_memory.c, also check READ_ONLY_THP_FOR_FS ahead to avoid
unnecessary code if config disabled.

Link: https://lkml.kernel.org/r/20241017141457.1169092-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 13 -------------
 mm/huge_memory.c        | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8afe09a2cf03..006f730545c2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -252,19 +252,6 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
 	return orders;
 }
 
-static inline bool file_thp_enabled(struct vm_area_struct *vma)
-{
-	struct inode *inode;
-
-	if (!vma->vm_file)
-		return false;
-
-	inode = vma->vm_file->f_inode;
-
-	return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) &&
-	       !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
-}
-
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 					 unsigned long vm_flags,
 					 unsigned long tva_flags,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 73194aa0544c..492c16eaf147 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -83,6 +83,21 @@ unsigned long huge_anon_orders_madvise __read_mostly;
 unsigned long huge_anon_orders_inherit __read_mostly;
 static bool anon_orders_configured __initdata;
 
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+	struct inode *inode;
+
+	if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+		return false;
+
+	if (!vma->vm_file)
+		return false;
+
+	inode = file_inode(vma->vm_file);
+
+	return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
+}
+
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 					 unsigned long vm_flags,
 					 unsigned long tva_flags,
-- 
cgit v1.2.3


From b7f058f827392022d8c689329f88c7b324d71dad Mon Sep 17 00:00:00 2001
From: Luoxi Li <kaixa@kiloview.com>
Date: Fri, 18 Oct 2024 17:22:35 +0800
Subject: mm: remove unused has_isolate_pageblock

has_isolate_pageblock() has been unused since commit 55612e80e722 ("mm:
page_alloc: close migratetype race between freeing and stealing")

Remove it.

Link: https://lkml.kernel.org/r/20241018092235.2764859-1-kaixa@kiloview.com
Signed-off-by: Luoxi Li <kaixa@kiloview.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-isolation.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index c16db0067090..73dc2c1841ec 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -3,10 +3,6 @@
 #define __LINUX_PAGEISOLATION_H
 
 #ifdef CONFIG_MEMORY_ISOLATION
-static inline bool has_isolate_pageblock(struct zone *zone)
-{
-	return zone->nr_isolate_pageblock;
-}
 static inline bool is_migrate_isolate_page(struct page *page)
 {
 	return get_pageblock_migratetype(page) == MIGRATE_ISOLATE;
@@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype)
 	return migratetype == MIGRATE_ISOLATE;
 }
 #else
-static inline bool has_isolate_pageblock(struct zone *zone)
-{
-	return false;
-}
 static inline bool is_migrate_isolate_page(struct page *page)
 {
 	return false;
-- 
cgit v1.2.3


From 78c018e3942c5dfbab7e6edb4eb784943878504b Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 7 Oct 2024 23:47:45 +0200
Subject: maple_tree: fix outdated flag name in comment

MAPLE_USE_RCU was renamed to MT_FLAGS_USE_RCU at some point, fix up the
comment.

Link: https://lkml.kernel.org/r/20241007-maple-tree-doc-fix-v1-1-6bbf89c1153d@google.com
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index c2c11004085e..61c236850ca8 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -224,7 +224,7 @@ typedef struct { /* nothing */ } lockdep_map_p;
  * (set at tree creation time) and dynamic information set under the spinlock.
  *
  * Another use of flags are to indicate global states of the tree.  This is the
- * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in
+ * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in
  * RCU mode.  This mode was added to allow the tree to reuse nodes instead of
  * re-allocating and RCU freeing nodes when there is a single user.
  */
-- 
cgit v1.2.3


From ed265529d39ac408396c031a4fd7e1ef922b80d0 Mon Sep 17 00:00:00 2001
From: Sourav Panda <souravpanda@google.com>
Date: Tue, 22 Oct 2024 23:24:40 +0000
Subject: mm/codetag: fix arg in pgalloc_tag_copy alloc_tag_sub

alloc_tag_sub() takes bytes as opposed to number of pages as argument.

Currently pgalloc_tag_copy() passes the number of pages.  This fix passes
the correct unit, which is the number of bytes allocated.

Link: https://lkml.kernel.org/r/20241022232440.334820-1-souravpanda@google.com
Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()")
Signed-off-by: Sourav Panda <souravpanda@google.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4570f33e2429..eb070c14e309 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4207,7 +4207,7 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
 	/* Clear the old ref to the original allocation tag. */
 	clear_page_tag_ref(&old->page);
 	/* Decrement the counters of the tag on get_new_folio. */
-	alloc_tag_sub(ref, folio_nr_pages(new));
+	alloc_tag_sub(ref, folio_size(new));
 
 	__alloc_tag_ref_set(ref, tag);
 
-- 
cgit v1.2.3


From 628e1b8c4777941e119effc92cd395b4b02c2c5f Mon Sep 17 00:00:00 2001
From: James Houghton <jthoughton@google.com>
Date: Mon, 21 Oct 2024 16:02:12 +0000
Subject: mm: add missing mmu_notifier_clear_young for !MMU_NOTIFIER

Remove the now unnecessary ifdef in mm/damon/vaddr.c as well.

Link: https://lkml.kernel.org/r/20241021160212.9935-1-jthoughton@google.com
Signed-off-by: James Houghton <jthoughton@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 7 +++++++
 mm/damon/vaddr.c             | 2 --
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d39ebb10caeb..e2dd57ca368b 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -606,6 +606,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+					   unsigned long start,
+					   unsigned long end)
+{
+	return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 08cfd22b5249..821990d0141a 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -353,11 +353,9 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
 		set_huge_pte_at(mm, addr, pte, entry, psize);
 	}
 
-#ifdef CONFIG_MMU_NOTIFIER
 	if (mmu_notifier_clear_young(mm, addr,
 				     addr + huge_page_size(hstate_vma(vma))))
 		referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
 
 	if (referenced)
 		folio_set_young(folio);
-- 
cgit v1.2.3


From 6b611388b626eaa59d202bf8f64d095ff80bcde6 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 24 Oct 2024 18:22:59 -0700
Subject: memcg-v1: remove charge move code

The memcg-v1 charge move feature has been deprecated completely and let's
remove the relevant code as well.

Link: https://lkml.kernel.org/r/20241025012304.2473312-3-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |   5 -
 mm/memcontrol-v1.c         | 887 ---------------------------------------------
 mm/memcontrol-v1.h         |   6 -
 mm/memcontrol.c            |   9 -
 4 files changed, 907 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 34d2da05f2f1..0b113267b2de 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -299,11 +299,6 @@ struct mem_cgroup {
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 
-	/*
-	 * Should we move charges of a task when a task is moved into this
-	 * mem_cgroup ? And what type of charges should we move ?
-	 */
-	unsigned long move_charge_at_immigrate;
 	/* taken only while moving_account > 0 */
 	spinlock_t move_lock;
 	unsigned long move_lock_flags;
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index ef7e2a0ec66a..9c0fba8c8a83 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -40,31 +40,6 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 
-/* Stuffs for move charges at task migration. */
-/*
- * Types of charges to be moved.
- */
-#define MOVE_ANON	0x1ULL
-#define MOVE_FILE	0x2ULL
-#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
-
-/* "mc" and its members are protected by cgroup_mutex */
-static struct move_charge_struct {
-	spinlock_t	  lock; /* for from, to */
-	struct mm_struct  *mm;
-	struct mem_cgroup *from;
-	struct mem_cgroup *to;
-	unsigned long flags;
-	unsigned long precharge;
-	unsigned long moved_charge;
-	unsigned long moved_swap;
-	struct task_struct *moving_task;	/* a task moving charges */
-	wait_queue_head_t waitq;		/* a waitq for other context */
-} mc = {
-	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
-	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
-};
-
 /* for OOM */
 struct mem_cgroup_eventfd_list {
 	struct list_head list;
@@ -426,51 +401,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
 	return nr_reclaimed;
 }
 
-/*
- * A routine for checking "mem" is under move_account() or not.
- *
- * Checking a cgroup is mc.from or mc.to or under hierarchy of
- * moving cgroups. This is for waiting at high-memory pressure
- * caused by "move".
- */
-static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *from;
-	struct mem_cgroup *to;
-	bool ret = false;
-	/*
-	 * Unlike task_move routines, we access mc.to, mc.from not under
-	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
-	 */
-	spin_lock(&mc.lock);
-	from = mc.from;
-	to = mc.to;
-	if (!from)
-		goto unlock;
-
-	ret = mem_cgroup_is_descendant(from, memcg) ||
-		mem_cgroup_is_descendant(to, memcg);
-unlock:
-	spin_unlock(&mc.lock);
-	return ret;
-}
-
-bool memcg1_wait_acct_move(struct mem_cgroup *memcg)
-{
-	if (mc.moving_task && current != mc.moving_task) {
-		if (mem_cgroup_under_move(memcg)) {
-			DEFINE_WAIT(wait);
-			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
-			/* moving charge context might have finished. */
-			if (mc.moving_task)
-				schedule();
-			finish_wait(&mc.waitq, &wait);
-			return true;
-		}
-	}
-	return false;
-}
-
 /**
  * folio_memcg_lock - Bind a folio to its memcg.
  * @folio: The folio.
@@ -552,44 +482,6 @@ void folio_memcg_unlock(struct folio *folio)
 	__folio_memcg_unlock(folio_memcg(folio));
 }
 
-#ifdef CONFIG_SWAP
-/**
- * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
- * @entry: swap entry to be moved
- * @from:  mem_cgroup which the entry is moved from
- * @to:  mem_cgroup which the entry is moved to
- *
- * It succeeds only when the swap_cgroup's record for this entry is the same
- * as the mem_cgroup's id of @from.
- *
- * Returns 0 on success, -EINVAL on failure.
- *
- * The caller must have charged to @to, IOW, called page_counter_charge() about
- * both res and memsw, and called css_get().
- */
-static int mem_cgroup_move_swap_account(swp_entry_t entry,
-				struct mem_cgroup *from, struct mem_cgroup *to)
-{
-	unsigned short old_id, new_id;
-
-	old_id = mem_cgroup_id(from);
-	new_id = mem_cgroup_id(to);
-
-	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
-		mod_memcg_state(from, MEMCG_SWAP, -1);
-		mod_memcg_state(to, MEMCG_SWAP, 1);
-		return 0;
-	}
-	return -EINVAL;
-}
-#else
-static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
-				struct mem_cgroup *from, struct mem_cgroup *to)
-{
-	return -EINVAL;
-}
-#endif
-
 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
 				struct cftype *cft)
 {
@@ -616,785 +508,6 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 }
 #endif
 
-#ifdef CONFIG_MMU
-/* Handlers for move charge at task migration. */
-static int mem_cgroup_do_precharge(unsigned long count)
-{
-	int ret;
-
-	/* Try a single bulk charge without reclaim first, kswapd may wake */
-	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
-	if (!ret) {
-		mc.precharge += count;
-		return ret;
-	}
-
-	/* Try charges one by one with reclaim, but do not retry */
-	while (count--) {
-		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
-		if (ret)
-			return ret;
-		mc.precharge++;
-		cond_resched();
-	}
-	return 0;
-}
-
-union mc_target {
-	struct folio	*folio;
-	swp_entry_t	ent;
-};
-
-enum mc_target_type {
-	MC_TARGET_NONE = 0,
-	MC_TARGET_PAGE,
-	MC_TARGET_SWAP,
-	MC_TARGET_DEVICE,
-};
-
-static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
-						unsigned long addr, pte_t ptent)
-{
-	struct page *page = vm_normal_page(vma, addr, ptent);
-
-	if (!page)
-		return NULL;
-	if (PageAnon(page)) {
-		if (!(mc.flags & MOVE_ANON))
-			return NULL;
-	} else {
-		if (!(mc.flags & MOVE_FILE))
-			return NULL;
-	}
-	get_page(page);
-
-	return page;
-}
-
-#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
-static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
-			pte_t ptent, swp_entry_t *entry)
-{
-	struct page *page = NULL;
-	swp_entry_t ent = pte_to_swp_entry(ptent);
-
-	if (!(mc.flags & MOVE_ANON))
-		return NULL;
-
-	/*
-	 * Handle device private pages that are not accessible by the CPU, but
-	 * stored as special swap entries in the page table.
-	 */
-	if (is_device_private_entry(ent)) {
-		page = pfn_swap_entry_to_page(ent);
-		if (!get_page_unless_zero(page))
-			return NULL;
-		return page;
-	}
-
-	if (non_swap_entry(ent))
-		return NULL;
-
-	/*
-	 * Because swap_cache_get_folio() updates some statistics counter,
-	 * we call find_get_page() with swapper_space directly.
-	 */
-	page = find_get_page(swap_address_space(ent), swap_cache_index(ent));
-	entry->val = ent.val;
-
-	return page;
-}
-#else
-static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
-			pte_t ptent, swp_entry_t *entry)
-{
-	return NULL;
-}
-#endif
-
-static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
-			unsigned long addr, pte_t ptent)
-{
-	unsigned long index;
-	struct folio *folio;
-
-	if (!vma->vm_file) /* anonymous vma */
-		return NULL;
-	if (!(mc.flags & MOVE_FILE))
-		return NULL;
-
-	/* folio is moved even if it's not RSS of this task(page-faulted). */
-	/* shmem/tmpfs may report page out on swap: account for that too. */
-	index = linear_page_index(vma, addr);
-	folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
-	if (IS_ERR(folio))
-		return NULL;
-	return folio_file_page(folio, index);
-}
-
-static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
-static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
-
-/**
- * mem_cgroup_move_account - move account of the folio
- * @folio: The folio.
- * @compound: charge the page as compound or small page
- * @from: mem_cgroup which the folio is moved from.
- * @to:	mem_cgroup which the folio is moved to. @from != @to.
- *
- * The folio must be locked and not on the LRU.
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct folio *folio,
-				   bool compound,
-				   struct mem_cgroup *from,
-				   struct mem_cgroup *to)
-{
-	struct lruvec *from_vec, *to_vec;
-	struct pglist_data *pgdat;
-	unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
-	int nid, ret;
-
-	VM_BUG_ON(from == to);
-	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
-	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
-	VM_BUG_ON(compound && !folio_test_large(folio));
-
-	ret = -EINVAL;
-	if (folio_memcg(folio) != from)
-		goto out;
-
-	pgdat = folio_pgdat(folio);
-	from_vec = mem_cgroup_lruvec(from, pgdat);
-	to_vec = mem_cgroup_lruvec(to, pgdat);
-
-	folio_memcg_lock(folio);
-
-	if (folio_test_anon(folio)) {
-		if (folio_mapped(folio)) {
-			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
-			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
-			if (folio_test_pmd_mappable(folio)) {
-				__mod_lruvec_state(from_vec, NR_ANON_THPS,
-						   -nr_pages);
-				__mod_lruvec_state(to_vec, NR_ANON_THPS,
-						   nr_pages);
-			}
-		}
-	} else {
-		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
-		__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
-
-		if (folio_test_swapbacked(folio)) {
-			__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
-			__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
-		}
-
-		if (folio_mapped(folio)) {
-			__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
-			__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
-		}
-
-		if (folio_test_dirty(folio)) {
-			struct address_space *mapping = folio_mapping(folio);
-
-			if (mapping_can_writeback(mapping)) {
-				__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
-						   -nr_pages);
-				__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
-						   nr_pages);
-			}
-		}
-	}
-
-#ifdef CONFIG_SWAP
-	if (folio_test_swapcache(folio)) {
-		__mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
-		__mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
-	}
-#endif
-	if (folio_test_writeback(folio)) {
-		__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
-		__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
-	}
-
-	/*
-	 * All state has been migrated, let's switch to the new memcg.
-	 *
-	 * It is safe to change page's memcg here because the page
-	 * is referenced, charged, isolated, and locked: we can't race
-	 * with (un)charging, migration, LRU putback, or anything else
-	 * that would rely on a stable page's memory cgroup.
-	 *
-	 * Note that folio_memcg_lock is a memcg lock, not a page lock,
-	 * to save space. As soon as we switch page's memory cgroup to a
-	 * new memcg that isn't locked, the above state can change
-	 * concurrently again. Make sure we're truly done with it.
-	 */
-	smp_mb();
-
-	css_get(&to->css);
-	css_put(&from->css);
-
-	/* Warning should never happen, so don't worry about refcount non-0 */
-	WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
-	folio->memcg_data = (unsigned long)to;
-
-	__folio_memcg_unlock(from);
-
-	ret = 0;
-	nid = folio_nid(folio);
-
-	local_irq_disable();
-	memcg1_charge_statistics(to, nr_pages);
-	memcg1_check_events(to, nid);
-	memcg1_charge_statistics(from, -nr_pages);
-	memcg1_check_events(from, nid);
-	local_irq_enable();
-out:
-	return ret;
-}
-
-/**
- * get_mctgt_type - get target type of moving charge
- * @vma: the vma the pte to be checked belongs
- * @addr: the address corresponding to the pte to be checked
- * @ptent: the pte to be checked
- * @target: the pointer the target page or swap ent will be stored(can be NULL)
- *
- * Context: Called with pte lock held.
- * Return:
- * * MC_TARGET_NONE - If the pte is not a target for move charge.
- * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
- *   move charge. If @target is not NULL, the folio is stored in target->folio
- *   with extra refcnt taken (Caller should release it).
- * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
- *   target for charge migration.  If @target is not NULL, the entry is
- *   stored in target->ent.
- * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
- *   thus not on the lru.  For now such page is charged like a regular page
- *   would be as it is just special memory taking the place of a regular page.
- *   See Documentations/vm/hmm.txt and include/linux/hmm.h
- */
-static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
-		unsigned long addr, pte_t ptent, union mc_target *target)
-{
-	struct page *page = NULL;
-	struct folio *folio;
-	enum mc_target_type ret = MC_TARGET_NONE;
-	swp_entry_t ent = { .val = 0 };
-
-	if (pte_present(ptent))
-		page = mc_handle_present_pte(vma, addr, ptent);
-	else if (pte_none_mostly(ptent))
-		/*
-		 * PTE markers should be treated as a none pte here, separated
-		 * from other swap handling below.
-		 */
-		page = mc_handle_file_pte(vma, addr, ptent);
-	else if (is_swap_pte(ptent))
-		page = mc_handle_swap_pte(vma, ptent, &ent);
-
-	if (page)
-		folio = page_folio(page);
-	if (target && page) {
-		if (!folio_trylock(folio)) {
-			folio_put(folio);
-			return ret;
-		}
-		/*
-		 * page_mapped() must be stable during the move. This
-		 * pte is locked, so if it's present, the page cannot
-		 * become unmapped. If it isn't, we have only partial
-		 * control over the mapped state: the page lock will
-		 * prevent new faults against pagecache and swapcache,
-		 * so an unmapped page cannot become mapped. However,
-		 * if the page is already mapped elsewhere, it can
-		 * unmap, and there is nothing we can do about it.
-		 * Alas, skip moving the page in this case.
-		 */
-		if (!pte_present(ptent) && page_mapped(page)) {
-			folio_unlock(folio);
-			folio_put(folio);
-			return ret;
-		}
-	}
-
-	if (!page && !ent.val)
-		return ret;
-	if (page) {
-		/*
-		 * Do only loose check w/o serialization.
-		 * mem_cgroup_move_account() checks the page is valid or
-		 * not under LRU exclusion.
-		 */
-		if (folio_memcg(folio) == mc.from) {
-			ret = MC_TARGET_PAGE;
-			if (folio_is_device_private(folio) ||
-			    folio_is_device_coherent(folio))
-				ret = MC_TARGET_DEVICE;
-			if (target)
-				target->folio = folio;
-		}
-		if (!ret || !target) {
-			if (target)
-				folio_unlock(folio);
-			folio_put(folio);
-		}
-	}
-	/*
-	 * There is a swap entry and a page doesn't exist or isn't charged.
-	 * But we cannot move a tail-page in a THP.
-	 */
-	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
-	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
-		ret = MC_TARGET_SWAP;
-		if (target)
-			target->ent = ent;
-	}
-	return ret;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * We don't consider PMD mapped swapping or file mapped pages because THP does
- * not support them for now.
- * Caller should make sure that pmd_trans_huge(pmd) is true.
- */
-static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
-		unsigned long addr, pmd_t pmd, union mc_target *target)
-{
-	struct page *page = NULL;
-	struct folio *folio;
-	enum mc_target_type ret = MC_TARGET_NONE;
-
-	if (unlikely(is_swap_pmd(pmd))) {
-		VM_BUG_ON(thp_migration_supported() &&
-				  !is_pmd_migration_entry(pmd));
-		return ret;
-	}
-	page = pmd_page(pmd);
-	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
-	folio = page_folio(page);
-	if (!(mc.flags & MOVE_ANON))
-		return ret;
-	if (folio_memcg(folio) == mc.from) {
-		ret = MC_TARGET_PAGE;
-		if (target) {
-			folio_get(folio);
-			if (!folio_trylock(folio)) {
-				folio_put(folio);
-				return MC_TARGET_NONE;
-			}
-			target->folio = folio;
-		}
-	}
-	return ret;
-}
-#else
-static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
-		unsigned long addr, pmd_t pmd, union mc_target *target)
-{
-	return MC_TARGET_NONE;
-}
-#endif
-
-static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
-					unsigned long addr, unsigned long end,
-					struct mm_walk *walk)
-{
-	struct vm_area_struct *vma = walk->vma;
-	pte_t *pte;
-	spinlock_t *ptl;
-
-	ptl = pmd_trans_huge_lock(pmd, vma);
-	if (ptl) {
-		/*
-		 * Note their can not be MC_TARGET_DEVICE for now as we do not
-		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
-		 * this might change.
-		 */
-		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
-			mc.precharge += HPAGE_PMD_NR;
-		spin_unlock(ptl);
-		return 0;
-	}
-
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	if (!pte)
-		return 0;
-	for (; addr != end; pte++, addr += PAGE_SIZE)
-		if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
-			mc.precharge++;	/* increment precharge temporarily */
-	pte_unmap_unlock(pte - 1, ptl);
-	cond_resched();
-
-	return 0;
-}
-
-static const struct mm_walk_ops precharge_walk_ops = {
-	.pmd_entry	= mem_cgroup_count_precharge_pte_range,
-	.walk_lock	= PGWALK_RDLOCK,
-};
-
-static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
-{
-	unsigned long precharge;
-
-	mmap_read_lock(mm);
-	walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
-	mmap_read_unlock(mm);
-
-	precharge = mc.precharge;
-	mc.precharge = 0;
-
-	return precharge;
-}
-
-static int mem_cgroup_precharge_mc(struct mm_struct *mm)
-{
-	unsigned long precharge = mem_cgroup_count_precharge(mm);
-
-	VM_BUG_ON(mc.moving_task);
-	mc.moving_task = current;
-	return mem_cgroup_do_precharge(precharge);
-}
-
-/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
-static void __mem_cgroup_clear_mc(void)
-{
-	struct mem_cgroup *from = mc.from;
-	struct mem_cgroup *to = mc.to;
-
-	/* we must uncharge all the leftover precharges from mc.to */
-	if (mc.precharge) {
-		mem_cgroup_cancel_charge(mc.to, mc.precharge);
-		mc.precharge = 0;
-	}
-	/*
-	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
-	 * we must uncharge here.
-	 */
-	if (mc.moved_charge) {
-		mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
-		mc.moved_charge = 0;
-	}
-	/* we must fixup refcnts and charges */
-	if (mc.moved_swap) {
-		/* uncharge swap account from the old cgroup */
-		if (!mem_cgroup_is_root(mc.from))
-			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
-
-		mem_cgroup_id_put_many(mc.from, mc.moved_swap);
-
-		/*
-		 * we charged both to->memory and to->memsw, so we
-		 * should uncharge to->memory.
-		 */
-		if (!mem_cgroup_is_root(mc.to))
-			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
-
-		mc.moved_swap = 0;
-	}
-	memcg1_oom_recover(from);
-	memcg1_oom_recover(to);
-	wake_up_all(&mc.waitq);
-}
-
-static void mem_cgroup_clear_mc(void)
-{
-	struct mm_struct *mm = mc.mm;
-
-	/*
-	 * we must clear moving_task before waking up waiters at the end of
-	 * task migration.
-	 */
-	mc.moving_task = NULL;
-	__mem_cgroup_clear_mc();
-	spin_lock(&mc.lock);
-	mc.from = NULL;
-	mc.to = NULL;
-	mc.mm = NULL;
-	spin_unlock(&mc.lock);
-
-	mmput(mm);
-}
-
-int memcg1_can_attach(struct cgroup_taskset *tset)
-{
-	struct cgroup_subsys_state *css;
-	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
-	struct mem_cgroup *from;
-	struct task_struct *leader, *p;
-	struct mm_struct *mm;
-	unsigned long move_flags;
-	int ret = 0;
-
-	/* charge immigration isn't supported on the default hierarchy */
-	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
-		return 0;
-
-	/*
-	 * Multi-process migrations only happen on the default hierarchy
-	 * where charge immigration is not used.  Perform charge
-	 * immigration if @tset contains a leader and whine if there are
-	 * multiple.
-	 */
-	p = NULL;
-	cgroup_taskset_for_each_leader(leader, css, tset) {
-		WARN_ON_ONCE(p);
-		p = leader;
-		memcg = mem_cgroup_from_css(css);
-	}
-	if (!p)
-		return 0;
-
-	/*
-	 * We are now committed to this value whatever it is. Changes in this
-	 * tunable will only affect upcoming migrations, not the current one.
-	 * So we need to save it, and keep it going.
-	 */
-	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
-	if (!move_flags)
-		return 0;
-
-	from = mem_cgroup_from_task(p);
-
-	VM_BUG_ON(from == memcg);
-
-	mm = get_task_mm(p);
-	if (!mm)
-		return 0;
-	/* We move charges only when we move a owner of the mm */
-	if (mm->owner == p) {
-		VM_BUG_ON(mc.from);
-		VM_BUG_ON(mc.to);
-		VM_BUG_ON(mc.precharge);
-		VM_BUG_ON(mc.moved_charge);
-		VM_BUG_ON(mc.moved_swap);
-
-		spin_lock(&mc.lock);
-		mc.mm = mm;
-		mc.from = from;
-		mc.to = memcg;
-		mc.flags = move_flags;
-		spin_unlock(&mc.lock);
-		/* We set mc.moving_task later */
-
-		ret = mem_cgroup_precharge_mc(mm);
-		if (ret)
-			mem_cgroup_clear_mc();
-	} else {
-		mmput(mm);
-	}
-	return ret;
-}
-
-void memcg1_cancel_attach(struct cgroup_taskset *tset)
-{
-	if (mc.to)
-		mem_cgroup_clear_mc();
-}
-
-static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
-				unsigned long addr, unsigned long end,
-				struct mm_walk *walk)
-{
-	int ret = 0;
-	struct vm_area_struct *vma = walk->vma;
-	pte_t *pte;
-	spinlock_t *ptl;
-	enum mc_target_type target_type;
-	union mc_target target;
-	struct folio *folio;
-	bool tried_split_before = false;
-
-retry_pmd:
-	ptl = pmd_trans_huge_lock(pmd, vma);
-	if (ptl) {
-		if (mc.precharge < HPAGE_PMD_NR) {
-			spin_unlock(ptl);
-			return 0;
-		}
-		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
-		if (target_type == MC_TARGET_PAGE) {
-			folio = target.folio;
-			/*
-			 * Deferred split queue locking depends on memcg,
-			 * and unqueue is unsafe unless folio refcount is 0:
-			 * split or skip if on the queue? first try to split.
-			 */
-			if (!list_empty(&folio->_deferred_list)) {
-				spin_unlock(ptl);
-				if (!tried_split_before)
-					split_folio(folio);
-				folio_unlock(folio);
-				folio_put(folio);
-				if (tried_split_before)
-					return 0;
-				tried_split_before = true;
-				goto retry_pmd;
-			}
-			/*
-			 * So long as that pmd lock is held, the folio cannot
-			 * be racily added to the _deferred_list, because
-			 * __folio_remove_rmap() will find !partially_mapped.
-			 */
-			if (folio_isolate_lru(folio)) {
-				if (!mem_cgroup_move_account(folio, true,
-							     mc.from, mc.to)) {
-					mc.precharge -= HPAGE_PMD_NR;
-					mc.moved_charge += HPAGE_PMD_NR;
-				}
-				folio_putback_lru(folio);
-			}
-			folio_unlock(folio);
-			folio_put(folio);
-		} else if (target_type == MC_TARGET_DEVICE) {
-			folio = target.folio;
-			if (!mem_cgroup_move_account(folio, true,
-						     mc.from, mc.to)) {
-				mc.precharge -= HPAGE_PMD_NR;
-				mc.moved_charge += HPAGE_PMD_NR;
-			}
-			folio_unlock(folio);
-			folio_put(folio);
-		}
-		spin_unlock(ptl);
-		return 0;
-	}
-
-retry:
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	if (!pte)
-		return 0;
-	for (; addr != end; addr += PAGE_SIZE) {
-		pte_t ptent = ptep_get(pte++);
-		bool device = false;
-		swp_entry_t ent;
-
-		if (!mc.precharge)
-			break;
-
-		switch (get_mctgt_type(vma, addr, ptent, &target)) {
-		case MC_TARGET_DEVICE:
-			device = true;
-			fallthrough;
-		case MC_TARGET_PAGE:
-			folio = target.folio;
-			/*
-			 * We can have a part of the split pmd here. Moving it
-			 * can be done but it would be too convoluted so simply
-			 * ignore such a partial THP and keep it in original
-			 * memcg. There should be somebody mapping the head.
-			 */
-			if (folio_test_large(folio))
-				goto put;
-			if (!device && !folio_isolate_lru(folio))
-				goto put;
-			if (!mem_cgroup_move_account(folio, false,
-						mc.from, mc.to)) {
-				mc.precharge--;
-				/* we uncharge from mc.from later. */
-				mc.moved_charge++;
-			}
-			if (!device)
-				folio_putback_lru(folio);
-put:			/* get_mctgt_type() gets & locks the page */
-			folio_unlock(folio);
-			folio_put(folio);
-			break;
-		case MC_TARGET_SWAP:
-			ent = target.ent;
-			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
-				mc.precharge--;
-				mem_cgroup_id_get_many(mc.to, 1);
-				/* we fixup other refcnts and charges later. */
-				mc.moved_swap++;
-			}
-			break;
-		default:
-			break;
-		}
-	}
-	pte_unmap_unlock(pte - 1, ptl);
-	cond_resched();
-
-	if (addr != end) {
-		/*
-		 * We have consumed all precharges we got in can_attach().
-		 * We try charge one by one, but don't do any additional
-		 * charges to mc.to if we have failed in charge once in attach()
-		 * phase.
-		 */
-		ret = mem_cgroup_do_precharge(1);
-		if (!ret)
-			goto retry;
-	}
-
-	return ret;
-}
-
-static const struct mm_walk_ops charge_walk_ops = {
-	.pmd_entry	= mem_cgroup_move_charge_pte_range,
-	.walk_lock	= PGWALK_RDLOCK,
-};
-
-static void mem_cgroup_move_charge(void)
-{
-	lru_add_drain_all();
-	/*
-	 * Signal folio_memcg_lock() to take the memcg's move_lock
-	 * while we're moving its pages to another memcg. Then wait
-	 * for already started RCU-only updates to finish.
-	 */
-	atomic_inc(&mc.from->moving_account);
-	synchronize_rcu();
-retry:
-	if (unlikely(!mmap_read_trylock(mc.mm))) {
-		/*
-		 * Someone who are holding the mmap_lock might be waiting in
-		 * waitq. So we cancel all extra charges, wake up all waiters,
-		 * and retry. Because we cancel precharges, we might not be able
-		 * to move enough charges, but moving charge is a best-effort
-		 * feature anyway, so it wouldn't be a big problem.
-		 */
-		__mem_cgroup_clear_mc();
-		cond_resched();
-		goto retry;
-	}
-	/*
-	 * When we have consumed all precharges and failed in doing
-	 * additional charge, the page walk just aborts.
-	 */
-	walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
-	mmap_read_unlock(mc.mm);
-	atomic_dec(&mc.from->moving_account);
-}
-
-void memcg1_move_task(void)
-{
-	if (mc.to) {
-		mem_cgroup_move_charge();
-		mem_cgroup_clear_mc();
-	}
-}
-
-#else	/* !CONFIG_MMU */
-int memcg1_can_attach(struct cgroup_taskset *tset)
-{
-	return 0;
-}
-void memcg1_cancel_attach(struct cgroup_taskset *tset)
-{
-}
-void memcg1_move_task(void)
-{
-}
-#endif
-
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
 	struct mem_cgroup_threshold_ary *t;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index c0672e25bcdb..0e3b82951d91 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -80,12 +80,7 @@ static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg)
 	WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
 }
 
-bool memcg1_wait_acct_move(struct mem_cgroup *memcg);
-
 struct cgroup_taskset;
-int memcg1_can_attach(struct cgroup_taskset *tset);
-void memcg1_cancel_attach(struct cgroup_taskset *tset);
-void memcg1_move_task(void);
 void memcg1_css_offline(struct mem_cgroup *memcg);
 
 /* for encoding cft->private value on file */
@@ -130,7 +125,6 @@ static inline void memcg1_free_events(struct mem_cgroup *memcg) {}
 static inline void memcg1_memcg_init(struct mem_cgroup *memcg) {}
 static inline void memcg1_remove_from_trees(struct mem_cgroup *memcg) {}
 static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) {}
-static inline bool memcg1_wait_acct_move(struct mem_cgroup *memcg) { return false; }
 static inline void memcg1_css_offline(struct mem_cgroup *memcg) {}
 
 static inline bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) { return true; }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5fcdd25fc134..ec4ac6eb650b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2242,12 +2242,6 @@ retry:
 	 */
 	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
 		goto retry;
-	/*
-	 * At task move, charge accounts can be doubly counted. So, it's
-	 * better to wait until the end of task_move if something is going on.
-	 */
-	if (memcg1_wait_acct_move(mem_over_limit))
-		goto retry;
 
 	if (nr_retries--)
 		goto retry;
@@ -4441,9 +4435,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.exit = mem_cgroup_exit,
 	.dfl_cftypes = memory_files,
 #ifdef CONFIG_MEMCG_V1
-	.can_attach = memcg1_can_attach,
-	.cancel_attach = memcg1_cancel_attach,
-	.post_attach = memcg1_move_task,
 	.legacy_cftypes = mem_cgroup_legacy_files,
 #endif
 	.early_init = 0,
-- 
cgit v1.2.3


From a29c0e4b2e867f4e362a6740c430bfdc2efdd1d9 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 24 Oct 2024 18:23:03 -0700
Subject: memcg-v1: remove memcg move locking code

The memcg v1's charge move feature has been deprecated.  All the places
using the memcg move lock, have stopped using it as they don't need the
protection any more.  Let's proceed to remove all the locking code related
to charge moving.

Link: https://lkml.kernel.org/r/20241025012304.2473312-7-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 54 ------------------------------
 mm/filemap.c               |  1 -
 mm/memcontrol-v1.c         | 82 ----------------------------------------------
 mm/memcontrol.c            |  5 ---
 mm/rmap.c                  |  1 -
 5 files changed, 143 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0b113267b2de..bb49e0d4b377 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -299,20 +299,10 @@ struct mem_cgroup {
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 
-	/* taken only while moving_account > 0 */
-	spinlock_t move_lock;
-	unsigned long move_lock_flags;
-
 	/* Legacy tcp memory accounting */
 	bool tcpmem_active;
 	int tcpmem_pressure;
 
-	/*
-	 * set > 0 if pages under this cgroup are moving to other cgroup.
-	 */
-	atomic_t moving_account;
-	struct task_struct *move_lock_task;
-
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
@@ -428,9 +418,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
  *
  * - the folio lock
  * - LRU isolation
- * - folio_memcg_lock()
  * - exclusive reference
- * - mem_cgroup_trylock_pages()
  *
  * For a kmem folio a caller should hold an rcu read lock to protect memcg
  * associated with a kmem folio from being released.
@@ -499,9 +487,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
  *
  * - the folio lock
  * - LRU isolation
- * - lock_folio_memcg()
  * - exclusive reference
- * - mem_cgroup_trylock_pages()
  *
  * For a kmem folio a caller should hold an rcu read lock to protect memcg
  * associated with a kmem folio from being released.
@@ -1867,26 +1853,6 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 	return p->memcg_in_oom;
 }
 
-void folio_memcg_lock(struct folio *folio);
-void folio_memcg_unlock(struct folio *folio);
-
-/* try to stablize folio_memcg() for all the pages in a memcg */
-static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
-{
-	rcu_read_lock();
-
-	if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
-		return true;
-
-	rcu_read_unlock();
-	return false;
-}
-
-static inline void mem_cgroup_unlock_pages(void)
-{
-	rcu_read_unlock();
-}
-
 static inline void mem_cgroup_enter_user_fault(void)
 {
 	WARN_ON(current->in_user_fault);
@@ -1908,26 +1874,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
 	return 0;
 }
 
-static inline void folio_memcg_lock(struct folio *folio)
-{
-}
-
-static inline void folio_memcg_unlock(struct folio *folio)
-{
-}
-
-static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
-{
-	/* to match folio_memcg_rcu() */
-	rcu_read_lock();
-	return true;
-}
-
-static inline void mem_cgroup_unlock_pages(void)
-{
-	rcu_read_unlock();
-}
-
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return false;
diff --git a/mm/filemap.c b/mm/filemap.c
index 630a1c431ea1..e582a1545d2a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,7 +119,6 @@
  *    ->i_pages lock		(folio_remove_rmap_pte->set_page_dirty)
  *    bdi.wb->list_lock		(folio_remove_rmap_pte->set_page_dirty)
  *    ->inode->i_lock		(folio_remove_rmap_pte->set_page_dirty)
- *    ->memcg->move_lock	(folio_remove_rmap_pte->folio_memcg_lock)
  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->block_dirty_folio)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 9c0fba8c8a83..539ceefa9d2d 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -401,87 +401,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
 	return nr_reclaimed;
 }
 
-/**
- * folio_memcg_lock - Bind a folio to its memcg.
- * @folio: The folio.
- *
- * This function prevents unlocked LRU folios from being moved to
- * another cgroup.
- *
- * It ensures lifetime of the bound memcg.  The caller is responsible
- * for the lifetime of the folio.
- */
-void folio_memcg_lock(struct folio *folio)
-{
-	struct mem_cgroup *memcg;
-	unsigned long flags;
-
-	/*
-	 * The RCU lock is held throughout the transaction.  The fast
-	 * path can get away without acquiring the memcg->move_lock
-	 * because page moving starts with an RCU grace period.
-         */
-	rcu_read_lock();
-
-	if (mem_cgroup_disabled())
-		return;
-again:
-	memcg = folio_memcg(folio);
-	if (unlikely(!memcg))
-		return;
-
-#ifdef CONFIG_PROVE_LOCKING
-	local_irq_save(flags);
-	might_lock(&memcg->move_lock);
-	local_irq_restore(flags);
-#endif
-
-	if (atomic_read(&memcg->moving_account) <= 0)
-		return;
-
-	spin_lock_irqsave(&memcg->move_lock, flags);
-	if (memcg != folio_memcg(folio)) {
-		spin_unlock_irqrestore(&memcg->move_lock, flags);
-		goto again;
-	}
-
-	/*
-	 * When charge migration first begins, we can have multiple
-	 * critical sections holding the fast-path RCU lock and one
-	 * holding the slowpath move_lock. Track the task who has the
-	 * move_lock for folio_memcg_unlock().
-	 */
-	memcg->move_lock_task = current;
-	memcg->move_lock_flags = flags;
-}
-
-static void __folio_memcg_unlock(struct mem_cgroup *memcg)
-{
-	if (memcg && memcg->move_lock_task == current) {
-		unsigned long flags = memcg->move_lock_flags;
-
-		memcg->move_lock_task = NULL;
-		memcg->move_lock_flags = 0;
-
-		spin_unlock_irqrestore(&memcg->move_lock, flags);
-	}
-
-	rcu_read_unlock();
-}
-
-/**
- * folio_memcg_unlock - Release the binding between a folio and its memcg.
- * @folio: The folio.
- *
- * This releases the binding created by folio_memcg_lock().  This does
- * not change the accounting of this folio to its memcg, but it does
- * permit others to change it.
- */
-void folio_memcg_unlock(struct folio *folio)
-{
-	__folio_memcg_unlock(folio_memcg(folio));
-}
-
 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
 				struct cftype *cft)
 {
@@ -1189,7 +1108,6 @@ void memcg1_memcg_init(struct mem_cgroup *memcg)
 {
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	mutex_init(&memcg->thresholds_lock);
-	spin_lock_init(&memcg->move_lock);
 	INIT_LIST_HEAD(&memcg->event_list);
 	spin_lock_init(&memcg->event_list_lock);
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ec4ac6eb650b..39e902c1dd9f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1189,7 +1189,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
  * These functions are safe to use under any of the following conditions:
  * - folio locked
  * - folio_test_lru false
- * - folio_memcg_lock()
  * - folio frozen (refcount of 0)
  *
  * Return: The lruvec this folio is on with its lock held.
@@ -1211,7 +1210,6 @@ struct lruvec *folio_lruvec_lock(struct folio *folio)
  * These functions are safe to use under any of the following conditions:
  * - folio locked
  * - folio_test_lru false
- * - folio_memcg_lock()
  * - folio frozen (refcount of 0)
  *
  * Return: The lruvec this folio is on with its lock held and interrupts
@@ -1235,7 +1233,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
  * These functions are safe to use under any of the following conditions:
  * - folio locked
  * - folio_test_lru false
- * - folio_memcg_lock()
  * - folio frozen (refcount of 0)
  *
  * Return: The lruvec this folio is on with its lock held and interrupts
@@ -2375,9 +2372,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 	 *
 	 * - the page lock
 	 * - LRU isolation
-	 * - folio_memcg_lock()
 	 * - exclusive reference
-	 * - mem_cgroup_trylock_pages()
 	 */
 	folio->memcg_data = (unsigned long)memcg;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 73d5998677d4..4d75433330f9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,6 @@
  *                   swap_lock (in swap_duplicate, swap_info_get)
  *                     mmlist_lock (in mmput, drain_mmlist and others)
  *                     mapping->private_lock (in block_dirty_folio)
- *                       folio_lock_memcg move_lock (in block_dirty_folio)
  *                         i_pages lock (widely used)
  *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
  *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
-- 
cgit v1.2.3


From 906c38ff52e95575ddf3281bee531eded3dba150 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Sat, 26 Oct 2024 09:37:07 -0700
Subject: memcg: workingset: remove folio_memcg_rcu usage

The function workingset_activation() is called from folio_mark_accessed()
with the guarantee that the given folio can not be freed under us in
workingset_activation().  In addition, the association of the folio and
its memcg can not be broken here because charge migration is no more.
There is no need to use folio_memcg_rcu.  Simply use folio_memcg_charged()
because that is what this function cares about.

[akpm@linux-foundation.org: provide folio_memcg_charged stub for CONFIG_MEMCG=n]
Link: https://lkml.kernel.org/r/20241026163707.2479526-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Suggested-by: Yu Zhao <yuzhao@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 34 ++--------------------------------
 mm/workingset.c            | 14 ++------------
 2 files changed, 4 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bb49e0d4b377..8e4608be811d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -443,35 +443,6 @@ static inline bool folio_memcg_charged(struct folio *folio)
 	return __folio_memcg(folio) != NULL;
 }
 
-/**
- * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
- * @folio: Pointer to the folio.
- *
- * This function assumes that the folio is known to have a
- * proper memory cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios.
- *
- * Return: A pointer to the memory cgroup associated with the folio,
- * or NULL.
- */
-static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
-{
-	unsigned long memcg_data = READ_ONCE(folio->memcg_data);
-
-	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
-
-	if (memcg_data & MEMCG_DATA_KMEM) {
-		struct obj_cgroup *objcg;
-
-		objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
-		return obj_cgroup_memcg(objcg);
-	}
-
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
-}
-
 /*
  * folio_memcg_check - Get the memory cgroup associated with a folio.
  * @folio: Pointer to the folio.
@@ -1084,10 +1055,9 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio)
 	return NULL;
 }
 
-static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
+static inline bool folio_memcg_charged(struct folio *folio)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	return NULL;
+	return false;
 }
 
 static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
diff --git a/mm/workingset.c b/mm/workingset.c
index a2b28e356e68..0e38bec261a4 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -591,22 +591,12 @@ void workingset_refault(struct folio *folio, void *shadow)
  */
 void workingset_activation(struct folio *folio)
 {
-	struct mem_cgroup *memcg;
-
-	rcu_read_lock();
 	/*
 	 * Filter non-memcg pages here, e.g. unmap can call
 	 * mark_page_accessed() on VDSO pages.
-	 *
-	 * XXX: See workingset_refault() - this should return
-	 * root_mem_cgroup even for !CONFIG_MEMCG.
 	 */
-	memcg = folio_memcg_rcu(folio);
-	if (!mem_cgroup_disabled() && !memcg)
-		goto out;
-	workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
-out:
-	rcu_read_unlock();
+	if (mem_cgroup_disabled() || folio_memcg_charged(folio))
+		workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
 }
 
 /*
-- 
cgit v1.2.3


From 8c462d56487e3abdbf8a61cedfe7c795a54f4a78 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 6 Nov 2024 16:04:48 +0000
Subject: arm64: smccc: Remove broken support for SMCCCv1.3 SVE discard hint

SMCCCv1.3 added a hint bit which callers can set in an SMCCC function ID
(AKA "FID") to indicate that it is acceptable for the SMCCC
implementation to discard SVE and/or SME state over a specific SMCCC
call. The kernel support for using this hint is broken and SMCCC calls
may clobber the SVE and/or SME state of arbitrary tasks, though FPSIMD
state is unaffected.

The kernel support is intended to use the hint when there is no SVE or
SME state to save, and to do this it checks whether TIF_FOREIGN_FPSTATE
is set or TIF_SVE is clear in assembly code:

|        ldr     <flags>, [<current_task>, #TSK_TI_FLAGS]
|        tbnz    <flags>, #TIF_FOREIGN_FPSTATE, 1f   // Any live FP state?
|        tbnz    <flags>, #TIF_SVE, 2f               // Does that state include SVE?
|
| 1:     orr     <fid>, <fid>, ARM_SMCCC_1_3_SVE_HINT
| 2:
|        << SMCCC call using FID >>

This is not safe as-is:

(1) SMCCC calls can be made in a preemptible context and preemption can
    result in TIF_FOREIGN_FPSTATE being set or cleared at arbitrary
    points in time. Thus checking for TIF_FOREIGN_FPSTATE provides no
    guarantee.

(2) TIF_FOREIGN_FPSTATE only indicates that the live FP/SVE/SME state in
    the CPU does not belong to the current task, and does not indicate
    that clobbering this state is acceptable.

    When the live CPU state is clobbered it is necessary to update
    fpsimd_last_state.st to ensure that a subsequent context switch will
    reload FP/SVE/SME state from memory rather than consuming the
    clobbered state. This and the SMCCC call itself must happen in a
    critical section with preemption disabled to avoid races.

(3) Live SVE/SME state can exist with TIF_SVE clear (e.g. with only
    TIF_SME set), and checking TIF_SVE alone is insufficient.

Remove the broken support for the SMCCCv1.3 SVE saving hint. This is
effectively a revert of commits:

* cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint")
* a7c3acca5380 ("arm64: smccc: Save lr before calling __arm_smccc_sve_check()")

... leaving behind the ARM_SMCCC_VERSION_1_3 and ARM_SMCCC_1_3_SVE_HINT
definitions, since these are simply definitions from the SMCCC
specification, and the latter is used in KVM via ARM_SMCCC_CALL_HINTS.

If we want to bring this back in future, we'll probably want to handle
this logic in C where we can use all the usual FPSIMD/SVE/SME helper
functions, and that'll likely require some rework of the SMCCC code
and/or its callers.

Fixes: cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: stable@vger.kernel.org
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241106160448.2712997-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/smccc-call.S | 35 +++--------------------------------
 drivers/firmware/smccc/smccc.c |  4 ----
 include/linux/arm-smccc.h      | 32 +++-----------------------------
 3 files changed, 6 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index 487381164ff6..2def9d0dd3dd 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -7,48 +7,19 @@
 
 #include <asm/asm-offsets.h>
 #include <asm/assembler.h>
-#include <asm/thread_info.h>
-
-/*
- * If we have SMCCC v1.3 and (as is likely) no SVE state in
- * the registers then set the SMCCC hint bit to say there's no
- * need to preserve it.  Do this by directly adjusting the SMCCC
- * function value which is already stored in x0 ready to be called.
- */
-SYM_FUNC_START(__arm_smccc_sve_check)
-
-	ldr_l	x16, smccc_has_sve_hint
-	cbz	x16, 2f
-
-	get_current_task x16
-	ldr	x16, [x16, #TSK_TI_FLAGS]
-	tbnz	x16, #TIF_FOREIGN_FPSTATE, 1f	// Any live FP state?
-	tbnz	x16, #TIF_SVE, 2f		// Does that state include SVE?
-
-1:	orr	x0, x0, ARM_SMCCC_1_3_SVE_HINT
-
-2:	ret
-SYM_FUNC_END(__arm_smccc_sve_check)
-EXPORT_SYMBOL(__arm_smccc_sve_check)
 
 	.macro SMCCC instr
-	stp     x29, x30, [sp, #-16]!
-	mov	x29, sp
-alternative_if ARM64_SVE
-	bl	__arm_smccc_sve_check
-alternative_else_nop_endif
 	\instr	#0
-	ldr	x4, [sp, #16]
+	ldr	x4, [sp]
 	stp	x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
 	stp	x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS]
-	ldr	x4, [sp, #24]
+	ldr	x4, [sp, #8]
 	cbz	x4, 1f /* no quirk structure */
 	ldr	x9, [x4, #ARM_SMCCC_QUIRK_ID_OFFS]
 	cmp	x9, #ARM_SMCCC_QUIRK_QCOM_A6
 	b.ne	1f
 	str	x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS]
-1:	ldp     x29, x30, [sp], #16
-	ret
+1:	ret
 	.endm
 
 /*
diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c
index d670635914ec..a74600d9f2d7 100644
--- a/drivers/firmware/smccc/smccc.c
+++ b/drivers/firmware/smccc/smccc.c
@@ -16,7 +16,6 @@ static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
 static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE;
 
 bool __ro_after_init smccc_trng_available = false;
-u64 __ro_after_init smccc_has_sve_hint = false;
 s32 __ro_after_init smccc_soc_id_version = SMCCC_RET_NOT_SUPPORTED;
 s32 __ro_after_init smccc_soc_id_revision = SMCCC_RET_NOT_SUPPORTED;
 
@@ -28,9 +27,6 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
 	smccc_conduit = conduit;
 
 	smccc_trng_available = smccc_probe_trng();
-	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
-	    smccc_version >= ARM_SMCCC_VERSION_1_3)
-		smccc_has_sve_hint = true;
 
 	if ((smccc_version >= ARM_SMCCC_VERSION_1_2) &&
 	    (smccc_conduit != SMCCC_CONDUIT_NONE)) {
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index f59099a213d0..67f6fdf2e7cd 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -315,8 +315,6 @@ u32 arm_smccc_get_version(void);
 
 void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit);
 
-extern u64 smccc_has_sve_hint;
-
 /**
  * arm_smccc_get_soc_id_version()
  *
@@ -414,15 +412,6 @@ struct arm_smccc_quirk {
 	} state;
 };
 
-/**
- * __arm_smccc_sve_check() - Set the SVE hint bit when doing SMC calls
- *
- * Sets the SMCCC hint bit to indicate if there is live state in the SVE
- * registers, this modifies x0 in place and should never be called from C
- * code.
- */
-asmlinkage unsigned long __arm_smccc_sve_check(unsigned long x0);
-
 /**
  * __arm_smccc_smc() - make SMC calls
  * @a0-a7: arguments passed in registers 0 to 7
@@ -490,20 +479,6 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 
 #endif
 
-/* nVHE hypervisor doesn't have a current thread so needs separate checks */
-#if defined(CONFIG_ARM64_SVE) && !defined(__KVM_NVHE_HYPERVISOR__)
-
-#define SMCCC_SVE_CHECK ALTERNATIVE("nop \n",  "bl __arm_smccc_sve_check \n", \
-				    ARM64_SVE)
-#define smccc_sve_clobbers "x16", "x30", "cc",
-
-#else
-
-#define SMCCC_SVE_CHECK
-#define smccc_sve_clobbers
-
-#endif
-
 #define __constraint_read_2	"r" (arg0)
 #define __constraint_read_3	__constraint_read_2, "r" (arg1)
 #define __constraint_read_4	__constraint_read_3, "r" (arg2)
@@ -574,12 +549,11 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 		register unsigned long r3 asm("r3"); 			\
 		CONCATENATE(__declare_arg_,				\
 			    COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__);	\
-		asm volatile(SMCCC_SVE_CHECK				\
-			     inst "\n" :				\
+		asm volatile(inst "\n" :				\
 			     "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3)	\
 			     : CONCATENATE(__constraint_read_,		\
 					   COUNT_ARGS(__VA_ARGS__))	\
-			     : smccc_sve_clobbers "memory");		\
+			     : "memory");				\
 		if (___res)						\
 			*___res = (typeof(*___res)){r0, r1, r2, r3};	\
 	} while (0)
@@ -628,7 +602,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 		asm ("" :						\
 		     : CONCATENATE(__constraint_read_,			\
 				   COUNT_ARGS(__VA_ARGS__))		\
-		     : smccc_sve_clobbers "memory");			\
+		     : "memory");					\
 		if (___res)						\
 			___res->a0 = SMCCC_RET_NOT_SUPPORTED;		\
 	} while (0)
-- 
cgit v1.2.3


From 495e7c8e9601245738a6ab53c992cc5aa0b13cb4 Mon Sep 17 00:00:00 2001
From: Jinjian Song <jinjian.song@fibocom.com>
Date: Mon, 4 Nov 2024 17:44:34 +0800
Subject: wwan: core: Add WWAN ADB and MIPC port type

Add new WWAN ports that connect to the device's ADB protocol interface
and MTK MIPC diagnostic interface.

Signed-off-by: Jinjian Song <jinjian.song@fibocom.com>
Reviewed-by: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/wwan/wwan_core.c | 8 ++++++++
 include/linux/wwan.h         | 4 ++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 8dc88cc65781..a51e2755991a 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -334,6 +334,14 @@ static const struct {
 		.name = "FASTBOOT",
 		.devsuf = "fastboot",
 	},
+	[WWAN_PORT_ADB] = {
+		.name = "ADB",
+		.devsuf = "adb",
+	},
+	[WWAN_PORT_MIPC] = {
+		.name = "MIPC",
+		.devsuf = "mipc",
+	},
 };
 
 static ssize_t type_show(struct device *dev, struct device_attribute *attr,
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 170fdee6339c..79c781875c09 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -17,6 +17,8 @@
  * @WWAN_PORT_FIREHOSE: XML based command protocol
  * @WWAN_PORT_XMMRPC: Control protocol for Intel XMM modems
  * @WWAN_PORT_FASTBOOT: Fastboot protocol control
+ * @WWAN_PORT_ADB: ADB protocol control
+ * @WWAN_PORT_MIPC: MTK MIPC diagnostic interface
  *
  * @WWAN_PORT_MAX: Highest supported port types
  * @WWAN_PORT_UNKNOWN: Special value to indicate an unknown port type
@@ -30,6 +32,8 @@ enum wwan_port_type {
 	WWAN_PORT_FIREHOSE,
 	WWAN_PORT_XMMRPC,
 	WWAN_PORT_FASTBOOT,
+	WWAN_PORT_ADB,
+	WWAN_PORT_MIPC,
 
 	/* Add new port types above this line */
 
-- 
cgit v1.2.3


From ab9bc81c1cf0efc7fc5a3aa4e562aa88d09ada57 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Nov 2024 05:45:34 -0700
Subject: Revert "block: pre-calculate max_zone_append_sectors"

This causes issue on, at least, nvme-mpath where my boot fails with:

WARNING: CPU: 354 PID: 2729 at block/blk-settings.c:75 blk_validate_limits+0x356/0x380
Modules linked in: tg3(+) nvme usbcore scsi_mod ptp i2c_piix4 libphy nvme_core crc32c_intel scsi_common usb_common pps_core i2c_smbus
CPU: 354 UID: 0 PID: 2729 Comm: kworker/u2061:1 Not tainted 6.12.0-rc6+ #181
Hardware name: Dell Inc. PowerEdge R7625/06444F, BIOS 1.8.3 04/02/2024
Workqueue: async async_run_entry_fn
RIP: 0010:blk_validate_limits+0x356/0x380
Code: f6 47 01 04 75 28 83 bf 94 00 00 00 00 75 39 83 bf 98 00 00 00 00 75 34 83 7f 68 00 75 32 31 c0 83 7f 5c 00 0f 84 9b fd ff ff <0f> 0b eb 13 0f 0b eb 0f 48 c7 c0 74 12 58 92 48 89 c7 e8 13 76 46
RSP: 0018:ffffa8a1dfb93b30 EFLAGS: 00010286
RAX: 0000000000000000 RBX: ffff9232829c8388 RCX: 0000000000000088
RDX: 0000000000000080 RSI: 0000000000000200 RDI: ffffa8a1dfb93c38
RBP: 000000000000000c R08: 00000000ffffffff R09: 000000000000ffff
R10: 0000000000000000 R11: 0000000000000000 R12: ffff9232829b9000
R13: ffff9232829b9010 R14: ffffa8a1dfb93c38 R15: ffffa8a1dfb93c38
FS:  0000000000000000(0000) GS:ffff923867c80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055c1b92480a8 CR3: 0000002484ff0002 CR4: 0000000000370ef0
Call Trace:
 <TASK>
 ? __warn+0xca/0x1a0
 ? blk_validate_limits+0x356/0x380
 ? report_bug+0x11a/0x1a0
 ? handle_bug+0x5e/0x90
 ? exc_invalid_op+0x16/0x40
 ? asm_exc_invalid_op+0x16/0x20
 ? blk_validate_limits+0x356/0x380
 blk_alloc_queue+0x7a/0x250
 __blk_alloc_disk+0x39/0x80
 nvme_mpath_alloc_disk+0x13d/0x1b0 [nvme_core]
 nvme_scan_ns+0xcc7/0x1010 [nvme_core]
 async_run_entry_fn+0x27/0x120
 process_scheduled_works+0x1a0/0x360
 worker_thread+0x2bc/0x350
 ? pr_cont_work+0x1b0/0x1b0
 kthread+0x111/0x120
 ? kthread_unuse_mm+0x90/0x90
 ret_from_fork+0x30/0x40
 ? kthread_unuse_mm+0x90/0x90
 ret_from_fork_asm+0x11/0x20
 </TASK>
---[ end trace 0000000000000000 ]---

presumably due to max_zone_append_sectors not being cleared to zero,
resulting in blk_validate_zoned_limits() complaining and failing.

This reverts commit 2a8f6153e1c2db06a537a5c9d61102eb591776f1.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c               |  2 +-
 block/blk-merge.c              |  3 ++-
 block/blk-settings.c           | 25 +++++++++++++------------
 block/blk-sysfs.c              | 17 ++++++++++++++---
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/block/ublk_drv.c       |  2 +-
 drivers/block/virtio_blk.c     |  2 +-
 drivers/md/dm-zone.c           |  4 ++--
 drivers/nvme/host/multipath.c  |  2 +-
 drivers/nvme/host/zns.c        |  2 +-
 drivers/scsi/sd_zbc.c          |  2 ++
 include/linux/blkdev.h         | 21 ++++++++++++++++++---
 12 files changed, 57 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 5df4607321ca..09d10bb95fda 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -607,7 +607,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 		return BLK_STS_IOERR;
 
 	/* Make sure the BIO is small enough and will not get split */
-	if (nr_sectors > q->limits.max_zone_append_sectors)
+	if (nr_sectors > queue_max_zone_append_sectors(q))
 		return BLK_STS_IOERR;
 
 	bio->bi_opf |= REQ_NOMERGE;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7c1375a080ad..d813d799cee7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -388,10 +388,11 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 struct bio *bio_split_zone_append(struct bio *bio,
 		const struct queue_limits *lim, unsigned *nr_segs)
 {
+	unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim);
 	int split_sectors;
 
 	split_sectors = bio_split_rw_at(bio, lim, nr_segs,
-			lim->max_zone_append_sectors << SECTOR_SHIFT);
+			max_sectors << SECTOR_SHIFT);
 	if (WARN_ON_ONCE(split_sectors > 0))
 		split_sectors = -EINVAL;
 	return bio_submit_split(bio, split_sectors);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5cb69d85af0e..5ee3d6d1448d 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,16 +91,17 @@ static int blk_validate_zoned_limits(struct queue_limits *lim)
 	if (lim->zone_write_granularity < lim->logical_block_size)
 		lim->zone_write_granularity = lim->logical_block_size;
 
-	/*
-	 * The Zone Append size is limited by the maximum I/O size and the zone
-	 * size given that it can't span zones.
-	 *
-	 * If no max_hw_zone_append_sectors limit is provided, the block layer
-	 * will emulated it, else we're also bound by the hardware limit.
-	 */
-	lim->max_zone_append_sectors =
-		min_not_zero(lim->max_hw_zone_append_sectors,
-			min(lim->chunk_sectors, lim->max_hw_sectors));
+	if (lim->max_zone_append_sectors) {
+		/*
+		 * The Zone Append size is limited by the maximum I/O size
+		 * and the zone size given that it can't span zones.
+		 */
+		lim->max_zone_append_sectors =
+			min3(lim->max_hw_sectors,
+			     lim->max_zone_append_sectors,
+			     lim->chunk_sectors);
+	}
+
 	return 0;
 }
 
@@ -526,8 +527,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
 					b->max_write_zeroes_sectors);
-	t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
-					b->max_hw_zone_append_sectors);
+	t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
+					 queue_limits_max_zone_append_sectors(b));
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d9f22122ae2f..741b95dfdbf6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -131,7 +131,6 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
-QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
 
 #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field)			\
 static ssize_t queue_##_field##_show(struct gendisk *disk, char *page)	\
@@ -179,6 +178,18 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
 	return ret;
 }
 
+/*
+ * For zone append queue_max_zone_append_sectors does not just return the
+ * underlying queue limits, but actually contains a calculation.  Because of
+ * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here.
+ */
+static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page)
+{
+	return sprintf(page, "%llu\n",
+		(u64)queue_max_zone_append_sectors(disk->queue) <<
+			SECTOR_SHIFT);
+}
+
 static ssize_t
 queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count)
 {
@@ -468,7 +479,7 @@ QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
 
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
-QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
@@ -596,7 +607,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_atomic_write_unit_max_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_max_write_zeroes_sectors_entry.attr,
-	&queue_max_zone_append_sectors_entry.attr,
+	&queue_zone_append_max_entry.attr,
 	&queue_zone_write_granularity_entry.attr,
 	&queue_rotational_entry.attr,
 	&queue_zoned_entry.attr,
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 0d5f9bf95229..9bc768b2ca56 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -166,7 +166,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
 
 	lim->features |= BLK_FEAT_ZONED;
 	lim->chunk_sectors = dev->zone_size_sects;
-	lim->max_hw_zone_append_sectors = dev->zone_append_max_sectors;
+	lim->max_zone_append_sectors = dev->zone_append_max_sectors;
 	lim->max_open_zones = dev->zone_max_open;
 	lim->max_active_zones = dev->zone_max_active;
 	return 0;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 8d938b2b41ee..59951e7c2593 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2270,7 +2270,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
 		lim.features |= BLK_FEAT_ZONED;
 		lim.max_active_zones = p->max_active_zones;
 		lim.max_open_zones =  p->max_open_zones;
-		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
+		lim.max_zone_append_sectors = p->max_zone_append_sectors;
 	}
 
 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 0e99a4714928..194417abc105 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -784,7 +784,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk,
 			wg, v);
 		return -ENODEV;
 	}
-	lim->max_hw_zone_append_sectors = v;
+	lim->max_zone_append_sectors = v;
 	dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
 
 	return 0;
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 20edd3fabbab..c0d41c36e06e 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -344,7 +344,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
 		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
 	} else {
 		set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
-		lim->max_hw_zone_append_sectors = 0;
+		lim->max_zone_append_sectors = 0;
 	}
 
 	/*
@@ -379,7 +379,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
 	if (!zlim.mapped_nr_seq_zones) {
 		lim->max_open_zones = 0;
 		lim->max_active_zones = 0;
-		lim->max_hw_zone_append_sectors = 0;
+		lim->max_zone_append_sectors = 0;
 		lim->zone_write_granularity = 0;
 		lim->chunk_sectors = 0;
 		lim->features &= ~BLK_FEAT_ZONED;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index c26cb7d3a2e5..6a15873055b9 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -636,7 +636,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (head->ids.csi == NVME_CSI_ZNS)
 		lim.features |= BLK_FEAT_ZONED;
 	else
-		lim.max_hw_zone_append_sectors = 0;
+		lim.max_zone_append_sectors = 0;
 
 	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
 	if (IS_ERR(head->disk))
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 382949e18c6a..9a06f9d98cd6 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -111,7 +111,7 @@ void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
 	lim->features |= BLK_FEAT_ZONED;
 	lim->max_open_zones = zi->max_open_zones;
 	lim->max_active_zones = zi->max_active_zones;
-	lim->max_hw_zone_append_sectors = ns->ctrl->max_zone_append;
+	lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
 	lim->chunk_sectors = ns->head->zsze =
 		nvme_lba_to_sect(ns->head, zi->zone_size);
 }
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index de5c54c057ec..ee2b74238758 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -634,6 +634,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim,
 		lim->max_open_zones = sdkp->zones_max_open;
 	lim->max_active_zones = 0;
 	lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks);
+	/* Enable block layer zone append emulation */
+	lim->max_zone_append_sectors = 0;
 
 	return 0;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6d1413bd69a5..7bfc877e159e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -375,7 +375,6 @@ struct queue_limits {
 	unsigned int		max_user_discard_sectors;
 	unsigned int		max_secure_erase_sectors;
 	unsigned int		max_write_zeroes_sectors;
-	unsigned int		max_hw_zone_append_sectors;
 	unsigned int		max_zone_append_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
@@ -1205,9 +1204,25 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
 	return q->limits.max_segment_size;
 }
 
+static inline unsigned int
+queue_limits_max_zone_append_sectors(const struct queue_limits *l)
+{
+	unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
+
+	return min_not_zero(l->max_zone_append_sectors, max_sectors);
+}
+
+static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
+{
+	if (!blk_queue_is_zoned(q))
+		return 0;
+
+	return queue_limits_max_zone_append_sectors(&q->limits);
+}
+
 static inline bool queue_emulates_zone_append(struct request_queue *q)
 {
-	return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors;
+	return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
 }
 
 static inline bool bdev_emulates_zone_append(struct block_device *bdev)
@@ -1218,7 +1233,7 @@ static inline bool bdev_emulates_zone_append(struct block_device *bdev)
 static inline unsigned int
 bdev_max_zone_append_sectors(struct block_device *bdev)
 {
-	return bdev_limits(bdev)->max_zone_append_sectors;
+	return queue_max_zone_append_sectors(bdev_get_queue(bdev));
 }
 
 static inline unsigned int bdev_max_segments(struct block_device *bdev)
-- 
cgit v1.2.3


From 1900e1a4495b7376cb9b4e58f1d846660f4c9c4b Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Wed, 6 Nov 2024 15:34:45 +0800
Subject: nvme: add reservation command's defines

This is a preparation patch for NVMeOF target reservation
commands implantation.
Add the defines of reservation command, such as reservation log
and sub operations.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Tested-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b58d9405d65e..44d048d68503 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -2037,4 +2037,72 @@ struct nvme_completion {
 #define NVME_MINOR(ver)		(((ver) >> 8) & 0xff)
 #define NVME_TERTIARY(ver)	((ver) & 0xff)
 
+enum {
+	NVME_AEN_RESV_LOG_PAGE_AVALIABLE	= 0x00,
+};
+
+enum {
+	NVME_PR_LOG_EMPTY_LOG_PAGE			= 0x00,
+	NVME_PR_LOG_REGISTRATION_PREEMPTED		= 0x01,
+	NVME_PR_LOG_RESERVATION_RELEASED		= 0x02,
+	NVME_PR_LOG_RESERVATOPM_PREEMPTED		= 0x03,
+};
+
+enum {
+	NVME_PR_NOTIFY_BIT_REG_PREEMPTED		= 1,
+	NVME_PR_NOTIFY_BIT_RESV_RELEASED		= 2,
+	NVME_PR_NOTIFY_BIT_RESV_PREEMPTED		= 3,
+};
+
+struct nvme_pr_log {
+	__le64			count;
+	__u8			type;
+	__u8			nr_pages;
+	__u8			rsvd1[2];
+	__le32			nsid;
+	__u8			rsvd2[48];
+};
+
+struct nvmet_pr_register_data {
+	__le64	crkey;
+	__le64	nrkey;
+};
+
+struct nvmet_pr_acquire_data {
+	__le64	crkey;
+	__le64	prkey;
+};
+
+struct nvmet_pr_release_data {
+	__le64	crkey;
+};
+
+enum nvme_pr_capabilities {
+	NVME_PR_SUPPORT_PTPL				= 1,
+	NVME_PR_SUPPORT_WRITE_EXCLUSIVE			= 1 << 1,
+	NVME_PR_SUPPORT_EXCLUSIVE_ACCESS		= 1 << 2,
+	NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY	= 1 << 3,
+	NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY	= 1 << 4,
+	NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS	= 1 << 5,
+	NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS	= 1 << 6,
+	NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF		= 1 << 7,
+};
+
+enum nvme_pr_register_action {
+	NVME_PR_REGISTER_ACT_REG		= 0,
+	NVME_PR_REGISTER_ACT_UNREG		= 1,
+	NVME_PR_REGISTER_ACT_REPLACE		= 1 << 1,
+};
+
+enum nvme_pr_acquire_action {
+	NVME_PR_ACQUIRE_ACT_ACQUIRE		= 0,
+	NVME_PR_ACQUIRE_ACT_PREEMPT		= 1,
+	NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT	= 1 << 1,
+};
+
+enum nvme_pr_release_action {
+	NVME_PR_RELEASE_ACT_RELEASE		= 0,
+	NVME_PR_RELEASE_ACT_CLEAR		= 1,
+};
+
 #endif /* _LINUX_NVME_H */
-- 
cgit v1.2.3


From c928807f6f6b6d595a7e199591ae297c81de3aeb Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 28 Oct 2024 12:26:53 -0600
Subject: mm/page_alloc: keep track of free highatomic

OOM kills due to vastly overestimated free highatomic reserves were
observed:

  ... invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0 ...
  Node 0 Normal free:1482936kB boost:0kB min:410416kB low:739404kB high:1068392kB reserved_highatomic:1073152KB ...
  Node 0 Normal: 1292*4kB (ME) 1920*8kB (E) 383*16kB (UE) 220*32kB (ME) 340*64kB (E) 2155*128kB (UE) 3243*256kB (UE) 615*512kB (U) 1*1024kB (M) 0*2048kB 0*4096kB = 1477408kB

The second line above shows that the OOM kill was due to the following
condition:

  free (1482936kB) - reserved_highatomic (1073152kB) = 409784KB < min (410416kB)

And the third line shows there were no free pages in any
MIGRATE_HIGHATOMIC pageblocks, which otherwise would show up as type 'H'.
Therefore __zone_watermark_unusable_free() underestimated the usable free
memory by over 1GB, which resulted in the unnecessary OOM kill above.

The comments in __zone_watermark_unusable_free() warns about the potential
risk, i.e.,

  If the caller does not have rights to reserves below the min
  watermark then subtract the high-atomic reserves. This will
  over-estimate the size of the atomic reserve but it avoids a search.

However, it is possible to keep track of free pages in reserved highatomic
pageblocks with a new per-zone counter nr_free_highatomic protected by the
zone lock, to avoid a search when calculating the usable free memory.  And
the cost would be minimal, i.e., simple arithmetics in the highatomic
alloc/free/move paths.

Note that since nr_free_highatomic can be relatively small, using a
per-cpu counter might cause too much drift and defeat its purpose, in
addition to the extra memory overhead.

Dependson e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting") - see [1]

[akpm@linux-foundation.org: s/if/else if/, per Johannes, stealth whitespace tweak]
Link: https://lkml.kernel.org/r/20241028182653.3420139-1-yuzhao@google.com
Link: https://lkml.kernel.org/r/0d0ddb33-fcdc-43e2-801f-0c1df2031afb@suse.cz [1]
Fixes: 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: Link Lin <linkl@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5b1c984daf45..80bc5640bb60 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -823,6 +823,7 @@ struct zone {
 	unsigned long watermark_boost;
 
 	unsigned long nr_reserved_highatomic;
+	unsigned long nr_free_highatomic;
 
 	/*
 	 * We don't know if the memory that we're going to allocate will be
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8ad38cd5e574..c6c7bb3ea71b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -635,6 +635,8 @@ compaction_capture(struct capture_control *capc, struct page *page,
 static inline void account_freepages(struct zone *zone, int nr_pages,
 				     int migratetype)
 {
+	lockdep_assert_held(&zone->lock);
+
 	if (is_migrate_isolate(migratetype))
 		return;
 
@@ -642,6 +644,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
 
 	if (is_migrate_cma(migratetype))
 		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
+	else if (is_migrate_highatomic(migratetype))
+		WRITE_ONCE(zone->nr_free_highatomic,
+			   zone->nr_free_highatomic + nr_pages);
 }
 
 /* Used for pages not on another list */
@@ -3079,11 +3084,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
 
 	/*
 	 * If the caller does not have rights to reserves below the min
-	 * watermark then subtract the high-atomic reserves. This will
-	 * over-estimate the size of the atomic reserve but it avoids a search.
+	 * watermark then subtract the free pages reserved for highatomic.
 	 */
 	if (likely(!(alloc_flags & ALLOC_RESERVES)))
-		unusable_free += z->nr_reserved_highatomic;
+		unusable_free += READ_ONCE(z->nr_free_highatomic);
 
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
-- 
cgit v1.2.3


From 9e05e5c7ee8758141d2db7e8fea2cab34500c6ed Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Mon, 4 Nov 2024 19:54:19 +0000
Subject: signal: restore the override_rlimit logic

Prior to commit d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of
ucounts") UCOUNT_RLIMIT_SIGPENDING rlimit was not enforced for a class of
signals.  However now it's enforced unconditionally, even if
override_rlimit is set.  This behavior change caused production issues.

For example, if the limit is reached and a process receives a SIGSEGV
signal, sigqueue_alloc fails to allocate the necessary resources for the
signal delivery, preventing the signal from being delivered with siginfo.
This prevents the process from correctly identifying the fault address and
handling the error.  From the user-space perspective, applications are
unaware that the limit has been reached and that the siginfo is
effectively 'corrupted'.  This can lead to unpredictable behavior and
crashes, as we observed with java applications.

Fix this by passing override_rlimit into inc_rlimit_get_ucounts() and skip
the comparison to max there if override_rlimit is set.  This effectively
restores the old behavior.

Link: https://lkml.kernel.org/r/20241104195419.3962584-1-roman.gushchin@linux.dev
Fixes: d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts")
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Co-developed-by: Andrei Vagin <avagin@google.com>
Signed-off-by: Andrei Vagin <avagin@google.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Alexey Gladkov <legion@kernel.org>
Cc: Kees Cook <kees@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/user_namespace.h | 3 ++-
 kernel/signal.c                | 3 ++-
 kernel/ucount.c                | 6 ++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 3625096d5f85..7183e5aca282 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -141,7 +141,8 @@ static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type ty
 
 long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
 bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
-long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type);
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
+			    bool override_rlimit);
 void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
 bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 4344860ffcac..cbabb2d05e0a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -419,7 +419,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 	 */
 	rcu_read_lock();
 	ucounts = task_ucounts(t);
-	sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
+	sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING,
+					    override_rlimit);
 	rcu_read_unlock();
 	if (!sigpending)
 		return NULL;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 9469102c5ac0..696406939be5 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
 	do_dec_rlimit_put_ucounts(ucounts, NULL, type);
 }
 
-long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
+			    bool override_rlimit)
 {
 	/* Caller must hold a reference to ucounts */
 	struct ucounts *iter;
@@ -320,7 +321,8 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
 			goto dec_unwind;
 		if (iter == ucounts)
 			ret = new;
-		max = get_userns_rlimit_max(iter->ns, type);
+		if (!override_rlimit)
+			max = get_userns_rlimit_max(iter->ns, type);
 		/*
 		 * Grab an extra ucount reference for the caller when
 		 * the rlimit count was previously 0.
-- 
cgit v1.2.3


From beeb9220c7307fbb61a2cd6575907db52bde722f Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 23 Oct 2024 19:27:04 +0300
Subject: mm: vmalloc: group declarations depending on CONFIG_MMU together

Patch series "x86/module: use large ROX pages for text allocations", v7.

These patches add support for using large ROX pages for allocations of
executable memory on x86.

They address Andy's comments [1] about having executable mappings for code
that was not completely formed.

The approach taken is to allocate ROX memory along with writable but not
executable memory and use the writable copy to perform relocations and
alternatives patching.  After the module text gets into its final shape,
the contents of the writable memory is copied into the actual ROX location
using text poking.

The allocations of the ROX memory use vmalloc(VMAP_ALLOW_HUGE_MAP) to
allocate PMD aligned memory, fill that memory with invalid instructions
and in the end remap it as ROX.  Portions of these large pages are handed
out to execmem_alloc() callers without any changes to the permissions.
When the memory is freed with execmem_free() it is invalidated again so
that it won't contain stale instructions.

The module memory allocation, x86 code dealing with relocations and
alternatives patching take into account the existence of the two copies,
the writable memory and the ROX memory at the actual allocated virtual
address.

[1] https://lore.kernel.org/all/a17c65c6-863f-4026-9c6f-a04b659e9ab4@app.fastmail.com


This patch (of 8):

There are a couple of declarations that depend on CONFIG_MMU in
include/linux/vmalloc.h spread all over the file.

Group them all together to improve code readability.

No functional changes.

Link: https://lkml.kernel.org/r/20241023162711.2579610-1-rppt@kernel.org
Link: https://lkml.kernel.org/r/20241023162711.2579610-2-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Tested-by: kdevops <kdevops@lists.linux.dev>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Song Liu <song@kernel.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmalloc.h | 60 ++++++++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index ad2ce7a6ab7a..27408f21e501 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -134,12 +134,6 @@ extern void vm_unmap_ram(const void *mem, unsigned int count);
 extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
 extern void vm_unmap_aliases(void);
 
-#ifdef CONFIG_MMU
-extern unsigned long vmalloc_nr_pages(void);
-#else
-static inline unsigned long vmalloc_nr_pages(void) { return 0; }
-#endif
-
 extern void *vmalloc_noprof(unsigned long size) __alloc_size(1);
 #define vmalloc(...)		alloc_hooks(vmalloc_noprof(__VA_ARGS__))
 
@@ -266,12 +260,29 @@ static inline bool is_vm_area_hugepages(const void *addr)
 #endif
 }
 
+/* for /proc/kcore */
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
+
+/*
+ *	Internals.  Don't use..
+ */
+__init void vm_area_add_early(struct vm_struct *vm);
+__init void vm_area_register_early(struct vm_struct *vm, size_t align);
+
+int register_vmap_purge_notifier(struct notifier_block *nb);
+int unregister_vmap_purge_notifier(struct notifier_block *nb);
+
 #ifdef CONFIG_MMU
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+
+unsigned long vmalloc_nr_pages(void);
+
 int vm_area_map_pages(struct vm_struct *area, unsigned long start,
 		      unsigned long end, struct page **pages);
 void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
 			 unsigned long end);
 void vunmap_range(unsigned long addr, unsigned long end);
+
 static inline void set_vm_flush_reset_perms(void *addr)
 {
 	struct vm_struct *vm = find_vm_area(addr);
@@ -279,24 +290,14 @@ static inline void set_vm_flush_reset_perms(void *addr)
 	if (vm)
 		vm->flags |= VM_FLUSH_RESET_PERMS;
 }
+#else  /* !CONFIG_MMU */
+#define VMALLOC_TOTAL 0UL
 
-#else
-static inline void set_vm_flush_reset_perms(void *addr)
-{
-}
-#endif
-
-/* for /proc/kcore */
-extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
-
-/*
- *	Internals.  Don't use..
- */
-extern __init void vm_area_add_early(struct vm_struct *vm);
-extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
+static inline unsigned long vmalloc_nr_pages(void) { return 0; }
+static inline void set_vm_flush_reset_perms(void *addr) {}
+#endif /* CONFIG_MMU */
 
-#ifdef CONFIG_SMP
-# ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) && defined(CONFIG_SMP)
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
 				     size_t align);
@@ -311,22 +312,9 @@ pcpu_get_vm_areas(const unsigned long *offsets,
 	return NULL;
 }
 
-static inline void
-pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
-{
-}
-# endif
-#endif
-
-#ifdef CONFIG_MMU
-#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
-#else
-#define VMALLOC_TOTAL 0UL
+static inline void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) {}
 #endif
 
-int register_vmap_purge_notifier(struct notifier_block *nb);
-int unregister_vmap_purge_notifier(struct notifier_block *nb);
-
 #if defined(CONFIG_MMU) && defined(CONFIG_PRINTK)
 bool vmalloc_dump_obj(void *object);
 #else
-- 
cgit v1.2.3


From 0c3beacf681ec897e0b36685a9b49d01f5cb2dfb Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 23 Oct 2024 19:27:06 +0300
Subject: asm-generic: introduce text-patching.h

Several architectures support text patching, but they name the header
files that declare patching functions differently.

Make all such headers consistently named text-patching.h and add an empty
header in asm-generic for architectures that do not support text patching.

Link: https://lkml.kernel.org/r/20241023162711.2579610-4-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k
Acked-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Tested-by: kdevops <kdevops@lists.linux.dev>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Song Liu <song@kernel.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/include/asm/Kbuild             |   1 +
 arch/arc/include/asm/Kbuild               |   1 +
 arch/arm/include/asm/patch.h              |  18 --
 arch/arm/include/asm/text-patching.h      |  18 ++
 arch/arm/kernel/ftrace.c                  |   2 +-
 arch/arm/kernel/jump_label.c              |   2 +-
 arch/arm/kernel/kgdb.c                    |   2 +-
 arch/arm/kernel/patch.c                   |   2 +-
 arch/arm/probes/kprobes/core.c            |   2 +-
 arch/arm/probes/kprobes/opt-arm.c         |   2 +-
 arch/arm64/include/asm/patching.h         |  17 --
 arch/arm64/include/asm/text-patching.h    |  17 ++
 arch/arm64/kernel/ftrace.c                |   2 +-
 arch/arm64/kernel/jump_label.c            |   2 +-
 arch/arm64/kernel/kgdb.c                  |   2 +-
 arch/arm64/kernel/patching.c              |   2 +-
 arch/arm64/kernel/probes/kprobes.c        |   2 +-
 arch/arm64/kernel/traps.c                 |   2 +-
 arch/arm64/net/bpf_jit_comp.c             |   2 +-
 arch/csky/include/asm/Kbuild              |   1 +
 arch/hexagon/include/asm/Kbuild           |   1 +
 arch/loongarch/include/asm/Kbuild         |   1 +
 arch/m68k/include/asm/Kbuild              |   1 +
 arch/microblaze/include/asm/Kbuild        |   1 +
 arch/mips/include/asm/Kbuild              |   1 +
 arch/nios2/include/asm/Kbuild             |   1 +
 arch/openrisc/include/asm/Kbuild          |   1 +
 arch/parisc/include/asm/patch.h           |  13 --
 arch/parisc/include/asm/text-patching.h   |  13 ++
 arch/parisc/kernel/ftrace.c               |   2 +-
 arch/parisc/kernel/jump_label.c           |   2 +-
 arch/parisc/kernel/kgdb.c                 |   2 +-
 arch/parisc/kernel/kprobes.c              |   2 +-
 arch/parisc/kernel/patch.c                |   2 +-
 arch/powerpc/include/asm/code-patching.h  | 275 ------------------------------
 arch/powerpc/include/asm/kprobes.h        |   2 +-
 arch/powerpc/include/asm/text-patching.h  | 275 ++++++++++++++++++++++++++++++
 arch/powerpc/kernel/crash_dump.c          |   2 +-
 arch/powerpc/kernel/epapr_paravirt.c      |   2 +-
 arch/powerpc/kernel/jump_label.c          |   2 +-
 arch/powerpc/kernel/kgdb.c                |   2 +-
 arch/powerpc/kernel/kprobes.c             |   2 +-
 arch/powerpc/kernel/module_32.c           |   2 +-
 arch/powerpc/kernel/module_64.c           |   2 +-
 arch/powerpc/kernel/optprobes.c           |   2 +-
 arch/powerpc/kernel/process.c             |   2 +-
 arch/powerpc/kernel/security.c            |   2 +-
 arch/powerpc/kernel/setup_32.c            |   2 +-
 arch/powerpc/kernel/setup_64.c            |   2 +-
 arch/powerpc/kernel/static_call.c         |   2 +-
 arch/powerpc/kernel/trace/ftrace.c        |   2 +-
 arch/powerpc/kernel/trace/ftrace_64_pg.c  |   2 +-
 arch/powerpc/lib/code-patching.c          |   2 +-
 arch/powerpc/lib/feature-fixups.c         |   2 +-
 arch/powerpc/lib/test-code-patching.c     |   2 +-
 arch/powerpc/lib/test_emulate_step.c      |   2 +-
 arch/powerpc/mm/book3s32/mmu.c            |   2 +-
 arch/powerpc/mm/book3s64/hash_utils.c     |   2 +-
 arch/powerpc/mm/book3s64/slb.c            |   2 +-
 arch/powerpc/mm/kasan/init_32.c           |   2 +-
 arch/powerpc/mm/mem.c                     |   2 +-
 arch/powerpc/mm/nohash/44x.c              |   2 +-
 arch/powerpc/mm/nohash/book3e_pgtable.c   |   2 +-
 arch/powerpc/mm/nohash/tlb.c              |   2 +-
 arch/powerpc/mm/nohash/tlb_64e.c          |   2 +-
 arch/powerpc/net/bpf_jit_comp.c           |   2 +-
 arch/powerpc/perf/8xx-pmu.c               |   2 +-
 arch/powerpc/perf/core-book3s.c           |   2 +-
 arch/powerpc/platforms/85xx/smp.c         |   2 +-
 arch/powerpc/platforms/86xx/mpc86xx_smp.c |   2 +-
 arch/powerpc/platforms/cell/smp.c         |   2 +-
 arch/powerpc/platforms/powermac/smp.c     |   2 +-
 arch/powerpc/platforms/powernv/idle.c     |   2 +-
 arch/powerpc/platforms/powernv/smp.c      |   2 +-
 arch/powerpc/platforms/pseries/smp.c      |   2 +-
 arch/powerpc/xmon/xmon.c                  |   2 +-
 arch/riscv/errata/andes/errata.c          |   2 +-
 arch/riscv/errata/sifive/errata.c         |   2 +-
 arch/riscv/errata/thead/errata.c          |   2 +-
 arch/riscv/include/asm/patch.h            |  16 --
 arch/riscv/include/asm/text-patching.h    |  16 ++
 arch/riscv/include/asm/uprobes.h          |   2 +-
 arch/riscv/kernel/alternative.c           |   2 +-
 arch/riscv/kernel/cpufeature.c            |   3 +-
 arch/riscv/kernel/ftrace.c                |   2 +-
 arch/riscv/kernel/jump_label.c            |   2 +-
 arch/riscv/kernel/patch.c                 |   2 +-
 arch/riscv/kernel/probes/kprobes.c        |   2 +-
 arch/riscv/net/bpf_jit_comp64.c           |   2 +-
 arch/riscv/net/bpf_jit_core.c             |   2 +-
 arch/sh/include/asm/Kbuild                |   1 +
 arch/sparc/include/asm/Kbuild             |   1 +
 arch/um/kernel/um_arch.c                  |   5 +
 arch/x86/include/asm/text-patching.h      |   1 +
 arch/xtensa/include/asm/Kbuild            |   1 +
 include/asm-generic/text-patching.h       |   5 +
 include/linux/text-patching.h             |  15 ++
 97 files changed, 449 insertions(+), 409 deletions(-)
 delete mode 100644 arch/arm/include/asm/patch.h
 create mode 100644 arch/arm/include/asm/text-patching.h
 delete mode 100644 arch/arm64/include/asm/patching.h
 create mode 100644 arch/arm64/include/asm/text-patching.h
 delete mode 100644 arch/parisc/include/asm/patch.h
 create mode 100644 arch/parisc/include/asm/text-patching.h
 delete mode 100644 arch/powerpc/include/asm/code-patching.h
 create mode 100644 arch/powerpc/include/asm/text-patching.h
 delete mode 100644 arch/riscv/include/asm/patch.h
 create mode 100644 arch/riscv/include/asm/text-patching.h
 create mode 100644 include/asm-generic/text-patching.h
 create mode 100644 include/linux/text-patching.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 396caece6d6d..483965c5a4de 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -5,3 +5,4 @@ generic-y += agp.h
 generic-y += asm-offsets.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += text-patching.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 49285a3ce239..4c69522e0328 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -6,3 +6,4 @@ generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/arm/include/asm/patch.h b/arch/arm/include/asm/patch.h
deleted file mode 100644
index 0b48247c4600..000000000000
--- a/arch/arm/include/asm/patch.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ARM_KERNEL_PATCH_H
-#define _ARM_KERNEL_PATCH_H
-
-void patch_text(void *addr, unsigned int insn);
-void __patch_text_real(void *addr, unsigned int insn, bool remap);
-
-static inline void __patch_text(void *addr, unsigned int insn)
-{
-	__patch_text_real(addr, insn, true);
-}
-
-static inline void __patch_text_early(void *addr, unsigned int insn)
-{
-	__patch_text_real(addr, insn, false);
-}
-
-#endif
diff --git a/arch/arm/include/asm/text-patching.h b/arch/arm/include/asm/text-patching.h
new file mode 100644
index 000000000000..0b48247c4600
--- /dev/null
+++ b/arch/arm/include/asm/text-patching.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARM_KERNEL_PATCH_H
+#define _ARM_KERNEL_PATCH_H
+
+void patch_text(void *addr, unsigned int insn);
+void __patch_text_real(void *addr, unsigned int insn, bool remap);
+
+static inline void __patch_text(void *addr, unsigned int insn)
+{
+	__patch_text_real(addr, insn, true);
+}
+
+static inline void __patch_text_early(void *addr, unsigned int insn)
+{
+	__patch_text_real(addr, insn, false);
+}
+
+#endif
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index e61591f33a6c..845acf9ce21e 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -23,7 +23,7 @@
 #include <asm/insn.h>
 #include <asm/set_memory.h>
 #include <asm/stacktrace.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 /*
  * The compiler emitted profiling hook consists of
diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index eb9c24b6e8e2..a06a92d0f550 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/kernel.h>
 #include <linux/jump_label.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/insn.h>
 
 static void __arch_jump_label_transform(struct jump_entry *entry,
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index 22f937e6f3ff..ab76c55fd610 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -15,7 +15,7 @@
 #include <linux/kgdb.h>
 #include <linux/uaccess.h>
 
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/traps.h>
 
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
index e9e828b6bb30..4d45e60cd46d 100644
--- a/arch/arm/kernel/patch.c
+++ b/arch/arm/kernel/patch.c
@@ -9,7 +9,7 @@
 #include <asm/fixmap.h>
 #include <asm/smp_plat.h>
 #include <asm/opcodes.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 struct patch {
 	void *addr;
diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c
index d8238da095df..9fd877c87a38 100644
--- a/arch/arm/probes/kprobes/core.c
+++ b/arch/arm/probes/kprobes/core.c
@@ -25,7 +25,7 @@
 #include <asm/cacheflush.h>
 #include <linux/percpu.h>
 #include <linux/bug.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/sections.h>
 
 #include "../decode-arm.h"
diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c
index 7f65048380ca..966c6042c5ad 100644
--- a/arch/arm/probes/kprobes/opt-arm.c
+++ b/arch/arm/probes/kprobes/opt-arm.c
@@ -14,7 +14,7 @@
 /* for arm_gen_branch */
 #include <asm/insn.h>
 /* for patch_text */
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 #include "core.h"
 
diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/patching.h
deleted file mode 100644
index 587bdb91ab7a..000000000000
--- a/arch/arm64/include/asm/patching.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef	__ASM_PATCHING_H
-#define	__ASM_PATCHING_H
-
-#include <linux/types.h>
-
-int aarch64_insn_read(void *addr, u32 *insnp);
-int aarch64_insn_write(void *addr, u32 insn);
-
-int aarch64_insn_write_literal_u64(void *addr, u64 val);
-void *aarch64_insn_set(void *dst, u32 insn, size_t len);
-void *aarch64_insn_copy(void *dst, void *src, size_t len);
-
-int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
-int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
-
-#endif	/* __ASM_PATCHING_H */
diff --git a/arch/arm64/include/asm/text-patching.h b/arch/arm64/include/asm/text-patching.h
new file mode 100644
index 000000000000..587bdb91ab7a
--- /dev/null
+++ b/arch/arm64/include/asm/text-patching.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef	__ASM_PATCHING_H
+#define	__ASM_PATCHING_H
+
+#include <linux/types.h>
+
+int aarch64_insn_read(void *addr, u32 *insnp);
+int aarch64_insn_write(void *addr, u32 insn);
+
+int aarch64_insn_write_literal_u64(void *addr, u64 val);
+void *aarch64_insn_set(void *dst, u32 insn, size_t len);
+void *aarch64_insn_copy(void *dst, void *src, size_t len);
+
+int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
+int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
+
+#endif	/* __ASM_PATCHING_H */
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index a650f5e11fc5..3575d03d60af 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -15,7 +15,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/ftrace.h>
 #include <asm/insn.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
 struct fregs_offset {
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index f63ea915d6ad..b345425193d2 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -9,7 +9,7 @@
 #include <linux/jump_label.h>
 #include <linux/smp.h>
 #include <asm/insn.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
 
 bool arch_jump_label_transform_queue(struct jump_entry *entry,
 				     enum jump_label_type type)
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index 4e1f983df3d1..f3c4d3a8a20f 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -17,7 +17,7 @@
 
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
 #include <asm/traps.h>
 
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
index 945df74005c7..7f99723fbb8c 100644
--- a/arch/arm64/kernel/patching.c
+++ b/arch/arm64/kernel/patching.c
@@ -10,7 +10,7 @@
 #include <asm/fixmap.h>
 #include <asm/insn.h>
 #include <asm/kprobes.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
 #include <asm/sections.h>
 
 static DEFINE_RAW_SPINLOCK(patch_lock);
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 4268678d0e86..01dbe9a56956 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -27,7 +27,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
 #include <asm/irq.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
 #include <asm/ptrace.h>
 #include <asm/sections.h>
 #include <asm/system_misc.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 563cbce11126..7d8199804086 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -41,7 +41,7 @@
 #include <asm/extable.h>
 #include <asm/insn.h>
 #include <asm/kprobes.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
 #include <asm/traps.h>
 #include <asm/smp.h>
 #include <asm/stack_pointer.h>
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 5db82bfc9dc1..1bcae29ff181 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -19,7 +19,7 @@
 #include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
 #include <asm/set_memory.h>
 
 #include "bpf_jit.h"
diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 9a9bc65b57a9..3a5c7f6e5aac 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += qspinlock.h
 generic-y += parport.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
+generic-y += text-patching.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 8c1a78c8f527..1efa1e993d4b 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -5,3 +5,4 @@ generic-y += extable.h
 generic-y += iomap.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += text-patching.h
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 5b5a6c90e6e2..80ddb5edb845 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += ioctl.h
 generic-y += mmzone.h
 generic-y += statfs.h
 generic-y += param.h
+generic-y += text-patching.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 0dbf9c5c6fae..b282e0dd8dc1 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += extable.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += spinlock.h
+generic-y += text-patching.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index a055f5dbe00a..7178f990e8b3 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -8,3 +8,4 @@ generic-y += parport.h
 generic-y += syscalls.h
 generic-y += tlb.h
 generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 7ba67a0d6c97..684569b2ecd6 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -13,3 +13,4 @@ generic-y += parport.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
 generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 0d09829ed144..28004301c236 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -7,3 +7,4 @@ generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += spinlock.h
 generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index cef49d60d74c..2b1a6b00cdac 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -9,3 +9,4 @@ generic-y += spinlock.h
 generic-y += qrwlock_types.h
 generic-y += qrwlock.h
 generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/parisc/include/asm/patch.h b/arch/parisc/include/asm/patch.h
deleted file mode 100644
index 400d84c6e504..000000000000
--- a/arch/parisc/include/asm/patch.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _PARISC_KERNEL_PATCH_H
-#define _PARISC_KERNEL_PATCH_H
-
-/* stop machine and patch kernel text */
-void patch_text(void *addr, unsigned int insn);
-void patch_text_multiple(void *addr, u32 *insn, unsigned int len);
-
-/* patch kernel text with machine already stopped (e.g. in kgdb) */
-void __patch_text(void *addr, u32 insn);
-void __patch_text_multiple(void *addr, u32 *insn, unsigned int len);
-
-#endif
diff --git a/arch/parisc/include/asm/text-patching.h b/arch/parisc/include/asm/text-patching.h
new file mode 100644
index 000000000000..400d84c6e504
--- /dev/null
+++ b/arch/parisc/include/asm/text-patching.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PARISC_KERNEL_PATCH_H
+#define _PARISC_KERNEL_PATCH_H
+
+/* stop machine and patch kernel text */
+void patch_text(void *addr, unsigned int insn);
+void patch_text_multiple(void *addr, u32 *insn, unsigned int len);
+
+/* patch kernel text with machine already stopped (e.g. in kgdb) */
+void __patch_text(void *addr, u32 insn);
+void __patch_text_multiple(void *addr, u32 *insn, unsigned int len);
+
+#endif
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index c91f9c2e61ed..3e34b4473d3a 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -20,7 +20,7 @@
 #include <asm/assembly.h>
 #include <asm/sections.h>
 #include <asm/ftrace.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 #define __hot __section(".text.hot")
 
diff --git a/arch/parisc/kernel/jump_label.c b/arch/parisc/kernel/jump_label.c
index e253b134500d..ea51f15bf0e6 100644
--- a/arch/parisc/kernel/jump_label.c
+++ b/arch/parisc/kernel/jump_label.c
@@ -8,7 +8,7 @@
 #include <linux/jump_label.h>
 #include <linux/bug.h>
 #include <asm/alternative.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 static inline int reassemble_17(int as17)
 {
diff --git a/arch/parisc/kernel/kgdb.c b/arch/parisc/kernel/kgdb.c
index b16fa9bac5f4..fee81f877525 100644
--- a/arch/parisc/kernel/kgdb.c
+++ b/arch/parisc/kernel/kgdb.c
@@ -16,7 +16,7 @@
 #include <asm/ptrace.h>
 #include <asm/traps.h>
 #include <asm/processor.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/cacheflush.h>
 
 const struct kgdb_arch arch_kgdb_ops = {
diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c
index 6e0b86652f30..9255adba67a3 100644
--- a/arch/parisc/kernel/kprobes.c
+++ b/arch/parisc/kernel/kprobes.c
@@ -12,7 +12,7 @@
 #include <linux/kprobes.h>
 #include <linux/slab.h>
 #include <asm/cacheflush.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c
index e59574f65e64..35dd764b871e 100644
--- a/arch/parisc/kernel/patch.c
+++ b/arch/parisc/kernel/patch.c
@@ -13,7 +13,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/fixmap.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 struct patch {
 	void *addr;
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
deleted file mode 100644
index e7f14720f630..000000000000
--- a/arch/powerpc/include/asm/code-patching.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_POWERPC_CODE_PATCHING_H
-#define _ASM_POWERPC_CODE_PATCHING_H
-
-/*
- * Copyright 2008, Michael Ellerman, IBM Corporation.
- */
-
-#include <asm/types.h>
-#include <asm/ppc-opcode.h>
-#include <linux/string.h>
-#include <linux/kallsyms.h>
-#include <asm/asm-compat.h>
-#include <asm/inst.h>
-
-/* Flags for create_branch:
- * "b"   == create_branch(addr, target, 0);
- * "ba"  == create_branch(addr, target, BRANCH_ABSOLUTE);
- * "bl"  == create_branch(addr, target, BRANCH_SET_LINK);
- * "bla" == create_branch(addr, target, BRANCH_ABSOLUTE | BRANCH_SET_LINK);
- */
-#define BRANCH_SET_LINK	0x1
-#define BRANCH_ABSOLUTE	0x2
-
-/*
- * Powerpc branch instruction is :
- *
- *  0         6                 30   31
- *  +---------+----------------+---+---+
- *  | opcode  |     LI         |AA |LK |
- *  +---------+----------------+---+---+
- *  Where AA = 0 and LK = 0
- *
- * LI is a signed 24 bits integer. The real branch offset is computed
- * by: imm32 = SignExtend(LI:'0b00', 32);
- *
- * So the maximum forward branch should be:
- *   (0x007fffff << 2) = 0x01fffffc =  0x1fffffc
- * The maximum backward branch should be:
- *   (0xff800000 << 2) = 0xfe000000 = -0x2000000
- */
-static inline bool is_offset_in_branch_range(long offset)
-{
-	return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3));
-}
-
-static inline bool is_offset_in_cond_branch_range(long offset)
-{
-	return offset >= -0x8000 && offset <= 0x7fff && !(offset & 0x3);
-}
-
-static inline int create_branch(ppc_inst_t *instr, const u32 *addr,
-				unsigned long target, int flags)
-{
-	long offset;
-
-	*instr = ppc_inst(0);
-	offset = target;
-	if (! (flags & BRANCH_ABSOLUTE))
-		offset = offset - (unsigned long)addr;
-
-	/* Check we can represent the target in the instruction format */
-	if (!is_offset_in_branch_range(offset))
-		return 1;
-
-	/* Mask out the flags and target, so they don't step on each other. */
-	*instr = ppc_inst(0x48000000 | (flags & 0x3) | (offset & 0x03FFFFFC));
-
-	return 0;
-}
-
-int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
-		       unsigned long target, int flags);
-int patch_branch(u32 *addr, unsigned long target, int flags);
-int patch_instruction(u32 *addr, ppc_inst_t instr);
-int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
-int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr);
-
-/*
- * The data patching functions patch_uint() and patch_ulong(), etc., must be
- * called on aligned addresses.
- *
- * The instruction patching functions patch_instruction() and similar must be
- * called on addresses satisfying instruction alignment requirements.
- */
-
-#ifdef CONFIG_PPC64
-
-int patch_uint(void *addr, unsigned int val);
-int patch_ulong(void *addr, unsigned long val);
-
-#define patch_u64 patch_ulong
-
-#else
-
-static inline int patch_uint(void *addr, unsigned int val)
-{
-	if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned int)))
-		return -EINVAL;
-
-	return patch_instruction(addr, ppc_inst(val));
-}
-
-static inline int patch_ulong(void *addr, unsigned long val)
-{
-	if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned long)))
-		return -EINVAL;
-
-	return patch_instruction(addr, ppc_inst(val));
-}
-
-#endif
-
-#define patch_u32 patch_uint
-
-static inline unsigned long patch_site_addr(s32 *site)
-{
-	return (unsigned long)site + *site;
-}
-
-static inline int patch_instruction_site(s32 *site, ppc_inst_t instr)
-{
-	return patch_instruction((u32 *)patch_site_addr(site), instr);
-}
-
-static inline int patch_branch_site(s32 *site, unsigned long target, int flags)
-{
-	return patch_branch((u32 *)patch_site_addr(site), target, flags);
-}
-
-static inline int modify_instruction(unsigned int *addr, unsigned int clr,
-				     unsigned int set)
-{
-	return patch_instruction(addr, ppc_inst((*addr & ~clr) | set));
-}
-
-static inline int modify_instruction_site(s32 *site, unsigned int clr, unsigned int set)
-{
-	return modify_instruction((unsigned int *)patch_site_addr(site), clr, set);
-}
-
-static inline unsigned int branch_opcode(ppc_inst_t instr)
-{
-	return ppc_inst_primary_opcode(instr) & 0x3F;
-}
-
-static inline int instr_is_branch_iform(ppc_inst_t instr)
-{
-	return branch_opcode(instr) == 18;
-}
-
-static inline int instr_is_branch_bform(ppc_inst_t instr)
-{
-	return branch_opcode(instr) == 16;
-}
-
-int instr_is_relative_branch(ppc_inst_t instr);
-int instr_is_relative_link_branch(ppc_inst_t instr);
-unsigned long branch_target(const u32 *instr);
-int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src);
-bool is_conditional_branch(ppc_inst_t instr);
-
-#define OP_RT_RA_MASK	0xffff0000UL
-#define LIS_R2		(PPC_RAW_LIS(_R2, 0))
-#define ADDIS_R2_R12	(PPC_RAW_ADDIS(_R2, _R12, 0))
-#define ADDI_R2_R2	(PPC_RAW_ADDI(_R2, _R2, 0))
-
-
-static inline unsigned long ppc_function_entry(void *func)
-{
-#ifdef CONFIG_PPC64_ELF_ABI_V2
-	u32 *insn = func;
-
-	/*
-	 * A PPC64 ABIv2 function may have a local and a global entry
-	 * point. We need to use the local entry point when patching
-	 * functions, so identify and step over the global entry point
-	 * sequence.
-	 *
-	 * The global entry point sequence is always of the form:
-	 *
-	 * addis r2,r12,XXXX
-	 * addi  r2,r2,XXXX
-	 *
-	 * A linker optimisation may convert the addis to lis:
-	 *
-	 * lis   r2,XXXX
-	 * addi  r2,r2,XXXX
-	 */
-	if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
-	     ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
-	    ((*(insn+1) & OP_RT_RA_MASK) == ADDI_R2_R2))
-		return (unsigned long)(insn + 2);
-	else
-		return (unsigned long)func;
-#elif defined(CONFIG_PPC64_ELF_ABI_V1)
-	/*
-	 * On PPC64 ABIv1 the function pointer actually points to the
-	 * function's descriptor. The first entry in the descriptor is the
-	 * address of the function text.
-	 */
-	return ((struct func_desc *)func)->addr;
-#else
-	return (unsigned long)func;
-#endif
-}
-
-static inline unsigned long ppc_global_function_entry(void *func)
-{
-#ifdef CONFIG_PPC64_ELF_ABI_V2
-	/* PPC64 ABIv2 the global entry point is at the address */
-	return (unsigned long)func;
-#else
-	/* All other cases there is no change vs ppc_function_entry() */
-	return ppc_function_entry(func);
-#endif
-}
-
-/*
- * Wrapper around kallsyms_lookup() to return function entry address:
- * - For ABIv1, we lookup the dot variant.
- * - For ABIv2, we return the local entry point.
- */
-static inline unsigned long ppc_kallsyms_lookup_name(const char *name)
-{
-	unsigned long addr;
-#ifdef CONFIG_PPC64_ELF_ABI_V1
-	/* check for dot variant */
-	char dot_name[1 + KSYM_NAME_LEN];
-	bool dot_appended = false;
-
-	if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
-		return 0;
-
-	if (name[0] != '.') {
-		dot_name[0] = '.';
-		dot_name[1] = '\0';
-		strlcat(dot_name, name, sizeof(dot_name));
-		dot_appended = true;
-	} else {
-		dot_name[0] = '\0';
-		strlcat(dot_name, name, sizeof(dot_name));
-	}
-	addr = kallsyms_lookup_name(dot_name);
-	if (!addr && dot_appended)
-		/* Let's try the original non-dot symbol lookup	*/
-		addr = kallsyms_lookup_name(name);
-#elif defined(CONFIG_PPC64_ELF_ABI_V2)
-	addr = kallsyms_lookup_name(name);
-	if (addr)
-		addr = ppc_function_entry((void *)addr);
-#else
-	addr = kallsyms_lookup_name(name);
-#endif
-	return addr;
-}
-
-/*
- * Some instruction encodings commonly used in dynamic ftracing
- * and function live patching.
- */
-
-/* This must match the definition of STK_GOT in <asm/ppc_asm.h> */
-#ifdef CONFIG_PPC64_ELF_ABI_V2
-#define R2_STACK_OFFSET         24
-#else
-#define R2_STACK_OFFSET         40
-#endif
-
-#define PPC_INST_LD_TOC		PPC_RAW_LD(_R2, _R1, R2_STACK_OFFSET)
-
-/* usually preceded by a mflr r0 */
-#define PPC_INST_STD_LR		PPC_RAW_STD(_R0, _R1, PPC_LR_STKOFF)
-
-#endif /* _ASM_POWERPC_CODE_PATCHING_H */
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 4525a9c68260..dfe2e5ad3b21 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -21,7 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/module.h>
 #include <asm/probes.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 
 #ifdef CONFIG_KPROBES
 #define  __ARCH_WANT_KPROBES_INSN_SLOT
diff --git a/arch/powerpc/include/asm/text-patching.h b/arch/powerpc/include/asm/text-patching.h
new file mode 100644
index 000000000000..e7f14720f630
--- /dev/null
+++ b/arch/powerpc/include/asm/text-patching.h
@@ -0,0 +1,275 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_POWERPC_CODE_PATCHING_H
+#define _ASM_POWERPC_CODE_PATCHING_H
+
+/*
+ * Copyright 2008, Michael Ellerman, IBM Corporation.
+ */
+
+#include <asm/types.h>
+#include <asm/ppc-opcode.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
+#include <asm/asm-compat.h>
+#include <asm/inst.h>
+
+/* Flags for create_branch:
+ * "b"   == create_branch(addr, target, 0);
+ * "ba"  == create_branch(addr, target, BRANCH_ABSOLUTE);
+ * "bl"  == create_branch(addr, target, BRANCH_SET_LINK);
+ * "bla" == create_branch(addr, target, BRANCH_ABSOLUTE | BRANCH_SET_LINK);
+ */
+#define BRANCH_SET_LINK	0x1
+#define BRANCH_ABSOLUTE	0x2
+
+/*
+ * Powerpc branch instruction is :
+ *
+ *  0         6                 30   31
+ *  +---------+----------------+---+---+
+ *  | opcode  |     LI         |AA |LK |
+ *  +---------+----------------+---+---+
+ *  Where AA = 0 and LK = 0
+ *
+ * LI is a signed 24 bits integer. The real branch offset is computed
+ * by: imm32 = SignExtend(LI:'0b00', 32);
+ *
+ * So the maximum forward branch should be:
+ *   (0x007fffff << 2) = 0x01fffffc =  0x1fffffc
+ * The maximum backward branch should be:
+ *   (0xff800000 << 2) = 0xfe000000 = -0x2000000
+ */
+static inline bool is_offset_in_branch_range(long offset)
+{
+	return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3));
+}
+
+static inline bool is_offset_in_cond_branch_range(long offset)
+{
+	return offset >= -0x8000 && offset <= 0x7fff && !(offset & 0x3);
+}
+
+static inline int create_branch(ppc_inst_t *instr, const u32 *addr,
+				unsigned long target, int flags)
+{
+	long offset;
+
+	*instr = ppc_inst(0);
+	offset = target;
+	if (! (flags & BRANCH_ABSOLUTE))
+		offset = offset - (unsigned long)addr;
+
+	/* Check we can represent the target in the instruction format */
+	if (!is_offset_in_branch_range(offset))
+		return 1;
+
+	/* Mask out the flags and target, so they don't step on each other. */
+	*instr = ppc_inst(0x48000000 | (flags & 0x3) | (offset & 0x03FFFFFC));
+
+	return 0;
+}
+
+int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
+		       unsigned long target, int flags);
+int patch_branch(u32 *addr, unsigned long target, int flags);
+int patch_instruction(u32 *addr, ppc_inst_t instr);
+int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
+int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr);
+
+/*
+ * The data patching functions patch_uint() and patch_ulong(), etc., must be
+ * called on aligned addresses.
+ *
+ * The instruction patching functions patch_instruction() and similar must be
+ * called on addresses satisfying instruction alignment requirements.
+ */
+
+#ifdef CONFIG_PPC64
+
+int patch_uint(void *addr, unsigned int val);
+int patch_ulong(void *addr, unsigned long val);
+
+#define patch_u64 patch_ulong
+
+#else
+
+static inline int patch_uint(void *addr, unsigned int val)
+{
+	if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned int)))
+		return -EINVAL;
+
+	return patch_instruction(addr, ppc_inst(val));
+}
+
+static inline int patch_ulong(void *addr, unsigned long val)
+{
+	if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned long)))
+		return -EINVAL;
+
+	return patch_instruction(addr, ppc_inst(val));
+}
+
+#endif
+
+#define patch_u32 patch_uint
+
+static inline unsigned long patch_site_addr(s32 *site)
+{
+	return (unsigned long)site + *site;
+}
+
+static inline int patch_instruction_site(s32 *site, ppc_inst_t instr)
+{
+	return patch_instruction((u32 *)patch_site_addr(site), instr);
+}
+
+static inline int patch_branch_site(s32 *site, unsigned long target, int flags)
+{
+	return patch_branch((u32 *)patch_site_addr(site), target, flags);
+}
+
+static inline int modify_instruction(unsigned int *addr, unsigned int clr,
+				     unsigned int set)
+{
+	return patch_instruction(addr, ppc_inst((*addr & ~clr) | set));
+}
+
+static inline int modify_instruction_site(s32 *site, unsigned int clr, unsigned int set)
+{
+	return modify_instruction((unsigned int *)patch_site_addr(site), clr, set);
+}
+
+static inline unsigned int branch_opcode(ppc_inst_t instr)
+{
+	return ppc_inst_primary_opcode(instr) & 0x3F;
+}
+
+static inline int instr_is_branch_iform(ppc_inst_t instr)
+{
+	return branch_opcode(instr) == 18;
+}
+
+static inline int instr_is_branch_bform(ppc_inst_t instr)
+{
+	return branch_opcode(instr) == 16;
+}
+
+int instr_is_relative_branch(ppc_inst_t instr);
+int instr_is_relative_link_branch(ppc_inst_t instr);
+unsigned long branch_target(const u32 *instr);
+int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src);
+bool is_conditional_branch(ppc_inst_t instr);
+
+#define OP_RT_RA_MASK	0xffff0000UL
+#define LIS_R2		(PPC_RAW_LIS(_R2, 0))
+#define ADDIS_R2_R12	(PPC_RAW_ADDIS(_R2, _R12, 0))
+#define ADDI_R2_R2	(PPC_RAW_ADDI(_R2, _R2, 0))
+
+
+static inline unsigned long ppc_function_entry(void *func)
+{
+#ifdef CONFIG_PPC64_ELF_ABI_V2
+	u32 *insn = func;
+
+	/*
+	 * A PPC64 ABIv2 function may have a local and a global entry
+	 * point. We need to use the local entry point when patching
+	 * functions, so identify and step over the global entry point
+	 * sequence.
+	 *
+	 * The global entry point sequence is always of the form:
+	 *
+	 * addis r2,r12,XXXX
+	 * addi  r2,r2,XXXX
+	 *
+	 * A linker optimisation may convert the addis to lis:
+	 *
+	 * lis   r2,XXXX
+	 * addi  r2,r2,XXXX
+	 */
+	if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
+	     ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
+	    ((*(insn+1) & OP_RT_RA_MASK) == ADDI_R2_R2))
+		return (unsigned long)(insn + 2);
+	else
+		return (unsigned long)func;
+#elif defined(CONFIG_PPC64_ELF_ABI_V1)
+	/*
+	 * On PPC64 ABIv1 the function pointer actually points to the
+	 * function's descriptor. The first entry in the descriptor is the
+	 * address of the function text.
+	 */
+	return ((struct func_desc *)func)->addr;
+#else
+	return (unsigned long)func;
+#endif
+}
+
+static inline unsigned long ppc_global_function_entry(void *func)
+{
+#ifdef CONFIG_PPC64_ELF_ABI_V2
+	/* PPC64 ABIv2 the global entry point is at the address */
+	return (unsigned long)func;
+#else
+	/* All other cases there is no change vs ppc_function_entry() */
+	return ppc_function_entry(func);
+#endif
+}
+
+/*
+ * Wrapper around kallsyms_lookup() to return function entry address:
+ * - For ABIv1, we lookup the dot variant.
+ * - For ABIv2, we return the local entry point.
+ */
+static inline unsigned long ppc_kallsyms_lookup_name(const char *name)
+{
+	unsigned long addr;
+#ifdef CONFIG_PPC64_ELF_ABI_V1
+	/* check for dot variant */
+	char dot_name[1 + KSYM_NAME_LEN];
+	bool dot_appended = false;
+
+	if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
+		return 0;
+
+	if (name[0] != '.') {
+		dot_name[0] = '.';
+		dot_name[1] = '\0';
+		strlcat(dot_name, name, sizeof(dot_name));
+		dot_appended = true;
+	} else {
+		dot_name[0] = '\0';
+		strlcat(dot_name, name, sizeof(dot_name));
+	}
+	addr = kallsyms_lookup_name(dot_name);
+	if (!addr && dot_appended)
+		/* Let's try the original non-dot symbol lookup	*/
+		addr = kallsyms_lookup_name(name);
+#elif defined(CONFIG_PPC64_ELF_ABI_V2)
+	addr = kallsyms_lookup_name(name);
+	if (addr)
+		addr = ppc_function_entry((void *)addr);
+#else
+	addr = kallsyms_lookup_name(name);
+#endif
+	return addr;
+}
+
+/*
+ * Some instruction encodings commonly used in dynamic ftracing
+ * and function live patching.
+ */
+
+/* This must match the definition of STK_GOT in <asm/ppc_asm.h> */
+#ifdef CONFIG_PPC64_ELF_ABI_V2
+#define R2_STACK_OFFSET         24
+#else
+#define R2_STACK_OFFSET         40
+#endif
+
+#define PPC_INST_LD_TOC		PPC_RAW_LD(_R2, _R1, R2_STACK_OFFSET)
+
+/* usually preceded by a mflr r0 */
+#define PPC_INST_STD_LR		PPC_RAW_STD(_R0, _R1, PPC_LR_STKOFF)
+
+#endif /* _ASM_POWERPC_CODE_PATCHING_H */
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index 2086fa6cdc25..103b6605dd68 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -13,7 +13,7 @@
 #include <linux/io.h>
 #include <linux/memblock.h>
 #include <linux/of.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/kdump.h>
 #include <asm/firmware.h>
 #include <linux/uio.h>
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
index d4b8aff20815..247ab2acaccc 100644
--- a/arch/powerpc/kernel/epapr_paravirt.c
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -9,7 +9,7 @@
 #include <linux/of_fdt.h>
 #include <asm/epapr_hcalls.h>
 #include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/machdep.h>
 #include <asm/inst.h>
 
diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c
index 5277cf582c16..2659e1ac8604 100644
--- a/arch/powerpc/kernel/jump_label.c
+++ b/arch/powerpc/kernel/jump_label.c
@@ -5,7 +5,7 @@
 
 #include <linux/kernel.h>
 #include <linux/jump_label.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/inst.h>
 
 void arch_jump_label_transform(struct jump_entry *entry,
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 7a8bc03a00af..5081334b7bd2 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -21,7 +21,7 @@
 #include <asm/processor.h>
 #include <asm/machdep.h>
 #include <asm/debug.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <linux/slab.h>
 #include <asm/inst.h>
 
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index f8aa91bc3b17..9c85bbcc5201 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -21,7 +21,7 @@
 #include <linux/slab.h>
 #include <linux/set_memory.h>
 #include <linux/execmem.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/sstep.h>
 #include <asm/sections.h>
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 816a63fd71fb..f930e3395a7f 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -18,7 +18,7 @@
 #include <linux/bug.h>
 #include <linux/sort.h>
 #include <asm/setup.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 
 /* Count how many different relocations (different symbol, different
    addend) */
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index e9bab599d0c2..135960918d14 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -17,7 +17,7 @@
 #include <linux/kernel.h>
 #include <asm/module.h>
 #include <asm/firmware.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <linux/sort.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index c0b351d61058..2e83702bf9ba 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -13,7 +13,7 @@
 #include <asm/kprobes.h>
 #include <asm/ptrace.h>
 #include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/sstep.h>
 #include <asm/ppc-opcode.h>
 #include <asm/inst.h>
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ff61a3e7984c..7b739b9a91ab 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -54,7 +54,7 @@
 #include <asm/firmware.h>
 #include <asm/hw_irq.h>
 #endif
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/exec.h>
 #include <asm/livepatch.h>
 #include <asm/cpu_has_feature.h>
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index 4856e1a5161c..fbb7ebd8aa08 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -14,7 +14,7 @@
 #include <linux/debugfs.h>
 
 #include <asm/asm-prototypes.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/security_features.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index e515c1f7d8d3..75dbf3e0d9c4 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -40,7 +40,7 @@
 #include <asm/time.h>
 #include <asm/serial.h>
 #include <asm/udbg.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/cpu_has_feature.h>
 #include <asm/asm-prototypes.h>
 #include <asm/kdump.h>
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 22f83fbbc762..3ebf5b9fbe98 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -60,7 +60,7 @@
 #include <asm/xmon.h>
 #include <asm/udbg.h>
 #include <asm/kexec.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/ftrace.h>
 #include <asm/opal.h>
 #include <asm/cputhreads.h>
diff --git a/arch/powerpc/kernel/static_call.c b/arch/powerpc/kernel/static_call.c
index 1502b7e439ca..7cfd0710e757 100644
--- a/arch/powerpc/kernel/static_call.c
+++ b/arch/powerpc/kernel/static_call.c
@@ -2,7 +2,7 @@
 #include <linux/memory.h>
 #include <linux/static_call.h>
 
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 
 void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
 {
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index d8d6b4fd9a14..be1a245241b3 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -23,7 +23,7 @@
 #include <linux/list.h>
 
 #include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/ftrace.h>
 #include <asm/syscall.h>
 #include <asm/inst.h>
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c
index 12fab1803bcf..9e862ba55263 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.c
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c
@@ -23,7 +23,7 @@
 #include <linux/list.h>
 
 #include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/ftrace.h>
 #include <asm/syscall.h>
 #include <asm/inst.h>
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index acdab294b340..af97fbb3c257 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -17,7 +17,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/inst.h>
 
 static int __patch_mem(void *exec_addr, unsigned long val, void *patch_addr, bool is_dword)
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index b7201ba50b2e..587c8cf1230f 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -16,7 +16,7 @@
 #include <linux/sched/mm.h>
 #include <linux/stop_machine.h>
 #include <asm/cputable.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/interrupt.h>
 #include <asm/page.h>
 #include <asm/sections.h>
diff --git a/arch/powerpc/lib/test-code-patching.c b/arch/powerpc/lib/test-code-patching.c
index 8cd3b32f805b..1440d99630b3 100644
--- a/arch/powerpc/lib/test-code-patching.c
+++ b/arch/powerpc/lib/test-code-patching.c
@@ -6,7 +6,7 @@
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 
 static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr)
 {
diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index 23c7805fb7b3..66b5b4fa1686 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -11,7 +11,7 @@
 #include <asm/cpu_has_feature.h>
 #include <asm/sstep.h>
 #include <asm/ppc-opcode.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/inst.h>
 
 #define MAX_SUBTESTS	16
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 2db167f4233f..6978344edcb4 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -25,7 +25,7 @@
 
 #include <asm/mmu.h>
 #include <asm/machdep.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/sections.h>
 
 #include <mm/mmu_decl.h>
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index e1eadd03f133..47b22282269c 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -57,7 +57,7 @@
 #include <asm/sections.h>
 #include <asm/copro.h>
 #include <asm/udbg.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/fadump.h>
 #include <asm/firmware.h>
 #include <asm/tm.h>
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index f2708c8629a5..6b783552403c 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -24,7 +24,7 @@
 #include <linux/pgtable.h>
 
 #include <asm/udbg.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 
 #include "internal.h"
 
diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c
index aa9aa11927b2..03666d790a53 100644
--- a/arch/powerpc/mm/kasan/init_32.c
+++ b/arch/powerpc/mm/kasan/init_32.c
@@ -7,7 +7,7 @@
 #include <linux/memblock.h>
 #include <linux/sched/task.h>
 #include <asm/pgalloc.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <mm/mmu_decl.h>
 
 static pgprot_t __init kasan_prot_ro(void)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1221c561b43a..c7708c8fad29 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -26,7 +26,7 @@
 #include <asm/svm.h>
 #include <asm/mmzone.h>
 #include <asm/ftrace.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/setup.h>
 #include <asm/fixmap.h>
 
diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c
index 1beae802bb1c..6d10c6d8be71 100644
--- a/arch/powerpc/mm/nohash/44x.c
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -24,7 +24,7 @@
 #include <asm/mmu.h>
 #include <asm/page.h>
 #include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/smp.h>
 
 #include <mm/mmu_decl.h>
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
index ad2a7c26f2a0..062e8785c1bb 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -10,7 +10,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/dma.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 
 #include <mm/mmu_decl.h>
 
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index b653a7be4cb1..0a650742f3a0 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -37,7 +37,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/cputhreads.h>
 #include <asm/hugetlb.h>
 #include <asm/paca.h>
diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c
index d26656b07b72..4f925adf2695 100644
--- a/arch/powerpc/mm/nohash/tlb_64e.c
+++ b/arch/powerpc/mm/nohash/tlb_64e.c
@@ -24,7 +24,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/cputhreads.h>
 
 #include <mm/mmu_decl.h>
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 2a36cc2e7e9e..68c6a13e6acb 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -18,7 +18,7 @@
 #include <linux/bpf.h>
 
 #include <asm/kprobes.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 
 #include "bpf_jit.h"
 
diff --git a/arch/powerpc/perf/8xx-pmu.c b/arch/powerpc/perf/8xx-pmu.c
index 308a2e40d7be..1d2972229e3a 100644
--- a/arch/powerpc/perf/8xx-pmu.c
+++ b/arch/powerpc/perf/8xx-pmu.c
@@ -14,7 +14,7 @@
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/ptrace.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/inst.h>
 
 #define PERF_8xx_ID_CPU_CYCLES		1
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 42867469752d..a727cd111cac 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -16,7 +16,7 @@
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/ptrace.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/hw_irq.h>
 #include <asm/interrupt.h>
 
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index e52b848b64b7..32fa5fb557c0 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -23,7 +23,7 @@
 #include <asm/mpic.h>
 #include <asm/cacheflush.h>
 #include <asm/dbell.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/cputhreads.h>
 #include <asm/fsl_pm.h>
 
diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
index 8a7e55acf090..9be33e41af6d 100644
--- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c
+++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
@@ -12,7 +12,7 @@
 #include <linux/delay.h>
 #include <linux/pgtable.h>
 
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/page.h>
 #include <asm/pci-bridge.h>
 #include <asm/mpic.h>
diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c
index fee638fd8970..0e8f20ecca08 100644
--- a/arch/powerpc/platforms/cell/smp.c
+++ b/arch/powerpc/platforms/cell/smp.c
@@ -35,7 +35,7 @@
 #include <asm/firmware.h>
 #include <asm/rtas.h>
 #include <asm/cputhreads.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 
 #include "interrupt.h"
 #include <asm/udbg.h>
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index d21b681f52fb..09e7fe24fac1 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -35,7 +35,7 @@
 
 #include <asm/ptrace.h>
 #include <linux/atomic.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/irq.h>
 #include <asm/page.h>
 #include <asm/sections.h>
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index ad41dffe4d92..d98b933e4984 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -18,7 +18,7 @@
 #include <asm/opal.h>
 #include <asm/cputhreads.h>
 #include <asm/cpuidle.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/smp.h>
 #include <asm/runlatch.h>
 #include <asm/dbell.h>
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 8f14f0581a21..6b746feeabe4 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -28,7 +28,7 @@
 #include <asm/xive.h>
 #include <asm/opal.h>
 #include <asm/runlatch.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/dbell.h>
 #include <asm/kvm_ppc.h>
 #include <asm/ppc-opcode.h>
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index c597711ef20a..db99725e752b 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -39,7 +39,7 @@
 #include <asm/xive.h>
 #include <asm/dbell.h>
 #include <asm/plpar_wrappers.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/svm.h>
 #include <asm/kvm_guest.h>
 
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index e6cddbb2305f..e76e1d5d0611 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -50,7 +50,7 @@
 #include <asm/xive.h>
 #include <asm/opal.h>
 #include <asm/firmware.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
 #include <asm/sections.h>
 #include <asm/inst.h>
 #include <asm/interrupt.h>
diff --git a/arch/riscv/errata/andes/errata.c b/arch/riscv/errata/andes/errata.c
index fc1a34faa5f3..dcc9d1ee5ffd 100644
--- a/arch/riscv/errata/andes/errata.c
+++ b/arch/riscv/errata/andes/errata.c
@@ -13,7 +13,7 @@
 #include <asm/alternative.h>
 #include <asm/cacheflush.h>
 #include <asm/errata_list.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/processor.h>
 #include <asm/sbi.h>
 #include <asm/vendorid_list.h>
diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c
index cea3b96ade11..38aac2c47845 100644
--- a/arch/riscv/errata/sifive/errata.c
+++ b/arch/riscv/errata/sifive/errata.c
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/bug.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/vendorid_list.h>
 #include <asm/errata_list.h>
diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
index f5120e07c318..e24770a77932 100644
--- a/arch/riscv/errata/thead/errata.c
+++ b/arch/riscv/errata/thead/errata.c
@@ -16,7 +16,7 @@
 #include <asm/errata_list.h>
 #include <asm/hwprobe.h>
 #include <asm/io.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/vendorid_list.h>
 #include <asm/vendor_extensions.h>
 
diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/patch.h
deleted file mode 100644
index 7228e266b9a1..000000000000
--- a/arch/riscv/include/asm/patch.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2020 SiFive
- */
-
-#ifndef _ASM_RISCV_PATCH_H
-#define _ASM_RISCV_PATCH_H
-
-int patch_insn_write(void *addr, const void *insn, size_t len);
-int patch_text_nosync(void *addr, const void *insns, size_t len);
-int patch_text_set_nosync(void *addr, u8 c, size_t len);
-int patch_text(void *addr, u32 *insns, size_t len);
-
-extern int riscv_patch_in_stop_machine;
-
-#endif /* _ASM_RISCV_PATCH_H */
diff --git a/arch/riscv/include/asm/text-patching.h b/arch/riscv/include/asm/text-patching.h
new file mode 100644
index 000000000000..7228e266b9a1
--- /dev/null
+++ b/arch/riscv/include/asm/text-patching.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 SiFive
+ */
+
+#ifndef _ASM_RISCV_PATCH_H
+#define _ASM_RISCV_PATCH_H
+
+int patch_insn_write(void *addr, const void *insn, size_t len);
+int patch_text_nosync(void *addr, const void *insns, size_t len);
+int patch_text_set_nosync(void *addr, u8 c, size_t len);
+int patch_text(void *addr, u32 *insns, size_t len);
+
+extern int riscv_patch_in_stop_machine;
+
+#endif /* _ASM_RISCV_PATCH_H */
diff --git a/arch/riscv/include/asm/uprobes.h b/arch/riscv/include/asm/uprobes.h
index 3fc7deda9190..5008f76cdc27 100644
--- a/arch/riscv/include/asm/uprobes.h
+++ b/arch/riscv/include/asm/uprobes.h
@@ -4,7 +4,7 @@
 #define _ASM_RISCV_UPROBES_H
 
 #include <asm/probes.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/bug.h>
 
 #define MAX_UINSN_BYTES		8
diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c
index 0128b161bfda..7eb3cb1215c6 100644
--- a/arch/riscv/kernel/alternative.c
+++ b/arch/riscv/kernel/alternative.c
@@ -18,7 +18,7 @@
 #include <asm/sbi.h>
 #include <asm/csr.h>
 #include <asm/insn.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 struct cpu_manufacturer_info_t {
 	unsigned long vendor_id;
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 3a8eeaa9310c..826f46b21f2e 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -20,7 +20,8 @@
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
 #include <asm/hwcap.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
+#include <asm/hwprobe.h>
 #include <asm/processor.h>
 #include <asm/sbi.h>
 #include <asm/vector.h>
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 4b95c574fd04..a7620ef93b6c 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -10,7 +10,7 @@
 #include <linux/memory.h>
 #include <linux/stop_machine.h>
 #include <asm/cacheflush.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex)
diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c
index 11ad789c60c6..6eee6f736f68 100644
--- a/arch/riscv/kernel/jump_label.c
+++ b/arch/riscv/kernel/jump_label.c
@@ -10,7 +10,7 @@
 #include <linux/mutex.h>
 #include <asm/bug.h>
 #include <asm/cacheflush.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 #define RISCV_INSN_NOP 0x00000013U
 #define RISCV_INSN_JAL 0x0000006fU
diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c
index 34ef522f07a8..db13c9ddf9e3 100644
--- a/arch/riscv/kernel/patch.c
+++ b/arch/riscv/kernel/patch.c
@@ -13,7 +13,7 @@
 #include <asm/cacheflush.h>
 #include <asm/fixmap.h>
 #include <asm/ftrace.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/sections.h>
 
 struct patch_insn {
diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c
index 474a65213657..380a0e8cecc0 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -12,7 +12,7 @@
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/bug.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 
 #include "decode-insn.h"
 
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 4cc631fa7039..ca60db75199d 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -10,7 +10,7 @@
 #include <linux/filter.h>
 #include <linux/memory.h>
 #include <linux/stop_machine.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/cfi.h>
 #include <asm/percpu.h>
 #include "bpf_jit.h"
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 6de753c667f4..f8cd2f70a7fb 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -9,7 +9,7 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/memory.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
 #include <asm/cfi.h>
 #include "bpf_jit.h"
 
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index fc44d9c88b41..4d3f10ed8275 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -3,3 +3,4 @@ generated-y += syscall_table.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += parport.h
+generic-y += text-patching.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 43b0ae4c2c21..17ee8a273aa6 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -4,3 +4,4 @@ generated-y += syscall_table_64.h
 generic-y += agp.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
+generic-y += text-patching.h
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 8e594cda6d77..f8de31a0c5d1 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -468,6 +468,11 @@ void *text_poke(void *addr, const void *opcode, size_t len)
 	return memcpy(addr, opcode, len);
 }
 
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+	return text_poke(addr, opcode, len);
+}
+
 void text_poke_sync(void)
 {
 }
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 6259f1937fe7..ab9e143ec9fe 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -35,6 +35,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void text_poke_sync(void);
 extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
 extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
+#define text_poke_copy text_poke_copy
 extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok);
 extern void *text_poke_set(void *addr, int c, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index fa07c686cbcc..cc5dba738389 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -8,3 +8,4 @@ generic-y += parport.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
 generic-y += user.h
+generic-y += text-patching.h
diff --git a/include/asm-generic/text-patching.h b/include/asm-generic/text-patching.h
new file mode 100644
index 000000000000..2245c641b741
--- /dev/null
+++ b/include/asm-generic/text-patching.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_TEXT_PATCHING_H
+#define _ASM_GENERIC_TEXT_PATCHING_H
+
+#endif /* _ASM_GENERIC_TEXT_PATCHING_H */
diff --git a/include/linux/text-patching.h b/include/linux/text-patching.h
new file mode 100644
index 000000000000..ad5877ab0855
--- /dev/null
+++ b/include/linux/text-patching.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TEXT_PATCHING_H
+#define _LINUX_TEXT_PATCHING_H
+
+#include <asm/text-patching.h>
+
+#ifndef text_poke_copy
+static inline void *text_poke_copy(void *dst, const void *src, size_t len)
+{
+	return memcpy(dst, src, len);
+}
+#define text_poke_copy text_poke_copy
+#endif
+
+#endif /* _LINUX_TEXT_PATCHING_H */
-- 
cgit v1.2.3


From 0c133b1e78cd34dd9d18da707dc6f46170e9129e Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 23 Oct 2024 19:27:07 +0300
Subject: module: prepare to handle ROX allocations for text

In order to support ROX allocations for module text, it is necessary to
handle modifications to the code, such as relocations and alternatives
patching, without write access to that memory.

One option is to use text patching, but this would make module loading
extremely slow and will expose executable code that is not finally formed.

A better way is to have memory allocated with ROX permissions contain
invalid instructions and keep a writable, but not executable copy of the
module text.  The relocations and alternative patches would be done on the
writable copy using the addresses of the ROX memory.  Once the module is
completely ready, the updated text will be copied to ROX memory using text
patching in one go and the writable copy will be freed.

Add support for that to module initialization code and provide necessary
interfaces in execmem.

Link: https://lkml.kernel.org/r/20241023162711.2579610-5-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewd-by: Luis Chamberlain <mcgrof@kernel.org>
Tested-by: kdevops <kdevops@lists.linux.dev>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Song Liu <song@kernel.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/execmem.h        | 23 +++++++++++++
 include/linux/module.h         | 16 +++++++++
 include/linux/moduleloader.h   |  4 +++
 kernel/module/debug_kmemleak.c |  3 +-
 kernel/module/main.c           | 74 ++++++++++++++++++++++++++++++++++++++----
 kernel/module/strict_rwx.c     |  3 ++
 mm/execmem.c                   | 11 +++++++
 7 files changed, 126 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 32cef1144117..dfdf19f8a5e8 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -46,9 +46,11 @@ enum execmem_type {
 /**
  * enum execmem_range_flags - options for executable memory allocations
  * @EXECMEM_KASAN_SHADOW:	allocate kasan shadow
+ * @EXECMEM_ROX_CACHE:		allocations should use ROX cache of huge pages
  */
 enum execmem_range_flags {
 	EXECMEM_KASAN_SHADOW	= (1 << 0),
+	EXECMEM_ROX_CACHE	= (1 << 1),
 };
 
 /**
@@ -123,6 +125,27 @@ void *execmem_alloc(enum execmem_type type, size_t size);
  */
 void execmem_free(void *ptr);
 
+/**
+ * execmem_update_copy - copy an update to executable memory
+ * @dst:  destination address to update
+ * @src:  source address containing the data
+ * @size: how many bytes of memory shold be copied
+ *
+ * Copy @size bytes from @src to @dst using text poking if the memory at
+ * @dst is read-only.
+ *
+ * Return: a pointer to @dst or NULL on error
+ */
+void *execmem_update_copy(void *dst, const void *src, size_t size);
+
+/**
+ * execmem_is_rox - check if execmem is read-only
+ * @type - the execmem type to check
+ *
+ * Return: %true if the @type is read-only, %false if it's writable
+ */
+bool execmem_is_rox(enum execmem_type type);
+
 #if defined(CONFIG_EXECMEM) && !defined(CONFIG_ARCH_WANTS_EXECMEM_LATE)
 void execmem_init(void);
 #else
diff --git a/include/linux/module.h b/include/linux/module.h
index 88ecc5e9f523..2a9386cbdf85 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -367,6 +367,8 @@ enum mod_mem_type {
 
 struct module_memory {
 	void *base;
+	void *rw_copy;
+	bool is_rox;
 	unsigned int size;
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
@@ -767,6 +769,15 @@ static inline bool is_livepatch_module(struct module *mod)
 
 void set_module_sig_enforced(void);
 
+void *__module_writable_address(struct module *mod, void *loc);
+
+static inline void *module_writable_address(struct module *mod, void *loc)
+{
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod)
+		return loc;
+	return __module_writable_address(mod, loc);
+}
+
 #else /* !CONFIG_MODULES... */
 
 static inline struct module *__module_address(unsigned long addr)
@@ -874,6 +885,11 @@ static inline bool module_is_coming(struct module *mod)
 {
 	return false;
 }
+
+static inline void *module_writable_address(struct module *mod, void *loc)
+{
+	return loc;
+}
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index e395461d59e5..1f5507ba5a12 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -108,6 +108,10 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *mod);
 
+int module_post_finalize(const Elf_Ehdr *hdr,
+			 const Elf_Shdr *sechdrs,
+			 struct module *mod);
+
 #ifdef CONFIG_MODULES
 void flush_module_init_free_work(void);
 #else
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
index b4cc03842d70..df873dad049d 100644
--- a/kernel/module/debug_kmemleak.c
+++ b/kernel/module/debug_kmemleak.c
@@ -14,7 +14,8 @@ void kmemleak_load_module(const struct module *mod,
 {
 	/* only scan writable, non-executable sections */
 	for_each_mod_mem_type(type) {
-		if (type != MOD_DATA && type != MOD_INIT_DATA)
+		if (type != MOD_DATA && type != MOD_INIT_DATA &&
+		    !mod->mem[type].is_rox)
 			kmemleak_no_scan(mod->mem[type].base);
 	}
 }
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 49b9bca9de12..73b588fe98d4 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1189,6 +1189,18 @@ void __weak module_arch_freeing_init(struct module *mod)
 {
 }
 
+void *__module_writable_address(struct module *mod, void *loc)
+{
+	for_class_mod_mem_type(type, text) {
+		struct module_memory *mem = &mod->mem[type];
+
+		if (loc >= mem->base && loc < mem->base + mem->size)
+			return loc + (mem->rw_copy - mem->base);
+	}
+
+	return loc;
+}
+
 static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 {
 	unsigned int size = PAGE_ALIGN(mod->mem[type].size);
@@ -1206,6 +1218,23 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 	if (!ptr)
 		return -ENOMEM;
 
+	mod->mem[type].base = ptr;
+
+	if (execmem_is_rox(execmem_type)) {
+		ptr = vzalloc(size);
+
+		if (!ptr) {
+			execmem_free(mod->mem[type].base);
+			return -ENOMEM;
+		}
+
+		mod->mem[type].rw_copy = ptr;
+		mod->mem[type].is_rox = true;
+	} else {
+		mod->mem[type].rw_copy = mod->mem[type].base;
+		memset(mod->mem[type].base, 0, size);
+	}
+
 	/*
 	 * The pointer to these blocks of memory are stored on the module
 	 * structure and we keep that around so long as the module is
@@ -1219,16 +1248,17 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 	 */
 	kmemleak_not_leak(ptr);
 
-	memset(ptr, 0, size);
-	mod->mem[type].base = ptr;
-
 	return 0;
 }
 
 static void module_memory_free(struct module *mod, enum mod_mem_type type,
 			       bool unload_codetags)
 {
-	void *ptr = mod->mem[type].base;
+	struct module_memory *mem = &mod->mem[type];
+	void *ptr = mem->base;
+
+	if (mem->is_rox)
+		vfree(mem->rw_copy);
 
 	if (!unload_codetags && mod_mem_type_is_core_data(type))
 		return;
@@ -2251,6 +2281,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	for_each_mod_mem_type(type) {
 		if (!mod->mem[type].size) {
 			mod->mem[type].base = NULL;
+			mod->mem[type].rw_copy = NULL;
 			continue;
 		}
 
@@ -2267,11 +2298,14 @@ static int move_module(struct module *mod, struct load_info *info)
 		void *dest;
 		Elf_Shdr *shdr = &info->sechdrs[i];
 		enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+		unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
+		unsigned long addr;
 
 		if (!(shdr->sh_flags & SHF_ALLOC))
 			continue;
 
-		dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK);
+		addr = (unsigned long)mod->mem[type].base + offset;
+		dest = mod->mem[type].rw_copy + offset;
 
 		if (shdr->sh_type != SHT_NOBITS) {
 			/*
@@ -2293,7 +2327,7 @@ static int move_module(struct module *mod, struct load_info *info)
 		 * users of info can keep taking advantage and using the newly
 		 * minted official memory area.
 		 */
-		shdr->sh_addr = (unsigned long)dest;
+		shdr->sh_addr = addr;
 		pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
 			 (long)shdr->sh_size, info->secstrings + shdr->sh_name);
 	}
@@ -2441,8 +2475,17 @@ int __weak module_finalize(const Elf_Ehdr *hdr,
 	return 0;
 }
 
+int __weak module_post_finalize(const Elf_Ehdr *hdr,
+				const Elf_Shdr *sechdrs,
+				struct module *me)
+{
+	return 0;
+}
+
 static int post_relocation(struct module *mod, const struct load_info *info)
 {
+	int ret;
+
 	/* Sort exception table now relocations are done. */
 	sort_extable(mod->extable, mod->extable + mod->num_exentries);
 
@@ -2454,7 +2497,24 @@ static int post_relocation(struct module *mod, const struct load_info *info)
 	add_kallsyms(mod, info);
 
 	/* Arch-specific module finalizing. */
-	return module_finalize(info->hdr, info->sechdrs, mod);
+	ret = module_finalize(info->hdr, info->sechdrs, mod);
+	if (ret)
+		return ret;
+
+	for_each_mod_mem_type(type) {
+		struct module_memory *mem = &mod->mem[type];
+
+		if (mem->is_rox) {
+			if (!execmem_update_copy(mem->base, mem->rw_copy,
+						 mem->size))
+				return -ENOMEM;
+
+			vfree(mem->rw_copy);
+			mem->rw_copy = NULL;
+		}
+	}
+
+	return module_post_finalize(info->hdr, info->sechdrs, mod);
 }
 
 /* Call module constructors. */
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index c45caa4690e5..239e5013359d 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -34,6 +34,9 @@ int module_enable_text_rox(const struct module *mod)
 	for_class_mod_mem_type(type, text) {
 		int ret;
 
+		if (mod->mem[type].is_rox)
+			continue;
+
 		if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
 			ret = module_set_memory(mod, type, set_memory_rox);
 		else
diff --git a/mm/execmem.c b/mm/execmem.c
index 0c4b36bc6d10..0f6691e9ffe6 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -10,6 +10,7 @@
 #include <linux/vmalloc.h>
 #include <linux/execmem.h>
 #include <linux/moduleloader.h>
+#include <linux/text-patching.h>
 
 static struct execmem_info *execmem_info __ro_after_init;
 static struct execmem_info default_execmem_info __ro_after_init;
@@ -69,6 +70,16 @@ void execmem_free(void *ptr)
 	vfree(ptr);
 }
 
+void *execmem_update_copy(void *dst, const void *src, size_t size)
+{
+	return text_poke_copy(dst, src, size);
+}
+
+bool execmem_is_rox(enum execmem_type type)
+{
+	return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
+}
+
 static bool execmem_validate(struct execmem_info *info)
 {
 	struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT];
-- 
cgit v1.2.3


From 0c6378a71574daa6cd1534ad42a956e3262756c7 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 23 Oct 2024 19:27:08 +0300
Subject: arch: introduce set_direct_map_valid_noflush()

Add an API that will allow updates of the direct/linear map for a set of
physically contiguous pages.

It will be used in the following patches.

Link: https://lkml.kernel.org/r/20241023162711.2579610-6-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Tested-by: kdevops <kdevops@lists.linux.dev>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Song Liu <song@kernel.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/set_memory.h     |  1 +
 arch/arm64/mm/pageattr.c                | 10 ++++++++++
 arch/loongarch/include/asm/set_memory.h |  1 +
 arch/loongarch/mm/pageattr.c            | 19 +++++++++++++++++++
 arch/riscv/include/asm/set_memory.h     |  1 +
 arch/riscv/mm/pageattr.c                | 15 +++++++++++++++
 arch/s390/include/asm/set_memory.h      |  1 +
 arch/s390/mm/pageattr.c                 | 11 +++++++++++
 arch/x86/include/asm/set_memory.h       |  1 +
 arch/x86/mm/pat/set_memory.c            |  8 ++++++++
 include/linux/set_memory.h              |  6 ++++++
 11 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
index 917761feeffd..98088c043606 100644
--- a/arch/arm64/include/asm/set_memory.h
+++ b/arch/arm64/include/asm/set_memory.h
@@ -13,6 +13,7 @@ int set_memory_valid(unsigned long addr, int numpages, int enable);
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
 bool kernel_page_present(struct page *page);
 
 #endif /* _ASM_ARM64_SET_MEMORY_H */
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 0e270a1c51e6..01225900293a 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -192,6 +192,16 @@ int set_direct_map_default_noflush(struct page *page)
 				   PAGE_SIZE, change_page_range, &data);
 }
 
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	if (!can_set_direct_map())
+		return 0;
+
+	return set_memory_valid(addr, nr, valid);
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
diff --git a/arch/loongarch/include/asm/set_memory.h b/arch/loongarch/include/asm/set_memory.h
index d70505b6676c..55dfaefd02c8 100644
--- a/arch/loongarch/include/asm/set_memory.h
+++ b/arch/loongarch/include/asm/set_memory.h
@@ -17,5 +17,6 @@ int set_memory_rw(unsigned long addr, int numpages);
 bool kernel_page_present(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
 int set_direct_map_invalid_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
 
 #endif /* _ASM_LOONGARCH_SET_MEMORY_H */
diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c
index ffd8d76021d4..bf8678248444 100644
--- a/arch/loongarch/mm/pageattr.c
+++ b/arch/loongarch/mm/pageattr.c
@@ -216,3 +216,22 @@ int set_direct_map_invalid_noflush(struct page *page)
 
 	return __set_memory(addr, 1, __pgprot(0), __pgprot(_PAGE_PRESENT | _PAGE_VALID));
 }
+
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+	pgprot_t set, clear;
+
+	if (addr < vm_map_base)
+		return 0;
+
+	if (valid) {
+		set = PAGE_KERNEL;
+		clear = __pgprot(0);
+	} else {
+		set = __pgprot(0);
+		clear = __pgprot(_PAGE_PRESENT | _PAGE_VALID);
+	}
+
+	return __set_memory(addr, 1, set, clear);
+}
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
index ab92fc84e1fc..ea263d3683ef 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -42,6 +42,7 @@ static inline int set_kernel_memory(char *startp, char *endp,
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
 bool kernel_page_present(struct page *page);
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 271d01a5ba4d..d815448758a1 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -386,6 +386,21 @@ int set_direct_map_default_noflush(struct page *page)
 			    PAGE_KERNEL, __pgprot(_PAGE_EXEC));
 }
 
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+	pgprot_t set, clear;
+
+	if (valid) {
+		set = PAGE_KERNEL;
+		clear = __pgprot(_PAGE_EXEC);
+	} else {
+		set = __pgprot(0);
+		clear = __pgprot(_PAGE_PRESENT);
+	}
+
+	return __set_memory((unsigned long)page_address(page), nr, set, clear);
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static int debug_pagealloc_set_page(pte_t *pte, unsigned long addr, void *data)
 {
diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h
index 06fbabe2f66c..240bcfbdcdce 100644
--- a/arch/s390/include/asm/set_memory.h
+++ b/arch/s390/include/asm/set_memory.h
@@ -62,5 +62,6 @@ __SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K)
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
 
 #endif
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 5f805ad42d4c..4c7ee74aa130 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -406,6 +406,17 @@ int set_direct_map_default_noflush(struct page *page)
 	return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF);
 }
 
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+	unsigned long flags;
+
+	if (valid)
+		flags = SET_MEMORY_DEF;
+	else
+		flags = SET_MEMORY_INV;
+
+	return __set_memory((unsigned long)page_to_virt(page), nr, flags);
+}
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
 
 static void ipte_range(pte_t *pte, unsigned long address, int nr)
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 4b2abce2e3e7..cc62ef70ccc0 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -89,6 +89,7 @@ int set_pages_rw(struct page *page, int numpages);
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
 bool kernel_page_present(struct page *page);
 
 extern int kernel_set_to_readonly;
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 44f7b2ea6a07..069e421c2247 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2444,6 +2444,14 @@ int set_direct_map_default_noflush(struct page *page)
 	return __set_pages_p(page, 1);
 }
 
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+	if (valid)
+		return __set_pages_p(page, nr);
+
+	return __set_pages_np(page, nr);
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index e7aec20fb44f..3030d9245f5a 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -34,6 +34,12 @@ static inline int set_direct_map_default_noflush(struct page *page)
 	return 0;
 }
 
+static inline int set_direct_map_valid_noflush(struct page *page,
+					       unsigned nr, bool valid)
+{
+	return 0;
+}
+
 static inline bool kernel_page_present(struct page *page)
 {
 	return true;
-- 
cgit v1.2.3


From 2e45474ab14f0f17c1091c503a13ff2fe2a84486 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 23 Oct 2024 19:27:10 +0300
Subject: execmem: add support for cache of large ROX pages

Using large pages to map text areas reduces iTLB pressure and improves
performance.

Extend execmem_alloc() with an ability to use huge pages with ROX
permissions as a cache for smaller allocations.

To populate the cache, a writable large page is allocated from vmalloc
with VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then
remapped as ROX.

The direct map alias of that large page is exculded from the direct map.

Portions of that large page are handed out to execmem_alloc() callers
without any changes to the permissions.

When the memory is freed with execmem_free() it is invalidated again so
that it won't contain stale instructions.

An architecture has to implement execmem_fill_trapping_insns() callback
and select ARCH_HAS_EXECMEM_ROX configuration option to be able to use the
ROX cache.

The cache is enabled on per-range basis when an architecture sets
EXECMEM_ROX_CACHE flag in definition of an execmem_range.

Link: https://lkml.kernel.org/r/20241023162711.2579610-8-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Tested-by: kdevops <kdevops@lists.linux.dev>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Song Liu <song@kernel.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/Kconfig            |   8 ++
 include/linux/execmem.h |  14 +++
 mm/execmem.c            | 325 ++++++++++++++++++++++++++++++++++++++++++++++--
 mm/internal.h           |   1 +
 mm/vmalloc.c            |   5 +
 5 files changed, 345 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index bd9f095d69fa..89b14e4edc61 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1024,6 +1024,14 @@ config ARCH_WANTS_EXECMEM_LATE
 	  enough entropy for module space randomization, for instance
 	  arm64.
 
+config ARCH_HAS_EXECMEM_ROX
+	bool
+	depends on MMU && !HIGHMEM
+	help
+	  For architectures that support allocations of executable memory
+	  with read-only execute permissions. Architecture must implement
+	  execmem_fill_trapping_insns() callback to enable this.
+
 config HAVE_IRQ_EXIT_ON_IRQ_STACK
 	bool
 	help
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index dfdf19f8a5e8..1517fa196bf7 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -53,6 +53,20 @@ enum execmem_range_flags {
 	EXECMEM_ROX_CACHE	= (1 << 1),
 };
 
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
+/**
+ * execmem_fill_trapping_insns - set memory to contain instructions that
+ *				 will trap
+ * @ptr:	pointer to memory to fill
+ * @size:	size of the range to fill
+ * @writable:	is the memory poited by @ptr is writable or ROX
+ *
+ * A hook for architecures to fill execmem ranges with invalid instructions.
+ * Architectures that use EXECMEM_ROX_CACHE must implement this.
+ */
+void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable);
+#endif
+
 /**
  * struct execmem_range - definition of an address space suitable for code and
  *			  related data allocations
diff --git a/mm/execmem.c b/mm/execmem.c
index 0f6691e9ffe6..576a57e2161f 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -6,29 +6,41 @@
  * Copyright (C) 2024 Mike Rapoport IBM.
  */
 
+#define pr_fmt(fmt) "execmem: " fmt
+
 #include <linux/mm.h>
+#include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/execmem.h>
+#include <linux/maple_tree.h>
+#include <linux/set_memory.h>
 #include <linux/moduleloader.h>
 #include <linux/text-patching.h>
 
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
 static struct execmem_info *execmem_info __ro_after_init;
 static struct execmem_info default_execmem_info __ro_after_init;
 
-static void *__execmem_alloc(struct execmem_range *range, size_t size)
+#ifdef CONFIG_MMU
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+			     pgprot_t pgprot, unsigned long vm_flags)
 {
 	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
-	unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
+	unsigned int align = range->alignment;
 	unsigned long start = range->start;
 	unsigned long end = range->end;
-	unsigned int align = range->alignment;
-	pgprot_t pgprot = range->pgprot;
 	void *p;
 
 	if (kasan)
 		vm_flags |= VM_DEFER_KMEMLEAK;
 
+	if (vm_flags & VM_ALLOW_HUGE_VMAP)
+		align = PMD_SIZE;
+
 	p = __vmalloc_node_range(size, align, start, end, gfp_flags,
 				 pgprot, vm_flags, NUMA_NO_NODE,
 				 __builtin_return_address(0));
@@ -41,7 +53,7 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size)
 	}
 
 	if (!p) {
-		pr_warn_ratelimited("execmem: unable to allocate memory\n");
+		pr_warn_ratelimited("unable to allocate memory\n");
 		return NULL;
 	}
 
@@ -50,14 +62,298 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size)
 		return NULL;
 	}
 
-	return kasan_reset_tag(p);
+	return p;
 }
+#else
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+			     pgprot_t pgprot, unsigned long vm_flags)
+{
+	return vmalloc(size);
+}
+#endif /* CONFIG_MMU */
+
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
+struct execmem_cache {
+	struct mutex mutex;
+	struct maple_tree busy_areas;
+	struct maple_tree free_areas;
+};
+
+static struct execmem_cache execmem_cache = {
+	.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
+	.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
+				     execmem_cache.mutex),
+	.free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
+				     execmem_cache.mutex),
+};
+
+static inline unsigned long mas_range_len(struct ma_state *mas)
+{
+	return mas->last - mas->index + 1;
+}
+
+static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid)
+{
+	unsigned int nr = (1 << get_vm_area_page_order(vm));
+	unsigned int updated = 0;
+	int err = 0;
+
+	for (int i = 0; i < vm->nr_pages; i += nr) {
+		err = set_direct_map_valid_noflush(vm->pages[i], nr, valid);
+		if (err)
+			goto err_restore;
+		updated += nr;
+	}
+
+	return 0;
+
+err_restore:
+	for (int i = 0; i < updated; i += nr)
+		set_direct_map_valid_noflush(vm->pages[i], nr, !valid);
+
+	return err;
+}
+
+static void execmem_cache_clean(struct work_struct *work)
+{
+	struct maple_tree *free_areas = &execmem_cache.free_areas;
+	struct mutex *mutex = &execmem_cache.mutex;
+	MA_STATE(mas, free_areas, 0, ULONG_MAX);
+	void *area;
+
+	mutex_lock(mutex);
+	mas_for_each(&mas, area, ULONG_MAX) {
+		size_t size = mas_range_len(&mas);
+
+		if (IS_ALIGNED(size, PMD_SIZE) &&
+		    IS_ALIGNED(mas.index, PMD_SIZE)) {
+			struct vm_struct *vm = find_vm_area(area);
+
+			execmem_set_direct_map_valid(vm, true);
+			mas_store_gfp(&mas, NULL, GFP_KERNEL);
+			vfree(area);
+		}
+	}
+	mutex_unlock(mutex);
+}
+
+static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
+
+static int execmem_cache_add(void *ptr, size_t size)
+{
+	struct maple_tree *free_areas = &execmem_cache.free_areas;
+	struct mutex *mutex = &execmem_cache.mutex;
+	unsigned long addr = (unsigned long)ptr;
+	MA_STATE(mas, free_areas, addr - 1, addr + 1);
+	unsigned long lower, upper;
+	void *area = NULL;
+	int err;
+
+	lower = addr;
+	upper = addr + size - 1;
+
+	mutex_lock(mutex);
+	area = mas_walk(&mas);
+	if (area && mas.last == addr - 1)
+		lower = mas.index;
+
+	area = mas_next(&mas, ULONG_MAX);
+	if (area && mas.index == addr + size)
+		upper = mas.last;
+
+	mas_set_range(&mas, lower, upper);
+	err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
+	mutex_unlock(mutex);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static bool within_range(struct execmem_range *range, struct ma_state *mas,
+			 size_t size)
+{
+	unsigned long addr = mas->index;
+
+	if (addr >= range->start && addr + size < range->end)
+		return true;
+
+	if (range->fallback_start &&
+	    addr >= range->fallback_start && addr + size < range->fallback_end)
+		return true;
+
+	return false;
+}
+
+static void *__execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+	struct maple_tree *free_areas = &execmem_cache.free_areas;
+	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+	MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
+	MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
+	struct mutex *mutex = &execmem_cache.mutex;
+	unsigned long addr, last, area_size = 0;
+	void *area, *ptr = NULL;
+	int err;
+
+	mutex_lock(mutex);
+	mas_for_each(&mas_free, area, ULONG_MAX) {
+		area_size = mas_range_len(&mas_free);
+
+		if (area_size >= size && within_range(range, &mas_free, size))
+			break;
+	}
+
+	if (area_size < size)
+		goto out_unlock;
+
+	addr = mas_free.index;
+	last = mas_free.last;
+
+	/* insert allocated size to busy_areas at range [addr, addr + size) */
+	mas_set_range(&mas_busy, addr, addr + size - 1);
+	err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL);
+	if (err)
+		goto out_unlock;
+
+	mas_store_gfp(&mas_free, NULL, GFP_KERNEL);
+	if (area_size > size) {
+		void *ptr = (void *)(addr + size);
+
+		/*
+		 * re-insert remaining free size to free_areas at range
+		 * [addr + size, last]
+		 */
+		mas_set_range(&mas_free, addr + size, last);
+		err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL);
+		if (err) {
+			mas_store_gfp(&mas_busy, NULL, GFP_KERNEL);
+			goto out_unlock;
+		}
+	}
+	ptr = (void *)addr;
+
+out_unlock:
+	mutex_unlock(mutex);
+	return ptr;
+}
+
+static int execmem_cache_populate(struct execmem_range *range, size_t size)
+{
+	unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
+	unsigned long start, end;
+	struct vm_struct *vm;
+	size_t alloc_size;
+	int err = -ENOMEM;
+	void *p;
+
+	alloc_size = round_up(size, PMD_SIZE);
+	p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+	if (!p)
+		return err;
+
+	vm = find_vm_area(p);
+	if (!vm)
+		goto err_free_mem;
+
+	/* fill memory with instructions that will trap */
+	execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
+
+	start = (unsigned long)p;
+	end = start + alloc_size;
+
+	vunmap_range(start, end);
+
+	err = execmem_set_direct_map_valid(vm, false);
+	if (err)
+		goto err_free_mem;
+
+	err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
+				       PMD_SHIFT);
+	if (err)
+		goto err_free_mem;
+
+	err = execmem_cache_add(p, alloc_size);
+	if (err)
+		goto err_free_mem;
+
+	return 0;
+
+err_free_mem:
+	vfree(p);
+	return err;
+}
+
+static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+	void *p;
+	int err;
+
+	p = __execmem_cache_alloc(range, size);
+	if (p)
+		return p;
+
+	err = execmem_cache_populate(range, size);
+	if (err)
+		return NULL;
+
+	return __execmem_cache_alloc(range, size);
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+	struct mutex *mutex = &execmem_cache.mutex;
+	unsigned long addr = (unsigned long)ptr;
+	MA_STATE(mas, busy_areas, addr, addr);
+	size_t size;
+	void *area;
+
+	mutex_lock(mutex);
+	area = mas_walk(&mas);
+	if (!area) {
+		mutex_unlock(mutex);
+		return false;
+	}
+	size = mas_range_len(&mas);
+
+	mas_store_gfp(&mas, NULL, GFP_KERNEL);
+	mutex_unlock(mutex);
+
+	execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
+
+	execmem_cache_add(ptr, size);
+
+	schedule_work(&execmem_cache_clean_work);
+
+	return true;
+}
+#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
+static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+	return NULL;
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+	return false;
+}
+#endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */
 
 void *execmem_alloc(enum execmem_type type, size_t size)
 {
 	struct execmem_range *range = &execmem_info->ranges[type];
+	bool use_cache = range->flags & EXECMEM_ROX_CACHE;
+	unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
+	pgprot_t pgprot = range->pgprot;
+	void *p;
 
-	return __execmem_alloc(range, size);
+	if (use_cache)
+		p = execmem_cache_alloc(range, size);
+	else
+		p = execmem_vmalloc(range, size, pgprot, vm_flags);
+
+	return kasan_reset_tag(p);
 }
 
 void execmem_free(void *ptr)
@@ -67,7 +363,9 @@ void execmem_free(void *ptr)
 	 * supported by vmalloc.
 	 */
 	WARN_ON(in_interrupt());
-	vfree(ptr);
+
+	if (!execmem_cache_free(ptr))
+		vfree(ptr);
 }
 
 void *execmem_update_copy(void *dst, const void *src, size_t size)
@@ -89,6 +387,17 @@ static bool execmem_validate(struct execmem_info *info)
 		return false;
 	}
 
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) {
+		for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) {
+			r = &info->ranges[i];
+
+			if (r->flags & EXECMEM_ROX_CACHE) {
+				pr_warn_once("ROX cache is not supported\n");
+				r->flags &= ~EXECMEM_ROX_CACHE;
+			}
+		}
+	}
+
 	return true;
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index c743c2b21dba..3dc745ba76dd 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1234,6 +1234,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
 void __init vmalloc_init(void);
 int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                 pgprot_t prot, struct page **pages, unsigned int page_shift);
+unsigned int get_vm_area_page_order(struct vm_struct *vm);
 #else
 static inline void vmalloc_init(void)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5c0ea4e2b17d..74c0a5eae210 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3023,6 +3023,11 @@ static inline unsigned int vm_area_page_order(struct vm_struct *vm)
 #endif
 }
 
+unsigned int get_vm_area_page_order(struct vm_struct *vm)
+{
+	return vm_area_page_order(vm);
+}
+
 static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
 {
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
-- 
cgit v1.2.3


From 7c8c76e446ca0079692fad44a3993cb1d7666c21 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 23 Oct 2024 10:07:54 -0700
Subject: maple_tree: add mas_for_each_rev() helper

Patch series "page allocation tag compression", v4.

This patchset implements several improvements:

1. Gracefully handles module unloading while there are used
   allocations allocated from that module;

2. Provides an option to store page allocation tag references in the
   page flags, removing dependency on page extensions and eliminating the
   memory overhead from storing page allocation references (~0.2% of total
   system memory).  This also improves page allocation performance when
   CONFIG_MEM_ALLOC_PROFILING is enabled by eliminating page extension
   lookup.  Page allocation performance overhead is reduced from 41% to
   5.5%.

Patch #1 introduces mas_for_each_rev() helper function.

Patch #2 introduces shutdown_mem_profiling() helper function to be used
when disabling memory allocation profiling.

Patch #3 copies module tags into virtually contiguous memory which
serves two purposes:

- Lets us deal with the situation when module is unloaded while there
  are still live allocations from that module.  Since we are using a copy
  version of the tags we can safely unload the module.  Space and gaps in
  this contiguous memory are managed using a maple tree.

- Enables simple indexing of the tags in the later patches.

Patch #4 changes the way we allocate virtually contiguous memory for
module tags to reserve only vitrual area and populate physical pages only
as needed at module load time.

Patch #5 abstracts page allocation tag reference to simplify later
changes.

Patch #6 adds compression option to the sysctl.vm.mem_profiling boot
parameter for storing page allocation tag references inside page flags if
they fit.  If the number of available page flag bits is insufficient to
address all kernel allocations, memory allocation profiling gets disabled
with an appropriate warning.


This patch (of 6):

Add mas_for_each_rev() function to iterate maple tree nodes in reverse
order.

Link: https://lkml.kernel.org/r/20241023170759.999909-1-surenb@google.com
Link: https://lkml.kernel.org/r/20241023170759.999909-2-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Minchan Kim <minchan@google.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Huth <thuth@redhat.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xiongwei Song <xiongwei.song@windriver.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 61c236850ca8..cbbcd18d4186 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -592,6 +592,20 @@ static __always_inline void mas_reset(struct ma_state *mas)
 #define mas_for_each(__mas, __entry, __max) \
 	while (((__entry) = mas_find((__mas), (__max))) != NULL)
 
+/**
+ * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order.
+ * @__mas: Maple Tree operation state (maple_state)
+ * @__entry: Entry retrieved from the tree
+ * @__min: minimum index to retrieve from the tree
+ *
+ * When returned, mas->index and mas->last will hold the entire range for the
+ * entry.
+ *
+ * Note: may return the zero entry.
+ */
+#define mas_for_each_rev(__mas, __entry, __min) \
+	while (((__entry) = mas_find_rev((__mas), (__min))) != NULL)
+
 #ifdef CONFIG_DEBUG_MAPLE_TREE
 enum mt_dump_format {
 	mt_dump_dec,
-- 
cgit v1.2.3


From 0db6f8d7820a4b788565dac8eed52bfc2c3216da Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 23 Oct 2024 10:07:56 -0700
Subject: alloc_tag: load module tags into separate contiguous memory

When a module gets unloaded there is a possibility that some of the
allocations it made are still used and therefore the allocation tags
corresponding to these allocations are still referenced.  As such, the
memory for these tags can't be freed.  This is currently handled as an
abnormal situation and module's data section is not being unloaded.  To
handle this situation without keeping module's data in memory, allow
codetags with longer lifespan than the module to be loaded into their own
separate memory.  The in-use memory areas and gaps after module unloading
in this separate memory are tracked using maple trees.  Allocation tags
arrange their separate memory so that it is virtually contiguous and that
will allow simple allocation tag indexing later on in this patchset.  The
size of this virtually contiguous memory is set to store up to 100000
allocation tags.

[surenb@google.com: fix empty codetag module section handling]
  Link: https://lkml.kernel.org/r/20241101000017.3856204-1-surenb@google.com
[akpm@linux-foundation.org: update comment, per Dan]
Link: https://lkml.kernel.org/r/20241023170759.999909-4-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Minchan Kim <minchan@google.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Huth <thuth@redhat.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xiongwei Song <xiongwei.song@windriver.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/codetag.lds.h |  19 +++
 include/linux/alloc_tag.h         |  13 +-
 include/linux/codetag.h           |  37 +++++-
 kernel/module/main.c              |  84 +++++++++----
 lib/alloc_tag.c                   | 249 +++++++++++++++++++++++++++++++++++---
 lib/codetag.c                     | 100 +++++++++++++--
 scripts/module.lds.S              |   5 +-
 7 files changed, 445 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
index 64f536b80380..372c320c5043 100644
--- a/include/asm-generic/codetag.lds.h
+++ b/include/asm-generic/codetag.lds.h
@@ -11,4 +11,23 @@
 #define CODETAG_SECTIONS()		\
 	SECTION_WITH_BOUNDARIES(alloc_tags)
 
+/*
+ * Module codetags which aren't used after module unload, therefore have the
+ * same lifespan as the module and can be safely unloaded with the module.
+ */
+#define MOD_CODETAG_SECTIONS()
+
+#define MOD_SEPARATE_CODETAG_SECTION(_name)	\
+	.codetag.##_name : {			\
+		SECTION_WITH_BOUNDARIES(_name)	\
+	}
+
+/*
+ * For codetags which might be used after module unload, therefore might stay
+ * longer in memory. Each such codetag type has its own section so that we can
+ * unload them individually once unused.
+ */
+#define MOD_SEPARATE_CODETAG_SECTIONS()		\
+	MOD_SEPARATE_CODETAG_SECTION(alloc_tags)
+
 #endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 941deffc590d..55d30543c4c7 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -30,6 +30,13 @@ struct alloc_tag {
 	struct alloc_tag_counters __percpu	*counters;
 } __aligned(8);
 
+struct alloc_tag_module_section {
+	unsigned long start_addr;
+	unsigned long end_addr;
+	/* used size */
+	unsigned long size;
+};
+
 #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
 
 #define CODETAG_EMPTY	((void *)1)
@@ -54,6 +61,8 @@ static inline void set_codetag_empty(union codetag_ref *ref) {}
 
 #ifdef CONFIG_MEM_ALLOC_PROFILING
 
+#define ALLOC_TAG_SECTION_NAME	"alloc_tags"
+
 struct codetag_bytes {
 	struct codetag *ct;
 	s64 bytes;
@@ -76,7 +85,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
 
 #define DEFINE_ALLOC_TAG(_alloc_tag)						\
 	static struct alloc_tag _alloc_tag __used __aligned(8)			\
-	__section("alloc_tags") = {						\
+	__section(ALLOC_TAG_SECTION_NAME) = {					\
 		.ct = CODE_TAG_INIT,						\
 		.counters = &_shared_alloc_tag };
 
@@ -85,7 +94,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
 #define DEFINE_ALLOC_TAG(_alloc_tag)						\
 	static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr);	\
 	static struct alloc_tag _alloc_tag __used __aligned(8)			\
-	__section("alloc_tags") = {						\
+	__section(ALLOC_TAG_SECTION_NAME) = {					\
 		.ct = CODE_TAG_INIT,						\
 		.counters = &_alloc_tag_cntr };
 
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index c2a579ccd455..d10bd9810d32 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -35,8 +35,15 @@ struct codetag_type_desc {
 	size_t tag_size;
 	void (*module_load)(struct codetag_type *cttype,
 			    struct codetag_module *cmod);
-	bool (*module_unload)(struct codetag_type *cttype,
+	void (*module_unload)(struct codetag_type *cttype,
 			      struct codetag_module *cmod);
+#ifdef CONFIG_MODULES
+	void (*module_replaced)(struct module *mod, struct module *new_mod);
+	bool (*needs_section_mem)(struct module *mod, unsigned long size);
+	void *(*alloc_section_mem)(struct module *mod, unsigned long size,
+				   unsigned int prepend, unsigned long align);
+	void (*free_section_mem)(struct module *mod, bool used);
+#endif
 };
 
 struct codetag_iterator {
@@ -71,11 +78,31 @@ struct codetag_type *
 codetag_register_type(const struct codetag_type_desc *desc);
 
 #if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES)
+
+bool codetag_needs_module_section(struct module *mod, const char *name,
+				  unsigned long size);
+void *codetag_alloc_module_section(struct module *mod, const char *name,
+				   unsigned long size, unsigned int prepend,
+				   unsigned long align);
+void codetag_free_module_sections(struct module *mod);
+void codetag_module_replaced(struct module *mod, struct module *new_mod);
 void codetag_load_module(struct module *mod);
-bool codetag_unload_module(struct module *mod);
-#else
+void codetag_unload_module(struct module *mod);
+
+#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
+
+static inline bool
+codetag_needs_module_section(struct module *mod, const char *name,
+			     unsigned long size) { return false; }
+static inline void *
+codetag_alloc_module_section(struct module *mod, const char *name,
+			     unsigned long size, unsigned int prepend,
+			     unsigned long align) { return NULL; }
+static inline void codetag_free_module_sections(struct module *mod) {}
+static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {}
 static inline void codetag_load_module(struct module *mod) {}
-static inline bool codetag_unload_module(struct module *mod) { return true; }
-#endif
+static inline void codetag_unload_module(struct module *mod) {}
+
+#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
 
 #endif /* _LINUX_CODETAG_H */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 73b588fe98d4..00c16f5c5568 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1251,22 +1251,17 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 	return 0;
 }
 
-static void module_memory_free(struct module *mod, enum mod_mem_type type,
-			       bool unload_codetags)
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
 {
 	struct module_memory *mem = &mod->mem[type];
-	void *ptr = mem->base;
 
 	if (mem->is_rox)
 		vfree(mem->rw_copy);
 
-	if (!unload_codetags && mod_mem_type_is_core_data(type))
-		return;
-
-	execmem_free(ptr);
+	execmem_free(mem->base);
 }
 
-static void free_mod_mem(struct module *mod, bool unload_codetags)
+static void free_mod_mem(struct module *mod)
 {
 	for_each_mod_mem_type(type) {
 		struct module_memory *mod_mem = &mod->mem[type];
@@ -1277,25 +1272,20 @@ static void free_mod_mem(struct module *mod, bool unload_codetags)
 		/* Free lock-classes; relies on the preceding sync_rcu(). */
 		lockdep_free_key_range(mod_mem->base, mod_mem->size);
 		if (mod_mem->size)
-			module_memory_free(mod, type, unload_codetags);
+			module_memory_free(mod, type);
 	}
 
 	/* MOD_DATA hosts mod, so free it at last */
 	lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
-	module_memory_free(mod, MOD_DATA, unload_codetags);
+	module_memory_free(mod, MOD_DATA);
 }
 
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
-	bool unload_codetags;
-
 	trace_module_free(mod);
 
-	unload_codetags = codetag_unload_module(mod);
-	if (!unload_codetags)
-		pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n",
-			mod->name);
+	codetag_unload_module(mod);
 
 	mod_sysfs_teardown(mod);
 
@@ -1338,7 +1328,7 @@ static void free_module(struct module *mod)
 	kfree(mod->args);
 	percpu_modfree(mod);
 
-	free_mod_mem(mod, unload_codetags);
+	free_mod_mem(mod);
 }
 
 void *__symbol_get(const char *symbol)
@@ -1603,6 +1593,20 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i
 			if (WARN_ON_ONCE(type == MOD_INVALID))
 				continue;
 
+			/*
+			 * Do not allocate codetag memory as we load it into
+			 * preallocated contiguous memory.
+			 */
+			if (codetag_needs_module_section(mod, sname, s->sh_size)) {
+				/*
+				 * s->sh_entsize won't be used but populate the
+				 * type field to avoid confusion.
+				 */
+				s->sh_entsize = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK)
+						<< SH_ENTSIZE_TYPE_SHIFT;
+				continue;
+			}
+
 			s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
 			pr_debug("\t%s\n", sname);
 		}
@@ -2277,6 +2281,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	int i;
 	enum mod_mem_type t = 0;
 	int ret = -ENOMEM;
+	bool codetag_section_found = false;
 
 	for_each_mod_mem_type(type) {
 		if (!mod->mem[type].size) {
@@ -2288,7 +2293,7 @@ static int move_module(struct module *mod, struct load_info *info)
 		ret = module_memory_alloc(mod, type);
 		if (ret) {
 			t = type;
-			goto out_enomem;
+			goto out_err;
 		}
 	}
 
@@ -2297,15 +2302,37 @@ static int move_module(struct module *mod, struct load_info *info)
 	for (i = 0; i < info->hdr->e_shnum; i++) {
 		void *dest;
 		Elf_Shdr *shdr = &info->sechdrs[i];
-		enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
-		unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
+		const char *sname;
 		unsigned long addr;
 
 		if (!(shdr->sh_flags & SHF_ALLOC))
 			continue;
 
-		addr = (unsigned long)mod->mem[type].base + offset;
-		dest = mod->mem[type].rw_copy + offset;
+		sname = info->secstrings + shdr->sh_name;
+		/*
+		 * Load codetag sections separately as they might still be used
+		 * after module unload.
+		 */
+		if (codetag_needs_module_section(mod, sname, shdr->sh_size)) {
+			dest = codetag_alloc_module_section(mod, sname, shdr->sh_size,
+					arch_mod_section_prepend(mod, i), shdr->sh_addralign);
+			if (WARN_ON(!dest)) {
+				ret = -EINVAL;
+				goto out_err;
+			}
+			if (IS_ERR(dest)) {
+				ret = PTR_ERR(dest);
+				goto out_err;
+			}
+			addr = (unsigned long)dest;
+			codetag_section_found = true;
+		} else {
+			enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+			unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
+
+			addr = (unsigned long)mod->mem[type].base + offset;
+			dest = mod->mem[type].rw_copy + offset;
+		}
 
 		if (shdr->sh_type != SHT_NOBITS) {
 			/*
@@ -2317,7 +2344,7 @@ static int move_module(struct module *mod, struct load_info *info)
 			if (i == info->index.mod &&
 			   (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
 				ret = -ENOEXEC;
-				goto out_enomem;
+				goto out_err;
 			}
 			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
 		}
@@ -2333,9 +2360,12 @@ static int move_module(struct module *mod, struct load_info *info)
 	}
 
 	return 0;
-out_enomem:
+out_err:
 	for (t--; t >= 0; t--)
-		module_memory_free(mod, t, true);
+		module_memory_free(mod, t);
+	if (codetag_section_found)
+		codetag_free_module_sections(mod);
+
 	return ret;
 }
 
@@ -2456,6 +2486,8 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 	/* Module has been copied to its final place now: return it. */
 	mod = (void *)info->sechdrs[info->index.mod].sh_addr;
 	kmemleak_load_module(mod, info);
+	codetag_module_replaced(info->mod, mod);
+
 	return mod;
 }
 
@@ -2465,7 +2497,7 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 	percpu_modfree(mod);
 	module_arch_freeing_init(mod);
 
-	free_mod_mem(mod, true);
+	free_mod_mem(mod);
 }
 
 int __weak module_finalize(const Elf_Ehdr *hdr,
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 435aa837e550..5f9cd1642d58 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/alloc_tag.h>
+#include <linux/execmem.h>
 #include <linux/fs.h>
 #include <linux/gfp.h>
 #include <linux/module.h>
@@ -9,6 +10,7 @@
 #include <linux/seq_file.h>
 
 #define ALLOCINFO_FILE_NAME		"allocinfo"
+#define MODULE_ALLOC_TAG_VMAP_SIZE	(100000UL * sizeof(struct alloc_tag))
 
 #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
 static bool mem_profiling_support __meminitdata = true;
@@ -174,31 +176,226 @@ static void __init procfs_init(void)
 	}
 }
 
-static bool alloc_tag_module_unload(struct codetag_type *cttype,
-				    struct codetag_module *cmod)
+#ifdef CONFIG_MODULES
+
+static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE);
+/* A dummy object used to indicate an unloaded module */
+static struct module unloaded_mod;
+/* A dummy object used to indicate a module prepended area */
+static struct module prepend_mod;
+
+static struct alloc_tag_module_section module_tags;
+
+static bool needs_section_mem(struct module *mod, unsigned long size)
 {
-	struct codetag_iterator iter = codetag_get_ct_iter(cttype);
-	struct alloc_tag_counters counter;
-	bool module_unused = true;
-	struct alloc_tag *tag;
-	struct codetag *ct;
+	return size >= sizeof(struct alloc_tag);
+}
+
+static struct alloc_tag *find_used_tag(struct alloc_tag *from, struct alloc_tag *to)
+{
+	while (from <= to) {
+		struct alloc_tag_counters counter;
 
-	for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
-		if (iter.cmod != cmod)
+		counter = alloc_tag_read(from);
+		if (counter.bytes)
+			return from;
+		from++;
+	}
+
+	return NULL;
+}
+
+/* Called with mod_area_mt locked */
+static void clean_unused_module_areas_locked(void)
+{
+	MA_STATE(mas, &mod_area_mt, 0, module_tags.size);
+	struct module *val;
+
+	mas_for_each(&mas, val, module_tags.size) {
+		if (val != &unloaded_mod)
 			continue;
 
-		tag = ct_to_alloc_tag(ct);
-		counter = alloc_tag_read(tag);
+		/* Release area if all tags are unused */
+		if (!find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index),
+				   (struct alloc_tag *)(module_tags.start_addr + mas.last)))
+			mas_erase(&mas);
+	}
+}
+
+/* Called with mod_area_mt locked */
+static bool find_aligned_area(struct ma_state *mas, unsigned long section_size,
+			      unsigned long size, unsigned int prepend, unsigned long align)
+{
+	bool cleanup_done = false;
+
+repeat:
+	/* Try finding exact size and hope the start is aligned */
+	if (!mas_empty_area(mas, 0, section_size - 1, prepend + size)) {
+		if (IS_ALIGNED(mas->index + prepend, align))
+			return true;
+
+		/* Try finding larger area to align later */
+		mas_reset(mas);
+		if (!mas_empty_area(mas, 0, section_size - 1,
+				    size + prepend + align - 1))
+			return true;
+	}
+
+	/* No free area, try cleanup stale data and repeat the search once */
+	if (!cleanup_done) {
+		clean_unused_module_areas_locked();
+		cleanup_done = true;
+		mas_reset(mas);
+		goto repeat;
+	}
+
+	return false;
+}
+
+static void *reserve_module_tags(struct module *mod, unsigned long size,
+				 unsigned int prepend, unsigned long align)
+{
+	unsigned long section_size = module_tags.end_addr - module_tags.start_addr;
+	MA_STATE(mas, &mod_area_mt, 0, section_size - 1);
+	unsigned long offset;
+	void *ret = NULL;
+
+	/* If no tags return error */
+	if (size < sizeof(struct alloc_tag))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * align is always power of 2, so we can use IS_ALIGNED and ALIGN.
+	 * align 0 or 1 means no alignment, to simplify set to 1.
+	 */
+	if (!align)
+		align = 1;
+
+	mas_lock(&mas);
+	if (!find_aligned_area(&mas, section_size, size, prepend, align)) {
+		ret = ERR_PTR(-ENOMEM);
+		goto unlock;
+	}
+
+	/* Mark found area as reserved */
+	offset = mas.index;
+	offset += prepend;
+	offset = ALIGN(offset, align);
+	if (offset != mas.index) {
+		unsigned long pad_start = mas.index;
+
+		mas.last = offset - 1;
+		mas_store(&mas, &prepend_mod);
+		if (mas_is_err(&mas)) {
+			ret = ERR_PTR(xa_err(mas.node));
+			goto unlock;
+		}
+		mas.index = offset;
+		mas.last = offset + size - 1;
+		mas_store(&mas, mod);
+		if (mas_is_err(&mas)) {
+			mas.index = pad_start;
+			mas_erase(&mas);
+			ret = ERR_PTR(xa_err(mas.node));
+		}
+	} else {
+		mas.last = offset + size - 1;
+		mas_store(&mas, mod);
+		if (mas_is_err(&mas))
+			ret = ERR_PTR(xa_err(mas.node));
+	}
+unlock:
+	mas_unlock(&mas);
+
+	if (IS_ERR(ret))
+		return ret;
 
-		if (WARN(counter.bytes,
-			 "%s:%u module %s func:%s has %llu allocated at module unload",
-			 ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
-			module_unused = false;
+	if (module_tags.size < offset + size)
+		module_tags.size = offset + size;
+
+	return (struct alloc_tag *)(module_tags.start_addr + offset);
+}
+
+static void release_module_tags(struct module *mod, bool used)
+{
+	MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size);
+	struct alloc_tag *tag;
+	struct module *val;
+
+	mas_lock(&mas);
+	mas_for_each_rev(&mas, val, 0)
+		if (val == mod)
+			break;
+
+	if (!val) /* module not found */
+		goto out;
+
+	if (!used)
+		goto release_area;
+
+	/* Find out if the area is used */
+	tag = find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index),
+			    (struct alloc_tag *)(module_tags.start_addr + mas.last));
+	if (tag) {
+		struct alloc_tag_counters counter = alloc_tag_read(tag);
+
+		pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n",
+			tag->ct.filename, tag->ct.lineno, tag->ct.modname,
+			tag->ct.function, counter.bytes);
+	} else {
+		used = false;
+	}
+release_area:
+	mas_store(&mas, used ? &unloaded_mod : NULL);
+	val = mas_prev_range(&mas, 0);
+	if (val == &prepend_mod)
+		mas_store(&mas, NULL);
+out:
+	mas_unlock(&mas);
+}
+
+static void replace_module(struct module *mod, struct module *new_mod)
+{
+	MA_STATE(mas, &mod_area_mt, 0, module_tags.size);
+	struct module *val;
+
+	mas_lock(&mas);
+	mas_for_each(&mas, val, module_tags.size) {
+		if (val != mod)
+			continue;
+
+		mas_store_gfp(&mas, new_mod, GFP_KERNEL);
+		break;
 	}
+	mas_unlock(&mas);
+}
+
+static int __init alloc_mod_tags_mem(void)
+{
+	/* Allocate space to copy allocation tags */
+	module_tags.start_addr = (unsigned long)execmem_alloc(EXECMEM_MODULE_DATA,
+							      MODULE_ALLOC_TAG_VMAP_SIZE);
+	if (!module_tags.start_addr)
+		return -ENOMEM;
+
+	module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE;
+
+	return 0;
+}
 
-	return module_unused;
+static void __init free_mod_tags_mem(void)
+{
+	execmem_free((void *)module_tags.start_addr);
+	module_tags.start_addr = 0;
 }
 
+#else /* CONFIG_MODULES */
+
+static inline int alloc_mod_tags_mem(void) { return 0; }
+static inline void free_mod_tags_mem(void) {}
+
+#endif /* CONFIG_MODULES */
+
 static int __init setup_early_mem_profiling(char *str)
 {
 	bool enable;
@@ -274,14 +471,26 @@ static inline void sysctl_init(void) {}
 static int __init alloc_tag_init(void)
 {
 	const struct codetag_type_desc desc = {
-		.section	= "alloc_tags",
-		.tag_size	= sizeof(struct alloc_tag),
-		.module_unload	= alloc_tag_module_unload,
+		.section		= ALLOC_TAG_SECTION_NAME,
+		.tag_size		= sizeof(struct alloc_tag),
+#ifdef CONFIG_MODULES
+		.needs_section_mem	= needs_section_mem,
+		.alloc_section_mem	= reserve_module_tags,
+		.free_section_mem	= release_module_tags,
+		.module_replaced	= replace_module,
+#endif
 	};
+	int res;
+
+	res = alloc_mod_tags_mem();
+	if (res)
+		return res;
 
 	alloc_tag_cttype = codetag_register_type(&desc);
-	if (IS_ERR(alloc_tag_cttype))
+	if (IS_ERR(alloc_tag_cttype)) {
+		free_mod_tags_mem();
 		return PTR_ERR(alloc_tag_cttype);
+	}
 
 	sysctl_init();
 	procfs_init();
diff --git a/lib/codetag.c b/lib/codetag.c
index d1fbbb7c2ec3..7455b966cae4 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -207,6 +207,94 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
 }
 
 #ifdef CONFIG_MODULES
+#define CODETAG_SECTION_PREFIX	".codetag."
+
+/* Some codetag types need a separate module section */
+bool codetag_needs_module_section(struct module *mod, const char *name,
+				  unsigned long size)
+{
+	const char *type_name;
+	struct codetag_type *cttype;
+	bool ret = false;
+
+	if (strncmp(name, CODETAG_SECTION_PREFIX, strlen(CODETAG_SECTION_PREFIX)))
+		return false;
+
+	type_name = name + strlen(CODETAG_SECTION_PREFIX);
+	mutex_lock(&codetag_lock);
+	list_for_each_entry(cttype, &codetag_types, link) {
+		if (strcmp(type_name, cttype->desc.section) == 0) {
+			if (!cttype->desc.needs_section_mem)
+				break;
+
+			down_write(&cttype->mod_lock);
+			ret = cttype->desc.needs_section_mem(mod, size);
+			up_write(&cttype->mod_lock);
+			break;
+		}
+	}
+	mutex_unlock(&codetag_lock);
+
+	return ret;
+}
+
+void *codetag_alloc_module_section(struct module *mod, const char *name,
+				   unsigned long size, unsigned int prepend,
+				   unsigned long align)
+{
+	const char *type_name = name + strlen(CODETAG_SECTION_PREFIX);
+	struct codetag_type *cttype;
+	void *ret = ERR_PTR(-EINVAL);
+
+	mutex_lock(&codetag_lock);
+	list_for_each_entry(cttype, &codetag_types, link) {
+		if (strcmp(type_name, cttype->desc.section) == 0) {
+			if (WARN_ON(!cttype->desc.alloc_section_mem))
+				break;
+
+			down_write(&cttype->mod_lock);
+			ret = cttype->desc.alloc_section_mem(mod, size, prepend, align);
+			up_write(&cttype->mod_lock);
+			break;
+		}
+	}
+	mutex_unlock(&codetag_lock);
+
+	return ret;
+}
+
+void codetag_free_module_sections(struct module *mod)
+{
+	struct codetag_type *cttype;
+
+	mutex_lock(&codetag_lock);
+	list_for_each_entry(cttype, &codetag_types, link) {
+		if (!cttype->desc.free_section_mem)
+			continue;
+
+		down_write(&cttype->mod_lock);
+		cttype->desc.free_section_mem(mod, false);
+		up_write(&cttype->mod_lock);
+	}
+	mutex_unlock(&codetag_lock);
+}
+
+void codetag_module_replaced(struct module *mod, struct module *new_mod)
+{
+	struct codetag_type *cttype;
+
+	mutex_lock(&codetag_lock);
+	list_for_each_entry(cttype, &codetag_types, link) {
+		if (!cttype->desc.module_replaced)
+			continue;
+
+		down_write(&cttype->mod_lock);
+		cttype->desc.module_replaced(mod, new_mod);
+		up_write(&cttype->mod_lock);
+	}
+	mutex_unlock(&codetag_lock);
+}
+
 void codetag_load_module(struct module *mod)
 {
 	struct codetag_type *cttype;
@@ -220,13 +308,12 @@ void codetag_load_module(struct module *mod)
 	mutex_unlock(&codetag_lock);
 }
 
-bool codetag_unload_module(struct module *mod)
+void codetag_unload_module(struct module *mod)
 {
 	struct codetag_type *cttype;
-	bool unload_ok = true;
 
 	if (!mod)
-		return true;
+		return;
 
 	/* await any module's kfree_rcu() operations to complete */
 	kvfree_rcu_barrier();
@@ -246,18 +333,17 @@ bool codetag_unload_module(struct module *mod)
 		}
 		if (found) {
 			if (cttype->desc.module_unload)
-				if (!cttype->desc.module_unload(cttype, cmod))
-					unload_ok = false;
+				cttype->desc.module_unload(cttype, cmod);
 
 			cttype->count -= range_size(cttype, &cmod->range);
 			idr_remove(&cttype->mod_idr, mod_id);
 			kfree(cmod);
 		}
 		up_write(&cttype->mod_lock);
+		if (found && cttype->desc.free_section_mem)
+			cttype->desc.free_section_mem(mod, true);
 	}
 	mutex_unlock(&codetag_lock);
-
-	return unload_ok;
 }
 #endif /* CONFIG_MODULES */
 
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index 3f43edef813c..711c6e029936 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -50,7 +50,7 @@ SECTIONS {
 	.data : {
 		*(.data .data.[0-9a-zA-Z_]*)
 		*(.data..L*)
-		CODETAG_SECTIONS()
+		MOD_CODETAG_SECTIONS()
 	}
 
 	.rodata : {
@@ -59,9 +59,10 @@ SECTIONS {
 	}
 #else
 	.data : {
-		CODETAG_SECTIONS()
+		MOD_CODETAG_SECTIONS()
 	}
 #endif
+	MOD_SEPARATE_CODETAG_SECTIONS()
 }
 
 /* bring in arch-specific sections */
-- 
cgit v1.2.3


From 0f9b685626daa2f8e19a9788625c9b624c223e45 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 23 Oct 2024 10:07:57 -0700
Subject: alloc_tag: populate memory for module tags as needed

The memory reserved for module tags does not need to be backed by physical
pages until there are tags to store there.  Change the way we reserve this
memory to allocate only virtual area for the tags and populate it with
physical pages as needed when we load a module.

[surenb@google.com: avoid execmem_vmap() when !MMU]
  Link: https://lkml.kernel.org/r/20241031233611.3833002-1-surenb@google.com
Link: https://lkml.kernel.org/r/20241023170759.999909-5-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Minchan Kim <minchan@google.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Huth <thuth@redhat.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xiongwei Song <xiongwei.song@windriver.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/execmem.h | 12 ++++++++
 include/linux/vmalloc.h |  3 ++
 lib/Kconfig.debug       |  1 +
 lib/alloc_tag.c         | 73 +++++++++++++++++++++++++++++++++++++++++++------
 mm/execmem.c            | 16 +++++++++++
 mm/internal.h           |  6 ++++
 mm/vmalloc.c            |  4 +--
 7 files changed, 104 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 1517fa196bf7..64130ae19690 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -139,6 +139,18 @@ void *execmem_alloc(enum execmem_type type, size_t size);
  */
 void execmem_free(void *ptr);
 
+#ifdef CONFIG_MMU
+/**
+ * execmem_vmap - create virtual mapping for EXECMEM_MODULE_DATA memory
+ * @size: size of the virtual mapping in bytes
+ *
+ * Maps virtually contiguous area in the range suitable for EXECMEM_MODULE_DATA.
+ *
+ * Return: the area descriptor on success or %NULL on failure.
+ */
+struct vm_struct *execmem_vmap(size_t size);
+#endif
+
 /**
  * execmem_update_copy - copy an update to executable memory
  * @dst:  destination address to update
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 27408f21e501..31e9ffd936e3 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -202,6 +202,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
 extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 							unsigned long pgoff);
 
+int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
+		     struct page **pages, unsigned int page_shift);
+
 /*
  * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
  * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7312ae7c3cc5..6798bbbcbd32 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -993,6 +993,7 @@ config CODE_TAGGING
 config MEM_ALLOC_PROFILING
 	bool "Enable memory allocation profiling"
 	default n
+	depends on MMU
 	depends on PROC_FS
 	depends on !DEBUG_FORCE_WEAK_PER_CPU
 	select CODE_TAGGING
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 5f9cd1642d58..4a7fc081b789 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -8,14 +8,15 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_buf.h>
 #include <linux/seq_file.h>
+#include <linux/vmalloc.h>
 
 #define ALLOCINFO_FILE_NAME		"allocinfo"
 #define MODULE_ALLOC_TAG_VMAP_SIZE	(100000UL * sizeof(struct alloc_tag))
 
 #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
-static bool mem_profiling_support __meminitdata = true;
+static bool mem_profiling_support = true;
 #else
-static bool mem_profiling_support __meminitdata;
+static bool mem_profiling_support;
 #endif
 
 static struct codetag_type *alloc_tag_cttype;
@@ -154,7 +155,7 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl
 	return nr;
 }
 
-static void __init shutdown_mem_profiling(void)
+static void shutdown_mem_profiling(void)
 {
 	if (mem_alloc_profiling_enabled())
 		static_branch_disable(&mem_alloc_profiling_key);
@@ -179,6 +180,7 @@ static void __init procfs_init(void)
 #ifdef CONFIG_MODULES
 
 static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE);
+static struct vm_struct *vm_module_tags;
 /* A dummy object used to indicate an unloaded module */
 static struct module unloaded_mod;
 /* A dummy object used to indicate a module prepended area */
@@ -252,6 +254,33 @@ repeat:
 	return false;
 }
 
+static int vm_module_tags_populate(void)
+{
+	unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT;
+
+	if (phys_size < module_tags.size) {
+		struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages;
+		unsigned long addr = module_tags.start_addr + phys_size;
+		unsigned long more_pages;
+		unsigned long nr;
+
+		more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT;
+		nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN,
+						 NUMA_NO_NODE, more_pages, next_page);
+		if (nr < more_pages ||
+		    vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL,
+				     next_page, PAGE_SHIFT) < 0) {
+			/* Clean up and error out */
+			for (int i = 0; i < nr; i++)
+				__free_page(next_page[i]);
+			return -ENOMEM;
+		}
+		vm_module_tags->nr_pages += nr;
+	}
+
+	return 0;
+}
+
 static void *reserve_module_tags(struct module *mod, unsigned long size,
 				 unsigned int prepend, unsigned long align)
 {
@@ -310,8 +339,18 @@ unlock:
 	if (IS_ERR(ret))
 		return ret;
 
-	if (module_tags.size < offset + size)
+	if (module_tags.size < offset + size) {
+		int grow_res;
+
 		module_tags.size = offset + size;
+		grow_res = vm_module_tags_populate();
+		if (grow_res) {
+			shutdown_mem_profiling();
+			pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n",
+			       mod->name);
+			return ERR_PTR(grow_res);
+		}
+	}
 
 	return (struct alloc_tag *)(module_tags.start_addr + offset);
 }
@@ -372,12 +411,23 @@ static void replace_module(struct module *mod, struct module *new_mod)
 
 static int __init alloc_mod_tags_mem(void)
 {
-	/* Allocate space to copy allocation tags */
-	module_tags.start_addr = (unsigned long)execmem_alloc(EXECMEM_MODULE_DATA,
-							      MODULE_ALLOC_TAG_VMAP_SIZE);
-	if (!module_tags.start_addr)
+	/* Map space to copy allocation tags */
+	vm_module_tags = execmem_vmap(MODULE_ALLOC_TAG_VMAP_SIZE);
+	if (!vm_module_tags) {
+		pr_err("Failed to map %lu bytes for module allocation tags\n",
+			MODULE_ALLOC_TAG_VMAP_SIZE);
+		module_tags.start_addr = 0;
 		return -ENOMEM;
+	}
 
+	vm_module_tags->pages = kmalloc_array(get_vm_area_size(vm_module_tags) >> PAGE_SHIFT,
+					sizeof(struct page *), GFP_KERNEL | __GFP_ZERO);
+	if (!vm_module_tags->pages) {
+		free_vm_area(vm_module_tags);
+		return -ENOMEM;
+	}
+
+	module_tags.start_addr = (unsigned long)vm_module_tags->addr;
 	module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE;
 
 	return 0;
@@ -385,8 +435,13 @@ static int __init alloc_mod_tags_mem(void)
 
 static void __init free_mod_tags_mem(void)
 {
-	execmem_free((void *)module_tags.start_addr);
+	int i;
+
 	module_tags.start_addr = 0;
+	for (i = 0; i < vm_module_tags->nr_pages; i++)
+		__free_page(vm_module_tags->pages[i]);
+	kfree(vm_module_tags->pages);
+	free_vm_area(vm_module_tags);
 }
 
 #else /* CONFIG_MODULES */
diff --git a/mm/execmem.c b/mm/execmem.c
index 576a57e2161f..317b6a8d35be 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -64,6 +64,22 @@ static void *execmem_vmalloc(struct execmem_range *range, size_t size,
 
 	return p;
 }
+
+struct vm_struct *execmem_vmap(size_t size)
+{
+	struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA];
+	struct vm_struct *area;
+
+	area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC,
+				  range->start, range->end, NUMA_NO_NODE,
+				  GFP_KERNEL, __builtin_return_address(0));
+	if (!area && range->fallback_start)
+		area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC,
+					  range->fallback_start, range->fallback_end,
+					  NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0));
+
+	return area;
+}
 #else
 static void *execmem_vmalloc(struct execmem_range *range, size_t size,
 			     pgprot_t pgprot, unsigned long vm_flags)
diff --git a/mm/internal.h b/mm/internal.h
index 3dc745ba76dd..cd96848be245 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1263,6 +1263,12 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
 void free_zone_device_folio(struct folio *folio);
 int migrate_device_coherent_folio(struct folio *folio);
 
+struct vm_struct *__get_vm_area_node(unsigned long size,
+				     unsigned long align, unsigned long shift,
+				     unsigned long flags, unsigned long start,
+				     unsigned long end, int node, gfp_t gfp_mask,
+				     const void *caller);
+
 /*
  * mm/gup.c
  */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 74c0a5eae210..7ed39d104201 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -653,7 +653,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-static int vmap_pages_range(unsigned long addr, unsigned long end,
+int vmap_pages_range(unsigned long addr, unsigned long end,
 		pgprot_t prot, struct page **pages, unsigned int page_shift)
 {
 	int err;
@@ -3106,7 +3106,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 	vm->flags &= ~VM_UNINITIALIZED;
 }
 
-static struct vm_struct *__get_vm_area_node(unsigned long size,
+struct vm_struct *__get_vm_area_node(unsigned long size,
 		unsigned long align, unsigned long shift, unsigned long flags,
 		unsigned long start, unsigned long end, int node,
 		gfp_t gfp_mask, const void *caller)
-- 
cgit v1.2.3


From 42895a86124418d8dd29a93812bc282e569ccfee Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 23 Oct 2024 10:07:58 -0700
Subject: alloc_tag: introduce pgtag_ref_handle to abstract page tag references

To simplify later changes to page tag references, introduce new
pgtag_ref_handle type.  This allows easy replacement of page_ext as a
storage of page allocation tags.

Link: https://lkml.kernel.org/r/20241023170759.999909-6-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Minchan Kim <minchan@google.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Huth <thuth@redhat.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xiongwei Song <xiongwei.song@windriver.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h          | 25 ++++++------
 include/linux/pgalloc_tag.h | 92 ++++++++++++++++++++++++++-------------------
 2 files changed, 67 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index eb070c14e309..f9120ac6d901 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4181,37 +4181,38 @@ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new
 		return;
 
 	for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
-		union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i));
+		union pgtag_ref_handle handle;
+		union codetag_ref ref;
 
-		if (ref) {
+		if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) {
 			/* Set new reference to point to the original tag */
-			alloc_tag_ref_set(ref, tag);
-			put_page_tag_ref(ref);
+			alloc_tag_ref_set(&ref, tag);
+			update_page_tag_ref(handle, &ref);
+			put_page_tag_ref(handle);
 		}
 	}
 }
 
 static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
 {
+	union pgtag_ref_handle handle;
+	union codetag_ref ref;
 	struct alloc_tag *tag;
-	union codetag_ref *ref;
 
 	tag = pgalloc_tag_get(&old->page);
 	if (!tag)
 		return;
 
-	ref = get_page_tag_ref(&new->page);
-	if (!ref)
+	if (!get_page_tag_ref(&new->page, &ref, &handle))
 		return;
 
 	/* Clear the old ref to the original allocation tag. */
 	clear_page_tag_ref(&old->page);
 	/* Decrement the counters of the tag on get_new_folio. */
-	alloc_tag_sub(ref, folio_size(new));
-
-	__alloc_tag_ref_set(ref, tag);
-
-	put_page_tag_ref(ref);
+	alloc_tag_sub(&ref, folio_size(new));
+	__alloc_tag_ref_set(&ref, tag);
+	update_page_tag_ref(handle, &ref);
+	put_page_tag_ref(handle);
 }
 #else /* !CONFIG_MEM_ALLOC_PROFILING */
 static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 59a3deb792a8..b13cd3313a88 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -11,46 +11,59 @@
 
 #include <linux/page_ext.h>
 
+union pgtag_ref_handle {
+	union codetag_ref *ref;	/* reference in page extension */
+};
+
 extern struct page_ext_operations page_alloc_tagging_ops;
 
-static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext)
+/* Should be called only if mem_alloc_profiling_enabled() */
+static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref,
+				    union pgtag_ref_handle *handle)
 {
-	return (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops);
-}
+	struct page_ext *page_ext;
+	union codetag_ref *tmp;
 
-static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref)
-{
-	return (void *)ref - page_alloc_tagging_ops.offset;
+	if (!page)
+		return false;
+
+	page_ext = page_ext_get(page);
+	if (!page_ext)
+		return false;
+
+	tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops);
+	ref->ct = tmp->ct;
+	handle->ref = tmp;
+	return true;
 }
 
-/* Should be called only if mem_alloc_profiling_enabled() */
-static inline union codetag_ref *get_page_tag_ref(struct page *page)
+static inline void put_page_tag_ref(union pgtag_ref_handle handle)
 {
-	if (page) {
-		struct page_ext *page_ext = page_ext_get(page);
+	if (WARN_ON(!handle.ref))
+		return;
 
-		if (page_ext)
-			return codetag_ref_from_page_ext(page_ext);
-	}
-	return NULL;
+	page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset);
 }
 
-static inline void put_page_tag_ref(union codetag_ref *ref)
+static inline void update_page_tag_ref(union pgtag_ref_handle handle,
+				       union codetag_ref *ref)
 {
-	if (WARN_ON(!ref))
+	if (WARN_ON(!handle.ref || !ref))
 		return;
 
-	page_ext_put(page_ext_from_codetag_ref(ref));
+	handle.ref->ct = ref->ct;
 }
 
 static inline void clear_page_tag_ref(struct page *page)
 {
 	if (mem_alloc_profiling_enabled()) {
-		union codetag_ref *ref = get_page_tag_ref(page);
+		union pgtag_ref_handle handle;
+		union codetag_ref ref;
 
-		if (ref) {
-			set_codetag_empty(ref);
-			put_page_tag_ref(ref);
+		if (get_page_tag_ref(page, &ref, &handle)) {
+			set_codetag_empty(&ref);
+			update_page_tag_ref(handle, &ref);
+			put_page_tag_ref(handle);
 		}
 	}
 }
@@ -59,11 +72,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
 				   unsigned int nr)
 {
 	if (mem_alloc_profiling_enabled()) {
-		union codetag_ref *ref = get_page_tag_ref(page);
+		union pgtag_ref_handle handle;
+		union codetag_ref ref;
 
-		if (ref) {
-			alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE * nr);
-			put_page_tag_ref(ref);
+		if (get_page_tag_ref(page, &ref, &handle)) {
+			alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+			update_page_tag_ref(handle, &ref);
+			put_page_tag_ref(handle);
 		}
 	}
 }
@@ -71,11 +86,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
 {
 	if (mem_alloc_profiling_enabled()) {
-		union codetag_ref *ref = get_page_tag_ref(page);
+		union pgtag_ref_handle handle;
+		union codetag_ref ref;
 
-		if (ref) {
-			alloc_tag_sub(ref, PAGE_SIZE * nr);
-			put_page_tag_ref(ref);
+		if (get_page_tag_ref(page, &ref, &handle)) {
+			alloc_tag_sub(&ref, PAGE_SIZE * nr);
+			update_page_tag_ref(handle, &ref);
+			put_page_tag_ref(handle);
 		}
 	}
 }
@@ -85,13 +102,14 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
 	struct alloc_tag *tag = NULL;
 
 	if (mem_alloc_profiling_enabled()) {
-		union codetag_ref *ref = get_page_tag_ref(page);
-
-		alloc_tag_sub_check(ref);
-		if (ref) {
-			if (ref->ct)
-				tag = ct_to_alloc_tag(ref->ct);
-			put_page_tag_ref(ref);
+		union pgtag_ref_handle handle;
+		union codetag_ref ref;
+
+		if (get_page_tag_ref(page, &ref, &handle)) {
+			alloc_tag_sub_check(&ref);
+			if (ref.ct)
+				tag = ct_to_alloc_tag(ref.ct);
+			put_page_tag_ref(handle);
 		}
 	}
 
@@ -106,8 +124,6 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
 
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
-static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; }
-static inline void put_page_tag_ref(union codetag_ref *ref) {}
 static inline void clear_page_tag_ref(struct page *page) {}
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
 				   unsigned int nr) {}
-- 
cgit v1.2.3


From 4835f747d3ed181bf2c67930fe06b2c01a5d2323 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 23 Oct 2024 10:07:59 -0700
Subject: alloc_tag: support for page allocation tag compression

Implement support for storing page allocation tag references directly in
the page flags instead of page extensions.  sysctl.vm.mem_profiling boot
parameter it extended to provide a way for a user to request this mode.
Enabling compression eliminates memory overhead caused by page_ext and
results in better performance for page allocations.  However this mode
will not work if the number of available page flag bits is insufficient to
address all kernel allocations.  Such condition can happen during boot or
when loading a module.  If this condition is detected, memory allocation
profiling gets disabled with an appropriate warning.  By default
compression mode is disabled.

Link: https://lkml.kernel.org/r/20241023170759.999909-7-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Minchan Kim <minchan@google.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Huth <thuth@redhat.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xiongwei Song <xiongwei.song@windriver.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/allocation-profiling.rst |   7 +-
 include/linux/alloc_tag.h                 |  10 ++-
 include/linux/codetag.h                   |   3 +
 include/linux/page-flags-layout.h         |   7 ++
 include/linux/pgalloc_tag.h               | 145 ++++++++++++++++++++++++++----
 lib/alloc_tag.c                           | 142 +++++++++++++++++++++++++++--
 lib/codetag.c                             |   4 +-
 mm/mm_init.c                              |   5 +-
 8 files changed, 290 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst
index ffd6655b7be2..316311240e6a 100644
--- a/Documentation/mm/allocation-profiling.rst
+++ b/Documentation/mm/allocation-profiling.rst
@@ -18,12 +18,17 @@ kconfig options:
   missing annotation
 
 Boot parameter:
-  sysctl.vm.mem_profiling=0|1|never
+  sysctl.vm.mem_profiling={0|1|never}[,compressed]
 
   When set to "never", memory allocation profiling overhead is minimized and it
   cannot be enabled at runtime (sysctl becomes read-only).
   When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y, default value is "1".
   When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, default value is "never".
+  "compressed" optional parameter will try to store page tag references in a
+  compact format, avoiding page extensions. This results in improved performance
+  and memory consumption, however it might fail depending on system configuration.
+  If compression fails, a warning is issued and memory allocation profiling gets
+  disabled.
 
 sysctl:
   /proc/sys/vm/mem_profiling
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 55d30543c4c7..7c0786bdf9af 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -30,8 +30,16 @@ struct alloc_tag {
 	struct alloc_tag_counters __percpu	*counters;
 } __aligned(8);
 
+struct alloc_tag_kernel_section {
+	struct alloc_tag *first_tag;
+	unsigned long count;
+};
+
 struct alloc_tag_module_section {
-	unsigned long start_addr;
+	union {
+		unsigned long start_addr;
+		struct alloc_tag *first_tag;
+	};
 	unsigned long end_addr;
 	/* used size */
 	unsigned long size;
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index d10bd9810d32..d14dbd26b370 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -13,6 +13,9 @@ struct codetag_module;
 struct seq_buf;
 struct module;
 
+#define CODETAG_SECTION_START_PREFIX	"__start_"
+#define CODETAG_SECTION_STOP_PREFIX	"__stop_"
+
 /*
  * An instance of this structure is created in a special ELF section at every
  * code location being tagged.  At runtime, the special section is treated as
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 7d79818dc065..4f5c9e979bb9 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -111,5 +111,12 @@
 			    ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
 			    NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
 
+#define NR_NON_PAGEFLAG_BITS	(SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH + \
+				LAST_CPUPID_SHIFT + KASAN_TAG_WIDTH + \
+				LRU_GEN_WIDTH + LRU_REFS_WIDTH)
+
+#define NR_UNUSED_PAGEFLAG_BITS	(BITS_PER_LONG - \
+				(NR_NON_PAGEFLAG_BITS + NR_PAGEFLAGS))
+
 #endif
 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index b13cd3313a88..1fe63b52e5e5 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -11,29 +11,118 @@
 
 #include <linux/page_ext.h>
 
+extern struct page_ext_operations page_alloc_tagging_ops;
+extern unsigned long alloc_tag_ref_mask;
+extern int alloc_tag_ref_offs;
+extern struct alloc_tag_kernel_section kernel_tags;
+
+DECLARE_STATIC_KEY_FALSE(mem_profiling_compressed);
+
+typedef u16	pgalloc_tag_idx;
+
 union pgtag_ref_handle {
 	union codetag_ref *ref;	/* reference in page extension */
+	struct page *page;	/* reference in page flags */
 };
 
-extern struct page_ext_operations page_alloc_tagging_ops;
+/* Reserved indexes */
+#define CODETAG_ID_NULL		0
+#define CODETAG_ID_EMPTY	1
+#define CODETAG_ID_FIRST	2
+
+#ifdef CONFIG_MODULES
+
+extern struct alloc_tag_module_section module_tags;
+
+static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx)
+{
+	return &module_tags.first_tag[idx - kernel_tags.count];
+}
+
+static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag)
+{
+	return CODETAG_ID_FIRST + kernel_tags.count + (tag - module_tags.first_tag);
+}
+
+#else /* CONFIG_MODULES */
+
+static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx)
+{
+	pr_warn("invalid page tag reference %lu\n", (unsigned long)idx);
+	return NULL;
+}
+
+static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag)
+{
+	pr_warn("invalid page tag 0x%lx\n", (unsigned long)tag);
+	return CODETAG_ID_NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
+static inline void idx_to_ref(pgalloc_tag_idx idx, union codetag_ref *ref)
+{
+	switch (idx) {
+	case (CODETAG_ID_NULL):
+		ref->ct = NULL;
+		break;
+	case (CODETAG_ID_EMPTY):
+		set_codetag_empty(ref);
+		break;
+	default:
+		idx -= CODETAG_ID_FIRST;
+		ref->ct = idx < kernel_tags.count ?
+			&kernel_tags.first_tag[idx].ct :
+			&module_idx_to_tag(idx)->ct;
+		break;
+	}
+}
+
+static inline pgalloc_tag_idx ref_to_idx(union codetag_ref *ref)
+{
+	struct alloc_tag *tag;
+
+	if (!ref->ct)
+		return CODETAG_ID_NULL;
+
+	if (is_codetag_empty(ref))
+		return CODETAG_ID_EMPTY;
+
+	tag = ct_to_alloc_tag(ref->ct);
+	if (tag >= kernel_tags.first_tag && tag < kernel_tags.first_tag + kernel_tags.count)
+		return CODETAG_ID_FIRST + (tag - kernel_tags.first_tag);
+
+	return module_tag_to_idx(tag);
+}
+
+
 
 /* Should be called only if mem_alloc_profiling_enabled() */
 static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref,
 				    union pgtag_ref_handle *handle)
 {
-	struct page_ext *page_ext;
-	union codetag_ref *tmp;
-
 	if (!page)
 		return false;
 
-	page_ext = page_ext_get(page);
-	if (!page_ext)
-		return false;
+	if (static_key_enabled(&mem_profiling_compressed)) {
+		pgalloc_tag_idx idx;
+
+		idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask;
+		idx_to_ref(idx, ref);
+		handle->page = page;
+	} else {
+		struct page_ext *page_ext;
+		union codetag_ref *tmp;
+
+		page_ext = page_ext_get(page);
+		if (!page_ext)
+			return false;
+
+		tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops);
+		ref->ct = tmp->ct;
+		handle->ref = tmp;
+	}
 
-	tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops);
-	ref->ct = tmp->ct;
-	handle->ref = tmp;
 	return true;
 }
 
@@ -42,16 +131,35 @@ static inline void put_page_tag_ref(union pgtag_ref_handle handle)
 	if (WARN_ON(!handle.ref))
 		return;
 
-	page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset);
+	if (!static_key_enabled(&mem_profiling_compressed))
+		page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset);
 }
 
-static inline void update_page_tag_ref(union pgtag_ref_handle handle,
-				       union codetag_ref *ref)
+static inline void update_page_tag_ref(union pgtag_ref_handle handle, union codetag_ref *ref)
 {
-	if (WARN_ON(!handle.ref || !ref))
-		return;
-
-	handle.ref->ct = ref->ct;
+	if (static_key_enabled(&mem_profiling_compressed)) {
+		struct page *page = handle.page;
+		unsigned long old_flags;
+		unsigned long flags;
+		unsigned long idx;
+
+		if (WARN_ON(!page || !ref))
+			return;
+
+		idx = (unsigned long)ref_to_idx(ref);
+		idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs;
+		do {
+			old_flags = READ_ONCE(page->flags);
+			flags = old_flags;
+			flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs);
+			flags |= idx;
+		} while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+	} else {
+		if (WARN_ON(!handle.ref || !ref))
+			return;
+
+		handle.ref->ct = ref->ct;
+	}
 }
 
 static inline void clear_page_tag_ref(struct page *page)
@@ -122,6 +230,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
 		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
 }
 
+void __init alloc_tag_sec_init(void);
+
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 static inline void clear_page_tag_ref(struct page *page) {}
@@ -130,6 +240,7 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
 static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
 static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+static inline void alloc_tag_sec_init(void) {}
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 4a7fc081b789..d38a4b2a551d 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -3,6 +3,7 @@
 #include <linux/execmem.h>
 #include <linux/fs.h>
 #include <linux/gfp.h>
+#include <linux/kallsyms.h>
 #include <linux/module.h>
 #include <linux/page_ext.h>
 #include <linux/proc_fs.h>
@@ -12,6 +13,8 @@
 
 #define ALLOCINFO_FILE_NAME		"allocinfo"
 #define MODULE_ALLOC_TAG_VMAP_SIZE	(100000UL * sizeof(struct alloc_tag))
+#define SECTION_START(NAME)		(CODETAG_SECTION_START_PREFIX NAME)
+#define SECTION_STOP(NAME)		(CODETAG_SECTION_STOP_PREFIX NAME)
 
 #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
 static bool mem_profiling_support = true;
@@ -26,6 +29,11 @@ EXPORT_SYMBOL(_shared_alloc_tag);
 
 DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
 			mem_alloc_profiling_key);
+DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed);
+
+struct alloc_tag_kernel_section kernel_tags = { NULL, 0 };
+unsigned long alloc_tag_ref_mask;
+int alloc_tag_ref_offs;
 
 struct allocinfo_private {
 	struct codetag_iterator iter;
@@ -155,7 +163,7 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl
 	return nr;
 }
 
-static void shutdown_mem_profiling(void)
+static void shutdown_mem_profiling(bool remove_file)
 {
 	if (mem_alloc_profiling_enabled())
 		static_branch_disable(&mem_alloc_profiling_key);
@@ -163,6 +171,8 @@ static void shutdown_mem_profiling(void)
 	if (!mem_profiling_support)
 		return;
 
+	if (remove_file)
+		remove_proc_entry(ALLOCINFO_FILE_NAME, NULL);
 	mem_profiling_support = false;
 }
 
@@ -173,10 +183,40 @@ static void __init procfs_init(void)
 
 	if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) {
 		pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME);
-		shutdown_mem_profiling();
+		shutdown_mem_profiling(false);
 	}
 }
 
+void __init alloc_tag_sec_init(void)
+{
+	struct alloc_tag *last_codetag;
+
+	if (!mem_profiling_support)
+		return;
+
+	if (!static_key_enabled(&mem_profiling_compressed))
+		return;
+
+	kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name(
+					SECTION_START(ALLOC_TAG_SECTION_NAME));
+	last_codetag = (struct alloc_tag *)kallsyms_lookup_name(
+					SECTION_STOP(ALLOC_TAG_SECTION_NAME));
+	kernel_tags.count = last_codetag - kernel_tags.first_tag;
+
+	/* Check if kernel tags fit into page flags */
+	if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) {
+		shutdown_mem_profiling(false); /* allocinfo file does not exist yet */
+		pr_err("%lu allocation tags cannot be references using %d available page flag bits. Memory allocation profiling is disabled!\n",
+			kernel_tags.count, NR_UNUSED_PAGEFLAG_BITS);
+		return;
+	}
+
+	alloc_tag_ref_offs = (LRU_REFS_PGOFF - NR_UNUSED_PAGEFLAG_BITS);
+	alloc_tag_ref_mask = ((1UL << NR_UNUSED_PAGEFLAG_BITS) - 1);
+	pr_debug("Memory allocation profiling compression is using %d page flag bits!\n",
+		 NR_UNUSED_PAGEFLAG_BITS);
+}
+
 #ifdef CONFIG_MODULES
 
 static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE);
@@ -186,10 +226,59 @@ static struct module unloaded_mod;
 /* A dummy object used to indicate a module prepended area */
 static struct module prepend_mod;
 
-static struct alloc_tag_module_section module_tags;
+struct alloc_tag_module_section module_tags;
+
+static inline unsigned long alloc_tag_align(unsigned long val)
+{
+	if (!static_key_enabled(&mem_profiling_compressed)) {
+		/* No alignment requirements when we are not indexing the tags */
+		return val;
+	}
+
+	if (val % sizeof(struct alloc_tag) == 0)
+		return val;
+	return ((val / sizeof(struct alloc_tag)) + 1) * sizeof(struct alloc_tag);
+}
+
+static bool ensure_alignment(unsigned long align, unsigned int *prepend)
+{
+	if (!static_key_enabled(&mem_profiling_compressed)) {
+		/* No alignment requirements when we are not indexing the tags */
+		return true;
+	}
+
+	/*
+	 * If alloc_tag size is not a multiple of required alignment, tag
+	 * indexing does not work.
+	 */
+	if (!IS_ALIGNED(sizeof(struct alloc_tag), align))
+		return false;
+
+	/* Ensure prepend consumes multiple of alloc_tag-sized blocks */
+	if (*prepend)
+		*prepend = alloc_tag_align(*prepend);
+
+	return true;
+}
+
+static inline bool tags_addressable(void)
+{
+	unsigned long tag_idx_count;
+
+	if (!static_key_enabled(&mem_profiling_compressed))
+		return true; /* with page_ext tags are always addressable */
+
+	tag_idx_count = CODETAG_ID_FIRST + kernel_tags.count +
+			module_tags.size / sizeof(struct alloc_tag);
+
+	return tag_idx_count < (1UL << NR_UNUSED_PAGEFLAG_BITS);
+}
 
 static bool needs_section_mem(struct module *mod, unsigned long size)
 {
+	if (!mem_profiling_support)
+		return false;
+
 	return size >= sizeof(struct alloc_tag);
 }
 
@@ -300,6 +389,13 @@ static void *reserve_module_tags(struct module *mod, unsigned long size,
 	if (!align)
 		align = 1;
 
+	if (!ensure_alignment(align, &prepend)) {
+		shutdown_mem_profiling(true);
+		pr_err("%s: alignment %lu is incompatible with allocation tag indexing. Memory allocation profiling is disabled!\n",
+			mod->name, align);
+		return ERR_PTR(-EINVAL);
+	}
+
 	mas_lock(&mas);
 	if (!find_aligned_area(&mas, section_size, size, prepend, align)) {
 		ret = ERR_PTR(-ENOMEM);
@@ -343,9 +439,15 @@ unlock:
 		int grow_res;
 
 		module_tags.size = offset + size;
+		if (mem_alloc_profiling_enabled() && !tags_addressable()) {
+			shutdown_mem_profiling(true);
+			pr_warn("With module %s there are too many tags to fit in %d page flag bits. Memory allocation profiling is disabled!\n",
+				mod->name, NR_UNUSED_PAGEFLAG_BITS);
+		}
+
 		grow_res = vm_module_tags_populate();
 		if (grow_res) {
-			shutdown_mem_profiling();
+			shutdown_mem_profiling(true);
 			pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n",
 			       mod->name);
 			return ERR_PTR(grow_res);
@@ -429,6 +531,8 @@ static int __init alloc_mod_tags_mem(void)
 
 	module_tags.start_addr = (unsigned long)vm_module_tags->addr;
 	module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE;
+	/* Ensure the base is alloc_tag aligned when required for indexing */
+	module_tags.start_addr = alloc_tag_align(module_tags.start_addr);
 
 	return 0;
 }
@@ -451,8 +555,10 @@ static inline void free_mod_tags_mem(void) {}
 
 #endif /* CONFIG_MODULES */
 
+/* See: Documentation/mm/allocation-profiling.rst */
 static int __init setup_early_mem_profiling(char *str)
 {
+	bool compressed = false;
 	bool enable;
 
 	if (!str || !str[0])
@@ -461,22 +567,37 @@ static int __init setup_early_mem_profiling(char *str)
 	if (!strncmp(str, "never", 5)) {
 		enable = false;
 		mem_profiling_support = false;
+		pr_info("Memory allocation profiling is disabled!\n");
 	} else {
-		int res;
+		char *token = strsep(&str, ",");
+
+		if (kstrtobool(token, &enable))
+			return -EINVAL;
 
-		res = kstrtobool(str, &enable);
-		if (res)
-			return res;
+		if (str) {
 
+			if (strcmp(str, "compressed"))
+				return -EINVAL;
+
+			compressed = true;
+		}
 		mem_profiling_support = true;
+		pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n",
+			compressed ? "with" : "without", enable ? "on" : "off");
 	}
 
-	if (enable != static_key_enabled(&mem_alloc_profiling_key)) {
+	if (enable != mem_alloc_profiling_enabled()) {
 		if (enable)
 			static_branch_enable(&mem_alloc_profiling_key);
 		else
 			static_branch_disable(&mem_alloc_profiling_key);
 	}
+	if (compressed != static_key_enabled(&mem_profiling_compressed)) {
+		if (compressed)
+			static_branch_enable(&mem_profiling_compressed);
+		else
+			static_branch_disable(&mem_profiling_compressed);
+	}
 
 	return 0;
 }
@@ -484,6 +605,9 @@ early_param("sysctl.vm.mem_profiling", setup_early_mem_profiling);
 
 static __init bool need_page_alloc_tagging(void)
 {
+	if (static_key_enabled(&mem_profiling_compressed))
+		return false;
+
 	return mem_profiling_support;
 }
 
diff --git a/lib/codetag.c b/lib/codetag.c
index 7455b966cae4..42aadd6c1454 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -149,8 +149,8 @@ static struct codetag_range get_section_range(struct module *mod,
 					      const char *section)
 {
 	return (struct codetag_range) {
-		get_symbol(mod, "__start_", section),
-		get_symbol(mod, "__stop_", section),
+		get_symbol(mod, CODETAG_SECTION_START_PREFIX, section),
+		get_symbol(mod, CODETAG_SECTION_STOP_PREFIX, section),
 	};
 }
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4ba5607aaf19..1c205b0a86ed 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -83,8 +83,7 @@ void __init mminit_verify_pageflags_layout(void)
 	unsigned long or_mask, add_mask;
 
 	shift = BITS_PER_LONG;
-	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
-		- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
+	width = shift - NR_NON_PAGEFLAG_BITS;
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
 		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
 		SECTIONS_WIDTH,
@@ -2639,7 +2638,7 @@ void __init mm_core_init(void)
 	BUILD_BUG_ON(MAX_ZONELISTS > 2);
 	build_all_zonelists(NULL);
 	page_alloc_init_cpuhp();
-
+	alloc_tag_sec_init();
 	/*
 	 * page_ext requires contiguous pages,
 	 * bigger than MAX_PAGE_ORDER unless SPARSEMEM.
-- 
cgit v1.2.3


From b7fc16a16b0850e41b88eae0edfa4c085c012347 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 24 Oct 2024 09:23:18 -0700
Subject: mm/codetag: uninline and move pgalloc_tag_copy and pgalloc_tag_split

pgalloc_tag_copy() and pgalloc_tag_split() are sizable and outside of any
performance-critical paths, so it should be fine to uninline them.  Also
move their declarations into pgalloc_tag.h which seems like a more
appropriate place for them.  No functional changes other than uninlining.

Link: https://lkml.kernel.org/r/20241024162318.1640781-1-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Sourav Panda <souravpanda@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h          | 58 ---------------------------------------------
 include/linux/pgalloc_tag.h |  5 ++++
 lib/alloc_tag.c             | 48 +++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f9120ac6d901..08e487e5850a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4166,62 +4166,4 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla
 }
 #endif
 
-#ifdef CONFIG_MEM_ALLOC_PROFILING
-static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
-{
-	int i;
-	struct alloc_tag *tag;
-	unsigned int nr_pages = 1 << new_order;
-
-	if (!mem_alloc_profiling_enabled())
-		return;
-
-	tag = pgalloc_tag_get(&folio->page);
-	if (!tag)
-		return;
-
-	for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
-		union pgtag_ref_handle handle;
-		union codetag_ref ref;
-
-		if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) {
-			/* Set new reference to point to the original tag */
-			alloc_tag_ref_set(&ref, tag);
-			update_page_tag_ref(handle, &ref);
-			put_page_tag_ref(handle);
-		}
-	}
-}
-
-static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
-{
-	union pgtag_ref_handle handle;
-	union codetag_ref ref;
-	struct alloc_tag *tag;
-
-	tag = pgalloc_tag_get(&old->page);
-	if (!tag)
-		return;
-
-	if (!get_page_tag_ref(&new->page, &ref, &handle))
-		return;
-
-	/* Clear the old ref to the original allocation tag. */
-	clear_page_tag_ref(&old->page);
-	/* Decrement the counters of the tag on get_new_folio. */
-	alloc_tag_sub(&ref, folio_size(new));
-	__alloc_tag_ref_set(&ref, tag);
-	update_page_tag_ref(handle, &ref);
-	put_page_tag_ref(handle);
-}
-#else /* !CONFIG_MEM_ALLOC_PROFILING */
-static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
-{
-}
-
-static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
-{
-}
-#endif /* CONFIG_MEM_ALLOC_PROFILING */
-
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 1fe63b52e5e5..0e43ab653ab6 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -230,6 +230,9 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
 		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
 }
 
+void pgalloc_tag_split(struct folio *folio, int old_order, int new_order);
+void pgalloc_tag_copy(struct folio *new, struct folio *old);
+
 void __init alloc_tag_sec_init(void);
 
 #else /* CONFIG_MEM_ALLOC_PROFILING */
@@ -241,6 +244,8 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
 static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
 static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
 static inline void alloc_tag_sec_init(void) {}
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {}
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) {}
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index d38a4b2a551d..2414a7ee7ec7 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -163,6 +163,54 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl
 	return nr;
 }
 
+void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+	int i;
+	struct alloc_tag *tag;
+	unsigned int nr_pages = 1 << new_order;
+
+	if (!mem_alloc_profiling_enabled())
+		return;
+
+	tag = pgalloc_tag_get(&folio->page);
+	if (!tag)
+		return;
+
+	for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
+		union pgtag_ref_handle handle;
+		union codetag_ref ref;
+
+		if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) {
+			/* Set new reference to point to the original tag */
+			alloc_tag_ref_set(&ref, tag);
+			update_page_tag_ref(handle, &ref);
+			put_page_tag_ref(handle);
+		}
+	}
+}
+
+void pgalloc_tag_copy(struct folio *new, struct folio *old)
+{
+	union pgtag_ref_handle handle;
+	union codetag_ref ref;
+	struct alloc_tag *tag;
+
+	tag = pgalloc_tag_get(&old->page);
+	if (!tag)
+		return;
+
+	if (!get_page_tag_ref(&new->page, &ref, &handle))
+		return;
+
+	/* Clear the old ref to the original allocation tag. */
+	clear_page_tag_ref(&old->page);
+	/* Decrement the counters of the tag on get_new_folio. */
+	alloc_tag_sub(&ref, folio_size(new));
+	__alloc_tag_ref_set(&ref, tag);
+	update_page_tag_ref(handle, &ref);
+	put_page_tag_ref(handle);
+}
+
 static void shutdown_mem_profiling(bool remove_file)
 {
 	if (mem_alloc_profiling_enabled())
-- 
cgit v1.2.3


From d7cb6d7414ea1b33536fa6d11805cb8dceec1f97 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 7 Nov 2024 15:42:59 +0900
Subject: block: RCU protect disk->conv_zones_bitmap

Ensure that a disk revalidation changing the conventional zones bitmap
of a disk does not cause invalid memory references when using the
disk_zone_is_conv() helper by RCU protecting the disk->conv_zones_bitmap
pointer.

disk_zone_is_conv() is modified to operate under the RCU read lock and
the function disk_set_conv_zones_bitmap() is added to update a disk
conv_zones_bitmap pointer using rcu_replace_pointer() with the disk
zone_wplugs_lock spinlock held.

disk_free_zone_resources() is modified to call
disk_update_zone_resources() with a NULL bitmap pointer to free the disk
conv_zones_bitmap. disk_set_conv_zones_bitmap() is also used in
disk_update_zone_resources() to set the new (revalidated) bitmap and
free the old one.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241107064300.227731-2-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 43 +++++++++++++++++++++++++++++++------------
 include/linux/blkdev.h |  2 +-
 2 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index f7e151f665c7..da710067e527 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -350,9 +350,15 @@ fail:
 
 static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
 {
-	if (!disk->conv_zones_bitmap)
-		return false;
-	return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
+	unsigned long *bitmap;
+	bool is_conv;
+
+	rcu_read_lock();
+	bitmap = rcu_dereference(disk->conv_zones_bitmap);
+	is_conv = bitmap && test_bit(disk_zone_no(disk, sector), bitmap);
+	rcu_read_unlock();
+
+	return is_conv;
 }
 
 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
@@ -1455,6 +1461,24 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
 	disk->zone_wplugs_hash_bits = 0;
 }
 
+static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
+					       unsigned long *bitmap)
+{
+	unsigned int nr_conv_zones = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	if (bitmap)
+		nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
+	bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
+				     lockdep_is_held(&disk->zone_wplugs_lock));
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+	kfree_rcu_mightsleep(bitmap);
+
+	return nr_conv_zones;
+}
+
 void disk_free_zone_resources(struct gendisk *disk)
 {
 	if (!disk->zone_wplugs_pool)
@@ -1478,8 +1502,7 @@ void disk_free_zone_resources(struct gendisk *disk)
 	mempool_destroy(disk->zone_wplugs_pool);
 	disk->zone_wplugs_pool = NULL;
 
-	bitmap_free(disk->conv_zones_bitmap);
-	disk->conv_zones_bitmap = NULL;
+	disk_set_conv_zones_bitmap(disk, NULL);
 	disk->zone_capacity = 0;
 	disk->last_zone_capacity = 0;
 	disk->nr_zones = 0;
@@ -1538,17 +1561,15 @@ static int disk_update_zone_resources(struct gendisk *disk,
 				      struct blk_revalidate_zone_args *args)
 {
 	struct request_queue *q = disk->queue;
-	unsigned int nr_seq_zones, nr_conv_zones = 0;
+	unsigned int nr_seq_zones, nr_conv_zones;
 	unsigned int pool_size;
 	struct queue_limits lim;
 
 	disk->nr_zones = args->nr_zones;
 	disk->zone_capacity = args->zone_capacity;
 	disk->last_zone_capacity = args->last_zone_capacity;
-	swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
-	if (disk->conv_zones_bitmap)
-		nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap,
-					      disk->nr_zones);
+	nr_conv_zones =
+		disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
 	if (nr_conv_zones >= disk->nr_zones) {
 		pr_warn("%s: Invalid number of conventional zones %u / %u\n",
 			disk->disk_name, nr_conv_zones, disk->nr_zones);
@@ -1817,8 +1838,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 		disk_free_zone_resources(disk);
 	blk_mq_unfreeze_queue(q);
 
-	kfree(args.conv_zones_bitmap);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7bfc877e159e..6d379803c777 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -195,7 +195,7 @@ struct gendisk {
 	unsigned int		nr_zones;
 	unsigned int		zone_capacity;
 	unsigned int		last_zone_capacity;
-	unsigned long		*conv_zones_bitmap;
+	unsigned long __rcu	*conv_zones_bitmap;
 	unsigned int            zone_wplugs_hash_bits;
 	spinlock_t              zone_wplugs_lock;
 	struct mempool_s	*zone_wplugs_pool;
-- 
cgit v1.2.3


From f3d9bf05140dd242cdc33c431489a853f2bc1b67 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 7 Nov 2024 15:43:00 +0900
Subject: block: Add a public bdev_zone_is_seq() helper

Turn the private disk_zone_is_conv() function in blk-zoned.c into a
public and documented bdev_zone_is_seq() helper with the inverse
polarity of the original function, also adding a check for non-zoned
devices so that all file systems can use the helper, even with a regular
block device.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241107064300.227731-3-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 17 ++---------------
 include/linux/blkdev.h | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index da710067e527..70211751df16 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -348,19 +348,6 @@ fail:
 	return ret;
 }
 
-static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
-{
-	unsigned long *bitmap;
-	bool is_conv;
-
-	rcu_read_lock();
-	bitmap = rcu_dereference(disk->conv_zones_bitmap);
-	is_conv = bitmap && test_bit(disk_zone_no(disk, sector), bitmap);
-	rcu_read_unlock();
-
-	return is_conv;
-}
-
 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
 {
 	return zone->start + zone->len >= get_capacity(disk);
@@ -715,7 +702,7 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
 	struct blk_zone_wplug *zwplug;
 
 	/* Conventional zones cannot be reset nor finished. */
-	if (disk_zone_is_conv(disk, sector)) {
+	if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
 		bio_io_error(bio);
 		return true;
 	}
@@ -969,7 +956,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
 	}
 
 	/* Conventional zones do not need write plugging. */
-	if (disk_zone_is_conv(disk, sector)) {
+	if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
 		/* Zone append to conventional zones is not allowed. */
 		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
 			bio_io_error(bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6d379803c777..93551772c1d6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1395,6 +1395,33 @@ static inline bool bdev_is_zone_start(struct block_device *bdev,
 	return bdev_offset_from_zone_start(bdev, sector) == 0;
 }
 
+/**
+ * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
+ * @bdev:	block device to check
+ * @sector:	sector number
+ *
+ * Check if @sector on @bdev is contained in a sequential write required zone.
+ */
+static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
+{
+	bool is_seq = false;
+
+#if IS_ENABLED(CONFIG_BLK_DEV_ZONED)
+	if (bdev_is_zoned(bdev)) {
+		struct gendisk *disk = bdev->bd_disk;
+		unsigned long *bitmap;
+
+		rcu_read_lock();
+		bitmap = rcu_dereference(disk->conv_zones_bitmap);
+		is_seq = !bitmap ||
+			!test_bit(disk_zone_no(disk, sector), bitmap);
+		rcu_read_unlock();
+	}
+#endif
+
+	return is_seq;
+}
+
 static inline int queue_dma_alignment(const struct request_queue *q)
 {
 	return q->limits.dma_alignment;
-- 
cgit v1.2.3


From f7470591f8db1a72ce9f7ab49cb13c2b21b92002 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 5 Oct 2024 21:01:12 +0100
Subject: mm: convert page_to_pgoff() to page_pgoff()

Patch series "page->index removals in mm", v2.

As part of shrinking struct page, we need to stop using page->index.  This
patchset gets rid of most of the remaining references to page->index in
mm, as well as increasing the number of functions which take a const
folio/page pointer.  It shrinks the text segment of mm by a few hundred
bytes in my test config, probably mostly from removing calls to
compound_head() in page_to_pgoff().


This patch (of 7):

Change the function signature to pass in the folio as all three callers
have it.  This removes a reference to page->index, which we're trying to
get rid of.  And add kernel-doc.

Link: https://lkml.kernel.org/r/20241005200121.3231142-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20241005200121.3231142-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h      |  2 +-
 include/linux/pagemap.h | 31 +++++++++++++++++--------------
 mm/memory-failure.c     |  4 ++--
 mm/rmap.c               |  2 +-
 4 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 08e487e5850a..78848fbefe94 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1895,7 +1895,7 @@ static inline unsigned long page_to_section(const struct page *page)
  *
  * Return: The Page Frame Number of the first page in the folio.
  */
-static inline unsigned long folio_pfn(struct folio *folio)
+static inline unsigned long folio_pfn(const struct folio *folio)
 {
 	return page_to_pfn(&folio->page);
 }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 68a5f1ff3301..bcf0865a38ae 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1011,22 +1011,25 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping,
 	return read_cache_folio(mapping, index, NULL, file);
 }
 
-/*
- * Get the offset in PAGE_SIZE (even for hugetlb pages).
+/**
+ * page_pgoff - Calculate the logical page offset of this page.
+ * @folio: The folio containing this page.
+ * @page: The page which we need the offset of.
+ *
+ * For file pages, this is the offset from the beginning of the file
+ * in units of PAGE_SIZE.  For anonymous pages, this is the offset from
+ * the beginning of the anon_vma in units of PAGE_SIZE.  This will
+ * return nonsense for KSM pages.
+ *
+ * Context: Caller must have a reference on the folio or otherwise
+ * prevent it from being split or freed.
+ *
+ * Return: The offset in units of PAGE_SIZE.
  */
-static inline pgoff_t page_to_pgoff(struct page *page)
+static inline pgoff_t page_pgoff(const struct folio *folio,
+		const struct page *page)
 {
-	struct page *head;
-
-	if (likely(!PageTransTail(page)))
-		return page->index;
-
-	head = compound_head(page);
-	/*
-	 *  We don't initialize ->index for tail pages: calculate based on
-	 *  head page
-	 */
-	return head->index + page - head;
+	return folio->index + folio_page_idx(folio, page);
 }
 
 /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 96ce31e5a203..58a3d80961a4 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -617,7 +617,7 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
 	if (av == NULL)	/* Not actually mapped anymore */
 		return;
 
-	pgoff = page_to_pgoff(page);
+	pgoff = page_pgoff(folio, page);
 	rcu_read_lock();
 	for_each_process(tsk) {
 		struct vm_area_struct *vma;
@@ -653,7 +653,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
 
 	i_mmap_lock_read(mapping);
 	rcu_read_lock();
-	pgoff = page_to_pgoff(page);
+	pgoff = page_pgoff(folio, page);
 	for_each_process(tsk) {
 		struct task_struct *t = task_early_kill(tsk, force_early);
 		unsigned long addr;
diff --git a/mm/rmap.c b/mm/rmap.c
index 4d75433330f9..77c6b27cb0ee 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1276,7 +1276,7 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page,
 	 */
 	VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
 			folio);
-	VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
+	VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
 		       page);
 }
 
-- 
cgit v1.2.3


From 713da0b33b3e9d16272b57f4c44dee5c052be9b7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 5 Oct 2024 21:01:14 +0100
Subject: mm: renovate page_address_in_vma()

This function doesn't modify any of its arguments, so if we make a few
other functions take const pointers, we can make page_address_in_vma()
take const pointers too.  All of its callers have the containing folio
already, so pass that in as an argument instead of recalculating it.  Also
add kernel-doc

Link: https://lkml.kernel.org/r/20241005200121.3231142-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  7 ++-----
 mm/internal.h        |  4 ++--
 mm/ksm.c             |  7 +++----
 mm/memory-failure.c  |  2 +-
 mm/mempolicy.c       |  2 +-
 mm/rmap.c            | 27 ++++++++++++++++++++-------
 mm/util.c            |  2 +-
 7 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d5e93e44322e..78923015a2e8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -728,11 +728,8 @@ page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw)
 }
 
 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
-
-/*
- * Used by swapoff to help locate where page is expected in vma.
- */
-unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
+unsigned long page_address_in_vma(const struct folio *folio,
+		const struct page *, const struct vm_area_struct *);
 
 /*
  * Cleans the PTEs of shared mappings.
diff --git a/mm/internal.h b/mm/internal.h
index cd96848be245..8674f677304a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -841,7 +841,7 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
 }
 
 /* mm/util.c */
-struct anon_vma *folio_anon_vma(struct folio *folio);
+struct anon_vma *folio_anon_vma(const struct folio *folio);
 
 #ifdef CONFIG_MMU
 void unmap_mapping_folio(struct folio *folio);
@@ -959,7 +959,7 @@ extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
  * If any page in this range is mapped by this VMA, return the first address
  * where any of these pages appear.  Otherwise, return -EFAULT.
  */
-static inline unsigned long vma_address(struct vm_area_struct *vma,
+static inline unsigned long vma_address(const struct vm_area_struct *vma,
 		pgoff_t pgoff, unsigned long nr_pages)
 {
 	unsigned long address;
diff --git a/mm/ksm.c b/mm/ksm.c
index e596bc1b5fa7..b813225a806d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1256,7 +1256,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
 	if (WARN_ON_ONCE(folio_test_large(folio)))
 		return err;
 
-	pvmw.address = page_address_in_vma(&folio->page, vma);
+	pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma);
 	if (pvmw.address == -EFAULT)
 		goto out;
 
@@ -1340,7 +1340,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 {
 	struct folio *kfolio = page_folio(kpage);
 	struct mm_struct *mm = vma->vm_mm;
-	struct folio *folio;
+	struct folio *folio = page_folio(page);
 	pmd_t *pmd;
 	pmd_t pmde;
 	pte_t *ptep;
@@ -1350,7 +1350,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	int err = -EFAULT;
 	struct mmu_notifier_range range;
 
-	addr = page_address_in_vma(page, vma);
+	addr = page_address_in_vma(folio, page, vma);
 	if (addr == -EFAULT)
 		goto out;
 
@@ -1416,7 +1416,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	ptep_clear_flush(vma, addr, ptep);
 	set_pte_at(mm, addr, ptep, newpte);
 
-	folio = page_folio(page);
 	folio_remove_rmap_pte(folio, page, vma);
 	if (!folio_mapped(folio))
 		folio_free_swap(folio);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 58a3d80961a4..ea9d883c01c1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -671,7 +671,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
 			 */
 			if (vma->vm_mm != t->mm)
 				continue;
-			addr = page_address_in_vma(page, vma);
+			addr = page_address_in_vma(folio, page, vma);
 			add_to_kill_anon_file(t, page, vma, to_kill, addr);
 		}
 	}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a29eff5d0585..bb37cd1a51d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1367,7 +1367,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 			if (!list_entry_is_head(folio, &pagelist, lru)) {
 				vma_iter_init(&vmi, mm, start);
 				for_each_vma_range(vmi, vma, end) {
-					addr = page_address_in_vma(
+					addr = page_address_in_vma(folio,
 						folio_page(folio, 0), vma);
 					if (addr != -EFAULT)
 						break;
diff --git a/mm/rmap.c b/mm/rmap.c
index e5ec8304a193..d4e5fe94fa92 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -767,14 +767,27 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 }
 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 
-/*
- * At what user virtual address is page expected in vma?
- * Caller should check the page is actually part of the vma.
+/**
+ * page_address_in_vma - The virtual address of a page in this VMA.
+ * @folio: The folio containing the page.
+ * @page: The page within the folio.
+ * @vma: The VMA we need to know the address in.
+ *
+ * Calculates the user virtual address of this page in the specified VMA.
+ * It is the caller's responsibililty to check the page is actually
+ * within the VMA.  There may not currently be a PTE pointing at this
+ * page, but if a page fault occurs at this address, this is the page
+ * which will be accessed.
+ *
+ * Context: Caller should hold a reference to the folio.  Caller should
+ * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
+ * VMA from being altered.
+ *
+ * Return: The virtual address corresponding to this page in the VMA.
  */
-unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_address_in_vma(const struct folio *folio,
+		const struct page *page, const struct vm_area_struct *vma)
 {
-	struct folio *folio = page_folio(page);
-
 	if (folio_test_anon(folio)) {
 		struct anon_vma *page__anon_vma = folio_anon_vma(folio);
 		/*
@@ -790,7 +803,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 		return -EFAULT;
 	}
 
-	/* The !page__anon_vma above handles KSM folios */
+	/* KSM folios don't reach here because of the !page__anon_vma check */
 	return vma_address(vma, page_pgoff(folio, page), 1);
 }
 
diff --git a/mm/util.c b/mm/util.c
index 4f1275023eb7..60017d2a9e48 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -820,7 +820,7 @@ void *vcalloc_noprof(size_t n, size_t size)
 }
 EXPORT_SYMBOL(vcalloc_noprof);
 
-struct anon_vma *folio_anon_vma(struct folio *folio)
+struct anon_vma *folio_anon_vma(const struct folio *folio)
 {
 	unsigned long mapping = (unsigned long)folio->mapping;
 
-- 
cgit v1.2.3


From 68158bfa3dbd4af8461ef75a91ffc03be942c8fe Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 5 Oct 2024 21:01:15 +0100
Subject: mm: mass constification of folio/page pointers

Now that page_pgoff() takes const pointers, we can constify the pointers
to a lot of functions.

Link: https://lkml.kernel.org/r/20241005200121.3231142-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h  |  7 ++++---
 include/linux/rmap.h | 10 +++++-----
 mm/internal.h        |  5 +++--
 mm/ksm.c             |  5 +++--
 mm/memory-failure.c  | 24 +++++++++++++-----------
 mm/page_vma_mapped.c |  5 +++--
 mm/rmap.c            | 11 ++++++-----
 7 files changed, 37 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 29022e71a074..6a53ac4885bb 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -90,7 +90,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio,
 
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
-void collect_procs_ksm(struct folio *folio, struct page *page,
+void collect_procs_ksm(const struct folio *folio, const struct page *page,
 		struct list_head *to_kill, int force_early);
 long ksm_process_profit(struct mm_struct *);
 
@@ -122,8 +122,9 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
 {
 }
 
-static inline void collect_procs_ksm(struct folio *folio, struct page *page,
-				     struct list_head *to_kill, int force_early)
+static inline void collect_procs_ksm(const struct folio *folio,
+		const struct page *page, struct list_head *to_kill,
+		int force_early)
 {
 }
 
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 78923015a2e8..683a04088f3f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -171,7 +171,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
 	unlink_anon_vmas(next);
 }
 
-struct anon_vma *folio_get_anon_vma(struct folio *folio);
+struct anon_vma *folio_get_anon_vma(const struct folio *folio);
 
 /* RMAP flags, currently only relevant for some anon rmap operations. */
 typedef int __bitwise rmap_t;
@@ -194,8 +194,8 @@ enum rmap_level {
 	RMAP_LEVEL_PMD,
 };
 
-static inline void __folio_rmap_sanity_checks(struct folio *folio,
-		struct page *page, int nr_pages, enum rmap_level level)
+static inline void __folio_rmap_sanity_checks(const struct folio *folio,
+		const struct page *page, int nr_pages, enum rmap_level level)
 {
 	/* hugetlb folios are handled separately. */
 	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
@@ -771,14 +771,14 @@ struct rmap_walk_control {
 	bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
 					unsigned long addr, void *arg);
 	int (*done)(struct folio *folio);
-	struct anon_vma *(*anon_lock)(struct folio *folio,
+	struct anon_vma *(*anon_lock)(const struct folio *folio,
 				      struct rmap_walk_control *rwc);
 	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
 };
 
 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
-struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
 					  struct rmap_walk_control *rwc);
 
 #else	/* !CONFIG_MMU */
diff --git a/mm/internal.h b/mm/internal.h
index 8674f677304a..fd6373cb1c66 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1117,10 +1117,11 @@ void ClearPageHWPoisonTakenOff(struct page *page);
 bool take_page_off_buddy(struct page *page);
 bool put_page_back_buddy(struct page *page);
 struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
-void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
 		     struct vm_area_struct *vma, struct list_head *to_kill,
 		     unsigned long ksm_addr);
-unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+unsigned long page_mapped_in_vma(const struct page *page,
+		struct vm_area_struct *vma);
 
 #else
 static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
diff --git a/mm/ksm.c b/mm/ksm.c
index b813225a806d..7ac59cde626c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1051,7 +1051,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
 	return err;
 }
 
-static inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
+static inline
+struct ksm_stable_node *folio_stable_node(const struct folio *folio)
 {
 	return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
 }
@@ -3067,7 +3068,7 @@ again:
 /*
  * Collect processes when the error hit an ksm page.
  */
-void collect_procs_ksm(struct folio *folio, struct page *page,
+void collect_procs_ksm(const struct folio *folio, const struct page *page,
 		struct list_head *to_kill, int force_early)
 {
 	struct ksm_stable_node *stable_node;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ea9d883c01c1..7ce7ba8586f5 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -445,7 +445,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
  * Schedule a process for later kill.
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  */
-static void __add_to_kill(struct task_struct *tsk, struct page *p,
+static void __add_to_kill(struct task_struct *tsk, const struct page *p,
 			  struct vm_area_struct *vma, struct list_head *to_kill,
 			  unsigned long addr)
 {
@@ -461,7 +461,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
 	if (is_zone_device_page(p))
 		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
 	else
-		tk->size_shift = page_shift(compound_head(p));
+		tk->size_shift = folio_shift(page_folio(p));
 
 	/*
 	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -486,7 +486,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
 	list_add_tail(&tk->nd, to_kill);
 }
 
-static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
+static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p,
 		struct vm_area_struct *vma, struct list_head *to_kill,
 		unsigned long addr)
 {
@@ -509,7 +509,7 @@ static bool task_in_to_kill_list(struct list_head *to_kill,
 	return false;
 }
 
-void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
 		     struct vm_area_struct *vma, struct list_head *to_kill,
 		     unsigned long addr)
 {
@@ -606,8 +606,9 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
 /*
  * Collect processes when the error hit an anonymous page.
  */
-static void collect_procs_anon(struct folio *folio, struct page *page,
-		struct list_head *to_kill, int force_early)
+static void collect_procs_anon(const struct folio *folio,
+		const struct page *page, struct list_head *to_kill,
+		int force_early)
 {
 	struct task_struct *tsk;
 	struct anon_vma *av;
@@ -643,8 +644,9 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
 /*
  * Collect processes when the error hit a file mapped page.
  */
-static void collect_procs_file(struct folio *folio, struct page *page,
-		struct list_head *to_kill, int force_early)
+static void collect_procs_file(const struct folio *folio,
+		const struct page *page, struct list_head *to_kill,
+		int force_early)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
@@ -680,7 +682,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
 }
 
 #ifdef CONFIG_FS_DAX
-static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
+static void add_to_kill_fsdax(struct task_struct *tsk, const struct page *p,
 			      struct vm_area_struct *vma,
 			      struct list_head *to_kill, pgoff_t pgoff)
 {
@@ -691,7 +693,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
 /*
  * Collect processes when the error hit a fsdax page.
  */
-static void collect_procs_fsdax(struct page *page,
+static void collect_procs_fsdax(const struct page *page,
 		struct address_space *mapping, pgoff_t pgoff,
 		struct list_head *to_kill, bool pre_remove)
 {
@@ -725,7 +727,7 @@ static void collect_procs_fsdax(struct page *page,
 /*
  * Collect the processes who have the corrupted page mapped to kill.
  */
-static void collect_procs(struct folio *folio, struct page *page,
+static void collect_procs(const struct folio *folio, const struct page *page,
 		struct list_head *tokill, int force_early)
 {
 	if (!folio->mapping)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 6b356853c04e..81839a9e74f1 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -337,9 +337,10 @@ next_pte:
  * outside the VMA or not present, returns -EFAULT.
  * Only valid for normal file or anonymous VMAs.
  */
-unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_mapped_in_vma(const struct page *page,
+		struct vm_area_struct *vma)
 {
-	struct folio *folio = page_folio(page);
+	const struct folio *folio = page_folio(page);
 	struct page_vma_mapped_walk pvmw = {
 		.pfn = page_to_pfn(page),
 		.nr_pages = 1,
diff --git a/mm/rmap.c b/mm/rmap.c
index d4e5fe94fa92..c6c4d4ea29a7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -496,7 +496,7 @@ void __init anon_vma_init(void)
  * concurrently without folio lock protection). See folio_lock_anon_vma_read()
  * which has already covered that, and comment above remap_pages().
  */
-struct anon_vma *folio_get_anon_vma(struct folio *folio)
+struct anon_vma *folio_get_anon_vma(const struct folio *folio)
 {
 	struct anon_vma *anon_vma = NULL;
 	unsigned long anon_mapping;
@@ -540,7 +540,7 @@ out:
  * reference like with folio_get_anon_vma() and then block on the mutex
  * on !rwc->try_lock case.
  */
-struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
 					  struct rmap_walk_control *rwc)
 {
 	struct anon_vma *anon_vma = NULL;
@@ -1271,8 +1271,9 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
  */
-static void __page_check_anon_rmap(struct folio *folio, struct page *page,
-	struct vm_area_struct *vma, unsigned long address)
+static void __page_check_anon_rmap(const struct folio *folio,
+		const struct page *page, struct vm_area_struct *vma,
+		unsigned long address)
 {
 	/*
 	 * The page's anon-rmap details (mapping and index) are guaranteed to
@@ -2569,7 +2570,7 @@ void __put_anon_vma(struct anon_vma *anon_vma)
 		anon_vma_free(root);
 }
 
-static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
+static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
 					    struct rmap_walk_control *rwc)
 {
 	struct anon_vma *anon_vma;
-- 
cgit v1.2.3


From 0386aaa6e9c826bc494169a914e01a86befe6edf Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 5 Oct 2024 21:01:16 +0100
Subject: bootmem: stop using page->index

Encode the type into the bottom four bits of page->private and the info
into the remaining bits.  Also turn the bootmem type into a named enum.

[arnd@arndb.de: bootmem: add bootmem_type stub function]
  Link: https://lkml.kernel.org/r/20241015143802.577613-1-arnd@kernel.org
[akpm@linux-foundation.org: fix build with !CONFIG_HAVE_BOOTMEM_INFO_NODE]
 Link: https://lore.kernel.org/oe-kbuild-all/202410090311.eaqcL7IZ-lkp@intel.com/
Link: https://lkml.kernel.org/r/20241005200121.3231142-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/init_64.c        | 28 +++++++++++++++++++---------
 include/linux/bootmem_info.h | 35 +++++++++++++++++++++++++++--------
 mm/bootmem_info.c            | 11 ++++++-----
 mm/sparse.c                  |  8 ++++----
 4 files changed, 56 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a564130b9d0..01ea7c6df303 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -985,22 +985,32 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return add_pages(nid, start_pfn, nr_pages, params);
 }
 
-static void __meminit free_pagetable(struct page *page, int order)
+static void free_reserved_pages(struct page *page, unsigned long nr_pages)
 {
-	unsigned long magic;
-	unsigned int nr_pages = 1 << order;
+	while (nr_pages--)
+		free_reserved_page(page++);
+}
 
+static void __meminit free_pagetable(struct page *page, int order)
+{
 	/* bootmem page has reserved flag */
 	if (PageReserved(page)) {
-		magic = page->index;
-		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+		unsigned long nr_pages = 1 << order;
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+		enum bootmem_type type = bootmem_type(page);
+
+		if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
-		} else
-			while (nr_pages--)
-				free_reserved_page(page++);
-	} else
+		} else {
+			free_reserved_pages(page, nr_pages);
+		}
+#else
+		free_reserved_pages(page, nr_pages);
+#endif
+	} else {
 		free_pages((unsigned long)page_address(page), order);
+	}
 }
 
 static void __meminit free_hugepage_table(struct page *page,
diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index cffa38a73618..d8a8d245824a 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -6,11 +6,10 @@
 #include <linux/kmemleak.h>
 
 /*
- * Types for free bootmem stored in page->lru.next. These have to be in
- * some random range in unsigned long space for debugging purposes.
+ * Types for free bootmem stored in the low bits of page->private.
  */
-enum {
-	MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
+enum bootmem_type {
+	MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 1,
 	SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
 	MIX_SECTION_INFO,
 	NODE_INFO,
@@ -21,9 +20,19 @@ enum {
 void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
 
 void get_page_bootmem(unsigned long info, struct page *page,
-		      unsigned long type);
+		enum bootmem_type type);
 void put_page_bootmem(struct page *page);
 
+static inline enum bootmem_type bootmem_type(const struct page *page)
+{
+	return (unsigned long)page->private & 0xf;
+}
+
+static inline unsigned long bootmem_info(const struct page *page)
+{
+	return (unsigned long)page->private >> 4;
+}
+
 /*
  * Any memory allocated via the memblock allocator and not via the
  * buddy will be marked reserved already in the memmap. For those
@@ -31,7 +40,7 @@ void put_page_bootmem(struct page *page);
  */
 static inline void free_bootmem_page(struct page *page)
 {
-	unsigned long magic = page->index;
+	enum bootmem_type type = bootmem_type(page);
 
 	/*
 	 * The reserve_bootmem_region sets the reserved flag on bootmem
@@ -39,7 +48,7 @@ static inline void free_bootmem_page(struct page *page)
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) != 2, page);
 
-	if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
+	if (type == SECTION_INFO || type == MIX_SECTION_INFO)
 		put_page_bootmem(page);
 	else
 		VM_BUG_ON_PAGE(1, page);
@@ -53,8 +62,18 @@ static inline void put_page_bootmem(struct page *page)
 {
 }
 
+static inline enum bootmem_type bootmem_type(const struct page *page)
+{
+	return SECTION_INFO;
+}
+
+static inline unsigned long bootmem_info(const struct page *page)
+{
+	return 0;
+}
+
 static inline void get_page_bootmem(unsigned long info, struct page *page,
-				    unsigned long type)
+				    enum bootmem_type type)
 {
 }
 
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index fa7cb0c87c03..95f288169a38 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -14,23 +14,24 @@
 #include <linux/memory_hotplug.h>
 #include <linux/kmemleak.h>
 
-void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
+void get_page_bootmem(unsigned long info, struct page *page,
+		enum bootmem_type type)
 {
-	page->index = type;
+	BUG_ON(type > 0xf);
+	BUG_ON(info > (ULONG_MAX >> 4));
 	SetPagePrivate(page);
-	set_page_private(page, info);
+	set_page_private(page, info << 4 | type);
 	page_ref_inc(page);
 }
 
 void put_page_bootmem(struct page *page)
 {
-	unsigned long type = page->index;
+	enum bootmem_type type = bootmem_type(page);
 
 	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
 	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
 
 	if (page_ref_dec_return(page) == 1) {
-		page->index = 0;
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		INIT_LIST_HEAD(&page->lru);
diff --git a/mm/sparse.c b/mm/sparse.c
index 4cb9793f0b52..13b6624d3562 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -720,19 +720,19 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 static void free_map_bootmem(struct page *memmap)
 {
 	unsigned long maps_section_nr, removing_section_nr, i;
-	unsigned long magic, nr_pages;
+	unsigned long type, nr_pages;
 	struct page *page = virt_to_page(memmap);
 
 	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
 		>> PAGE_SHIFT;
 
 	for (i = 0; i < nr_pages; i++, page++) {
-		magic = page->index;
+		type = bootmem_type(page);
 
-		BUG_ON(magic == NODE_INFO);
+		BUG_ON(type == NODE_INFO);
 
 		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
-		removing_section_nr = page_private(page);
+		removing_section_nr = bootmem_info(page);
 
 		/*
 		 * When this function is called, the removing section is
-- 
cgit v1.2.3


From 6a78699838a0ddeed3620ddf50c1521f1fe1e811 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 31 Oct 2024 21:37:19 +0800
Subject: block: always verify unfreeze lock on the owner task

commit f1be1788a32e ("block: model freeze & enter queue as lock for
supporting lockdep") tries to apply lockdep for verifying freeze &
unfreeze. However, the verification is only done the outmost freeze and
unfreeze. This way is actually not correct because q->mq_freeze_depth
still may drop to zero on other task instead of the freeze owner task.

Fix this issue by always verifying the last unfreeze lock on the owner
task context, and make sure both the outmost freeze & unfreeze are
verified in the current task.

Fixes: f1be1788a32e ("block: model freeze & enter queue as lock for supporting lockdep")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20241031133723.303835-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  2 +-
 block/blk-mq.c         | 62 +++++++++++++++++++++++++++++++++++++++++++-------
 block/blk.h            |  3 ++-
 include/linux/blkdev.h |  4 ++++
 4 files changed, 61 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 09d10bb95fda..4f791a3114a1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -287,7 +287,7 @@ bool blk_queue_start_drain(struct request_queue *q)
 	 * entering queue, so we call blk_freeze_queue_start() to
 	 * prevent I/O from crossing blk_queue_enter().
 	 */
-	bool freeze = __blk_freeze_queue_start(q);
+	bool freeze = __blk_freeze_queue_start(q, current);
 	if (queue_is_mq(q))
 		blk_mq_wake_waiters(q);
 	/* Make blk_queue_enter() reexamine the DYING flag. */
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5f4496220432..5e240a4b6be0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -120,20 +120,66 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 	inflight[1] = mi.inflight[1];
 }
 
-bool __blk_freeze_queue_start(struct request_queue *q)
+#ifdef CONFIG_LOCKDEP
+static bool blk_freeze_set_owner(struct request_queue *q,
+				 struct task_struct *owner)
 {
-	int freeze;
+	if (!owner)
+		return false;
+
+	if (!q->mq_freeze_depth) {
+		q->mq_freeze_owner = owner;
+		q->mq_freeze_owner_depth = 1;
+		return true;
+	}
+
+	if (owner == q->mq_freeze_owner)
+		q->mq_freeze_owner_depth += 1;
+	return false;
+}
+
+/* verify the last unfreeze in owner context */
+static bool blk_unfreeze_check_owner(struct request_queue *q)
+{
+	if (!q->mq_freeze_owner)
+		return false;
+	if (q->mq_freeze_owner != current)
+		return false;
+	if (--q->mq_freeze_owner_depth == 0) {
+		q->mq_freeze_owner = NULL;
+		return true;
+	}
+	return false;
+}
+
+#else
+
+static bool blk_freeze_set_owner(struct request_queue *q,
+				 struct task_struct *owner)
+{
+	return false;
+}
+
+static bool blk_unfreeze_check_owner(struct request_queue *q)
+{
+	return false;
+}
+#endif
+
+bool __blk_freeze_queue_start(struct request_queue *q,
+			      struct task_struct *owner)
+{
+	bool freeze;
 
 	mutex_lock(&q->mq_freeze_lock);
+	freeze = blk_freeze_set_owner(q, owner);
 	if (++q->mq_freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
 		mutex_unlock(&q->mq_freeze_lock);
 		if (queue_is_mq(q))
 			blk_mq_run_hw_queues(q, false);
-		freeze = true;
 	} else {
 		mutex_unlock(&q->mq_freeze_lock);
-		freeze = false;
 	}
 
 	return freeze;
@@ -141,7 +187,7 @@ bool __blk_freeze_queue_start(struct request_queue *q)
 
 void blk_freeze_queue_start(struct request_queue *q)
 {
-	if (__blk_freeze_queue_start(q))
+	if (__blk_freeze_queue_start(q, current))
 		blk_freeze_acquire_lock(q, false, false);
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -170,7 +216,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 {
-	int unfreeze = false;
+	bool unfreeze;
 
 	mutex_lock(&q->mq_freeze_lock);
 	if (force_atomic)
@@ -180,8 +226,8 @@ bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 	if (!q->mq_freeze_depth) {
 		percpu_ref_resurrect(&q->q_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
-		unfreeze = true;
 	}
+	unfreeze = blk_unfreeze_check_owner(q);
 	mutex_unlock(&q->mq_freeze_lock);
 
 	return unfreeze;
@@ -203,7 +249,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
  */
 void blk_freeze_queue_start_non_owner(struct request_queue *q)
 {
-	__blk_freeze_queue_start(q);
+	__blk_freeze_queue_start(q, NULL);
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
 
diff --git a/block/blk.h b/block/blk.h
index ac48b79cbf80..57fc035620d6 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -37,7 +37,8 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 
 bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 bool blk_queue_start_drain(struct request_queue *q);
-bool __blk_freeze_queue_start(struct request_queue *q);
+bool __blk_freeze_queue_start(struct request_queue *q,
+			      struct task_struct *owner);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
 void submit_bio_noacct_nocheck(struct bio *bio);
 void bio_await_chain(struct bio *bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 93551772c1d6..1b51a7c92e9b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -575,6 +575,10 @@ struct request_queue {
 	struct throtl_data *td;
 #endif
 	struct rcu_head		rcu_head;
+#ifdef CONFIG_LOCKDEP
+	struct task_struct	*mq_freeze_owner;
+	int			mq_freeze_owner_depth;
+#endif
 	wait_queue_head_t	mq_freeze_wq;
 	/*
 	 * Protect concurrent access to q_usage_counter by
-- 
cgit v1.2.3


From 9c477088b60dd23f3120b2b01b14d85047a033da Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 6 Nov 2024 21:19:59 +0100
Subject: net: phy: make genphy_c45_write_eee_adv() static

genphy_c45_write_eee_adv() isn't used outside phy-c45.c,
so make it static.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/d23bd784-44e6-4a15-af3a-b37379156521@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy-c45.c | 3 ++-
 include/linux/phy.h       | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 8c4dd94c7495..66be5832ba51 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -680,7 +680,8 @@ EXPORT_SYMBOL_GPL(genphy_c45_read_mdix);
  * @phydev: target phy_device struct
  * @adv: the linkmode advertisement settings
  */
-int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv)
+static int genphy_c45_write_eee_adv(struct phy_device *phydev,
+				    unsigned long *adv)
 {
 	int val, changed = 0;
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bf0eb4e5d35c..bb09a5a128e1 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1953,7 +1953,6 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev,
 			       struct ethtool_keee *data);
 int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
 			       struct ethtool_keee *data);
-int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv);
 int genphy_c45_an_config_eee_aneg(struct phy_device *phydev);
 int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv);
 
-- 
cgit v1.2.3


From db73835f54fc57bc267d43836905588f24ddac85 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 6 Nov 2024 21:25:12 +0100
Subject: net: phy: remove genphy_config_eee_advert

bcm_config_lre_aneg() doesn't use genphy_config_eee_advert() any longer.
As this was the only user, we can remove genphy_config_eee_advert() now.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/37da7f3e-b883-4c07-9881-b8c0516822b7@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 23 -----------------------
 include/linux/phy.h          |  1 -
 2 files changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 563497a3274c..bc24c9f2786b 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2239,29 +2239,6 @@ static int genphy_c37_config_advert(struct phy_device *phydev)
 				  adv);
 }
 
-/**
- * genphy_config_eee_advert - disable unwanted eee mode advertisement
- * @phydev: target phy_device struct
- *
- * Description: Writes MDIO_AN_EEE_ADV after disabling unsupported energy
- *   efficent ethernet modes. Returns 0 if the PHY's advertisement hasn't
- *   changed, and 1 if it has changed.
- */
-int genphy_config_eee_advert(struct phy_device *phydev)
-{
-	int err;
-
-	/* Nothing to disable */
-	if (!phydev->eee_broken_modes)
-		return 0;
-
-	err = phy_modify_mmd_changed(phydev, MDIO_MMD_AN, MDIO_AN_EEE_ADV,
-				     phydev->eee_broken_modes, 0);
-	/* If the call failed, we assume that EEE is not supported */
-	return err < 0 ? 0 : err;
-}
-EXPORT_SYMBOL(genphy_config_eee_advert);
-
 /**
  * genphy_setup_forced - configures/forces speed/duplex from @phydev
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bb09a5a128e1..1e4127c495c0 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1885,7 +1885,6 @@ int genphy_read_abilities(struct phy_device *phydev);
 int genphy_setup_forced(struct phy_device *phydev);
 int genphy_restart_aneg(struct phy_device *phydev);
 int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart);
-int genphy_config_eee_advert(struct phy_device *phydev);
 int __genphy_config_aneg(struct phy_device *phydev, bool changed);
 int genphy_aneg_done(struct phy_device *phydev);
 int genphy_update_link(struct phy_device *phydev);
-- 
cgit v1.2.3


From b45a3777ceabbe08ab7a6e97f258191c07cbab8d Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 8 Nov 2024 10:13:52 +0800
Subject: iommu: Pass old domain to set_dev_pasid op

To support domain replacement for pasid, the underlying iommu driver needs
to know the old domain hence be able to clean up the existing attachment.
It would be much convenient for iommu layer to pass down the old domain.
Otherwise, iommu drivers would need to track domain for pasids by
themselves, this would duplicate code among the iommu drivers. Or iommu
drivers would rely group->pasid_array to get domain, which may not always
the correct one.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Link: https://lore.kernel.org/r/20241107122234.7424-2-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/amd_iommu.h                   | 3 ++-
 drivers/iommu/amd/pasid.c                       | 3 ++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 3 ++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c     | 3 ++-
 drivers/iommu/intel/iommu.c                     | 6 ++++--
 drivers/iommu/intel/svm.c                       | 3 ++-
 drivers/iommu/iommu.c                           | 3 ++-
 include/linux/iommu.h                           | 2 +-
 8 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 6386fa4556d9..b11b014fa82d 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -52,7 +52,8 @@ struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev,
 						struct mm_struct *mm);
 void amd_iommu_domain_free(struct iommu_domain *dom);
 int iommu_sva_set_dev_pasid(struct iommu_domain *domain,
-			    struct device *dev, ioasid_t pasid);
+			    struct device *dev, ioasid_t pasid,
+			    struct iommu_domain *old);
 void amd_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
 				struct iommu_domain *domain);
 
diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c
index 0657b9373be5..d1dfc745f55e 100644
--- a/drivers/iommu/amd/pasid.c
+++ b/drivers/iommu/amd/pasid.c
@@ -100,7 +100,8 @@ static const struct mmu_notifier_ops sva_mn = {
 };
 
 int iommu_sva_set_dev_pasid(struct iommu_domain *domain,
-			    struct device *dev, ioasid_t pasid)
+			    struct device *dev, ioasid_t pasid,
+			    struct iommu_domain *old)
 {
 	struct pdom_dev_data *pdom_dev_data;
 	struct protection_domain *sva_pdom = to_pdomain(domain);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index a7c36654dee5..645da7b69bed 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -332,7 +332,8 @@ void arm_smmu_sva_notifier_synchronize(void)
 }
 
 static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
-				      struct device *dev, ioasid_t id)
+				      struct device *dev, ioasid_t id,
+				      struct iommu_domain *old)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 737c5b882355..1517fe2c356e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2856,7 +2856,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 }
 
 static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain,
-				      struct device *dev, ioasid_t id)
+				     struct device *dev, ioasid_t id,
+				     struct iommu_domain *old)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e2b3aa601191..2d1d208d5125 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4070,7 +4070,8 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
 }
 
 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
-				     struct device *dev, ioasid_t pasid)
+				     struct device *dev, ioasid_t pasid,
+				     struct iommu_domain *old)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
@@ -4356,7 +4357,8 @@ static int identity_domain_attach_dev(struct iommu_domain *domain, struct device
 }
 
 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
-					 struct device *dev, ioasid_t pasid)
+					 struct device *dev, ioasid_t pasid,
+					 struct iommu_domain *old)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct intel_iommu *iommu = info->iommu;
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 3cc43a958b4d..4a2bd65614ad 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -111,7 +111,8 @@ static const struct mmu_notifier_ops intel_mmuops = {
 };
 
 static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
-				   struct device *dev, ioasid_t pasid)
+				   struct device *dev, ioasid_t pasid,
+				   struct iommu_domain *old)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 83c8e617a2c5..f3f81c04b8fb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3331,7 +3331,8 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain,
 	int ret;
 
 	for_each_group_device(group, device) {
-		ret = domain->ops->set_dev_pasid(domain, device->dev, pasid);
+		ret = domain->ops->set_dev_pasid(domain, device->dev,
+						 pasid, NULL);
 		if (ret)
 			goto err_revert;
 	}
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..32dce80aa7fd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -642,7 +642,7 @@ struct iommu_ops {
 struct iommu_domain_ops {
 	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
 	int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
-			     ioasid_t pasid);
+			     ioasid_t pasid, struct iommu_domain *old);
 
 	int (*map_pages)(struct iommu_domain *domain, unsigned long iova,
 			 phys_addr_t paddr, size_t pgsize, size_t pgcount,
-- 
cgit v1.2.3


From 980e3016ebcc00021bf8190c64fd65ba23e90498 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 8 Nov 2024 10:14:04 +0800
Subject: iommu: Make set_dev_pasid op support domain replacement

The iommu core is going to support domain replacement for pasid, it needs
to make the set_dev_pasid op support replacing domain and keep the old
domain config in the failure case.

AMD iommu driver does not support domain replacement for pasid yet, so it
would fail the set_dev_pasid op to keep the old config if the input @old
is non-NULL. Till now, all the set_dev_pasid callbacks can handle the old
parameter and can keep the old config when failed, so update the kdoc of
set_dev_pasid op.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Link: https://lore.kernel.org/r/20241107122234.7424-14-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/pasid.c | 3 +++
 include/linux/iommu.h     | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c
index d1dfc745f55e..8c73a30c2800 100644
--- a/drivers/iommu/amd/pasid.c
+++ b/drivers/iommu/amd/pasid.c
@@ -109,6 +109,9 @@ int iommu_sva_set_dev_pasid(struct iommu_domain *domain,
 	unsigned long flags;
 	int ret = -EINVAL;
 
+	if (old)
+		return -EOPNOTSUPP;
+
 	/* PASID zero is used for requests from the I/O device without PASID */
 	if (!is_pasid_valid(dev_data, pasid))
 		return ret;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 32dce80aa7fd..27f923450a7c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -616,7 +616,8 @@ struct iommu_ops {
  * * EBUSY	- device is attached to a domain and cannot be changed
  * * ENODEV	- device specific errors, not able to be attached
  * * <others>	- treated as ENODEV by the caller. Use is discouraged
- * @set_dev_pasid: set an iommu domain to a pasid of device
+ * @set_dev_pasid: set or replace an iommu domain to a pasid of device. The pasid of
+ *                 the device should be left in the old config in error case.
  * @map_pages: map a physically contiguous set of pages of the same size to
  *             an iommu domain.
  * @unmap_pages: unmap a number of pages of the same size from an iommu domain
-- 
cgit v1.2.3


From d1f4390dd28ba110f232615dc4610ac1bb2f39f2 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Fri, 8 Nov 2024 16:07:37 +0200
Subject: regmap: provide regmap_assign_bits()

Add another bits helper to regmap API: this one sets given bits if value
is true and clears them if it's false.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Link: https://patch.msgid.link/20241108-assign-bits-v1-1-382790562d99@ideasonboard.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index da07584284ab..0d5b74f9bd32 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1335,6 +1335,15 @@ static inline int regmap_clear_bits(struct regmap *map,
 	return regmap_update_bits_base(map, reg, bits, 0, NULL, false, false);
 }
 
+static inline int regmap_assign_bits(struct regmap *map, unsigned int reg,
+				     unsigned int bits, bool value)
+{
+	if (value)
+		return regmap_set_bits(map, reg, bits);
+	else
+		return regmap_clear_bits(map, reg, bits);
+}
+
 int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits);
 
 /**
@@ -1803,6 +1812,13 @@ static inline int regmap_clear_bits(struct regmap *map,
 	return -EINVAL;
 }
 
+static inline int regmap_assign_bits(struct regmap *map, unsigned int reg,
+				     unsigned int bits, bool value)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
 static inline int regmap_test_bits(struct regmap *map,
 				   unsigned int reg, unsigned int bits)
 {
-- 
cgit v1.2.3


From 06cf321aadef17c7b1578369e314193c0e1c7d8e Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 7 Nov 2024 14:58:19 -0600
Subject: range: Add range_overlaps()

Code to support CXL Dynamic Capacity devices will have extent ranges
which need to be compared for intersection not a subset as is being
checked in range_contains().

range_overlaps() is defined in btrfs with a different meaning from what
is required in the standard range code.  Dan Williams pointed this out
in [1].  Adjust the btrfs call according to his suggestion there.

Then add a generic range_overlaps().

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: David Sterba <dsterba@suse.com>
Cc: linux-btrfs@vger.kernel.org
Link: https://lore.kernel.org/all/65949f79ef908_8dc68294f2@dwillia2-xfh.jf.intel.com.notmuch/ [1]
Acked-by: David Sterba <dsterba@suse.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Fan Ni <fan.ni@samsung.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://patch.msgid.link/20241107-dcd-type2-upstream-v7-1-56a84e66bc36@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 fs/btrfs/ordered-data.c | 10 +++++-----
 include/linux/range.h   |  8 ++++++++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2104d60c2161..744c3375ee6a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -111,8 +111,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 	return NULL;
 }
 
-static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
-			  u64 len)
+static int btrfs_range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+				u64 len)
 {
 	if (file_offset + len <= entry->file_offset ||
 	    entry->file_offset + entry->num_bytes <= file_offset)
@@ -985,7 +985,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
 
 	while (1) {
 		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-		if (range_overlaps(entry, file_offset, len))
+		if (btrfs_range_overlaps(entry, file_offset, len))
 			break;
 
 		if (entry->file_offset >= file_offset + len) {
@@ -1114,12 +1114,12 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
 	}
 	if (prev) {
 		entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
-		if (range_overlaps(entry, file_offset, len))
+		if (btrfs_range_overlaps(entry, file_offset, len))
 			goto out;
 	}
 	if (next) {
 		entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
-		if (range_overlaps(entry, file_offset, len))
+		if (btrfs_range_overlaps(entry, file_offset, len))
 			goto out;
 	}
 	/* No ordered extent in the range */
diff --git a/include/linux/range.h b/include/linux/range.h
index 6ad0b73cb7ad..876cd5355158 100644
--- a/include/linux/range.h
+++ b/include/linux/range.h
@@ -13,11 +13,19 @@ static inline u64 range_len(const struct range *range)
 	return range->end - range->start + 1;
 }
 
+/* True if r1 completely contains r2 */
 static inline bool range_contains(struct range *r1, struct range *r2)
 {
 	return r1->start <= r2->start && r1->end >= r2->end;
 }
 
+/* True if any part of r1 overlaps r2 */
+static inline bool range_overlaps(const struct range *r1,
+				  const struct range *r2)
+{
+	return r1->start <= r2->end && r1->end >= r2->start;
+}
+
 int add_range(struct range *range, int az, int nr_range,
 		u64 start, u64 end);
 
-- 
cgit v1.2.3


From d1b3dad9de7962f7bea861f27e352bac17b68bed Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:04:17 -0800
Subject: iommufd: Move struct iommufd_object to public iommufd header

Prepare for an embedded structure design for driver-level iommufd_viommu
objects:
    // include/linux/iommufd.h
    struct iommufd_viommu {
        struct iommufd_object obj;
        ....
    };

    // Some IOMMU driver
    struct iommu_driver_viommu {
        struct iommufd_viommu core;
        ....
    };

It has to expose struct iommufd_object and enum iommufd_object_type from
the core-level private header to the public iommufd header.

Link: https://patch.msgid.link/r/54a43b0768089d690104530754f499ca05ce0074.1730836219.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h | 25 +------------------------
 include/linux/iommufd.h                 | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 8f3c21a664bd..94cfcab7e9de 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -5,8 +5,8 @@
 #define __IOMMUFD_PRIVATE_H
 
 #include <linux/iommu.h>
+#include <linux/iommufd.h>
 #include <linux/iova_bitmap.h>
-#include <linux/refcount.h>
 #include <linux/rwsem.h>
 #include <linux/uaccess.h>
 #include <linux/xarray.h>
@@ -126,29 +126,6 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
 	return 0;
 }
 
-enum iommufd_object_type {
-	IOMMUFD_OBJ_NONE,
-	IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
-	IOMMUFD_OBJ_DEVICE,
-	IOMMUFD_OBJ_HWPT_PAGING,
-	IOMMUFD_OBJ_HWPT_NESTED,
-	IOMMUFD_OBJ_IOAS,
-	IOMMUFD_OBJ_ACCESS,
-	IOMMUFD_OBJ_FAULT,
-#ifdef CONFIG_IOMMUFD_TEST
-	IOMMUFD_OBJ_SELFTEST,
-#endif
-	IOMMUFD_OBJ_MAX,
-};
-
-/* Base struct for all objects with a userspace ID handle. */
-struct iommufd_object {
-	refcount_t shortterm_users;
-	refcount_t users;
-	enum iommufd_object_type type;
-	unsigned int id;
-};
-
 static inline bool iommufd_lock_obj(struct iommufd_object *obj)
 {
 	if (!refcount_inc_not_zero(&obj->users))
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 30f832a60ccb..22948dd03d67 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -8,6 +8,7 @@
 
 #include <linux/err.h>
 #include <linux/errno.h>
+#include <linux/refcount.h>
 #include <linux/types.h>
 
 struct device;
@@ -18,6 +19,29 @@ struct iommufd_ctx;
 struct iommufd_device;
 struct page;
 
+enum iommufd_object_type {
+	IOMMUFD_OBJ_NONE,
+	IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+	IOMMUFD_OBJ_DEVICE,
+	IOMMUFD_OBJ_HWPT_PAGING,
+	IOMMUFD_OBJ_HWPT_NESTED,
+	IOMMUFD_OBJ_IOAS,
+	IOMMUFD_OBJ_ACCESS,
+	IOMMUFD_OBJ_FAULT,
+#ifdef CONFIG_IOMMUFD_TEST
+	IOMMUFD_OBJ_SELFTEST,
+#endif
+	IOMMUFD_OBJ_MAX,
+};
+
+/* Base struct for all objects with a userspace ID handle. */
+struct iommufd_object {
+	refcount_t shortterm_users;
+	refcount_t users;
+	enum iommufd_object_type type;
+	unsigned int id;
+};
+
 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 					   struct device *dev, u32 *id);
 void iommufd_device_unbind(struct iommufd_device *idev);
-- 
cgit v1.2.3


From e8e26a0b09f5783be471b5ffb1e31822b1272c1d Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Tue, 15 Oct 2024 22:27:31 +0200
Subject: nfs: Annotate struct pnfs_commit_array with __counted_by()

Add the __counted_by compiler attribute to the flexible array member
buckets to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Compile-tested only.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 12d8e47bc5a3..559273a0f16d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1336,7 +1336,7 @@ struct pnfs_commit_array {
 	struct rcu_head rcu;
 	refcount_t refcount;
 	unsigned int nbuckets;
-	struct pnfs_commit_bucket buckets[];
+	struct pnfs_commit_bucket buckets[] __counted_by(nbuckets);
 };
 
 struct pnfs_ds_commit_info {
-- 
cgit v1.2.3


From e32c260195e6ff72940ab7826e38e0a0066fc58f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Nov 2024 11:49:04 -1000
Subject: sched_ext: Enable the ops breather and eject BPF scheduler on
 softlockup

On 2 x Intel Sapphire Rapids machines with 224 logical CPUs, a poorly
behaving BPF scheduler can live-lock the system by making multiple CPUs bang
on the same DSQ to the point where soft-lockup detection triggers before
SCX's own watchdog can take action. It also seems possible that the machine
can be live-locked enough to prevent scx_ops_helper, which is an RT task,
from running in a timely manner.

Implement scx_softlockup() which is called when three quarters of
soft-lockup threshold has passed. The function immediately enables the ops
breather and triggers an ops error to initiate ejection of the BPF
scheduler.

The previous and this patch combined enable the kernel to reliably recover
the system from live-lock conditions that can be triggered by a poorly
behaving BPF scheduler on Intel dual socket systems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/ext.h         |  2 ++
 kernel/sched/ext.c                | 45 +++++++++++++++++++++++++++++++++++++++
 kernel/watchdog.c                 |  8 +++++++
 tools/sched_ext/scx_show_state.py |  2 ++
 4 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1ddbde64a31b..65bc0a489cd2 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -205,11 +205,13 @@ struct sched_ext_entity {
 
 void sched_ext_free(struct task_struct *p);
 void print_scx_info(const char *log_lvl, struct task_struct *p);
+void scx_softlockup(u32 dur_s);
 
 #else	/* !CONFIG_SCHED_CLASS_EXT */
 
 static inline void sched_ext_free(struct task_struct *p) {}
 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
+static inline void scx_softlockup(u32 dur_s) {}
 
 #endif	/* CONFIG_SCHED_CLASS_EXT */
 #endif	/* _LINUX_SCHED_EXT_H */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2d41f1917464..02f39314ef8a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -867,6 +867,7 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static unsigned long scx_in_softlockup;
 static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
 static int scx_ops_bypass_depth;
 static bool scx_ops_init_task_enabled;
@@ -4614,6 +4615,49 @@ bool task_should_scx(struct task_struct *p)
 	return p->policy == SCHED_EXT;
 }
 
+/**
+ * scx_softlockup - sched_ext softlockup handler
+ *
+ * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
+ * live-lock the system by making many CPUs target the same DSQ to the point
+ * where soft-lockup detection triggers. This function is called from
+ * soft-lockup watchdog when the triggering point is close and tries to unjam
+ * the system by enabling the breather and aborting the BPF scheduler.
+ */
+void scx_softlockup(u32 dur_s)
+{
+	switch (scx_ops_enable_state()) {
+	case SCX_OPS_ENABLING:
+	case SCX_OPS_ENABLED:
+		break;
+	default:
+		return;
+	}
+
+	/* allow only one instance, cleared at the end of scx_ops_bypass() */
+	if (test_and_set_bit(0, &scx_in_softlockup))
+		return;
+
+	printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
+			smp_processor_id(), dur_s, scx_ops.name);
+
+	/*
+	 * Some CPUs may be trapped in the dispatch paths. Enable breather
+	 * immediately; otherwise, we might even be able to get to
+	 * scx_ops_bypass().
+	 */
+	atomic_inc(&scx_ops_breather_depth);
+
+	scx_ops_error("soft lockup - CPU#%d stuck for %us",
+		      smp_processor_id(), dur_s);
+}
+
+static void scx_clear_softlockup(void)
+{
+	if (test_and_clear_bit(0, &scx_in_softlockup))
+		atomic_dec(&scx_ops_breather_depth);
+}
+
 /**
  * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
  *
@@ -4724,6 +4768,7 @@ static void scx_ops_bypass(bool bypass)
 	atomic_dec(&scx_ops_breather_depth);
 unlock:
 	raw_spin_unlock_irqrestore(&bypass_lock, flags);
+	scx_clear_softlockup();
 }
 
 static void free_exit_info(struct scx_exit_info *ei)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 262691ba62b7..5a93d4c446b8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -644,6 +644,14 @@ static int is_softlockup(unsigned long touch_ts,
 		    need_counting_irqs())
 			start_counting_irqs();
 
+		/*
+		 * A poorly behaving BPF scheduler can live-lock the system into
+		 * soft lockups. Tell sched_ext to try ejecting the BPF
+		 * scheduler when close to a soft lockup.
+		 */
+		if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
+			scx_softlockup(now - touch_ts);
+
 		/* Warn about unreasonable delays. */
 		if (time_after(now, period_ts + get_softlockup_thresh()))
 			return now - touch_ts;
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
index c4b3fdda9a0b..b800d4f5f2e9 100644
--- a/tools/sched_ext/scx_show_state.py
+++ b/tools/sched_ext/scx_show_state.py
@@ -35,6 +35,8 @@ print(f'enabled       : {read_static_key("__scx_ops_enabled")}')
 print(f'switching_all : {read_int("scx_switching_all")}')
 print(f'switched_all  : {read_static_key("__scx_switched_all")}')
 print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
+print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}')
+print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}')
 print(f'bypass_depth  : {prog["scx_ops_bypass_depth"].value_()}')
 print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
 print(f'enable_seq    : {read_atomic("scx_enable_seq")}')
-- 
cgit v1.2.3


From 20fd1383cd616d61b2a79967da1221dc6cfb8430 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Thu, 7 Nov 2024 18:52:33 +0000
Subject: iio: Move __private marking before struct element priv in struct
 iio_dev

This is to avoid tripping up kernel-doc which filters it out before
but not after the name.

Note the formatting is less than ideal as a result so we may revisit
in future.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 59c58f455311..ae65890d4567 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -624,7 +624,7 @@ struct iio_dev {
 	const struct iio_info		*info;
 	const struct iio_buffer_setup_ops	*setup_ops;
 
-	void				*priv __private;
+	void				*__private priv;
 };
 
 int iio_device_id(struct iio_dev *indio_dev);
-- 
cgit v1.2.3


From fffb9fff12250018a6f4d3e411f9d289210da329 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 4 Nov 2024 11:34:24 +0200
Subject: gpio: Get rid of GPIOF_ACTIVE_LOW

No more users.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20241104093609.156059-7-andriy.shevchenko@linux.intel.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib-legacy.c | 3 ---
 include/linux/gpio.h          | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-legacy.c b/drivers/gpio/gpiolib-legacy.c
index 28f1046fb670..aeae6df8bec9 100644
--- a/drivers/gpio/gpiolib-legacy.c
+++ b/drivers/gpio/gpiolib-legacy.c
@@ -46,9 +46,6 @@ int gpio_request_one(unsigned gpio, unsigned long flags, const char *label)
 	if (err)
 		return err;
 
-	if (flags & GPIOF_ACTIVE_LOW)
-		set_bit(FLAG_ACTIVE_LOW, &desc->flags);
-
 	if (flags & GPIOF_IN)
 		err = gpiod_direction_input(desc);
 	else
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 2d105be7bbc3..6270150f4e29 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -21,9 +21,6 @@ struct device;
 #define GPIOF_OUT_INIT_LOW	((0 << 0) | (0 << 1))
 #define GPIOF_OUT_INIT_HIGH	((0 << 0) | (1 << 1))
 
-/* Gpio pin is active-low */
-#define GPIOF_ACTIVE_LOW        (1 << 2)
-
 /**
  * struct gpio - a structure describing a GPIO with configuration
  * @gpio:	the GPIO number
-- 
cgit v1.2.3


From f7f52738637f4361c108cad36e23ee98959a9006 Mon Sep 17 00:00:00 2001
From: Gilad Naaman <gnaaman@drivenets.com>
Date: Thu, 7 Nov 2024 16:04:43 +0000
Subject: neighbour: Create netdev->neighbour association

Create a mapping between a netdev and its neighoburs,
allowing for much cheaper flushes.

Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20241107160444.2913124-7-gnaaman@drivenets.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/net_cachelines/net_device.rst       |  1 +
 include/linux/netdevice.h                          |  7 ++
 include/net/neighbour.h                            |  9 +-
 include/net/neighbour_tables.h                     | 12 +++
 net/core/neighbour.c                               | 96 +++++++++++++---------
 5 files changed, 80 insertions(+), 45 deletions(-)
 create mode 100644 include/net/neighbour_tables.h

(limited to 'include/linux')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index ade50d4e67cf..15e31ece675f 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -188,4 +188,5 @@ u64                                 max_pacing_offload_horizon
 struct_napi_config*                 napi_config
 unsigned_long                       gro_flush_timeout
 u32                                 napi_defer_hard_irqs
+struct hlist_head                   neighbours[2]
 =================================== =========================== =================== =================== ===================================================================================
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3c552b648b27..df4483598628 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -52,6 +52,7 @@
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
+#include <net/neighbour_tables.h>
 
 struct netpoll_info;
 struct device;
@@ -2032,6 +2033,9 @@ enum netdev_reg_state {
  *	@napi_defer_hard_irqs:	If not zero, provides a counter that would
  *				allow to avoid NIC hard IRQ, on busy queues.
  *
+ *	@neighbours:	List heads pointing to this device's neighbours'
+ *			dev_list, one per address-family.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2440,6 +2444,9 @@ struct net_device {
 	 */
 	struct net_shaper_hierarchy *net_shaper_hierarchy;
 #endif
+
+	struct hlist_head neighbours[NEIGH_NR_TABLES];
+
 	u8			priv[] ____cacheline_aligned
 				       __counted_by(priv_len);
 } ____cacheline_aligned;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 40aac1e24c68..9a832cab5b1d 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -29,6 +29,7 @@
 #include <linux/sysctl.h>
 #include <linux/workqueue.h>
 #include <net/rtnetlink.h>
+#include <net/neighbour_tables.h>
 
 /*
  * NUD stands for "neighbor unreachability detection"
@@ -136,6 +137,7 @@ struct neigh_statistics {
 
 struct neighbour {
 	struct hlist_node	hash;
+	struct hlist_node	dev_list;
 	struct neigh_table	*tbl;
 	struct neigh_parms	*parms;
 	unsigned long		confirmed;
@@ -236,13 +238,6 @@ struct neigh_table {
 	struct pneigh_entry	**phash_buckets;
 };
 
-enum {
-	NEIGH_ARP_TABLE = 0,
-	NEIGH_ND_TABLE = 1,
-	NEIGH_NR_TABLES,
-	NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */
-};
-
 static inline int neigh_parms_family(struct neigh_parms *p)
 {
 	return p->tbl->family;
diff --git a/include/net/neighbour_tables.h b/include/net/neighbour_tables.h
new file mode 100644
index 000000000000..bcffbe8f7601
--- /dev/null
+++ b/include/net/neighbour_tables.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_NEIGHBOUR_TABLES_H
+#define _NET_NEIGHBOUR_TABLES_H
+
+enum {
+	NEIGH_ARP_TABLE = 0,
+	NEIGH_ND_TABLE = 1,
+	NEIGH_NR_TABLES,
+	NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */
+};
+
+#endif
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 59f359c7b5e3..5e572f6eaf2c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -60,6 +60,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
 static const struct seq_operations neigh_stat_seq_ops;
 #endif
 
+static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family)
+{
+	int i;
+
+	switch (family) {
+	default:
+		DEBUG_NET_WARN_ON_ONCE(1);
+		fallthrough; /* to avoid panic by null-ptr-deref */
+	case AF_INET:
+		i = NEIGH_ARP_TABLE;
+		break;
+	case AF_INET6:
+		i = NEIGH_ND_TABLE;
+		break;
+	}
+
+	return &dev->neighbours[i];
+}
+
 /*
    Neighbour hash table buckets are protected with rwlock tbl->lock.
 
@@ -211,6 +230,7 @@ bool neigh_remove_one(struct neighbour *n)
 	write_lock(&n->lock);
 	if (refcount_read(&n->refcnt) == 1) {
 		hlist_del_rcu(&n->hash);
+		hlist_del_rcu(&n->dev_list);
 		neigh_mark_dead(n);
 		retval = true;
 	}
@@ -351,48 +371,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
 static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
 			    bool skip_perm)
 {
-	int i;
-	struct neigh_hash_table *nht;
-
-	nht = rcu_dereference_protected(tbl->nht,
-					lockdep_is_held(&tbl->lock));
+	struct hlist_head *dev_head;
+	struct hlist_node *tmp;
+	struct neighbour *n;
 
-	for (i = 0; i < (1 << nht->hash_shift); i++) {
-		struct hlist_node *tmp;
-		struct neighbour *n;
+	dev_head = neigh_get_dev_table(dev, tbl->family);
 
-		neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
-			if (dev && n->dev != dev)
-				continue;
-			if (skip_perm && n->nud_state & NUD_PERMANENT)
-				continue;
+	hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) {
+		if (skip_perm && n->nud_state & NUD_PERMANENT)
+			continue;
 
-			hlist_del_rcu(&n->hash);
-			write_lock(&n->lock);
-			neigh_del_timer(n);
-			neigh_mark_dead(n);
-			if (refcount_read(&n->refcnt) != 1) {
-				/* The most unpleasant situation.
-				   We must destroy neighbour entry,
-				   but someone still uses it.
-
-				   The destroy will be delayed until
-				   the last user releases us, but
-				   we must kill timers etc. and move
-				   it to safe state.
-				 */
-				__skb_queue_purge(&n->arp_queue);
-				n->arp_queue_len_bytes = 0;
-				WRITE_ONCE(n->output, neigh_blackhole);
-				if (n->nud_state & NUD_VALID)
-					n->nud_state = NUD_NOARP;
-				else
-					n->nud_state = NUD_NONE;
-				neigh_dbg(2, "neigh %p is stray\n", n);
-			}
-			write_unlock(&n->lock);
-			neigh_cleanup_and_release(n);
+		hlist_del_rcu(&n->hash);
+		hlist_del_rcu(&n->dev_list);
+		write_lock(&n->lock);
+		neigh_del_timer(n);
+		neigh_mark_dead(n);
+		if (refcount_read(&n->refcnt) != 1) {
+			/* The most unpleasant situation.
+			 * We must destroy neighbour entry,
+			 * but someone still uses it.
+			 *
+			 * The destroy will be delayed until
+			 * the last user releases us, but
+			 * we must kill timers etc. and move
+			 * it to safe state.
+			 */
+			__skb_queue_purge(&n->arp_queue);
+			n->arp_queue_len_bytes = 0;
+			WRITE_ONCE(n->output, neigh_blackhole);
+			if (n->nud_state & NUD_VALID)
+				n->nud_state = NUD_NOARP;
+			else
+				n->nud_state = NUD_NONE;
+			neigh_dbg(2, "neigh %p is stray\n", n);
 		}
+		write_unlock(&n->lock);
+		neigh_cleanup_and_release(n);
 	}
 }
 
@@ -655,6 +669,10 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
 	if (want_ref)
 		neigh_hold(n);
 	hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
+
+	hlist_add_head_rcu(&n->dev_list,
+			   neigh_get_dev_table(dev, tbl->family));
+
 	write_unlock_bh(&tbl->lock);
 	neigh_dbg(2, "neigh %p is created\n", n);
 	rc = n;
@@ -935,6 +953,7 @@ static void neigh_periodic_work(struct work_struct *work)
 			     !time_in_range_open(jiffies, n->used,
 						 n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
 				hlist_del_rcu(&n->hash);
+				hlist_del_rcu(&n->dev_list);
 				neigh_mark_dead(n);
 				write_unlock(&n->lock);
 				neigh_cleanup_and_release(n);
@@ -3054,6 +3073,7 @@ void __neigh_for_each_release(struct neigh_table *tbl,
 			release = cb(n);
 			if (release) {
 				hlist_del_rcu(&n->hash);
+				hlist_del_rcu(&n->dev_list);
 				neigh_mark_dead(n);
 			}
 			write_unlock(&n->lock);
-- 
cgit v1.2.3


From 79bc0af904db647979c735563299c9b0d820e432 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 10 Oct 2024 21:35:42 +0200
Subject: hwmon: Add static visibility member to struct hwmon_ops

Several drivers return the same static value in their is_visible
callback, what results in code duplication. Therefore add an option
for drivers to specify a static visibility directly.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Message-ID: <89690b81-2c73-47ae-9ae9-45c77b45ca0c@gmail.com>
groeck: Renamed hwmon_ops_is_visible -> hwmon_is_visible
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/hwmon.c | 19 +++++++++++++++----
 include/linux/hwmon.h |  5 ++++-
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index 9c35c4d0369d..49b366254529 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -145,6 +145,17 @@ static const struct class hwmon_class = {
 
 static DEFINE_IDA(hwmon_ida);
 
+static umode_t hwmon_is_visible(const struct hwmon_ops *ops,
+				const void *drvdata,
+				enum hwmon_sensor_types type,
+				u32 attr, int channel)
+{
+	if (ops->visible)
+		return ops->visible;
+
+	return ops->is_visible(drvdata, type, attr, channel);
+}
+
 /* Thermal zone handling */
 
 /*
@@ -267,8 +278,8 @@ static int hwmon_thermal_register_sensors(struct device *dev)
 			int err;
 
 			if (!(info[i]->config[j] & HWMON_T_INPUT) ||
-			    !chip->ops->is_visible(drvdata, hwmon_temp,
-						   hwmon_temp_input, j))
+			    !hwmon_is_visible(chip->ops, drvdata, hwmon_temp,
+					      hwmon_temp_input, j))
 				continue;
 
 			err = hwmon_thermal_add_sensor(dev, j);
@@ -506,7 +517,7 @@ static struct attribute *hwmon_genattr(const void *drvdata,
 	const char *name;
 	bool is_string = is_string_attr(type, attr);
 
-	mode = ops->is_visible(drvdata, type, attr, index);
+	mode = hwmon_is_visible(ops, drvdata, type, attr, index);
 	if (!mode)
 		return ERR_PTR(-ENOENT);
 
@@ -1033,7 +1044,7 @@ hwmon_device_register_with_info(struct device *dev, const char *name,
 	if (!dev || !name || !chip)
 		return ERR_PTR(-EINVAL);
 
-	if (!chip->ops || !chip->ops->is_visible || !chip->info)
+	if (!chip->ops || !(chip->ops->visible || chip->ops->is_visible) || !chip->info)
 		return ERR_PTR(-EINVAL);
 
 	return __hwmon_device_register(dev, name, drvdata, chip, extra_groups);
diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index 5c6a421ad580..3a63dff62d03 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -368,7 +368,9 @@ enum hwmon_intrusion_attributes {
 
 /**
  * struct hwmon_ops - hwmon device operations
- * @is_visible: Callback to return attribute visibility. Mandatory.
+ * @visible:	Static visibility. If non-zero, 'is_visible' is ignored.
+ * @is_visible: Callback to return attribute visibility. Mandatory unless
+ *		'visible' is non-zero.
  *		Parameters are:
  *		@const void *drvdata:
  *			Pointer to driver-private data structure passed
@@ -412,6 +414,7 @@ enum hwmon_intrusion_attributes {
  *		The function returns 0 on success or a negative error number.
  */
 struct hwmon_ops {
+	umode_t visible;
 	umode_t (*is_visible)(const void *drvdata, enum hwmon_sensor_types type,
 			      u32 attr, int channel);
 	int (*read)(struct device *dev, enum hwmon_sensor_types type,
-- 
cgit v1.2.3


From 7506ebcd662b868780774d191a7c024c18c557a8 Mon Sep 17 00:00:00 2001
From: Naresh Solanki <naresh.solanki@9elements.com>
Date: Mon, 7 Oct 2024 14:34:24 +0530
Subject: hwmon: (max6639) : Configure based on DT property

Remove platform data & initialize with defaults
configuration & overwrite based on DT properties.

Signed-off-by: Naresh Solanki <naresh.solanki@9elements.com>
Message-ID: <20241007090426.811736-1-naresh.solanki@9elements.com>
[groeck: Dropped some unnecessary empty lines]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/max6639.c               | 83 +++++++++++++++++++++++++----------
 include/linux/platform_data/max6639.h | 15 -------
 2 files changed, 60 insertions(+), 38 deletions(-)
 delete mode 100644 include/linux/platform_data/max6639.h

(limited to 'include/linux')

diff --git a/drivers/hwmon/max6639.c b/drivers/hwmon/max6639.c
index c955b0f3a8d3..32b4d54b2076 100644
--- a/drivers/hwmon/max6639.c
+++ b/drivers/hwmon/max6639.c
@@ -19,7 +19,6 @@
 #include <linux/hwmon-sysfs.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
-#include <linux/platform_data/max6639.h>
 #include <linux/regmap.h>
 #include <linux/util_macros.h>
 
@@ -531,14 +530,49 @@ static int rpm_range_to_reg(int range)
 	return 1; /* default: 4000 RPM */
 }
 
+static int max6639_probe_child_from_dt(struct i2c_client *client,
+				       struct device_node *child,
+				       struct max6639_data *data)
+
+{
+	struct device *dev = &client->dev;
+	u32 i;
+	int err, val;
+
+	err = of_property_read_u32(child, "reg", &i);
+	if (err) {
+		dev_err(dev, "missing reg property of %pOFn\n", child);
+		return err;
+	}
+
+	if (i > 1) {
+		dev_err(dev, "Invalid fan index reg %d\n", i);
+		return -EINVAL;
+	}
+
+	err = of_property_read_u32(child, "pulses-per-revolution", &val);
+	if (!err) {
+		if (val < 1 || val > 5) {
+			dev_err(dev, "invalid pulses-per-revolution %d of %pOFn\n", val, child);
+			return -EINVAL;
+		}
+		data->ppr[i] = val;
+	}
+
+	err = of_property_read_u32(child, "max-rpm", &val);
+	if (!err)
+		data->rpm_range[i] = rpm_range_to_reg(val);
+
+	return 0;
+}
+
 static int max6639_init_client(struct i2c_client *client,
 			       struct max6639_data *data)
 {
-	struct max6639_platform_data *max6639_info =
-		dev_get_platdata(&client->dev);
-	int i;
-	int rpm_range = 1; /* default: 4000 RPM */
-	int err, ppr;
+	struct device *dev = &client->dev;
+	const struct device_node *np = dev->of_node;
+	struct device_node *child;
+	int i, err;
 
 	/* Reset chip to default values, see below for GCONFIG setup */
 	err = regmap_write(data->regmap, MAX6639_REG_GCONFIG, MAX6639_GCONFIG_POR);
@@ -546,21 +580,29 @@ static int max6639_init_client(struct i2c_client *client,
 		return err;
 
 	/* Fans pulse per revolution is 2 by default */
-	if (max6639_info && max6639_info->ppr > 0 &&
-			max6639_info->ppr < 5)
-		ppr = max6639_info->ppr;
-	else
-		ppr = 2;
+	data->ppr[0] = 2;
+	data->ppr[1] = 2;
+
+	/* default: 4000 RPM */
+	data->rpm_range[0] = 1;
+	data->rpm_range[1] = 1;
 
-	data->ppr[0] = ppr;
-	data->ppr[1] = ppr;
+	for_each_child_of_node(np, child) {
+		if (strcmp(child->name, "fan"))
+			continue;
 
-	if (max6639_info)
-		rpm_range = rpm_range_to_reg(max6639_info->rpm_range);
-	data->rpm_range[0] = rpm_range;
-	data->rpm_range[1] = rpm_range;
+		err = max6639_probe_child_from_dt(client, child, data);
+		if (err) {
+			of_node_put(child);
+			return err;
+		}
+	}
 
 	for (i = 0; i < MAX6639_NUM_CHANNELS; i++) {
+		err = regmap_set_bits(data->regmap, MAX6639_REG_OUTPUT_MASK, BIT(1 - i));
+		if (err)
+			return err;
+
 		/* Set Fan pulse per revolution */
 		err = max6639_set_ppr(data, i, data->ppr[i]);
 		if (err)
@@ -573,12 +615,7 @@ static int max6639_init_client(struct i2c_client *client,
 			return err;
 
 		/* Fans PWM polarity high by default */
-		if (max6639_info) {
-			if (max6639_info->pwm_polarity == 0)
-				err = regmap_write(data->regmap, MAX6639_REG_FAN_CONFIG2a(i), 0x00);
-			else
-				err = regmap_write(data->regmap, MAX6639_REG_FAN_CONFIG2a(i), 0x02);
-		}
+		err = regmap_write(data->regmap, MAX6639_REG_FAN_CONFIG2a(i), 0x00);
 		if (err)
 			return err;
 
diff --git a/include/linux/platform_data/max6639.h b/include/linux/platform_data/max6639.h
deleted file mode 100644
index 65bfdb4fdc15..000000000000
--- a/include/linux/platform_data/max6639.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_MAX6639_H
-#define _LINUX_MAX6639_H
-
-#include <linux/types.h>
-
-/* platform data for the MAX6639 temperature sensor and fan control */
-
-struct max6639_platform_data {
-	bool pwm_polarity;	/* Polarity low (0) or high (1, default) */
-	int ppr;		/* Pulses per rotation 1..4 (default == 2) */
-	int rpm_range;		/* 2000, 4000 (default), 8000 or 16000 */
-};
-
-#endif /* _LINUX_MAX6639_H */
-- 
cgit v1.2.3


From e7ac4daeed91a25382091e73818ea0cddb1afd5e Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Thu, 7 Nov 2024 14:12:46 +1300
Subject: mm: count zeromap read and set for swapout and swapin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the proportion of folios from the zeromap is small, missing their
accounting may not significantly impact profiling.  However, it's easy to
construct a scenario where this becomes an issue—for example, allocating
1 GB of memory, writing zeros from userspace, followed by MADV_PAGEOUT,
and then swapping it back in.  In this case, the swap-out and swap-in
counts seem to vanish into a black hole, potentially causing semantic
ambiguity.

On the other hand, Usama reported that zero-filled pages can exceed 10% in
workloads utilizing zswap, while Hailong noted that some app in Android
have more than 6% zero-filled pages.  Before commit 0ca0c24e3211 ("mm:
store zero pages to be swapped out in a bitmap"), both zswap and zRAM
implemented similar optimizations, leading to these optimized-out pages
being counted in either zswap or zRAM counters (with pswpin/pswpout also
increasing for zRAM).  With zeromap functioning prior to both zswap and
zRAM, userspace will no longer detect these swap-out and swap-in actions.

We have three ways to address this:

1. Introduce a dedicated counter specifically for the zeromap.

2. Use pswpin/pswpout accounting, treating the zero map as a standard
   backend.  This approach aligns with zRAM's current handling of
   same-page fills at the device level.  However, it would mean losing the
   optimized-out page counters previously available in zRAM and would not
   align with systems using zswap.  Additionally, as noted by Nhat Pham,
   pswpin/pswpout counters apply only to I/O done directly to the backend
   device.

3. Count zeromap pages under zswap, aligning with system behavior when
   zswap is enabled.  However, this would not be consistent with zRAM, nor
   would it align with systems lacking both zswap and zRAM.

Given the complications with options 2 and 3, this patch selects
option 1.

We can find these counters from /proc/vmstat (counters for the whole
system) and memcg's memory.stat (counters for the interested memcg).

For example:

$ grep -E 'swpin_zero|swpout_zero' /proc/vmstat
swpin_zero 1648
swpout_zero 33536

$ grep -E 'swpin_zero|swpout_zero' /sys/fs/cgroup/system.slice/memory.stat
swpin_zero 3905
swpout_zero 3985

This patch does not address any specific zeromap bug, but the missing
swpout and swpin counts for zero-filled pages can be highly confusing and
may mislead user-space agents that rely on changes in these counters as
indicators.  Therefore, we add a Fixes tag to encourage the inclusion of
this counter in any kernel versions with zeromap.

Many thanks to Kanchana for the contribution of changing
count_objcg_event() to count_objcg_events() to support large folios[1],
which has now been incorporated into this patch.

[1] https://lkml.kernel.org/r/20241001053222.6944-5-kanchana.p.sridhar@intel.com

Link: https://lkml.kernel.org/r/20241107011246.59137-1-21cnbao@gmail.com
Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap")
Co-developed-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Hailong Liu <hailong.liu@oppo.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst |  9 +++++++++
 include/linux/memcontrol.h              | 12 +++++++-----
 include/linux/vm_event_item.h           |  2 ++
 mm/memcontrol.c                         |  4 ++++
 mm/page_io.c                            | 16 ++++++++++++++++
 mm/vmstat.c                             |  2 ++
 mm/zswap.c                              |  6 +++---
 7 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 69af2173555f..6d02168d78be 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1599,6 +1599,15 @@ The following nested keys are defined.
 	  pglazyfreed (npn)
 		Amount of reclaimed lazyfree pages
 
+	  swpin_zero
+		Number of pages swapped into memory and filled with zero, where I/O
+		was optimized out because the page content was detected to be zero
+		during swapout.
+
+	  swpout_zero
+		Number of zero-filled pages swapped out with I/O skipped due to the
+		content being detected as zero.
+
 	  zswpin
 		Number of pages moved in to memory from zswap.
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 34d2da05f2f1..e1b41554a5fb 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1760,8 +1760,9 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
 
 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
 
-static inline void count_objcg_event(struct obj_cgroup *objcg,
-				     enum vm_event_item idx)
+static inline void count_objcg_events(struct obj_cgroup *objcg,
+				      enum vm_event_item idx,
+				      unsigned long count)
 {
 	struct mem_cgroup *memcg;
 
@@ -1770,7 +1771,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 
 	rcu_read_lock();
 	memcg = obj_cgroup_memcg(objcg);
-	count_memcg_events(memcg, idx, 1);
+	count_memcg_events(memcg, idx, count);
 	rcu_read_unlock();
 }
 
@@ -1825,8 +1826,9 @@ static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
 	return NULL;
 }
 
-static inline void count_objcg_event(struct obj_cgroup *objcg,
-				     enum vm_event_item idx)
+static inline void count_objcg_events(struct obj_cgroup *objcg,
+				      enum vm_event_item idx,
+				      unsigned long count)
 {
 }
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index aed952d04132..f70d0958095c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -134,6 +134,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_SWAP
 		SWAP_RA,
 		SWAP_RA_HIT,
+		SWPIN_ZERO,
+		SWPOUT_ZERO,
 #ifdef CONFIG_KSM
 		KSM_SWPIN_COPY,
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06df2af97415..53db98d2c4a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -431,6 +431,10 @@ static const unsigned int memcg_vm_event_stat[] = {
 	PGDEACTIVATE,
 	PGLAZYFREE,
 	PGLAZYFREED,
+#ifdef CONFIG_SWAP
+	SWPIN_ZERO,
+	SWPOUT_ZERO,
+#endif
 #ifdef CONFIG_ZSWAP
 	ZSWPIN,
 	ZSWPOUT,
diff --git a/mm/page_io.c b/mm/page_io.c
index 69536a2b3c13..01749b99fb54 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -204,7 +204,9 @@ static bool is_folio_zero_filled(struct folio *folio)
 
 static void swap_zeromap_folio_set(struct folio *folio)
 {
+	struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
 	struct swap_info_struct *sis = swp_swap_info(folio->swap);
+	int nr_pages = folio_nr_pages(folio);
 	swp_entry_t entry;
 	unsigned int i;
 
@@ -212,6 +214,12 @@ static void swap_zeromap_folio_set(struct folio *folio)
 		entry = page_swap_entry(folio_page(folio, i));
 		set_bit(swp_offset(entry), sis->zeromap);
 	}
+
+	count_vm_events(SWPOUT_ZERO, nr_pages);
+	if (objcg) {
+		count_objcg_events(objcg, SWPOUT_ZERO, nr_pages);
+		obj_cgroup_put(objcg);
+	}
 }
 
 static void swap_zeromap_folio_clear(struct folio *folio)
@@ -503,6 +511,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
 static bool swap_read_folio_zeromap(struct folio *folio)
 {
 	int nr_pages = folio_nr_pages(folio);
+	struct obj_cgroup *objcg;
 	bool is_zeromap;
 
 	/*
@@ -517,6 +526,13 @@ static bool swap_read_folio_zeromap(struct folio *folio)
 	if (!is_zeromap)
 		return false;
 
+	objcg = get_obj_cgroup_from_folio(folio);
+	count_vm_events(SWPIN_ZERO, nr_pages);
+	if (objcg) {
+		count_objcg_events(objcg, SWPIN_ZERO, nr_pages);
+		obj_cgroup_put(objcg);
+	}
+
 	folio_zero_range(folio, 0, folio_size(folio));
 	folio_mark_uptodate(folio);
 	return true;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b5a4cea423e1..ac6a5aa34eab 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1415,6 +1415,8 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_SWAP
 	"swap_ra",
 	"swap_ra_hit",
+	"swpin_zero",
+	"swpout_zero",
 #ifdef CONFIG_KSM
 	"ksm_swpin_copy",
 #endif
diff --git a/mm/zswap.c b/mm/zswap.c
index 162013952074..0030ce8fecfc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1053,7 +1053,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 
 	count_vm_event(ZSWPWB);
 	if (entry->objcg)
-		count_objcg_event(entry->objcg, ZSWPWB);
+		count_objcg_events(entry->objcg, ZSWPWB, 1);
 
 	zswap_entry_free(entry);
 
@@ -1483,7 +1483,7 @@ bool zswap_store(struct folio *folio)
 
 	if (objcg) {
 		obj_cgroup_charge_zswap(objcg, entry->length);
-		count_objcg_event(objcg, ZSWPOUT);
+		count_objcg_events(objcg, ZSWPOUT, 1);
 	}
 
 	/*
@@ -1577,7 +1577,7 @@ bool zswap_load(struct folio *folio)
 
 	count_vm_event(ZSWPIN);
 	if (entry->objcg)
-		count_objcg_event(entry->objcg, ZSWPIN);
+		count_objcg_events(entry->objcg, ZSWPIN, 1);
 
 	if (swapcache) {
 		zswap_entry_free(entry);
-- 
cgit v1.2.3


From 69bad21551c9caea8c58800f96da48a704fd311e Mon Sep 17 00:00:00 2001
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Date: Mon, 30 Sep 2024 22:32:16 -0700
Subject: mm: define obj_cgroup_get() if CONFIG_MEMCG is not defined

Patch series "mm: zswap swap-out of large folios", v10.

This patch series enables zswap_store() to accept and store large folios.
The most significant contribution in this series is from the earlier RFC
submitted by Ryan Roberts [1].  Ryan's original RFC has been migrated to
mm-unstable as of 9-30-2024 in patch 6 of this series, and adapted based
on code review comments received for the current patch-series.

[1]: [RFC PATCH v1] mm: zswap: Store large folios without splitting
     https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u

The first few patches do the prep work for supporting large folios in
zswap_store.  Patch 6 provides the main functionality to swap-out large
folios in zswap.  Patch 7 adds sysfs per-order hugepages "zswpout"
counters that get incremented upon successful zswap_store of large folios,
and also updates the documentation for this:

/sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout

This patch series is a prerequisite for zswap compress batching of large
folio swap-out and decompress batching of swap-ins based on
swapin_readahead(), using Intel IAA hardware acceleration, which we would
like to submit in subsequent patch-series, with performance improvement
data.

Thanks to Ying Huang for pre-posting review feedback and suggestions!

Thanks also to Nhat, Yosry, Johannes, Barry, Chengming, Usama, Ying and
Matthew for their helpful feedback, code/data reviews and suggestions!

Co-development signoff request:
===============================
I would like to thank Ryan Roberts for his original RFC [1] and request
his co-developer signoff on patch 6 in this series. Thanks Ryan!


System setup for testing:
=========================
Testing of this patch series was done with mm-unstable as of 9-27-2024,
commit de2fbaa6d9c3576ec7133ed02a370ec9376bf000 (without this patch-series)
and mm-unstable 9-30-2024 commit c121617e3606be6575cdacfdb63cc8d67b46a568
(with this patch-series). Data was gathered on an Intel Sapphire Rapids
server, dual-socket 56 cores per socket, 4 IAA devices per socket, 503 GiB
RAM and 525G SSD disk partition swap. Core frequency was fixed at 2500MHz.

The vm-scalability "usemem" test was run in a cgroup whose memory.high
was fixed at 150G. The is no swap limit set for the cgroup. 30 usemem
processes were run, each allocating and writing 10G of memory, and sleeping
for 10 sec before exiting:

usemem --init-time -w -O -s 10 -n 30 10g

Other kernel configuration parameters:

    zswap compressors : zstd, deflate-iaa
    zswap allocator   : zsmalloc
    vm.page-cluster   : 2

In the experiments where "deflate-iaa" is used as the zswap compressor,
IAA "compression verification" is enabled by default
(cat /sys/bus/dsa/drivers/crypto/verify_compress). Hence each IAA
compression will be decompressed internally by the "iaa_crypto" driver, the
crc-s returned by the hardware will be compared and errors reported in case
of mismatches. Thus "deflate-iaa" helps ensure better data integrity as
compared to the software compressors, and the experimental data listed
below is with verify_compress set to "1".


Metrics reporting methodology:
==============================
Total and average throughput are derived from the individual 30 processes'
throughputs reported by usemem. elapsed/sys times are measured with perf.

All percentage changes are "new" vs. "old"; hence a positive value
denotes an increase in the metric, whether it is throughput or latency,
and a negative value denotes a reduction in the metric. Positive throughput
change percentages and negative latency change percentages denote improvements.

The vm stats and sysfs hugepages stats included with the performance data
provide details on the swapout activity to zswap/swap device.


Testing labels used in data summaries:
======================================
The data refers to these test configurations and the before/after
comparisons that they do:

 before-case1:
 -------------
 mm-unstable 9-27-2024, CONFIG_THP_SWAP=N (compares zswap 4K vs. zswap 64K)

 In this scenario, CONFIG_THP_SWAP=N results in 64K/2M folios to be split
 into 4K folios that get processed by zswap.

 before-case2:
 -------------
 mm-unstable 9-27-2024, CONFIG_THP_SWAP=Y (compares SSD swap large folios vs. zswap large folios)

 In this scenario, CONFIG_THP_SWAP=Y results in zswap rejecting large
 folios, which will then be stored by the SSD swap device.

 after:
 ------
 v10 of this patch-series, CONFIG_THP_SWAP=Y

 The "after" is CONFIG_THP_SWAP=Y and v10 of this patch-series, that results
 in 64K/2M folios to not be split, and to be processed by zswap_store.


Regression Testing:
===================
I ran vm-scalability usemem without large folios, i.e., only 4K folios with
mm-unstable and this patch-series. The main goal was to make sure that
there is no functional or performance regression wrt the earlier zswap
behavior for 4K folios, now that 4K folios will be processed by the new
zswap_store() code.

The data indicates there is no significant regression.

 -------------------------------------------------------------------------------
 4K folios:
 ==========

 zswap compressor                zstd          zstd        zstd       zstd v10
                         before-case1  before-case2       after      vs.     vs.
                                                                   case1   case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)    4,793,363     4,880,978   4,853,074       1%     -1%
 Average throughput (KB/s)    159,778       162,699     161,769       1%     -1%
 elapsed time (sec)            130.14        123.17      126.29      -3%      3%
 sys time (sec)              3,135.53      2,985.64    3,083.18      -2%      3%
 memcg_high                   446,826       444,626     452,930
 memcg_swap_fail                    0             0           0
 zswpout                   48,932,107    48,931,971  48,931,820
 zswpin                           383           386         397
 pswpout                            0             0           0
 pswpin                             0             0           0
 thp_swpout                         0             0           0
 thp_swpout_fallback                0             0           0
 64kB-mthp_swpout_fallback          0             0           0
 pgmajfault                     3,063         3,077       3,479
 swap_ra                           93            94          96
 swap_ra_hit                       47            47          50
 ZSWPOUT-64kB                     n/a           n/a           0
 SWPOUT-64kB                        0             0           0
 -------------------------------------------------------------------------------


Performance Testing:
====================

We list the data for 64K folios with before/after data per-compressor,
followed by the same for 2M pmd-mappable folios.


 -------------------------------------------------------------------------------
 64K folios: zstd:
 =================

 zswap compressor                zstd          zstd         zstd      zstd v10
                         before-case1  before-case2        after     vs.    vs.
                                                                    case1  case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)    5,222,213     1,076,611    6,159,776      18%   472%
 Average throughput (KB/s)    174,073        35,887      205,325      18%   472%
 elapsed time (sec)            120.50        347.16       108.33     -10%   -69%
 sys time (sec)              2,930.33        248.16     2,549.65     -13%   927%
 memcg_high                   416,773       552,200      465,874
 memcg_swap_fail            3,192,906         1,293        1,012
 zswpout                   48,931,583        20,903   48,931,218
 zswpin                           384           363          410
 pswpout                            0    40,778,448            0
 pswpin                             0            16            0
 thp_swpout                         0             0            0
 thp_swpout_fallback                0             0            0
 64kB-mthp_swpout_fallback  3,192,906         1,293        1,012
 pgmajfault                     3,452         3,072        3,061
 swap_ra                           90            87          107
 swap_ra_hit                       42            43           57
 ZSWPOUT-64kB                     n/a           n/a    3,057,173
 SWPOUT-64kB                        0     2,548,653            0
 -------------------------------------------------------------------------------


 -------------------------------------------------------------------------------
 64K folios: deflate-iaa:
 ========================

 zswap compressor         deflate-iaa   deflate-iaa  deflate-iaa deflate-iaa v10
                         before-case1  before-case2        after     vs.     vs.
                                                                   case1   case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)    5,652,608     1,089,180    7,189,778     27%    560%
 Average throughput (KB/s)    188,420        36,306      239,659     27%    560%
 elapsed time (sec)            102.90        343.35        87.05    -15%    -75%
 sys time (sec)              2,246.86        213.53     1,864.16    -17%    773%
 memcg_high                   576,104       502,907      642,083
 memcg_swap_fail            4,016,117         1,407        1,478
 zswpout                   61,163,423        22,444   57,798,716
 zswpin                           401           368          454
 pswpout                            0    40,862,080            0
 pswpin                             0            20            0
 thp_swpout                         0             0            0
 thp_swpout_fallback                0             0            0
 64kB-mthp_swpout_fallback  4,016,117         1,407        1,478
 pgmajfault                     3,063         3,153        3,122
 swap_ra                           96            93          156
 swap_ra_hit                       46            45           83
 ZSWPOUT-64kB                     n/a           n/a    3,611,032
 SWPOUT-64kB                        0     2,553,880            0
 -------------------------------------------------------------------------------


 -------------------------------------------------------------------------------
 2M folios: zstd:
 ================

 zswap compressor                zstd          zstd         zstd      zstd v10
                         before-case1  before-case2        after     vs.    vs.
                                                                   case1  case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)    5,895,500     1,109,694    6,484,224     10%    484%
 Average throughput (KB/s)    196,516        36,989      216,140     10%    484%
 elapsed time (sec)            108.77        334.28       106.33     -2%    -68%
 sys time (sec)              2,657.14         94.88     2,376.13    -11%   2404%
 memcg_high                    64,200        66,316       56,898
 memcg_swap_fail              101,182            70           27
 zswpout                   48,931,499        36,507   48,890,640
 zswpin                           380           379          377
 pswpout                            0    40,166,400            0
 pswpin                             0             0            0
 thp_swpout                         0        78,450            0
 thp_swpout_fallback          101,182            70           27
 2MB-mthp_swpout_fallback           0             0           27
 pgmajfault                     3,067         3,417        3,311
 swap_ra                           91            90          854
 swap_ra_hit                       45            45          810
 ZSWPOUT-2MB                      n/a           n/a       95,459
 SWPOUT-2MB                         0        78,450            0
 -------------------------------------------------------------------------------


 -------------------------------------------------------------------------------
 2M folios: deflate-iaa:
 =======================

 zswap compressor         deflate-iaa   deflate-iaa  deflate-iaa deflate-iaa v10
                         before-case1  before-case2        after     vs.     vs.
                                                                   case1   case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)   6,286,587      1,126,785    7,073,464     13%    528%
 Average throughput (KB/s)   209,552         37,559      235,782     13%    528%
 elapsed time (sec)            96.19         333.03        85.79    -11%    -74%
 sys time (sec)             2,141.44          99.96     1,826.67    -15%   1727%
 memcg_high                   99,253         64,666       79,718
 memcg_swap_fail             129,074             53          165
 zswpout                  61,312,794         28,321   56,045,120
 zswpin                          383            406          403
 pswpout                           0     40,048,128            0
 pswpin                            0              0            0
 thp_swpout                        0         78,219            0
 thp_swpout_fallback         129,074             53          165
 2MB-mthp_swpout_fallback          0              0          165
 pgmajfault                    3,430          3,077       31,468
 swap_ra                          91            103       84,373
 swap_ra_hit                      47             46       84,317
 ZSWPOUT-2MB                     n/a            n/a      109,229
 SWPOUT-2MB                        0         78,219            0
 -------------------------------------------------------------------------------


And finally, this is a comparison of deflate-iaa vs. zstd with v10 of this
patch-series:

 ---------------------------------------------
                  zswap_store large folios v10
                  Impr w/ deflate-iaa vs. zstd

                       64K folios    2M folios
 ---------------------------------------------
 Throughput (KB/s)            17%           9%
 elapsed time (sec)          -20%         -19%
 sys time (sec)              -27%         -23%
 ---------------------------------------------


Conclusions based on the performance results:
=============================================

 v10 wrt before-case1:
 ---------------------
 We see significant improvements in throughput, elapsed and sys time for
 zstd and deflate-iaa, when comparing before-case1 (THP_SWAP=N) vs. after
 (THP_SWAP=Y) with zswap_store large folios.

 v10 wrt before-case2:
 ---------------------
 We see even more significant improvements in throughput and elapsed time
 for zstd and deflate-iaa, when comparing before-case2 (large-folio-SSD)
 vs. after (large-folio-zswap). The sys time increases with
 large-folio-zswap as expected, due to the CPU compression time
 vs. asynchronous disk write times, as pointed out by Ying and Yosry.

 In before-case2, when zswap does not store large folios, only allocations
 and cgroup charging due to 4K folio zswap stores count towards the cgroup
 memory limit. However, in the after scenario, with the introduction of
 zswap_store() of large folios, there is an added component of the zswap
 compressed pool usage from large folio stores from potentially all 30
 processes, that gets counted towards the memory limit. As a result, we see
 higher swapout activity in the "after" data.


Summary:
========
The v10 data presented above shows that zswap_store of large folios
demonstrates good throughput/performance improvements compared to
conventional SSD swap of large folios with a sufficiently large 525G SSD
swap device. Hence, it seems reasonable for zswap_store to support large
folios, so that further performance improvements can be implemented.

In the experimental setup used in this patchset, we have enabled IAA
compress verification to ensure additional hardware data integrity CRC
checks not currently done by the software compressors. We see good
throughput/latency improvements with deflate-iaa vs. zstd with zswap_store
of large folios.

Some of the ideas for further reducing latency that have shown promise in
our experiments, are:

1) IAA compress/decompress batching.
2) Distributing compress jobs across all IAA devices on the socket.

The tests run for this patchset are using only 1 IAA device per core, that
avails of 2 compress engines on the device. In our experiments with IAA
batching, we distribute compress jobs from all cores to the 8 compress
engines available per socket. We further compress the pages in each folio
in parallel in the accelerator. As a result, we improve compress latency
and reclaim throughput.

In decompress batching, we use swapin_readahead to generate a prefetch
batch of 4K folios that we decompress in parallel in IAA.

 ------------------------------------------------------------------------------
                          IAA compress/decompress batching
              Further improvements wrt v10 zswap_store Sequential
                          subpage store using "deflate-iaa":

                      "deflate-iaa" Batching  "deflate-iaa-canned" [2] Batching
                          Additional Impr               Additional Impr
                     64K folios    2M folios     64K folios    2M folios
 ------------------------------------------------------------------------------
 Throughput (KB/s)          19%          43%           26%           55%
 elapsed time (sec)         -5%         -14%          -10%          -21%
 sys time (sec)              4%          -7%           -4%          -18%
 ------------------------------------------------------------------------------


With zswap IAA compress/decompress batching, we are able to demonstrate
significant performance improvements and memory savings in server
scalability experiments in highly contended system scenarios under
significant memory pressure; as compared to software compressors. We hope
to submit this work in subsequent patch series. The current patch-series is
a prequisite for these future submissions.

[1] https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u
[2] https://patchwork.kernel.org/project/linux-crypto/cover/cover.1710969449.git.andre.glover@linux.intel.com/


This patch (of 6):

This resolves an issue with obj_cgroup_get() not being defined if
CONFIG_MEMCG is not defined.

Before this patch, we would see build errors if obj_cgroup_get() is called
from code that is agnostic of CONFIG_MEMCG.

The zswap_store() changes for large folios in subsequent commits will
require the use of obj_cgroup_get() in zswap code that falls into this
category.

Link: https://lkml.kernel.org/r/20241001053222.6944-1-kanchana.p.sridhar@intel.com
Link: https://lkml.kernel.org/r/20241001053222.6944-2-kanchana.p.sridhar@intel.com
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Wajdi Feghali <wajdi.k.feghali@intel.com>
Cc: "Zou, Nanhai" <nanhai.zou@intel.com>
Cc: Barry Song <21cnbao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0bd8f61a5597..5502aa8e138e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1233,6 +1233,10 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
 	return NULL;
 }
 
+static inline void obj_cgroup_get(struct obj_cgroup *objcg)
+{
+}
+
 static inline void obj_cgroup_put(struct obj_cgroup *objcg)
 {
 }
-- 
cgit v1.2.3


From 6e1fa555ec772046ec3b903f507ff7fed5323796 Mon Sep 17 00:00:00 2001
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Date: Mon, 30 Sep 2024 22:32:20 -0700
Subject: mm: zswap: modify zswap_stored_pages to be atomic_long_t

For zswap_store() to support large folios, we need to be able to do a
batch update of zswap_stored_pages upon successful store of all pages in
the folio.  For this, we need to add folio_nr_pages(), which returns a
long, to zswap_stored_pages.

Link: https://lkml.kernel.org/r/20241001053222.6944-6-kanchana.p.sridhar@intel.com
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Wajdi Feghali <wajdi.k.feghali@intel.com>
Cc: "Zou, Nanhai" <nanhai.zou@intel.com>
Cc: Barry Song <21cnbao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/meminfo.c     |  2 +-
 include/linux/zswap.h |  2 +-
 mm/zswap.c            | 19 +++++++++++++------
 3 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 245171d9164b..8ba9b1472390 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -91,7 +91,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_ZSWAP
 	show_val_kb(m, "Zswap:          ", zswap_total_pages());
 	seq_printf(m,  "Zswapped:       %8lu kB\n",
-		   (unsigned long)atomic_read(&zswap_stored_pages) <<
+		   (unsigned long)atomic_long_read(&zswap_stored_pages) <<
 		   (PAGE_SHIFT - 10));
 #endif
 	show_val_kb(m, "Dirty:          ",
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 9cd1beef0654..d961ead91bf1 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -7,7 +7,7 @@
 
 struct lruvec;
 
-extern atomic_t zswap_stored_pages;
+extern atomic_long_t zswap_stored_pages;
 
 #ifdef CONFIG_ZSWAP
 
diff --git a/mm/zswap.c b/mm/zswap.c
index bcb1b9cc9645..1692273ff08c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,7 +43,7 @@
 * statistics
 **********************************/
 /* The number of compressed pages currently stored in zswap */
-atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
 
 /*
  * The statistics below are not protected from concurrent access for
@@ -802,7 +802,7 @@ static void zswap_entry_free(struct zswap_entry *entry)
 		obj_cgroup_put(entry->objcg);
 	}
 	zswap_entry_cache_free(entry);
-	atomic_dec(&zswap_stored_pages);
+	atomic_long_dec(&zswap_stored_pages);
 }
 
 /*********************************
@@ -1233,7 +1233,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 		nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
 	} else {
 		nr_backing = zswap_total_pages();
-		nr_stored = atomic_read(&zswap_stored_pages);
+		nr_stored = atomic_long_read(&zswap_stored_pages);
 	}
 
 	if (!nr_stored)
@@ -1502,7 +1502,7 @@ bool zswap_store(struct folio *folio)
 	}
 
 	/* update stats */
-	atomic_inc(&zswap_stored_pages);
+	atomic_long_inc(&zswap_stored_pages);
 	count_vm_event(ZSWPOUT);
 
 	return true;
@@ -1654,6 +1654,13 @@ static int debugfs_get_total_size(void *data, u64 *val)
 }
 DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n");
 
+static int debugfs_get_stored_pages(void *data, u64 *val)
+{
+	*val = atomic_long_read(&zswap_stored_pages);
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n");
+
 static int zswap_debugfs_init(void)
 {
 	if (!debugfs_initialized())
@@ -1677,8 +1684,8 @@ static int zswap_debugfs_init(void)
 			   zswap_debugfs_root, &zswap_written_back_pages);
 	debugfs_create_file("pool_total_size", 0444,
 			    zswap_debugfs_root, NULL, &total_size_fops);
-	debugfs_create_atomic_t("stored_pages", 0444,
-				zswap_debugfs_root, &zswap_stored_pages);
+	debugfs_create_file("stored_pages", 0444,
+			    zswap_debugfs_root, NULL, &stored_pages_fops);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 0c560dd86040556a9e55d88229d9295672428c78 Mon Sep 17 00:00:00 2001
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Date: Mon, 30 Sep 2024 22:32:22 -0700
Subject: mm: swap: count successful large folio zswap stores in hugepage
 zswpout stats

Added a new MTHP_STAT_ZSWPOUT entry to the sysfs transparent_hugepage
stats so that successful large folio zswap stores can be accounted under
the per-order sysfs "zswpout" stats:

/sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout

Other non-zswap swap device swap-out events will be counted under
the existing sysfs "swpout" stats:

/sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/swpout

Also, added documentation for the newly added sysfs per-order hugepage
"zswpout" stats. The documentation clarifies that only non-zswap swapouts
will be accounted in the existing "swpout" stats.

Link: https://lkml.kernel.org/r/20241001053222.6944-8-kanchana.p.sridhar@intel.com
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Wajdi Feghali <wajdi.k.feghali@intel.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: "Zou, Nanhai" <nanhai.zou@intel.com>
Cc: Barry Song <21cnbao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/transhuge.rst | 8 ++++++--
 include/linux/huge_mm.h                    | 1 +
 mm/huge_memory.c                           | 3 +++
 mm/page_io.c                               | 1 +
 4 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index a1bb495eab59..e8db1543e0b9 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -530,10 +530,14 @@ anon_fault_fallback_charge
 	instead falls back to using huge pages with lower orders or
 	small pages even though the allocation was successful.
 
-swpout
-	is incremented every time a huge page is swapped out in one
+zswpout
+	is incremented every time a huge page is swapped out to zswap in one
 	piece without splitting.
 
+swpout
+	is incremented every time a huge page is swapped out to a non-zswap
+	swap device in one piece without splitting.
+
 swpout_fallback
 	is incremented if a huge page has to be split before swapout.
 	Usually because failed to allocate some continuous swap space
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 006f730545c2..c59e5aa9b081 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -119,6 +119,7 @@ enum mthp_stat_item {
 	MTHP_STAT_ANON_FAULT_ALLOC,
 	MTHP_STAT_ANON_FAULT_FALLBACK,
 	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+	MTHP_STAT_ZSWPOUT,
 	MTHP_STAT_SWPOUT,
 	MTHP_STAT_SWPOUT_FALLBACK,
 	MTHP_STAT_SHMEM_ALLOC,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 61fc407330f2..b26c6503e993 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -615,6 +615,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
 #ifdef CONFIG_SHMEM
@@ -633,6 +634,7 @@ static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_fallback_attr.attr,
 	&anon_fault_fallback_charge_attr.attr,
 #ifndef CONFIG_SHMEM
+	&zswpout_attr.attr,
 	&swpout_attr.attr,
 	&swpout_fallback_attr.attr,
 #endif
@@ -663,6 +665,7 @@ static struct attribute_group file_stats_attr_grp = {
 
 static struct attribute *any_stats_attrs[] = {
 #ifdef CONFIG_SHMEM
+	&zswpout_attr.attr,
 	&swpout_attr.attr,
 	&swpout_fallback_attr.attr,
 #endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 39e8be23bd95..e9be4b436fd8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -277,6 +277,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		swap_zeromap_folio_clear(folio);
 	}
 	if (zswap_store(folio)) {
+		count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
 		folio_unlock(folio);
 		return 0;
 	}
-- 
cgit v1.2.3


From aaf2914aec0fa67395574f6fa6b726168b049e60 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sat, 26 Oct 2024 21:24:23 +1300
Subject: mm: add per-order mTHP swpin counters

This helps profile the sizes of folios being swapped in. Currently,
only mTHP swap-out is being counted.
The new interface can be found at:
/sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
         swpin
For example,
cat /sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpin
12809
cat /sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpin
4763

[v-songbaohua@oppo.com: add a blank line in doc]
  Link: https://lkml.kernel.org/r/20241030233423.80759-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20241026082423.26298-1-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/transhuge.rst | 4 ++++
 include/linux/huge_mm.h                    | 1 +
 mm/huge_memory.c                           | 3 +++
 mm/page_io.c                               | 3 +++
 4 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index e8db1543e0b9..abdf10a1c7db 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -534,6 +534,10 @@ zswpout
 	is incremented every time a huge page is swapped out to zswap in one
 	piece without splitting.
 
+swpin
+	is incremented every time a huge page is swapped in from a non-zswap
+	swap device in one piece.
+
 swpout
 	is incremented every time a huge page is swapped out to a non-zswap
 	swap device in one piece without splitting.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c59e5aa9b081..b94c2e8ee918 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -120,6 +120,7 @@ enum mthp_stat_item {
 	MTHP_STAT_ANON_FAULT_FALLBACK,
 	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
 	MTHP_STAT_ZSWPOUT,
+	MTHP_STAT_SWPIN,
 	MTHP_STAT_SWPOUT,
 	MTHP_STAT_SWPOUT_FALLBACK,
 	MTHP_STAT_SHMEM_ALLOC,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b26c6503e993..f92068864469 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -616,6 +616,7 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
+DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
 #ifdef CONFIG_SHMEM
@@ -635,6 +636,7 @@ static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_fallback_charge_attr.attr,
 #ifndef CONFIG_SHMEM
 	&zswpout_attr.attr,
+	&swpin_attr.attr,
 	&swpout_attr.attr,
 	&swpout_fallback_attr.attr,
 #endif
@@ -666,6 +668,7 @@ static struct attribute_group file_stats_attr_grp = {
 static struct attribute *any_stats_attrs[] = {
 #ifdef CONFIG_SHMEM
 	&zswpout_attr.attr,
+	&swpin_attr.attr,
 	&swpout_attr.attr,
 	&swpout_fallback_attr.attr,
 #endif
diff --git a/mm/page_io.c b/mm/page_io.c
index e9be4b436fd8..4b4ea8e49cf6 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -495,6 +495,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
 		for (p = 0; p < sio->pages; p++) {
 			struct folio *folio = page_folio(sio->bvec[p].bv_page);
 
+			count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
 			count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
 			folio_mark_uptodate(folio);
 			folio_unlock(folio);
@@ -589,6 +590,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio,
 	 * attempt to access it in the page fault retry time check.
 	 */
 	get_task_struct(current);
+	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
 	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
 	count_vm_events(PSWPIN, folio_nr_pages(folio));
 	submit_bio_wait(&bio);
@@ -605,6 +607,7 @@ static void swap_read_folio_bdev_async(struct folio *folio,
 	bio->bi_iter.bi_sector = swap_folio_sector(folio);
 	bio->bi_end_io = end_swap_bio_read;
 	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
+	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
 	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
 	count_vm_events(PSWPIN, folio_nr_pages(folio));
 	submit_bio(bio);
-- 
cgit v1.2.3


From 5f6170a469cd2c13ad4dffe42714cf777b132451 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 28 Oct 2024 14:13:27 +0000
Subject: mm: pagewalk: add the ability to install PTEs

Patch series "implement lightweight guard pages", v4.

Userland library functions such as allocators and threading
implementations often require regions of memory to act as 'guard pages' -
mappings which, when accessed, result in a fatal signal being sent to the
accessing process.

The current means by which these are implemented is via a PROT_NONE mmap()
mapping, which provides the required semantics however incur an overhead
of a VMA for each such region.

With a great many processes and threads, this can rapidly add up and incur
a significant memory penalty.  It also has the added problem of preventing
merges that might otherwise be permitted.

This series takes a different approach - an idea suggested by Vlastimil
Babka (and before him David Hildenbrand and Jann Horn - perhaps more - the
provenance becomes a little tricky to ascertain after this - please
forgive any omissions!) - rather than locating the guard pages at the VMA
layer, instead placing them in page tables mapping the required ranges.

Early testing of the prototype version of this code suggests a 5 times
speed up in memory mapping invocations (in conjunction with use of
process_madvise()) and a 13% reduction in VMAs on an entirely idle android
system and unoptimised code.

We expect with optimisation and a loaded system with a larger number of
guard pages this could significantly increase, but in any case these
numbers are encouraging.

This way, rather than having separate VMAs specifying which parts of a
range are guard pages, instead we have a VMA spanning the entire range of
memory a user is permitted to access and including ranges which are to be
'guarded'.

After mapping this, a user can specify which parts of the range should
result in a fatal signal when accessed.

By restricting the ability to specify guard pages to memory mapped by
existing VMAs, we can rely on the mappings being torn down when the
mappings are ultimately unmapped and everything works simply as if the
memory were not faulted in, from the point of view of the containing VMAs.

This mechanism in effect poisons memory ranges similar to hardware memory
poisoning, only it is an entirely software-controlled form of poisoning.

The mechanism is implemented via madvise() behaviour - MADV_GUARD_INSTALL
which installs page table-level guard page markers - and MADV_GUARD_REMOVE
- which clears them.

Guard markers can be installed across multiple VMAs and any existing
mappings will be cleared, that is zapped, before installing the guard page
markers in the page tables.

There is no concept of 'nested' guard markers, multiple attempts to
install guard markers in a range will, after the first attempt, have no
effect.

Importantly, removing guard markers over a range that contains both guard
markers and ordinary backed memory has no effect on anything but the guard
markers (including leaving huge pages un-split), so a user can safely
remove guard markers over a range of memory leaving the rest intact.

The actual mechanism by which the page table entries are specified makes
use of existing logic - PTE markers, which are used for the userfaultfd
UFFDIO_POISON mechanism.

Unfortunately PTE_MARKER_POISONED is not suited for the guard page
mechanism as it results in VM_FAULT_HWPOISON semantics in the fault
handler, so we add our own specific PTE_MARKER_GUARD and adapt existing
logic to handle it.

We also extend the generic page walk mechanism to allow for installation
of PTEs (carefully restricted to memory management logic only to prevent
unwanted abuse).

We ensure that zapping performed by MADV_DONTNEED and MADV_FREE do not
remove guard markers, nor does forking (except when VM_WIPEONFORK is
specified for a VMA which implies a total removal of memory
characteristics).

It's important to note that the guard page implementation is emphatically
NOT a security feature, so a user can remove the markers if they wish.  We
simply implement it in such a way as to provide the least surprising
behaviour.

An extensive set of self-tests are provided which ensure behaviour is as
expected and additionally self-documents expected behaviour of guard
ranges.


This patch (of 5):

The existing generic pagewalk logic permits the walking of page tables,
invoking callbacks at individual page table levels via user-provided
mm_walk_ops callbacks.

This is useful for traversing existing page table entries, but precludes
the ability to establish new ones.

Existing mechanism for performing a walk which also installs page table
entries if necessary are heavily duplicated throughout the kernel, each
with semantic differences from one another and largely unavailable for use
elsewhere.

Rather than add yet another implementation, we extend the generic pagewalk
logic to enable the installation of page table entries by adding a new
install_pte() callback in mm_walk_ops.  If this is specified, then upon
encountering a missing page table entry, we allocate and install a new one
and continue the traversal.

If a THP huge page is encountered at either the PMD or PUD level we split
it only if there are ops->pte_entry() (or ops->pmd_entry at PUD level),
otherwise if there is only an ops->install_pte(), we avoid the unnecessary
split.

We do not support hugetlb at this stage.

If this function returns an error, or an allocation fails during the
operation, we abort the operation altogether.  It is up to the caller to
deal appropriately with partially populated page table ranges.

If install_pte() is defined, the semantics of pte_entry() change - this
callback is then only invoked if the entry already exists.  This is a
useful property, as it allows a caller to handle existing PTEs while
installing new ones where necessary in the specified range.

If install_pte() is not defined, then there is no functional difference to
this patch, so all existing logic will work precisely as it did before.

As we only permit the installation of PTEs where a mapping does not
already exist there is no need for TLB management, however we do invoke
update_mmu_cache() for architectures which require manual maintenance of
mappings for other CPUs.

We explicitly do not allow the existing page walk API to expose this
feature as it is dangerous and intended for internal mm use only.
Therefore we provide a new walk_page_range_mm() function exposed only to
mm/internal.h.

We take the opportunity to additionally clean up the page walker logic to
be a little easier to follow.

Link: https://lkml.kernel.org/r/cover.1730123433.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/51b432ebef013e3fdf9f92101533435de1bffadf.1730123433.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Suggested-by: Jann Horn <jannh@google.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Vlastimil Babka <vbabkba@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagewalk.h |  18 +++-
 mm/internal.h            |   6 ++
 mm/pagewalk.c            | 246 ++++++++++++++++++++++++++++++++++-------------
 3 files changed, 201 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index f5eb5a32aeed..9700a29f8afb 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -25,12 +25,15 @@ enum page_walk_lock {
  *			this handler is required to be able to handle
  *			pmd_trans_huge() pmds.  They may simply choose to
  *			split_huge_page() instead of handling it explicitly.
- * @pte_entry:		if set, called for each PTE (lowest-level) entry,
- *			including empty ones
+ * @pte_entry:		if set, called for each PTE (lowest-level) entry
+ *			including empty ones, except if @install_pte is set.
+ *			If @install_pte is set, @pte_entry is called only for
+ *			existing PTEs.
  * @pte_hole:		if set, called for each hole at all levels,
  *			depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
  *			Any folded depths (where PTRS_PER_P?D is equal to 1)
- *			are skipped.
+ *			are skipped. If @install_pte is specified, this will
+ *			not trigger for any populated ranges.
  * @hugetlb_entry:	if set, called for each hugetlb entry. This hook
  *			function is called with the vma lock held, in order to
  *			protect against a concurrent freeing of the pte_t* or
@@ -51,6 +54,13 @@ enum page_walk_lock {
  * @pre_vma:            if set, called before starting walk on a non-null vma.
  * @post_vma:           if set, called after a walk on a non-null vma, provided
  *                      that @pre_vma and the vma walk succeeded.
+ * @install_pte:        if set, missing page table entries are installed and
+ *                      thus all levels are always walked in the specified
+ *                      range. This callback is then invoked at the PTE level
+ *                      (having split any THP pages prior), providing the PTE to
+ *                      install. If allocations fail, the walk is aborted. This
+ *                      operation is only available for userland memory. Not
+ *                      usable for hugetlb ranges.
  *
  * p?d_entry callbacks are called even if those levels are folded on a
  * particular architecture/configuration.
@@ -76,6 +86,8 @@ struct mm_walk_ops {
 	int (*pre_vma)(unsigned long start, unsigned long end,
 		       struct mm_walk *walk);
 	void (*post_vma)(struct mm_walk *walk);
+	int (*install_pte)(unsigned long addr, unsigned long next,
+			   pte_t *ptep, struct mm_walk *walk);
 	enum page_walk_lock walk_lock;
 };
 
diff --git a/mm/internal.h b/mm/internal.h
index fd6373cb1c66..d5b93c5b6364 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
 #include <linux/pagemap.h>
+#include <linux/pagewalk.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -1502,4 +1503,9 @@ static inline void accept_page(struct page *page)
 }
 #endif /* CONFIG_UNACCEPTED_MEMORY */
 
+/* pagewalk.c */
+int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
+		unsigned long end, const struct mm_walk_ops *ops,
+		void *private);
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 5f9f01532e67..e478777c86e1 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,9 +3,14 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/hugetlb.h>
+#include <linux/mmu_context.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
 /*
  * We want to know the real level where a entry is located ignoring any
  * folding of levels which may be happening. For example if p4d is folded then
@@ -29,9 +34,23 @@ static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
 	int err = 0;
 
 	for (;;) {
-		err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
-		if (err)
-		       break;
+		if (ops->install_pte && pte_none(ptep_get(pte))) {
+			pte_t new_pte;
+
+			err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte,
+					       walk);
+			if (err)
+				break;
+
+			set_pte_at(walk->mm, addr, pte, new_pte);
+			/* Non-present before, so for arches that need it. */
+			if (!WARN_ON_ONCE(walk->no_vma))
+				update_mmu_cache(walk->vma, addr, pte);
+		} else {
+			err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+			if (err)
+				break;
+		}
 		if (addr >= end - PAGE_SIZE)
 			break;
 		addr += PAGE_SIZE;
@@ -81,6 +100,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 	pmd_t *pmd;
 	unsigned long next;
 	const struct mm_walk_ops *ops = walk->ops;
+	bool has_handler = ops->pte_entry;
+	bool has_install = ops->install_pte;
 	int err = 0;
 	int depth = real_depth(3);
 
@@ -89,11 +110,14 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 again:
 		next = pmd_addr_end(addr, end);
 		if (pmd_none(*pmd)) {
-			if (ops->pte_hole)
+			if (has_install)
+				err = __pte_alloc(walk->mm, pmd);
+			else if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, depth, walk);
 			if (err)
 				break;
-			continue;
+			if (!has_install)
+				continue;
 		}
 
 		walk->action = ACTION_SUBTREE;
@@ -109,18 +133,25 @@ again:
 
 		if (walk->action == ACTION_AGAIN)
 			goto again;
-
-		/*
-		 * Check this here so we only break down trans_huge
-		 * pages when we _need_ to
-		 */
-		if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
-		    walk->action == ACTION_CONTINUE ||
-		    !(ops->pte_entry))
+		if (walk->action == ACTION_CONTINUE)
 			continue;
 
+		if (!has_handler) { /* No handlers for lower page tables. */
+			if (!has_install)
+				continue; /* Nothing to do. */
+			/*
+			 * We are ONLY installing, so avoid unnecessarily
+			 * splitting a present huge page.
+			 */
+			if (pmd_present(*pmd) &&
+			    (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+				continue;
+		}
+
 		if (walk->vma)
 			split_huge_pmd(walk->vma, pmd, addr);
+		else if (pmd_leaf(*pmd) || !pmd_present(*pmd))
+			continue; /* Nothing to do. */
 
 		err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
@@ -140,6 +171,8 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 	pud_t *pud;
 	unsigned long next;
 	const struct mm_walk_ops *ops = walk->ops;
+	bool has_handler = ops->pmd_entry || ops->pte_entry;
+	bool has_install = ops->install_pte;
 	int err = 0;
 	int depth = real_depth(2);
 
@@ -148,11 +181,14 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
  again:
 		next = pud_addr_end(addr, end);
 		if (pud_none(*pud)) {
-			if (ops->pte_hole)
+			if (has_install)
+				err = __pmd_alloc(walk->mm, pud, addr);
+			else if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, depth, walk);
 			if (err)
 				break;
-			continue;
+			if (!has_install)
+				continue;
 		}
 
 		walk->action = ACTION_SUBTREE;
@@ -164,14 +200,26 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 
 		if (walk->action == ACTION_AGAIN)
 			goto again;
-
-		if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
-		    walk->action == ACTION_CONTINUE ||
-		    !(ops->pmd_entry || ops->pte_entry))
+		if (walk->action == ACTION_CONTINUE)
 			continue;
 
+		if (!has_handler) { /* No handlers for lower page tables. */
+			if (!has_install)
+				continue; /* Nothing to do. */
+			/*
+			 * We are ONLY installing, so avoid unnecessarily
+			 * splitting a present huge page.
+			 */
+			if (pud_present(*pud) &&
+			    (pud_trans_huge(*pud) || pud_devmap(*pud)))
+				continue;
+		}
+
 		if (walk->vma)
 			split_huge_pud(walk->vma, pud, addr);
+		else if (pud_leaf(*pud) || !pud_present(*pud))
+			continue; /* Nothing to do. */
+
 		if (pud_none(*pud))
 			goto again;
 
@@ -189,6 +237,8 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	p4d_t *p4d;
 	unsigned long next;
 	const struct mm_walk_ops *ops = walk->ops;
+	bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry;
+	bool has_install = ops->install_pte;
 	int err = 0;
 	int depth = real_depth(1);
 
@@ -196,18 +246,21 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	do {
 		next = p4d_addr_end(addr, end);
 		if (p4d_none_or_clear_bad(p4d)) {
-			if (ops->pte_hole)
+			if (has_install)
+				err = __pud_alloc(walk->mm, p4d, addr);
+			else if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, depth, walk);
 			if (err)
 				break;
-			continue;
+			if (!has_install)
+				continue;
 		}
 		if (ops->p4d_entry) {
 			err = ops->p4d_entry(p4d, addr, next, walk);
 			if (err)
 				break;
 		}
-		if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+		if (has_handler || has_install)
 			err = walk_pud_range(p4d, addr, next, walk);
 		if (err)
 			break;
@@ -222,6 +275,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
 	pgd_t *pgd;
 	unsigned long next;
 	const struct mm_walk_ops *ops = walk->ops;
+	bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
+		ops->pte_entry;
+	bool has_install = ops->install_pte;
 	int err = 0;
 
 	if (walk->pgd)
@@ -231,18 +287,21 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd)) {
-			if (ops->pte_hole)
+			if (has_install)
+				err = __p4d_alloc(walk->mm, pgd, addr);
+			else if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, 0, walk);
 			if (err)
 				break;
-			continue;
+			if (!has_install)
+				continue;
 		}
 		if (ops->pgd_entry) {
 			err = ops->pgd_entry(pgd, addr, next, walk);
 			if (err)
 				break;
 		}
-		if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+		if (has_handler || has_install)
 			err = walk_p4d_range(pgd, addr, next, walk);
 		if (err)
 			break;
@@ -334,6 +393,11 @@ static int __walk_page_range(unsigned long start, unsigned long end,
 	int err = 0;
 	struct vm_area_struct *vma = walk->vma;
 	const struct mm_walk_ops *ops = walk->ops;
+	bool is_hugetlb = is_vm_hugetlb_page(vma);
+
+	/* We do not support hugetlb PTE installation. */
+	if (ops->install_pte && is_hugetlb)
+		return -EINVAL;
 
 	if (ops->pre_vma) {
 		err = ops->pre_vma(start, end, walk);
@@ -341,7 +405,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
 			return err;
 	}
 
-	if (is_vm_hugetlb_page(vma)) {
+	if (is_hugetlb) {
 		if (ops->hugetlb_entry)
 			err = walk_hugetlb_range(start, end, walk);
 	} else
@@ -380,47 +444,14 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
 #endif
 }
 
-/**
- * walk_page_range - walk page table with caller specific callbacks
- * @mm:		mm_struct representing the target process of page table walk
- * @start:	start address of the virtual address range
- * @end:	end address of the virtual address range
- * @ops:	operation to call during the walk
- * @private:	private data for callbacks' usage
- *
- * Recursively walk the page table tree of the process represented by @mm
- * within the virtual address range [@start, @end). During walking, we can do
- * some caller-specific works for each entry, by setting up pmd_entry(),
- * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
- * callbacks, the associated entries/pages are just ignored.
- * The return values of these callbacks are commonly defined like below:
- *
- *  - 0  : succeeded to handle the current entry, and if you don't reach the
- *         end address yet, continue to walk.
- *  - >0 : succeeded to handle the current entry, and return to the caller
- *         with caller specific value.
- *  - <0 : failed to handle the current entry, and return to the caller
- *         with error code.
- *
- * Before starting to walk page table, some callers want to check whether
- * they really want to walk over the current vma, typically by checking
- * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
- * purpose.
- *
- * If operations need to be staged before and committed after a vma is walked,
- * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
- * since it is intended to handle commit-type operations, can't return any
- * errors.
- *
- * struct mm_walk keeps current values of some common data like vma and pmd,
- * which are useful for the access from callbacks. If you want to pass some
- * caller-specific data to callbacks, @private should be helpful.
+/*
+ * See the comment for walk_page_range(), this performs the heavy lifting of the
+ * operation, only sets no restrictions on how the walk proceeds.
  *
- * Locking:
- *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
- *   because these function traverse vma list and/or access to vma's data.
+ * We usually restrict the ability to install PTEs, but this functionality is
+ * available to internal memory management code and provided in mm/internal.h.
  */
-int walk_page_range(struct mm_struct *mm, unsigned long start,
+int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
 		unsigned long end, const struct mm_walk_ops *ops,
 		void *private)
 {
@@ -479,6 +510,80 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
 	return err;
 }
 
+/*
+ * Determine if the walk operations specified are permitted to be used for a
+ * page table walk.
+ *
+ * This check is performed on all functions which are parameterised by walk
+ * operations and exposed in include/linux/pagewalk.h.
+ *
+ * Internal memory management code can use the walk_page_range_mm() function to
+ * be able to use all page walking operations.
+ */
+static bool check_ops_valid(const struct mm_walk_ops *ops)
+{
+	/*
+	 * The installation of PTEs is solely under the control of memory
+	 * management logic and subject to many subtle locking, security and
+	 * cache considerations so we cannot permit other users to do so, and
+	 * certainly not for exported symbols.
+	 */
+	if (ops->install_pte)
+		return false;
+
+	return true;
+}
+
+/**
+ * walk_page_range - walk page table with caller specific callbacks
+ * @mm:		mm_struct representing the target process of page table walk
+ * @start:	start address of the virtual address range
+ * @end:	end address of the virtual address range
+ * @ops:	operation to call during the walk
+ * @private:	private data for callbacks' usage
+ *
+ * Recursively walk the page table tree of the process represented by @mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ *
+ *  - 0  : succeeded to handle the current entry, and if you don't reach the
+ *         end address yet, continue to walk.
+ *  - >0 : succeeded to handle the current entry, and return to the caller
+ *         with caller specific value.
+ *  - <0 : failed to handle the current entry, and return to the caller
+ *         with error code.
+ *
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
+ * purpose.
+ *
+ * If operations need to be staged before and committed after a vma is walked,
+ * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
+ * since it is intended to handle commit-type operations, can't return any
+ * errors.
+ *
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @private should be helpful.
+ *
+ * Locking:
+ *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
+ *   because these function traverse vma list and/or access to vma's data.
+ */
+int walk_page_range(struct mm_struct *mm, unsigned long start,
+		unsigned long end, const struct mm_walk_ops *ops,
+		void *private)
+{
+	if (!check_ops_valid(ops))
+		return -EINVAL;
+
+	return walk_page_range_mm(mm, start, end, ops, private);
+}
+
 /**
  * walk_page_range_novma - walk a range of pagetables not backed by a vma
  * @mm:		mm_struct representing the target process of page table walk
@@ -494,7 +599,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
  * walking the kernel pages tables or page tables for firmware.
  *
  * Note: Be careful to walk the kernel pages tables, the caller may be need to
- * take other effective approache (mmap lock may be insufficient) to prevent
+ * take other effective approaches (mmap lock may be insufficient) to prevent
  * the intermediate kernel page tables belonging to the specified address range
  * from being freed (e.g. memory hot-remove).
  */
@@ -513,6 +618,8 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
 
 	if (start >= end || !walk.mm)
 		return -EINVAL;
+	if (!check_ops_valid(ops))
+		return -EINVAL;
 
 	/*
 	 * 1) For walking the user virtual address space:
@@ -556,6 +663,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
 		return -EINVAL;
 	if (start < vma->vm_start || end > vma->vm_end)
 		return -EINVAL;
+	if (!check_ops_valid(ops))
+		return -EINVAL;
 
 	process_mm_walk_lock(walk.mm, ops->walk_lock);
 	process_vma_walk_lock(vma, ops->walk_lock);
@@ -574,6 +683,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
 
 	if (!walk.mm)
 		return -EINVAL;
+	if (!check_ops_valid(ops))
+		return -EINVAL;
 
 	process_mm_walk_lock(walk.mm, ops->walk_lock);
 	process_vma_walk_lock(vma, ops->walk_lock);
@@ -623,6 +734,9 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
 	unsigned long start_addr, end_addr;
 	int err = 0;
 
+	if (!check_ops_valid(ops))
+		return -EINVAL;
+
 	lockdep_assert_held(&mapping->i_mmap_rwsem);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
 				  first_index + nr - 1) {
-- 
cgit v1.2.3


From 7c53dfbdb024915f23f03f972b33744309d4608b Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 28 Oct 2024 14:13:28 +0000
Subject: mm: add PTE_MARKER_GUARD PTE marker

Add a new PTE marker that results in any access causing the accessing
process to segfault.

This is preferable to PTE_MARKER_POISONED, which results in the same
handling as hardware poisoned memory, and is thus undesirable for cases
where we simply wish to 'soft' poison a range.

This is in preparation for implementing the ability to specify guard pages
at the page table level, i.e.  ranges that, when accessed, should cause
process termination.

Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
purpose was simply incorrect.

We then reuse the same logic to determine whether a zap should clear a
guard entry - this should only be performed on teardown and never on
MADV_DONTNEED or MADV_FREE.

We additionally add a WARN_ON_ONCE() in hugetlb logic should a guard
marker be encountered there, as we explicitly do not support this
operation and this should not occur.

Link: https://lkml.kernel.org/r/f47f3d5acca2dcf9bbf655b6d33f3dc713e4a4a0.1730123433.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabkba@suse.cz>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Suggested-by: Jann Horn <jannh@google.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h |  2 +-
 include/linux/swapops.h   | 24 +++++++++++++++++++++++-
 mm/hugetlb.c              |  4 ++++
 mm/memory.c               | 18 +++++++++++++++---
 mm/mprotect.c             |  6 ++++--
 5 files changed, 47 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 355cf46a01a6..1b6a917fffa4 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -544,7 +544,7 @@ static inline pte_marker copy_pte_marker(
 {
 	pte_marker srcm = pte_marker_get(entry);
 	/* Always copy error entries. */
-	pte_marker dstm = srcm & PTE_MARKER_POISONED;
+	pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
 
 	/* Only copy PTE markers if UFFD register matches. */
 	if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cb468e418ea1..96f26e29fefe 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -426,9 +426,19 @@ typedef unsigned long pte_marker;
  * "Poisoned" here is meant in the very general sense of "future accesses are
  * invalid", instead of referring very specifically to hardware memory errors.
  * This marker is meant to represent any of various different causes of this.
+ *
+ * Note that, when encountered by the faulting logic, PTEs with this marker will
+ * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error
+ * logic.
  */
 #define  PTE_MARKER_POISONED			BIT(1)
-#define  PTE_MARKER_MASK			(BIT(2) - 1)
+/*
+ * Indicates that, on fault, this PTE will case a SIGSEGV signal to be
+ * sent. This means guard markers behave in effect as if the region were mapped
+ * PROT_NONE, rather than if they were a memory hole or equivalent.
+ */
+#define  PTE_MARKER_GUARD			BIT(2)
+#define  PTE_MARKER_MASK			(BIT(3) - 1)
 
 static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
 {
@@ -464,6 +474,18 @@ static inline int is_poisoned_swp_entry(swp_entry_t entry)
 {
 	return is_pte_marker_entry(entry) &&
 	    (pte_marker_get(entry) & PTE_MARKER_POISONED);
+
+}
+
+static inline swp_entry_t make_guard_swp_entry(void)
+{
+	return make_pte_marker_entry(PTE_MARKER_GUARD);
+}
+
+static inline int is_guard_swp_entry(swp_entry_t entry)
+{
+	return is_pte_marker_entry(entry) &&
+		(pte_marker_get(entry) & PTE_MARKER_GUARD);
 }
 
 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 906294ac85dc..2c8c5da0f5d3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6353,6 +6353,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				ret = VM_FAULT_HWPOISON_LARGE |
 				      VM_FAULT_SET_HINDEX(hstate_index(h));
 				goto out_mutex;
+			} else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
+				/* This isn't supported in hugetlb. */
+				ret = VM_FAULT_SIGSEGV;
+				goto out_mutex;
 			}
 		}
 
diff --git a/mm/memory.c b/mm/memory.c
index 2d32023d4eb8..75c2dfd04f72 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1455,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details,
 	return !folio_test_anon(folio);
 }
 
-static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
+static inline bool zap_drop_markers(struct zap_details *details)
 {
 	if (!details)
 		return false;
@@ -1476,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 	if (vma_is_anonymous(vma))
 		return;
 
-	if (zap_drop_file_uffd_wp(details))
+	if (zap_drop_markers(details))
 		return;
 
 	for (;;) {
@@ -1671,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			 * drop the marker if explicitly requested.
 			 */
 			if (!vma_is_anonymous(vma) &&
-			    !zap_drop_file_uffd_wp(details))
+			    !zap_drop_markers(details))
+				continue;
+		} else if (is_guard_swp_entry(entry)) {
+			/*
+			 * Ordinary zapping should not remove guard PTE
+			 * markers. Only do so if we should remove PTE markers
+			 * in general.
+			 */
+			if (!zap_drop_markers(details))
 				continue;
 		} else if (is_hwpoison_entry(entry) ||
 			   is_poisoned_swp_entry(entry)) {
@@ -4003,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 	if (marker & PTE_MARKER_POISONED)
 		return VM_FAULT_HWPOISON;
 
+	/* Hitting a guard page is always a fatal condition. */
+	if (marker & PTE_MARKER_GUARD)
+		return VM_FAULT_SIGSEGV;
+
 	if (pte_marker_entry_uffd_wp(entry))
 		return pte_marker_handle_uffd_wp(vmf);
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6f450af3252e..516b1d847e2c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -236,9 +236,11 @@ static long change_pte_range(struct mmu_gather *tlb,
 			} else if (is_pte_marker_entry(entry)) {
 				/*
 				 * Ignore error swap entries unconditionally,
-				 * because any access should sigbus anyway.
+				 * because any access should sigbus/sigsegv
+				 * anyway.
 				 */
-				if (is_poisoned_swp_entry(entry))
+				if (is_poisoned_swp_entry(entry) ||
+				    is_guard_swp_entry(entry))
 					continue;
 				/*
 				 * If this is uffd-wp pte marker and we'd like
-- 
cgit v1.2.3


From ab6e8e74e47362bd9d79dd4394a167b2afe0cc77 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 27 Oct 2024 13:14:42 -0700
Subject: mm: delete the unused put_pages_list()

The last user of put_pages_list() converted away from it in 6.10 commit
06c375053cef ("iommu/vt-d: add wrapper functions for page allocations"):
delete put_pages_list().

Link: https://lkml.kernel.org/r/d9985d6a-293e-176b-e63d-82fdfd28c139@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  2 --
 mm/swap.c          | 31 -------------------------------
 2 files changed, 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 78848fbefe94..32888a97ab44 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1286,8 +1286,6 @@ static inline struct folio *virt_to_folio(const void *x)
 
 void __folio_put(struct folio *folio);
 
-void put_pages_list(struct list_head *pages);
-
 void split_page(struct page *page, unsigned int order);
 void folio_copy(struct folio *dst, struct folio *src);
 int folio_mc_copy(struct folio *dst, struct folio *src);
diff --git a/mm/swap.c b/mm/swap.c
index b8e3259ea2c4..638a3f001676 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -127,37 +127,6 @@ void __folio_put(struct folio *folio)
 }
 EXPORT_SYMBOL(__folio_put);
 
-/**
- * put_pages_list() - release a list of pages
- * @pages: list of pages threaded on page->lru
- *
- * Release a list of pages which are strung together on page.lru.
- */
-void put_pages_list(struct list_head *pages)
-{
-	struct folio_batch fbatch;
-	struct folio *folio, *next;
-
-	folio_batch_init(&fbatch);
-	list_for_each_entry_safe(folio, next, pages, lru) {
-		if (!folio_put_testzero(folio))
-			continue;
-		if (folio_test_hugetlb(folio)) {
-			free_huge_folio(folio);
-			continue;
-		}
-		/* LRU flag must be clear because it's passed using the lru */
-		if (folio_batch_add(&fbatch, folio) > 0)
-			continue;
-		free_unref_folios(&fbatch);
-	}
-
-	if (fbatch.nr)
-		free_unref_folios(&fbatch);
-	INIT_LIST_HEAD(pages);
-}
-EXPORT_SYMBOL(put_pages_list);
-
 typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
 
 static void lru_add(struct lruvec *lruvec, struct folio *folio)
-- 
cgit v1.2.3


From 48b50624aec454ce0fa8f78ef96e2f43bc0be495 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 21 Oct 2024 12:33:38 +0200
Subject: backing-file: clean up the API

 - Pass iocb to ctx->end_write() instead of file + pos

 - Get rid of ctx->user_file, which is redundant most of the time

 - Instead pass iocb to backing_file_splice_read and
   backing_file_splice_write

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/backing-file.c            | 33 ++++++++++++++++-----------------
 fs/fuse/passthrough.c        | 32 ++++++++++++++++++--------------
 fs/overlayfs/file.c          | 22 +++++++++++++---------
 include/linux/backing-file.h | 11 +++++------
 4 files changed, 52 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/backing-file.c b/fs/backing-file.c
index 09a9be945d45..a38737592ec7 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -80,7 +80,7 @@ struct backing_aio {
 	refcount_t ref;
 	struct kiocb *orig_iocb;
 	/* used for aio completion */
-	void (*end_write)(struct file *, loff_t, ssize_t);
+	void (*end_write)(struct kiocb *iocb, ssize_t);
 	struct work_struct work;
 	long res;
 };
@@ -108,10 +108,10 @@ static void backing_aio_cleanup(struct backing_aio *aio, long res)
 	struct kiocb *iocb = &aio->iocb;
 	struct kiocb *orig_iocb = aio->orig_iocb;
 
+	orig_iocb->ki_pos = iocb->ki_pos;
 	if (aio->end_write)
-		aio->end_write(orig_iocb->ki_filp, iocb->ki_pos, res);
+		aio->end_write(orig_iocb, res);
 
-	orig_iocb->ki_pos = iocb->ki_pos;
 	backing_aio_put(aio);
 }
 
@@ -200,7 +200,7 @@ out:
 	revert_creds(old_cred);
 
 	if (ctx->accessed)
-		ctx->accessed(ctx->user_file);
+		ctx->accessed(iocb->ki_filp);
 
 	return ret;
 }
@@ -219,7 +219,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 	if (!iov_iter_count(iter))
 		return 0;
 
-	ret = file_remove_privs(ctx->user_file);
+	ret = file_remove_privs(iocb->ki_filp);
 	if (ret)
 		return ret;
 
@@ -239,7 +239,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 
 		ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
 		if (ctx->end_write)
-			ctx->end_write(ctx->user_file, iocb->ki_pos, ret);
+			ctx->end_write(iocb, ret);
 	} else {
 		struct backing_aio *aio;
 
@@ -270,7 +270,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(backing_file_write_iter);
 
-ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
 				 struct pipe_inode_info *pipe, size_t len,
 				 unsigned int flags,
 				 struct backing_file_ctx *ctx)
@@ -282,19 +282,19 @@ ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
 		return -EIO;
 
 	old_cred = override_creds(ctx->cred);
-	ret = vfs_splice_read(in, ppos, pipe, len, flags);
+	ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
 	revert_creds(old_cred);
 
 	if (ctx->accessed)
-		ctx->accessed(ctx->user_file);
+		ctx->accessed(iocb->ki_filp);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(backing_file_splice_read);
 
 ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
-				  struct file *out, loff_t *ppos, size_t len,
-				  unsigned int flags,
+				  struct file *out, struct kiocb *iocb,
+				  size_t len, unsigned int flags,
 				  struct backing_file_ctx *ctx)
 {
 	const struct cred *old_cred;
@@ -306,18 +306,18 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
 	if (!out->f_op->splice_write)
 		return -EINVAL;
 
-	ret = file_remove_privs(ctx->user_file);
+	ret = file_remove_privs(iocb->ki_filp);
 	if (ret)
 		return ret;
 
 	old_cred = override_creds(ctx->cred);
 	file_start_write(out);
-	ret = out->f_op->splice_write(pipe, out, ppos, len, flags);
+	ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
 	file_end_write(out);
 	revert_creds(old_cred);
 
 	if (ctx->end_write)
-		ctx->end_write(ctx->user_file, ppos ? *ppos : 0, ret);
+		ctx->end_write(iocb, ret);
 
 	return ret;
 }
@@ -329,8 +329,7 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
 	const struct cred *old_cred;
 	int ret;
 
-	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) ||
-	    WARN_ON_ONCE(ctx->user_file != vma->vm_file))
+	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
 		return -EIO;
 
 	if (!file->f_op->mmap)
@@ -343,7 +342,7 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
 	revert_creds(old_cred);
 
 	if (ctx->accessed)
-		ctx->accessed(ctx->user_file);
+		ctx->accessed(vma->vm_file);
 
 	return ret;
 }
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index bbac547dfcb3..607ef735ad4a 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -18,11 +18,11 @@ static void fuse_file_accessed(struct file *file)
 	fuse_invalidate_atime(inode);
 }
 
-static void fuse_passthrough_end_write(struct file *file, loff_t pos, ssize_t ret)
+static void fuse_passthrough_end_write(struct kiocb *iocb, ssize_t ret)
 {
-	struct inode *inode = file_inode(file);
+	struct inode *inode = file_inode(iocb->ki_filp);
 
-	fuse_write_update_attr(inode, pos, ret);
+	fuse_write_update_attr(inode, iocb->ki_pos, ret);
 }
 
 ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -34,7 +34,6 @@ ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	ssize_t ret;
 	struct backing_file_ctx ctx = {
 		.cred = ff->cred,
-		.user_file = file,
 		.accessed = fuse_file_accessed,
 	};
 
@@ -62,7 +61,6 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb,
 	ssize_t ret;
 	struct backing_file_ctx ctx = {
 		.cred = ff->cred,
-		.user_file = file,
 		.end_write = fuse_passthrough_end_write,
 	};
 
@@ -88,15 +86,20 @@ ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos,
 	struct file *backing_file = fuse_file_passthrough(ff);
 	struct backing_file_ctx ctx = {
 		.cred = ff->cred,
-		.user_file = in,
 		.accessed = fuse_file_accessed,
 	};
+	struct kiocb iocb;
+	ssize_t ret;
 
 	pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
-		 backing_file, ppos ? *ppos : 0, len, flags);
+		 backing_file, *ppos, len, flags);
 
-	return backing_file_splice_read(backing_file, ppos, pipe, len, flags,
-					&ctx);
+	init_sync_kiocb(&iocb, in);
+	iocb.ki_pos = *ppos;
+	ret = backing_file_splice_read(backing_file, &iocb, pipe, len, flags, &ctx);
+	*ppos = iocb.ki_pos;
+
+	return ret;
 }
 
 ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
@@ -109,16 +112,18 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
 	ssize_t ret;
 	struct backing_file_ctx ctx = {
 		.cred = ff->cred,
-		.user_file = out,
 		.end_write = fuse_passthrough_end_write,
 	};
+	struct kiocb iocb;
 
 	pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
-		 backing_file, ppos ? *ppos : 0, len, flags);
+		 backing_file, *ppos, len, flags);
 
 	inode_lock(inode);
-	ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags,
-					&ctx);
+	init_sync_kiocb(&iocb, out);
+	iocb.ki_pos = *ppos;
+	ret = backing_file_splice_write(pipe, backing_file, &iocb, len, flags, &ctx);
+	*ppos = iocb.ki_pos;
 	inode_unlock(inode);
 
 	return ret;
@@ -130,7 +135,6 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma)
 	struct file *backing_file = fuse_file_passthrough(ff);
 	struct backing_file_ctx ctx = {
 		.cred = ff->cred,
-		.user_file = file,
 		.accessed = fuse_file_accessed,
 	};
 
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 4444c78e2e0c..12c4d502ff91 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -231,9 +231,9 @@ static void ovl_file_modified(struct file *file)
 	ovl_copyattr(file_inode(file));
 }
 
-static void ovl_file_end_write(struct file *file, loff_t pos, ssize_t ret)
+static void ovl_file_end_write(struct kiocb *iocb, ssize_t ret)
 {
-	ovl_file_modified(file);
+	ovl_file_modified(iocb->ki_filp);
 }
 
 static void ovl_file_accessed(struct file *file)
@@ -271,7 +271,6 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	ssize_t ret;
 	struct backing_file_ctx ctx = {
 		.cred = ovl_creds(file_inode(file)->i_sb),
-		.user_file = file,
 		.accessed = ovl_file_accessed,
 	};
 
@@ -298,7 +297,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	int ifl = iocb->ki_flags;
 	struct backing_file_ctx ctx = {
 		.cred = ovl_creds(inode->i_sb),
-		.user_file = file,
 		.end_write = ovl_file_end_write,
 	};
 
@@ -338,15 +336,18 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
 	ssize_t ret;
 	struct backing_file_ctx ctx = {
 		.cred = ovl_creds(file_inode(in)->i_sb),
-		.user_file = in,
 		.accessed = ovl_file_accessed,
 	};
+	struct kiocb iocb;
 
 	ret = ovl_real_fdget(in, &real);
 	if (ret)
 		return ret;
 
-	ret = backing_file_splice_read(fd_file(real), ppos, pipe, len, flags, &ctx);
+	init_sync_kiocb(&iocb, in);
+	iocb.ki_pos = *ppos;
+	ret = backing_file_splice_read(fd_file(real), &iocb, pipe, len, flags, &ctx);
+	*ppos = iocb.ki_pos;
 	fdput(real);
 
 	return ret;
@@ -368,9 +369,9 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	ssize_t ret;
 	struct backing_file_ctx ctx = {
 		.cred = ovl_creds(inode->i_sb),
-		.user_file = out,
 		.end_write = ovl_file_end_write,
 	};
+	struct kiocb iocb;
 
 	inode_lock(inode);
 	/* Update mode */
@@ -380,9 +381,13 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	if (ret)
 		goto out_unlock;
 
-	ret = backing_file_splice_write(pipe, fd_file(real), ppos, len, flags, &ctx);
+	init_sync_kiocb(&iocb, out);
+	iocb.ki_pos = *ppos;
+	ret = backing_file_splice_write(pipe, fd_file(real), &iocb, len, flags, &ctx);
+	*ppos = iocb.ki_pos;
 	fdput(real);
 
+
 out_unlock:
 	inode_unlock(inode);
 
@@ -420,7 +425,6 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
 	struct file *realfile = file->private_data;
 	struct backing_file_ctx ctx = {
 		.cred = ovl_creds(file_inode(file)->i_sb),
-		.user_file = file,
 		.accessed = ovl_file_accessed,
 	};
 
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 2eed0ffb5e8f..1476a6ed1bfd 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -14,9 +14,8 @@
 
 struct backing_file_ctx {
 	const struct cred *cred;
-	struct file *user_file;
-	void (*accessed)(struct file *);
-	void (*end_write)(struct file *, loff_t, ssize_t);
+	void (*accessed)(struct file *file);
+	void (*end_write)(struct kiocb *iocb, ssize_t);
 };
 
 struct file *backing_file_open(const struct path *user_path, int flags,
@@ -31,13 +30,13 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
 ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 				struct kiocb *iocb, int flags,
 				struct backing_file_ctx *ctx);
-ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
 				 struct pipe_inode_info *pipe, size_t len,
 				 unsigned int flags,
 				 struct backing_file_ctx *ctx);
 ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
-				  struct file *out, loff_t *ppos, size_t len,
-				  unsigned int flags,
+				  struct file *out, struct kiocb *iocb,
+				  size_t len, unsigned int flags,
 				  struct backing_file_ctx *ctx);
 int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
 		      struct backing_file_ctx *ctx);
-- 
cgit v1.2.3


From 49dffdfde462c7823de6ed882f71ce233aaeba63 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Wed, 6 Nov 2024 16:57:17 -0800
Subject: cred: Add a light version of override/revert_creds()

Add a light version of override/revert_creds(), this should only be
used when the credentials in question will outlive the critical
section and the critical section doesn't change the ->usage of the
credentials.

Suggested-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 include/linux/cred.h | 18 ++++++++++++++++++
 kernel/cred.c        |  6 +++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 2976f534a7a3..e4a3155fe409 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -172,6 +172,24 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 					  cred->cap_inheritable));
 }
 
+/*
+ * Override creds without bumping reference count. Caller must ensure
+ * reference remains valid or has taken reference. Almost always not the
+ * interface you want. Use override_creds()/revert_creds() instead.
+ */
+static inline const struct cred *override_creds_light(const struct cred *override_cred)
+{
+	const struct cred *old = current->cred;
+
+	rcu_assign_pointer(current->cred, override_cred);
+	return old;
+}
+
+static inline void revert_creds_light(const struct cred *revert_cred)
+{
+	rcu_assign_pointer(current->cred, revert_cred);
+}
+
 /**
  * get_new_cred_many - Get references on a new set of credentials
  * @cred: The new credentials to reference
diff --git a/kernel/cred.c b/kernel/cred.c
index 075cfa7c896f..da7da250f7c8 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -485,7 +485,7 @@ EXPORT_SYMBOL(abort_creds);
  */
 const struct cred *override_creds(const struct cred *new)
 {
-	const struct cred *old = current->cred;
+	const struct cred *old;
 
 	kdebug("override_creds(%p{%ld})", new,
 	       atomic_long_read(&new->usage));
@@ -499,7 +499,7 @@ const struct cred *override_creds(const struct cred *new)
 	 * visible to other threads under RCU.
 	 */
 	get_new_cred((struct cred *)new);
-	rcu_assign_pointer(current->cred, new);
+	old = override_creds_light(new);
 
 	kdebug("override_creds() = %p{%ld}", old,
 	       atomic_long_read(&old->usage));
@@ -521,7 +521,7 @@ void revert_creds(const struct cred *old)
 	kdebug("revert_creds(%p{%ld})", old,
 	       atomic_long_read(&old->usage));
 
-	rcu_assign_pointer(current->cred, old);
+	revert_creds_light(old);
 	put_cred(override);
 }
 EXPORT_SYMBOL(revert_creds);
-- 
cgit v1.2.3


From 5c2e7736e20d9b348a44cafbfa639fe2653fbc34 Mon Sep 17 00:00:00 2001
From: Eder Zulian <ezulian@redhat.com>
Date: Thu, 7 Nov 2024 17:32:23 +0100
Subject: rust: helpers: Avoid raw_spin_lock initialization for PREEMPT_RT

When PREEMPT_RT=y, spin locks are mapped to rt_mutex types, so using
spinlock_check() + __raw_spin_lock_init() to initialize spin locks is
incorrect, and would cause build errors.

Introduce __spin_lock_init() to initialize a spin lock with lockdep
rquired information for PREEMPT_RT builds, and use it in the Rust
helper.

Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.")
Closes: https://lore.kernel.org/oe-kbuild-all/202409251238.vetlgXE9-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Eder Zulian <ezulian@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Tested-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/r/20241107163223.2092690-2-ezulian@redhat.com
---
 include/linux/spinlock_rt.h | 15 +++++++--------
 rust/helpers/spinlock.c     |  8 ++++++--
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index f9f14e135be7..f6499c37157d 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -16,22 +16,21 @@ static inline void __rt_spin_lock_init(spinlock_t *lock, const char *name,
 }
 #endif
 
-#define spin_lock_init(slock)					\
+#define __spin_lock_init(slock, name, key, percpu)		\
 do {								\
-	static struct lock_class_key __key;			\
-								\
 	rt_mutex_base_init(&(slock)->lock);			\
-	__rt_spin_lock_init(slock, #slock, &__key, false);	\
+	__rt_spin_lock_init(slock, name, key, percpu);		\
 } while (0)
 
-#define local_spin_lock_init(slock)				\
+#define _spin_lock_init(slock, percpu)				\
 do {								\
 	static struct lock_class_key __key;			\
-								\
-	rt_mutex_base_init(&(slock)->lock);			\
-	__rt_spin_lock_init(slock, #slock, &__key, true);	\
+	__spin_lock_init(slock, #slock, &__key, percpu);	\
 } while (0)
 
+#define spin_lock_init(slock)		_spin_lock_init(slock, false)
+#define local_spin_lock_init(slock)	_spin_lock_init(slock, true)
+
 extern void rt_spin_lock(spinlock_t *lock) __acquires(lock);
 extern void rt_spin_lock_nested(spinlock_t *lock, int subclass)	__acquires(lock);
 extern void rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock) __acquires(lock);
diff --git a/rust/helpers/spinlock.c b/rust/helpers/spinlock.c
index acc1376b833c..92f7fc418425 100644
--- a/rust/helpers/spinlock.c
+++ b/rust/helpers/spinlock.c
@@ -7,10 +7,14 @@ void rust_helper___spin_lock_init(spinlock_t *lock, const char *name,
 				  struct lock_class_key *key)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
+# if defined(CONFIG_PREEMPT_RT)
+	__spin_lock_init(lock, name, key, false);
+# else /*!CONFIG_PREEMPT_RT */
 	__raw_spin_lock_init(spinlock_check(lock), name, key, LD_WAIT_CONFIG);
-#else
+# endif /* CONFIG_PREEMPT_RT */
+#else /* !CONFIG_DEBUG_SPINLOCK */
 	spin_lock_init(lock);
-#endif
+#endif /* CONFIG_DEBUG_SPINLOCK */
 }
 
 void rust_helper_spin_lock(spinlock_t *lock)
-- 
cgit v1.2.3


From c554aa9ca976480839af342204e05bb4ce8367d5 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 1 Nov 2024 22:13:33 +0100
Subject: uprobes: Re-order struct uprobe_task to save some space

On x86_64, with allmodconfig, struct uprobe_task is 72 bytes long, with a
hole and some padding.

	/* size: 72, cachelines: 2, members: 7 */
	/* sum members: 64, holes: 1, sum holes: 4 */
	/* padding: 4 */
	/* forced alignments: 1, forced holes: 1, sum forced holes: 4 */
	/* last cacheline: 8 bytes */

Reorder the structure to fill the hole and avoid the padding.

This way, the whole structure fits in a single cacheline and some memory is
saved when it is allocated.

	/* size: 64, cachelines: 1, members: 7 */
	/* forced alignments: 1 */

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/a9f541d0cedf421f765c77a1fb93d6a979778a88.1730495562.git.christophe.jaillet@wanadoo.fr
---
 include/linux/uprobes.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 7a051b5d2edd..e0a4c2082245 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -121,6 +121,9 @@ struct hprobe {
 struct uprobe_task {
 	enum uprobe_task_state		state;
 
+	unsigned int			depth;
+	struct return_instance		*return_instances;
+
 	union {
 		struct {
 			struct arch_uprobe_task	autask;
@@ -138,9 +141,6 @@ struct uprobe_task {
 	unsigned long			xol_vaddr;
 
 	struct arch_uprobe              *auprobe;
-
-	struct return_instance		*return_instances;
-	unsigned int			depth;
 };
 
 struct return_consumer {
-- 
cgit v1.2.3


From ed76c07c6885b249ce8486dac22fb97151a83185 Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Tue, 5 Nov 2024 16:45:08 -0300
Subject: printk: Introduce FORCE_CON flag

Introduce FORCE_CON flag to printk. The new flag will make it possible to
create a context where printk messages will never be suppressed.

This mechanism will be used in the next patch to create a force_con
context on sysrq handling, removing an existing workaround on the
loglevel global variable. The workaround existed to make sure that sysrq
header messages were sent to all consoles, but this doesn't work with
deferred messages because the loglevel might be restored to its original
value before a console flushes the messages.

Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20241105-printk-loud-con-v2-1-bd3ecdf7b0e4@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h      |  3 +++
 kernel/printk/internal.h    |  3 +++
 kernel/printk/printk.c      | 21 ++++++++++++++++-----
 kernel/printk/printk_safe.c | 18 ++++++++++++++++++
 4 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index eca9bb2ee637..232e5fd06701 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -166,6 +166,9 @@ __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);
 extern void __printk_deferred_enter(void);
 extern void __printk_deferred_exit(void);
 
+extern void printk_force_console_enter(void);
+extern void printk_force_console_exit(void);
+
 /*
  * The printk_deferred_enter/exit macros are available only as a hack for
  * some code paths that need to defer all printk console printing. Interrupts
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 3fcb48502adb..c6bb47666aef 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -53,6 +53,8 @@ int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
 
 /* Flags for a single printk record. */
 enum printk_info_flags {
+	/* always show on console, ignore console_loglevel */
+	LOG_FORCE_CON	= 1,
 	LOG_NEWLINE	= 2,	/* text ended with a newline */
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
@@ -90,6 +92,7 @@ bool printk_percpu_data_ready(void);
 
 void defer_console_output(void);
 bool is_printk_legacy_deferred(void);
+bool is_printk_force_console(void);
 
 u16 printk_parse_prefix(const char *text, int *level,
 			enum printk_info_flags *flags);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 71e4fe6f9b85..7cb44f9f3825 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1319,11 +1319,11 @@ static void boot_delay_msec(int level)
 {
 	unsigned long long k;
 	unsigned long timeout;
+	bool suppress = !is_printk_force_console() &&
+			suppress_message_printing(level);
 
-	if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
-		|| suppress_message_printing(level)) {
+	if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress)
 		return;
-	}
 
 	k = (unsigned long long)loops_per_msec * boot_delay;
 
@@ -2273,6 +2273,9 @@ int vprintk_store(int facility, int level,
 	if (dev_info)
 		flags |= LOG_NEWLINE;
 
+	if (is_printk_force_console())
+		flags |= LOG_FORCE_CON;
+
 	if (flags & LOG_CONT) {
 		prb_rec_init_wr(&r, reserve_size);
 		if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
@@ -2280,6 +2283,9 @@ int vprintk_store(int facility, int level,
 						 facility, &flags, fmt, args);
 			r.info->text_len += text_len;
 
+			if (flags & LOG_FORCE_CON)
+				r.info->flags |= LOG_FORCE_CON;
+
 			if (flags & LOG_NEWLINE) {
 				r.info->flags |= LOG_NEWLINE;
 				prb_final_commit(&e);
@@ -2947,6 +2953,7 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
 	struct printk_info info;
 	struct printk_record r;
 	size_t len = 0;
+	bool force_con;
 
 	/*
 	 * Formatting extended messages requires a separate buffer, so use the
@@ -2965,9 +2972,13 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
 
 	pmsg->seq = r.info->seq;
 	pmsg->dropped = r.info->seq - seq;
+	force_con = r.info->flags & LOG_FORCE_CON;
 
-	/* Skip record that has level above the console loglevel. */
-	if (may_suppress && suppress_message_printing(r.info->level))
+	/*
+	 * Skip records that are not forced to be printed on consoles and that
+	 * has level above the console loglevel.
+	 */
+	if (!force_con && may_suppress && suppress_message_printing(r.info->level))
 		goto out;
 
 	if (is_extended) {
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 2b35a9d3919d..6f94418d53ff 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -12,6 +12,24 @@
 
 #include "internal.h"
 
+/* Context where printk messages are never suppressed */
+static atomic_t force_con;
+
+void printk_force_console_enter(void)
+{
+	atomic_inc(&force_con);
+}
+
+void printk_force_console_exit(void)
+{
+	atomic_dec(&force_con);
+}
+
+bool is_printk_force_console(void)
+{
+	return atomic_read(&force_con);
+}
+
 static DEFINE_PER_CPU(int, printk_context);
 
 /* Can be preempted by NMI. */
-- 
cgit v1.2.3


From df3b8ca604f224eb4cd51669416ad4d607682273 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 4 Nov 2024 16:12:04 +0000
Subject: io_uring/cmd: let cmds to know about dying task

When the taks that submitted a request is dying, a task work for that
request might get run by a kernel thread or even worse by a half
dismantled task. We can't just cancel the task work without running the
callback as the cmd might need to do some clean up, so pass a flag
instead. If set, it's not safe to access any task resources and the
callback is expected to cancel the cmd ASAP.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/io_uring_types.h | 1 +
 io_uring/uring_cmd.c           | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 4b9ba523978d..2ee5dc105b58 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -37,6 +37,7 @@ enum io_uring_cmd_flags {
 	/* set when uring wants to cancel a previously issued command */
 	IO_URING_F_CANCEL		= (1 << 11),
 	IO_URING_F_COMPAT		= (1 << 12),
+	IO_URING_F_TASK_DEAD		= (1 << 13),
 };
 
 struct io_wq_work_node {
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 39c3c816ec78..e2e8485932d6 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -119,9 +119,13 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
 static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+	unsigned int flags = IO_URING_F_COMPLETE_DEFER;
+
+	if (current->flags & (PF_EXITING | PF_KTHREAD))
+		flags |= IO_URING_F_TASK_DEAD;
 
 	/* task_work executor checks the deffered list completion */
-	ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER);
+	ioucmd->task_work_cb(ioucmd, flags);
 }
 
 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
-- 
cgit v1.2.3


From 0ef2b9e698dbf9ba78f67952a747f35eb7060470 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 4 Nov 2024 07:26:31 +0100
Subject: block: lift bio_is_zone_append to bio.h

Make bio_is_zone_append globally available, because file systems need
to use to check for a zone append bio in their end_io handlers to deal
with the block layer emulation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241104062647.91160-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h         |  9 ---------
 include/linux/bio.h | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk.h b/block/blk.h
index 57fc035620d6..2c26abf505b8 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -457,11 +457,6 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
 {
 	return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
 }
-static inline bool bio_is_zone_append(struct bio *bio)
-{
-	return bio_op(bio) == REQ_OP_ZONE_APPEND ||
-		bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
-}
 void blk_zone_write_plug_bio_merged(struct bio *bio);
 void blk_zone_write_plug_init_request(struct request *rq);
 static inline void blk_zone_update_request_bio(struct request *rq,
@@ -510,10 +505,6 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
 {
 	return false;
 }
-static inline bool bio_is_zone_append(struct bio *bio)
-{
-	return false;
-}
 static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
 {
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4a1bf43ca53d..60830a6a5939 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -675,6 +675,23 @@ static inline void bio_clear_polled(struct bio *bio)
 	bio->bi_opf &= ~REQ_POLLED;
 }
 
+/**
+ * bio_is_zone_append - is this a zone append bio?
+ * @bio:	bio to check
+ *
+ * Check if @bio is a zone append operation.  Core block layer code and end_io
+ * handlers must use this instead of an open coded REQ_OP_ZONE_APPEND check
+ * because the block layer can rewrite REQ_OP_ZONE_APPEND to REQ_OP_WRITE if
+ * it is not natively supported.
+ */
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+	if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+		return false;
+	return bio_op(bio) == REQ_OP_ZONE_APPEND ||
+		bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
+}
+
 struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
 		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp);
 struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new);
-- 
cgit v1.2.3


From f6b9a69a9e56b2083aca8a925fc1a28eb698e3ed Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 9 Nov 2024 15:14:29 -0800
Subject: bpf: Refactor active lock management

When bpf_spin_lock was introduced originally, there was deliberation on
whether to use an array of lock IDs, but since bpf_spin_lock is limited
to holding a single lock at any given time, we've been using a single ID
to identify the held lock.

In preparation for introducing spin locks that can be taken multiple
times, introduce support for acquiring multiple lock IDs. For this
purpose, reuse the acquired_refs array and store both lock and pointer
references. We tag the entry with REF_TYPE_PTR or REF_TYPE_LOCK to
disambiguate and find the relevant entry. The ptr field is used to track
the map_ptr or btf (for bpf_obj_new allocations) to ensure locks can be
matched with protected fields within the same "allocation", i.e.
bpf_obj_new object or map value.

The struct active_lock is changed to an int as the state is part of the
acquired_refs array, and we only need active_lock as a cheap way of
detecting lock presence.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241109231430.2475236-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf_verifier.h |  53 ++++++++--------
 kernel/bpf/verifier.c        | 146 +++++++++++++++++++++++++++++++------------
 2 files changed, 132 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4513372c5bc8..d84beed92ae4 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -48,22 +48,6 @@ enum bpf_reg_liveness {
 	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
 };
 
-/* For every reg representing a map value or allocated object pointer,
- * we consider the tuple of (ptr, id) for them to be unique in verifier
- * context and conside them to not alias each other for the purposes of
- * tracking lock state.
- */
-struct bpf_active_lock {
-	/* This can either be reg->map_ptr or reg->btf. If ptr is NULL,
-	 * there's no active lock held, and other fields have no
-	 * meaning. If non-NULL, it indicates that a lock is held and
-	 * id member has the reg->id of the register which can be >= 0.
-	 */
-	void *ptr;
-	/* This will be reg->id */
-	u32 id;
-};
-
 #define ITER_PREFIX "bpf_iter_"
 
 enum bpf_iter_state {
@@ -266,6 +250,13 @@ struct bpf_stack_state {
 };
 
 struct bpf_reference_state {
+	/* Each reference object has a type. Ensure REF_TYPE_PTR is zero to
+	 * default to pointer reference on zero initialization of a state.
+	 */
+	enum ref_state_type {
+		REF_TYPE_PTR = 0,
+		REF_TYPE_LOCK,
+	} type;
 	/* Track each reference created with a unique id, even if the same
 	 * instruction creates the reference multiple times (eg, via CALL).
 	 */
@@ -274,17 +265,23 @@ struct bpf_reference_state {
 	 * is used purely to inform the user of a reference leak.
 	 */
 	int insn_idx;
-	/* There can be a case like:
-	 * main (frame 0)
-	 *  cb (frame 1)
-	 *   func (frame 3)
-	 *    cb (frame 4)
-	 * Hence for frame 4, if callback_ref just stored boolean, it would be
-	 * impossible to distinguish nested callback refs. Hence store the
-	 * frameno and compare that to callback_ref in check_reference_leak when
-	 * exiting a callback function.
-	 */
-	int callback_ref;
+	union {
+		/* There can be a case like:
+		 * main (frame 0)
+		 *  cb (frame 1)
+		 *   func (frame 3)
+		 *    cb (frame 4)
+		 * Hence for frame 4, if callback_ref just stored boolean, it would be
+		 * impossible to distinguish nested callback refs. Hence store the
+		 * frameno and compare that to callback_ref in check_reference_leak when
+		 * exiting a callback function.
+		 */
+		int callback_ref;
+		/* Use to keep track of the source object of a lock, to ensure
+		 * it matches on unlock.
+		 */
+		void *ptr;
+	};
 };
 
 struct bpf_retval_range {
@@ -332,6 +329,7 @@ struct bpf_func_state {
 
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
+	int active_locks;
 	struct bpf_reference_state *refs;
 	/* The state of the stack. Each element of the array describes BPF_REG_SIZE
 	 * (i.e. 8) bytes worth of stack memory.
@@ -434,7 +432,6 @@ struct bpf_verifier_state {
 	u32 insn_idx;
 	u32 curframe;
 
-	struct bpf_active_lock active_lock;
 	bool speculative;
 	bool active_rcu_lock;
 	u32 active_preempt_lock;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 132fc172961f..d55ca27dc031 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1284,6 +1284,7 @@ static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_fun
 	if (!dst->refs)
 		return -ENOMEM;
 
+	dst->active_locks = src->active_locks;
 	dst->acquired_refs = src->acquired_refs;
 	return 0;
 }
@@ -1354,6 +1355,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
 	if (err)
 		return err;
 	id = ++env->id_gen;
+	state->refs[new_ofs].type = REF_TYPE_PTR;
 	state->refs[new_ofs].id = id;
 	state->refs[new_ofs].insn_idx = insn_idx;
 	state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
@@ -1361,6 +1363,25 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
 	return id;
 }
 
+static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type,
+			      int id, void *ptr)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int new_ofs = state->acquired_refs;
+	int err;
+
+	err = resize_reference_state(state, state->acquired_refs + 1);
+	if (err)
+		return err;
+	state->refs[new_ofs].type = type;
+	state->refs[new_ofs].id = id;
+	state->refs[new_ofs].insn_idx = insn_idx;
+	state->refs[new_ofs].ptr = ptr;
+
+	state->active_locks++;
+	return 0;
+}
+
 /* release function corresponding to acquire_reference_state(). Idempotent. */
 static int release_reference_state(struct bpf_func_state *state, int ptr_id)
 {
@@ -1368,6 +1389,8 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
 
 	last_idx = state->acquired_refs - 1;
 	for (i = 0; i < state->acquired_refs; i++) {
+		if (state->refs[i].type != REF_TYPE_PTR)
+			continue;
 		if (state->refs[i].id == ptr_id) {
 			/* Cannot release caller references in callbacks */
 			if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
@@ -1383,6 +1406,45 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
 	return -EINVAL;
 }
 
+static int release_lock_state(struct bpf_func_state *state, int type, int id, void *ptr)
+{
+	int i, last_idx;
+
+	last_idx = state->acquired_refs - 1;
+	for (i = 0; i < state->acquired_refs; i++) {
+		if (state->refs[i].type != type)
+			continue;
+		if (state->refs[i].id == id && state->refs[i].ptr == ptr) {
+			if (last_idx && i != last_idx)
+				memcpy(&state->refs[i], &state->refs[last_idx],
+				       sizeof(*state->refs));
+			memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+			state->acquired_refs--;
+			state->active_locks--;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+static struct bpf_reference_state *find_lock_state(struct bpf_verifier_env *env, enum ref_state_type type,
+						   int id, void *ptr)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int i;
+
+	for (i = 0; i < state->acquired_refs; i++) {
+		struct bpf_reference_state *s = &state->refs[i];
+
+		if (s->type == REF_TYPE_PTR || s->type != type)
+			continue;
+
+		if (s->id == id && s->ptr == ptr)
+			return s;
+	}
+	return NULL;
+}
+
 static void free_func_state(struct bpf_func_state *state)
 {
 	if (!state)
@@ -1453,8 +1515,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->active_preempt_lock = src->active_preempt_lock;
 	dst_state->in_sleepable = src->in_sleepable;
 	dst_state->curframe = src->curframe;
-	dst_state->active_lock.ptr = src->active_lock.ptr;
-	dst_state->active_lock.id = src->active_lock.id;
 	dst_state->branches = src->branches;
 	dst_state->parent = src->parent;
 	dst_state->first_insn_idx = src->first_insn_idx;
@@ -5442,7 +5502,7 @@ static bool in_sleepable(struct bpf_verifier_env *env)
 static bool in_rcu_cs(struct bpf_verifier_env *env)
 {
 	return env->cur_state->active_rcu_lock ||
-	       env->cur_state->active_lock.ptr ||
+	       cur_func(env)->active_locks ||
 	       !in_sleepable(env);
 }
 
@@ -7724,19 +7784,20 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
  * Since only one bpf_spin_lock is allowed the checks are simpler than
  * reg_is_refcounted() logic. The verifier needs to remember only
  * one spin_lock instead of array of acquired_refs.
- * cur_state->active_lock remembers which map value element or allocated
+ * cur_func(env)->active_locks remembers which map value element or allocated
  * object got locked and clears it after bpf_spin_unlock.
  */
 static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 			     bool is_lock)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
-	struct bpf_verifier_state *cur = env->cur_state;
 	bool is_const = tnum_is_const(reg->var_off);
+	struct bpf_func_state *cur = cur_func(env);
 	u64 val = reg->var_off.value;
 	struct bpf_map *map = NULL;
 	struct btf *btf = NULL;
 	struct btf_record *rec;
+	int err;
 
 	if (!is_const) {
 		verbose(env,
@@ -7768,16 +7829,23 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 		return -EINVAL;
 	}
 	if (is_lock) {
-		if (cur->active_lock.ptr) {
+		void *ptr;
+
+		if (map)
+			ptr = map;
+		else
+			ptr = btf;
+
+		if (cur->active_locks) {
 			verbose(env,
 				"Locking two bpf_spin_locks are not allowed\n");
 			return -EINVAL;
 		}
-		if (map)
-			cur->active_lock.ptr = map;
-		else
-			cur->active_lock.ptr = btf;
-		cur->active_lock.id = reg->id;
+		err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr);
+		if (err < 0) {
+			verbose(env, "Failed to acquire lock state\n");
+			return err;
+		}
 	} else {
 		void *ptr;
 
@@ -7786,20 +7854,17 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 		else
 			ptr = btf;
 
-		if (!cur->active_lock.ptr) {
+		if (!cur->active_locks) {
 			verbose(env, "bpf_spin_unlock without taking a lock\n");
 			return -EINVAL;
 		}
-		if (cur->active_lock.ptr != ptr ||
-		    cur->active_lock.id != reg->id) {
+
+		if (release_lock_state(cur_func(env), REF_TYPE_LOCK, reg->id, ptr)) {
 			verbose(env, "bpf_spin_unlock of different lock\n");
 			return -EINVAL;
 		}
 
 		invalidate_non_owning_refs(env);
-
-		cur->active_lock.ptr = NULL;
-		cur->active_lock.id = 0;
 	}
 	return 0;
 }
@@ -9861,7 +9926,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		const char *sub_name = subprog_name(env, subprog);
 
 		/* Only global subprogs cannot be called with a lock held. */
-		if (env->cur_state->active_lock.ptr) {
+		if (cur_func(env)->active_locks) {
 			verbose(env, "global function calls are not allowed while holding a lock,\n"
 				     "use static function instead\n");
 			return -EINVAL;
@@ -10386,6 +10451,8 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
 		return 0;
 
 	for (i = 0; i < state->acquired_refs; i++) {
+		if (state->refs[i].type != REF_TYPE_PTR)
+			continue;
 		if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
 			continue;
 		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
@@ -10399,7 +10466,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit
 {
 	int err;
 
-	if (check_lock && env->cur_state->active_lock.ptr) {
+	if (check_lock && cur_func(env)->active_locks) {
 		verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix);
 		return -EINVAL;
 	}
@@ -11620,10 +11687,9 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 
 static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
-	struct bpf_verifier_state *state = env->cur_state;
 	struct btf_record *rec = reg_btf_record(reg);
 
-	if (!state->active_lock.ptr) {
+	if (!cur_func(env)->active_locks) {
 		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
 		return -EFAULT;
 	}
@@ -11720,6 +11786,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o
  */
 static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
+	struct bpf_reference_state *s;
 	void *ptr;
 	u32 id;
 
@@ -11736,10 +11803,10 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
 	}
 	id = reg->id;
 
-	if (!env->cur_state->active_lock.ptr)
+	if (!cur_func(env)->active_locks)
 		return -EINVAL;
-	if (env->cur_state->active_lock.ptr != ptr ||
-	    env->cur_state->active_lock.id != id) {
+	s = find_lock_state(env, REF_TYPE_LOCK, id, ptr);
+	if (!s) {
 		verbose(env, "held lock and object are not in the same allocation\n");
 		return -EINVAL;
 	}
@@ -17635,8 +17702,22 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
 		return false;
 
 	for (i = 0; i < old->acquired_refs; i++) {
-		if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
+		if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
+		    old->refs[i].type != cur->refs[i].type)
+			return false;
+		switch (old->refs[i].type) {
+		case REF_TYPE_PTR:
+			if (old->refs[i].callback_ref != cur->refs[i].callback_ref)
+				return false;
+			break;
+		case REF_TYPE_LOCK:
+			if (old->refs[i].ptr != cur->refs[i].ptr)
+				return false;
+			break;
+		default:
+			WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
 			return false;
+		}
 	}
 
 	return true;
@@ -17714,19 +17795,6 @@ static bool states_equal(struct bpf_verifier_env *env,
 	if (old->speculative && !cur->speculative)
 		return false;
 
-	if (old->active_lock.ptr != cur->active_lock.ptr)
-		return false;
-
-	/* Old and cur active_lock's have to be either both present
-	 * or both absent.
-	 */
-	if (!!old->active_lock.id != !!cur->active_lock.id)
-		return false;
-
-	if (old->active_lock.id &&
-	    !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
-		return false;
-
 	if (old->active_rcu_lock != cur->active_rcu_lock)
 		return false;
 
@@ -18625,7 +18693,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
-				if (env->cur_state->active_lock.ptr) {
+				if (cur_func(env)->active_locks) {
 					if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
 					    (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
 					     (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
-- 
cgit v1.2.3


From ae6e3a273f590a2b64f14a9fab3546c3a8f44ed4 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 9 Nov 2024 15:14:30 -0800
Subject: bpf: Drop special callback reference handling

Logic to prevent callbacks from acquiring new references for the program
(i.e. leaving acquired references), and releasing caller references
(i.e. those acquired in parent frames) was introduced in commit
9d9d00ac29d0 ("bpf: Fix reference state management for synchronous callbacks").

This was necessary because back then, the verifier simulated each
callback once (that could potentially be executed N times, where N can
be zero). This meant that callbacks that left lingering resources or
cleared caller resources could do it more than once, operating on
undefined state or leaking memory.

With the fixes to callback verification in commit
ab5cfac139ab ("bpf: verify callbacks as if they are called unknown number of times"),
all of this extra logic is no longer necessary. Hence, drop it as part
of this commit.

Cc: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241109231430.2475236-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf_verifier.h                     | 21 ++++----------------
 kernel/bpf/verifier.c                            | 25 +++++-------------------
 tools/testing/selftests/bpf/prog_tests/cb_refs.c |  4 ++--
 3 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d84beed92ae4..3a74033d49c4 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -265,23 +265,10 @@ struct bpf_reference_state {
 	 * is used purely to inform the user of a reference leak.
 	 */
 	int insn_idx;
-	union {
-		/* There can be a case like:
-		 * main (frame 0)
-		 *  cb (frame 1)
-		 *   func (frame 3)
-		 *    cb (frame 4)
-		 * Hence for frame 4, if callback_ref just stored boolean, it would be
-		 * impossible to distinguish nested callback refs. Hence store the
-		 * frameno and compare that to callback_ref in check_reference_leak when
-		 * exiting a callback function.
-		 */
-		int callback_ref;
-		/* Use to keep track of the source object of a lock, to ensure
-		 * it matches on unlock.
-		 */
-		void *ptr;
-	};
+	/* Use to keep track of the source object of a lock, to ensure
+	 * it matches on unlock.
+	 */
+	void *ptr;
 };
 
 struct bpf_retval_range {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d55ca27dc031..9f5de8d4fbd0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1358,7 +1358,6 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
 	state->refs[new_ofs].type = REF_TYPE_PTR;
 	state->refs[new_ofs].id = id;
 	state->refs[new_ofs].insn_idx = insn_idx;
-	state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
 
 	return id;
 }
@@ -1392,9 +1391,6 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
 		if (state->refs[i].type != REF_TYPE_PTR)
 			continue;
 		if (state->refs[i].id == ptr_id) {
-			/* Cannot release caller references in callbacks */
-			if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
-				return -EINVAL;
 			if (last_idx && i != last_idx)
 				memcpy(&state->refs[i], &state->refs[last_idx],
 				       sizeof(*state->refs));
@@ -10267,17 +10263,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		caller->regs[BPF_REG_0] = *r0;
 	}
 
-	/* callback_fn frame should have released its own additions to parent's
-	 * reference state at this point, or check_reference_leak would
-	 * complain, hence it must be the same as the caller. There is no need
-	 * to copy it back.
-	 */
-	if (!callee->in_callback_fn) {
-		/* Transfer references to the caller */
-		err = copy_reference_state(caller, callee);
-		if (err)
-			return err;
-	}
+	/* Transfer references to the caller */
+	err = copy_reference_state(caller, callee);
+	if (err)
+		return err;
 
 	/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
 	 * there function call logic would reschedule callback visit. If iteration
@@ -10447,14 +10436,12 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
 	bool refs_lingering = false;
 	int i;
 
-	if (!exception_exit && state->frameno && !state->in_callback_fn)
+	if (!exception_exit && state->frameno)
 		return 0;
 
 	for (i = 0; i < state->acquired_refs; i++) {
 		if (state->refs[i].type != REF_TYPE_PTR)
 			continue;
-		if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
-			continue;
 		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
 			state->refs[i].id, state->refs[i].insn_idx);
 		refs_lingering = true;
@@ -17707,8 +17694,6 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
 			return false;
 		switch (old->refs[i].type) {
 		case REF_TYPE_PTR:
-			if (old->refs[i].callback_ref != cur->refs[i].callback_ref)
-				return false;
 			break;
 		case REF_TYPE_LOCK:
 			if (old->refs[i].ptr != cur->refs[i].ptr)
diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c
index 3bff680de16c..c40df623a8f7 100644
--- a/tools/testing/selftests/bpf/prog_tests/cb_refs.c
+++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c
@@ -11,8 +11,8 @@ struct {
 	const char *prog_name;
 	const char *err_msg;
 } cb_refs_tests[] = {
-	{ "underflow_prog", "reference has not been acquired before" },
-	{ "leak_prog", "Unreleased reference" },
+	{ "underflow_prog", "must point to scalar, or struct with scalar" },
+	{ "leak_prog", "Possibly NULL pointer passed to helper arg2" },
 	{ "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */
 	{ "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */
 };
-- 
cgit v1.2.3


From 559218d43ec9dde3d2847c7aa127e88d6ab1c9ed Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 8 Nov 2024 16:46:51 +0100
Subject: block: pre-calculate max_zone_append_sectors

max_zone_append_sectors differs from all other queue limits in that the
final value used is not stored in the queue_limits but needs to be
obtained using queue_limits_max_zone_append_sectors helper.  This not
only adds (tiny) extra overhead to the I/O path, but also can be easily
forgotten in file system code.

Add a new max_hw_zone_append_sectors value to queue_limits which is
set by the driver, and calculate max_zone_append_sectors from that and
the other inputs in blk_validate_zoned_limits, similar to how
max_sectors is calculated to fix this.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241104073955.112324-3-hch@lst.de
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20241108154657.845768-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c               |  2 +-
 block/blk-merge.c              |  3 +--
 block/blk-settings.c           | 27 +++++++++++++--------------
 block/blk-sysfs.c              | 17 +++--------------
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/block/ublk_drv.c       |  2 +-
 drivers/block/virtio_blk.c     |  2 +-
 drivers/md/dm-zone.c           |  4 ++--
 drivers/nvme/host/multipath.c  |  2 +-
 drivers/nvme/host/zns.c        |  2 +-
 drivers/scsi/sd_zbc.c          |  2 --
 include/linux/blkdev.h         | 21 +++------------------
 12 files changed, 28 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4f791a3114a1..0387172e8259 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -607,7 +607,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 		return BLK_STS_IOERR;
 
 	/* Make sure the BIO is small enough and will not get split */
-	if (nr_sectors > queue_max_zone_append_sectors(q))
+	if (nr_sectors > q->limits.max_zone_append_sectors)
 		return BLK_STS_IOERR;
 
 	bio->bi_opf |= REQ_NOMERGE;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 859875a5ee90..7b0af8317c1c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -392,11 +392,10 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 struct bio *bio_split_zone_append(struct bio *bio,
 		const struct queue_limits *lim, unsigned *nr_segs)
 {
-	unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim);
 	int split_sectors;
 
 	split_sectors = bio_split_rw_at(bio, lim, nr_segs,
-			max_sectors << SECTOR_SHIFT);
+			lim->max_zone_append_sectors << SECTOR_SHIFT);
 	if (WARN_ON_ONCE(split_sectors > 0))
 		split_sectors = -EINVAL;
 	return bio_submit_split(bio, split_sectors);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5ee3d6d1448d..7d6b296997c2 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -50,7 +50,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	lim->max_sectors = UINT_MAX;
 	lim->max_dev_sectors = UINT_MAX;
 	lim->max_write_zeroes_sectors = UINT_MAX;
-	lim->max_zone_append_sectors = UINT_MAX;
+	lim->max_hw_zone_append_sectors = UINT_MAX;
 	lim->max_user_discard_sectors = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -91,17 +91,16 @@ static int blk_validate_zoned_limits(struct queue_limits *lim)
 	if (lim->zone_write_granularity < lim->logical_block_size)
 		lim->zone_write_granularity = lim->logical_block_size;
 
-	if (lim->max_zone_append_sectors) {
-		/*
-		 * The Zone Append size is limited by the maximum I/O size
-		 * and the zone size given that it can't span zones.
-		 */
-		lim->max_zone_append_sectors =
-			min3(lim->max_hw_sectors,
-			     lim->max_zone_append_sectors,
-			     lim->chunk_sectors);
-	}
-
+	/*
+	 * The Zone Append size is limited by the maximum I/O size and the zone
+	 * size given that it can't span zones.
+	 *
+	 * If no max_hw_zone_append_sectors limit is provided, the block layer
+	 * will emulated it, else we're also bound by the hardware limit.
+	 */
+	lim->max_zone_append_sectors =
+		min_not_zero(lim->max_hw_zone_append_sectors,
+			min(lim->chunk_sectors, lim->max_hw_sectors));
 	return 0;
 }
 
@@ -527,8 +526,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
 					b->max_write_zeroes_sectors);
-	t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
-					 queue_limits_max_zone_append_sectors(b));
+	t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
+					b->max_hw_zone_append_sectors);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0ef4e13e247d..d80a202cd170 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -131,6 +131,7 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
 
 #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field)			\
 static ssize_t queue_##_field##_show(struct gendisk *disk, char *page)	\
@@ -178,18 +179,6 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
 	return ret;
 }
 
-/*
- * For zone append queue_max_zone_append_sectors does not just return the
- * underlying queue limits, but actually contains a calculation.  Because of
- * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here.
- */
-static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page)
-{
-	return sprintf(page, "%llu\n",
-		(u64)queue_max_zone_append_sectors(disk->queue) <<
-			SECTOR_SHIFT);
-}
-
 static ssize_t
 queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count)
 {
@@ -479,7 +468,7 @@ QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
 
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
-QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
@@ -607,7 +596,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_atomic_write_unit_max_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_max_write_zeroes_sectors_entry.attr,
-	&queue_zone_append_max_entry.attr,
+	&queue_max_zone_append_sectors_entry.attr,
 	&queue_zone_write_granularity_entry.attr,
 	&queue_rotational_entry.attr,
 	&queue_zoned_entry.attr,
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 9bc768b2ca56..0d5f9bf95229 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -166,7 +166,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
 
 	lim->features |= BLK_FEAT_ZONED;
 	lim->chunk_sectors = dev->zone_size_sects;
-	lim->max_zone_append_sectors = dev->zone_append_max_sectors;
+	lim->max_hw_zone_append_sectors = dev->zone_append_max_sectors;
 	lim->max_open_zones = dev->zone_max_open;
 	lim->max_active_zones = dev->zone_max_active;
 	return 0;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 4ae4fdb8bb7f..c6d18cd8af44 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2279,7 +2279,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
 		lim.features |= BLK_FEAT_ZONED;
 		lim.max_active_zones = p->max_active_zones;
 		lim.max_open_zones =  p->max_open_zones;
-		lim.max_zone_append_sectors = p->max_zone_append_sectors;
+		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
 	}
 
 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 194417abc105..0e99a4714928 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -784,7 +784,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk,
 			wg, v);
 		return -ENODEV;
 	}
-	lim->max_zone_append_sectors = v;
+	lim->max_hw_zone_append_sectors = v;
 	dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
 
 	return 0;
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index c0d41c36e06e..20edd3fabbab 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -344,7 +344,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
 		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
 	} else {
 		set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
-		lim->max_zone_append_sectors = 0;
+		lim->max_hw_zone_append_sectors = 0;
 	}
 
 	/*
@@ -379,7 +379,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
 	if (!zlim.mapped_nr_seq_zones) {
 		lim->max_open_zones = 0;
 		lim->max_active_zones = 0;
-		lim->max_zone_append_sectors = 0;
+		lim->max_hw_zone_append_sectors = 0;
 		lim->zone_write_granularity = 0;
 		lim->chunk_sectors = 0;
 		lim->features &= ~BLK_FEAT_ZONED;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 6a15873055b9..c26cb7d3a2e5 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -636,7 +636,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (head->ids.csi == NVME_CSI_ZNS)
 		lim.features |= BLK_FEAT_ZONED;
 	else
-		lim.max_zone_append_sectors = 0;
+		lim.max_hw_zone_append_sectors = 0;
 
 	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
 	if (IS_ERR(head->disk))
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 9a06f9d98cd6..382949e18c6a 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -111,7 +111,7 @@ void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
 	lim->features |= BLK_FEAT_ZONED;
 	lim->max_open_zones = zi->max_open_zones;
 	lim->max_active_zones = zi->max_active_zones;
-	lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
+	lim->max_hw_zone_append_sectors = ns->ctrl->max_zone_append;
 	lim->chunk_sectors = ns->head->zsze =
 		nvme_lba_to_sect(ns->head, zi->zone_size);
 }
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index ee2b74238758..de5c54c057ec 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -634,8 +634,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim,
 		lim->max_open_zones = sdkp->zones_max_open;
 	lim->max_active_zones = 0;
 	lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks);
-	/* Enable block layer zone append emulation */
-	lim->max_zone_append_sectors = 0;
 
 	return 0;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1b51a7c92e9b..65f37ae70712 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -375,6 +375,7 @@ struct queue_limits {
 	unsigned int		max_user_discard_sectors;
 	unsigned int		max_secure_erase_sectors;
 	unsigned int		max_write_zeroes_sectors;
+	unsigned int		max_hw_zone_append_sectors;
 	unsigned int		max_zone_append_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
@@ -1208,25 +1209,9 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
 	return q->limits.max_segment_size;
 }
 
-static inline unsigned int
-queue_limits_max_zone_append_sectors(const struct queue_limits *l)
-{
-	unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
-
-	return min_not_zero(l->max_zone_append_sectors, max_sectors);
-}
-
-static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
-{
-	if (!blk_queue_is_zoned(q))
-		return 0;
-
-	return queue_limits_max_zone_append_sectors(&q->limits);
-}
-
 static inline bool queue_emulates_zone_append(struct request_queue *q)
 {
-	return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+	return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors;
 }
 
 static inline bool bdev_emulates_zone_append(struct block_device *bdev)
@@ -1237,7 +1222,7 @@ static inline bool bdev_emulates_zone_append(struct block_device *bdev)
 static inline unsigned int
 bdev_max_zone_append_sectors(struct block_device *bdev)
 {
-	return queue_max_zone_append_sectors(bdev_get_queue(bdev));
+	return bdev_limits(bdev)->max_zone_append_sectors;
 }
 
 static inline unsigned int bdev_max_segments(struct block_device *bdev)
-- 
cgit v1.2.3


From 5a47c2080a7316f184107464e4f76737c0c05186 Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Wed, 6 Nov 2024 15:34:46 +0800
Subject: nvmet: support reservation feature

This patch implements the reservation feature, including:
  1. reservation register(register, unregister and replace).
  2. reservation acquire(acquire, preempt, preempt and abort).
  3. reservation release(release and clear).
  4. reservation report.
  5. set feature and get feature of reservation notify mask.
  6. get log page of reservation event.

Not supported:
  1. persistent reservation through power loss.

Test cases:
  Use nvme-cli and fio to test all implemented sub features:
  1. use nvme resv-register to register host a registrant or
     unregister or replace a new key.
  2. use nvme resv-acquire to set host to the holder, and use fio
     to send read and write io in all reservation type. And also
     test preempt and "preempt and abort".
  3. use nvme resv-report to show all registrants and reservation
     status.
  4. use nvme resv-release to release all registrants.
  5. use nvme get-log to get events generated by the preceding
     operations.

In addition, make reservation configurable, one can set ns to
support reservation before enable ns. The default of resv_enable
is false.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/Makefile      |    2 +-
 drivers/nvme/target/admin-cmd.c   |   24 +-
 drivers/nvme/target/configfs.c    |   27 +
 drivers/nvme/target/core.c        |   62 +-
 drivers/nvme/target/fabrics-cmd.c |    4 +-
 drivers/nvme/target/nvmet.h       |   65 ++-
 drivers/nvme/target/pr.c          | 1156 +++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h              |    2 +-
 8 files changed, 1329 insertions(+), 13 deletions(-)
 create mode 100644 drivers/nvme/target/pr.c

(limited to 'include/linux')

diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index c402c44350b2..f2b025bbe10c 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
 obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
-			discovery.o io-cmd-file.o io-cmd-bdev.o
+			discovery.o io-cmd-file.o io-cmd-bdev.o pr.o
 nvmet-$(CONFIG_NVME_TARGET_DEBUGFS)	+= debugfs.o
 nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
 nvmet-$(CONFIG_BLK_DEV_ZONED)		+= zns.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 081f0473cd9e..19428745c795 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -176,6 +176,10 @@ static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log)
 	log->iocs[nvme_cmd_read] =
 	log->iocs[nvme_cmd_flush] =
 	log->iocs[nvme_cmd_dsm]	=
+	log->iocs[nvme_cmd_resv_acquire] =
+	log->iocs[nvme_cmd_resv_register] =
+	log->iocs[nvme_cmd_resv_release] =
+	log->iocs[nvme_cmd_resv_report] =
 		cpu_to_le32(NVME_CMD_EFFECTS_CSUPP);
 	log->iocs[nvme_cmd_write] =
 	log->iocs[nvme_cmd_write_zeroes] =
@@ -340,6 +344,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return nvmet_execute_get_log_cmd_effects_ns(req);
 	case NVME_LOG_ANA:
 		return nvmet_execute_get_log_page_ana(req);
+	case NVME_LOG_RESERVATION:
+		return nvmet_execute_get_log_page_resv(req);
 	}
 	pr_debug("unhandled lid %d on qid %d\n",
 	       req->cmd->get_log_page.lid, req->sq->qid);
@@ -433,7 +439,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 	id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES);
 	id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);
 	id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
-			NVME_CTRL_ONCS_WRITE_ZEROES);
+			NVME_CTRL_ONCS_WRITE_ZEROES |
+			NVME_CTRL_ONCS_RESERVATIONS);
 
 	/* XXX: don't report vwc if the underlying device is write through */
 	id->vwc = NVME_CTRL_VWC_PRESENT;
@@ -551,6 +558,15 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	id->nmic = NVME_NS_NMIC_SHARED;
 	id->anagrpid = cpu_to_le32(req->ns->anagrpid);
 
+	if (req->ns->pr.enable)
+		id->rescap = NVME_PR_SUPPORT_WRITE_EXCLUSIVE |
+			NVME_PR_SUPPORT_EXCLUSIVE_ACCESS |
+			NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY |
+			NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY |
+			NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS |
+			NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS |
+			NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF;
+
 	memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid));
 
 	id->lbaf[0].ds = req->ns->blksize_shift;
@@ -861,6 +877,9 @@ void nvmet_execute_set_features(struct nvmet_req *req)
 	case NVME_FEAT_WRITE_PROTECT:
 		status = nvmet_set_feat_write_protect(req);
 		break;
+	case NVME_FEAT_RESV_MASK:
+		status = nvmet_set_feat_resv_notif_mask(req, cdw11);
+		break;
 	default:
 		req->error_loc = offsetof(struct nvme_common_command, cdw10);
 		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
@@ -959,6 +978,9 @@ void nvmet_execute_get_features(struct nvmet_req *req)
 	case NVME_FEAT_WRITE_PROTECT:
 		status = nvmet_get_feat_write_protect(req);
 		break;
+	case NVME_FEAT_RESV_MASK:
+		status = nvmet_get_feat_resv_notif_mask(req);
+		break;
 	default:
 		req->error_loc =
 			offsetof(struct nvme_common_command, cdw10);
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 685e89b35d33..eeee9e9b854c 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -769,6 +769,32 @@ static ssize_t nvmet_ns_revalidate_size_store(struct config_item *item,
 
 CONFIGFS_ATTR_WO(nvmet_ns_, revalidate_size);
 
+static ssize_t nvmet_ns_resv_enable_show(struct config_item *item, char *page)
+{
+	return sysfs_emit(page, "%d\n", to_nvmet_ns(item)->pr.enable);
+}
+
+static ssize_t nvmet_ns_resv_enable_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_ns *ns = to_nvmet_ns(item);
+	bool val;
+
+	if (kstrtobool(page, &val))
+		return -EINVAL;
+
+	mutex_lock(&ns->subsys->lock);
+	if (ns->enabled) {
+		pr_err("the ns:%d is already enabled.\n", ns->nsid);
+		mutex_unlock(&ns->subsys->lock);
+		return -EINVAL;
+	}
+	ns->pr.enable = val;
+	mutex_unlock(&ns->subsys->lock);
+	return count;
+}
+CONFIGFS_ATTR(nvmet_ns_, resv_enable);
+
 static struct configfs_attribute *nvmet_ns_attrs[] = {
 	&nvmet_ns_attr_device_path,
 	&nvmet_ns_attr_device_nguid,
@@ -777,6 +803,7 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
 	&nvmet_ns_attr_enable,
 	&nvmet_ns_attr_buffered_io,
 	&nvmet_ns_attr_revalidate_size,
+	&nvmet_ns_attr_resv_enable,
 #ifdef CONFIG_PCI_P2PDMA
 	&nvmet_ns_attr_p2pmem,
 #endif
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 15b25f464e77..1f4e9989663b 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -611,6 +611,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
 	if (ret)
 		goto out_restore_subsys_maxnsid;
 
+	if (ns->pr.enable) {
+		ret = nvmet_pr_init_ns(ns);
+		if (ret)
+			goto out_remove_from_subsys;
+	}
+
 	subsys->nr_namespaces++;
 
 	nvmet_ns_changed(subsys, ns->nsid);
@@ -620,6 +626,8 @@ out_unlock:
 	mutex_unlock(&subsys->lock);
 	return ret;
 
+out_remove_from_subsys:
+	xa_erase(&subsys->namespaces, ns->nsid);
 out_restore_subsys_maxnsid:
 	subsys->max_nsid = nvmet_max_nsid(subsys);
 	percpu_ref_exit(&ns->ref);
@@ -663,6 +671,9 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
 	wait_for_completion(&ns->disable_done);
 	percpu_ref_exit(&ns->ref);
 
+	if (ns->pr.enable)
+		nvmet_pr_exit_ns(ns);
+
 	mutex_lock(&subsys->lock);
 
 	subsys->nr_namespaces--;
@@ -754,6 +765,7 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status)
 static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 {
 	struct nvmet_ns *ns = req->ns;
+	struct nvmet_pr_per_ctrl_ref *pc_ref = req->pc_ref;
 
 	if (!req->sq->sqhd_disabled)
 		nvmet_update_sq_head(req);
@@ -766,6 +778,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 	trace_nvmet_req_complete(req);
 
 	req->ops->queue_response(req);
+
+	if (pc_ref)
+		nvmet_pr_put_ns_pc_ref(pc_ref);
 	if (ns)
 		nvmet_put_namespace(ns);
 }
@@ -929,18 +944,39 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 		return ret;
 	}
 
+	if (req->ns->pr.enable) {
+		ret = nvmet_parse_pr_cmd(req);
+		if (!ret)
+			return ret;
+	}
+
 	switch (req->ns->csi) {
 	case NVME_CSI_NVM:
 		if (req->ns->file)
-			return nvmet_file_parse_io_cmd(req);
-		return nvmet_bdev_parse_io_cmd(req);
+			ret = nvmet_file_parse_io_cmd(req);
+		else
+			ret = nvmet_bdev_parse_io_cmd(req);
+		break;
 	case NVME_CSI_ZNS:
 		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
-			return nvmet_bdev_zns_parse_io_cmd(req);
-		return NVME_SC_INVALID_IO_CMD_SET;
+			ret = nvmet_bdev_zns_parse_io_cmd(req);
+		else
+			ret = NVME_SC_INVALID_IO_CMD_SET;
+		break;
 	default:
-		return NVME_SC_INVALID_IO_CMD_SET;
+		ret = NVME_SC_INVALID_IO_CMD_SET;
 	}
+	if (ret)
+		return ret;
+
+	if (req->ns->pr.enable) {
+		ret = nvmet_pr_check_cmd_access(req);
+		if (ret)
+			return ret;
+
+		ret = nvmet_pr_get_ns_pc_ref(req);
+	}
+	return ret;
 }
 
 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
@@ -964,6 +1000,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	req->ns = NULL;
 	req->error_loc = NVMET_NO_ERROR_LOC;
 	req->error_slba = 0;
+	req->pc_ref = NULL;
 
 	/* no support for fused commands yet */
 	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
@@ -1015,6 +1052,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init);
 void nvmet_req_uninit(struct nvmet_req *req)
 {
 	percpu_ref_put(&req->sq->ref);
+	if (req->pc_ref)
+		nvmet_pr_put_ns_pc_ref(req->pc_ref);
 	if (req->ns)
 		nvmet_put_namespace(req->ns);
 }
@@ -1383,7 +1422,8 @@ static void nvmet_fatal_error_handler(struct work_struct *work)
 }
 
 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
-		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
+		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp,
+		uuid_t *hostid)
 {
 	struct nvmet_subsys *subsys;
 	struct nvmet_ctrl *ctrl;
@@ -1462,6 +1502,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	}
 	ctrl->cntlid = ret;
 
+	uuid_copy(&ctrl->hostid, hostid);
+
 	/*
 	 * Discovery controllers may use some arbitrary high value
 	 * in order to cleanup stale discovery sessions
@@ -1478,6 +1520,9 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	nvmet_start_keep_alive_timer(ctrl);
 
 	mutex_lock(&subsys->lock);
+	ret = nvmet_ctrl_init_pr(ctrl);
+	if (ret)
+		goto init_pr_fail;
 	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
 	nvmet_setup_p2p_ns_map(ctrl, req);
 	nvmet_debugfs_ctrl_setup(ctrl);
@@ -1486,6 +1531,10 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	*ctrlp = ctrl;
 	return 0;
 
+init_pr_fail:
+	mutex_unlock(&subsys->lock);
+	nvmet_stop_keep_alive_timer(ctrl);
+	ida_free(&cntlid_ida, ctrl->cntlid);
 out_free_sqs:
 	kfree(ctrl->sqs);
 out_free_changed_ns_list:
@@ -1504,6 +1553,7 @@ static void nvmet_ctrl_free(struct kref *ref)
 	struct nvmet_subsys *subsys = ctrl->subsys;
 
 	mutex_lock(&subsys->lock);
+	nvmet_ctrl_destroy_pr(ctrl);
 	nvmet_release_p2p_ns_map(ctrl);
 	list_del(&ctrl->subsys_entry);
 	mutex_unlock(&subsys->lock);
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index c4b2eddd5666..28a84af1b4c0 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -245,12 +245,10 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 	d->subsysnqn[NVMF_NQN_FIELD_LEN - 1] = '\0';
 	d->hostnqn[NVMF_NQN_FIELD_LEN - 1] = '\0';
 	status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
-				  le32_to_cpu(c->kato), &ctrl);
+				  le32_to_cpu(c->kato), &ctrl, &d->hostid);
 	if (status)
 		goto out;
 
-	uuid_copy(&ctrl->hostid, &d->hostid);
-
 	dhchap_status = nvmet_setup_auth(ctrl);
 	if (dhchap_status) {
 		pr_err("Failed to setup authentication, dhchap status %u\n",
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 190f55e6d753..ec379814b16c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -20,6 +20,7 @@
 #include <linux/blkdev.h>
 #include <linux/radix-tree.h>
 #include <linux/t10-pi.h>
+#include <linux/kfifo.h>
 
 #define NVMET_DEFAULT_VS		NVME_VS(1, 3, 0)
 
@@ -30,6 +31,7 @@
 #define NVMET_MN_MAX_SIZE		40
 #define NVMET_SN_MAX_SIZE		20
 #define NVMET_FR_MAX_SIZE		8
+#define NVMET_PR_LOG_QUEUE_SIZE		64
 
 /*
  * Supported optional AENs:
@@ -56,6 +58,38 @@
 #define IPO_IATTR_CONNECT_SQE(x)	\
 	(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
 
+struct nvmet_pr_registrant {
+	u64			rkey;
+	uuid_t			hostid;
+	enum nvme_pr_type	rtype;
+	struct list_head	entry;
+	struct rcu_head		rcu;
+};
+
+struct nvmet_pr {
+	bool			enable;
+	unsigned long		notify_mask;
+	atomic_t		generation;
+	struct nvmet_pr_registrant __rcu *holder;
+	/*
+	 * During the execution of the reservation command, mutual
+	 * exclusion is required throughout the process. However,
+	 * while waiting asynchronously for the 'per controller
+	 * percpu_ref' to complete before the 'preempt and abort'
+	 * command finishes, a semaphore is needed to ensure mutual
+	 * exclusion instead of a mutex.
+	 */
+	struct semaphore	pr_sem;
+	struct list_head	registrant_list;
+};
+
+struct nvmet_pr_per_ctrl_ref {
+	struct percpu_ref	ref;
+	struct completion	free_done;
+	struct completion	confirm_done;
+	uuid_t			hostid;
+};
+
 struct nvmet_ns {
 	struct percpu_ref	ref;
 	struct file		*bdev_file;
@@ -85,6 +119,8 @@ struct nvmet_ns {
 	int			pi_type;
 	int			metadata_size;
 	u8			csi;
+	struct nvmet_pr		pr;
+	struct xarray		pr_per_ctrl_refs;
 };
 
 static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
@@ -191,6 +227,13 @@ static inline bool nvmet_port_secure_channel_required(struct nvmet_port *port)
     return nvmet_port_disc_addr_treq_secure_channel(port) == NVMF_TREQ_REQUIRED;
 }
 
+struct nvmet_pr_log_mgr {
+	struct mutex		lock;
+	u64			lost_count;
+	u64			counter;
+	DECLARE_KFIFO(log_queue, struct nvme_pr_log, NVMET_PR_LOG_QUEUE_SIZE);
+};
+
 struct nvmet_ctrl {
 	struct nvmet_subsys	*subsys;
 	struct nvmet_sq		**sqs;
@@ -246,6 +289,7 @@ struct nvmet_ctrl {
 	u8			*dh_key;
 	size_t			dh_keysize;
 #endif
+	struct nvmet_pr_log_mgr pr_log_mgr;
 };
 
 struct nvmet_subsys {
@@ -396,6 +440,9 @@ struct nvmet_req {
 			struct work_struct	zmgmt_work;
 		} z;
 #endif /* CONFIG_BLK_DEV_ZONED */
+		struct {
+			struct work_struct	abort_work;
+		} r;
 	};
 	int			sg_cnt;
 	int			metadata_sg_cnt;
@@ -412,6 +459,7 @@ struct nvmet_req {
 	struct device		*p2p_client;
 	u16			error_loc;
 	u64			error_slba;
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
 };
 
 #define NVMET_MAX_MPOOL_BVEC		16
@@ -498,7 +546,8 @@ void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);
 
 void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new);
 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
-		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp);
+		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp,
+		uuid_t *hostid);
 struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn,
 				       const char *hostnqn, u16 cntlid,
 				       struct nvmet_req *req);
@@ -761,4 +810,18 @@ static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
 static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; }
 #endif
 
+int nvmet_pr_init_ns(struct nvmet_ns *ns);
+u16 nvmet_parse_pr_cmd(struct nvmet_req *req);
+u16 nvmet_pr_check_cmd_access(struct nvmet_req *req);
+int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl);
+void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl);
+void nvmet_pr_exit_ns(struct nvmet_ns *ns);
+void nvmet_execute_get_log_page_resv(struct nvmet_req *req);
+u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask);
+u16 nvmet_get_feat_resv_notif_mask(struct nvmet_req *req);
+u16 nvmet_pr_get_ns_pc_ref(struct nvmet_req *req);
+static inline void nvmet_pr_put_ns_pc_ref(struct nvmet_pr_per_ctrl_ref *pc_ref)
+{
+	percpu_ref_put(&pc_ref->ref);
+}
 #endif /* _NVMET_H */
diff --git a/drivers/nvme/target/pr.c b/drivers/nvme/target/pr.c
new file mode 100644
index 000000000000..25a02b50d9f3
--- /dev/null
+++ b/drivers/nvme/target/pr.c
@@ -0,0 +1,1156 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics Persist Reservation.
+ * Copyright (c) 2024 Guixin Liu, Alibaba Group.
+ * All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/unaligned.h>
+#include "nvmet.h"
+
+#define NVMET_PR_NOTIFI_MASK_ALL \
+	(1 << NVME_PR_NOTIFY_BIT_REG_PREEMPTED | \
+	 1 << NVME_PR_NOTIFY_BIT_RESV_RELEASED | \
+	 1 << NVME_PR_NOTIFY_BIT_RESV_PREEMPTED)
+
+static inline bool nvmet_pr_parse_ignore_key(u32 cdw10)
+{
+	/* Ignore existing key, bit 03. */
+	return (cdw10 >> 3) & 1;
+}
+
+static inline struct nvmet_ns *nvmet_pr_to_ns(struct nvmet_pr *pr)
+{
+	return container_of(pr, struct nvmet_ns, pr);
+}
+
+static struct nvmet_pr_registrant *
+nvmet_pr_find_registrant(struct nvmet_pr *pr, uuid_t *hostid)
+{
+	struct nvmet_pr_registrant *reg;
+
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, hostid))
+			return reg;
+	}
+	return NULL;
+}
+
+u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask)
+{
+	u32 nsid = le32_to_cpu(req->cmd->common.nsid);
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_ns *ns;
+	unsigned long idx;
+	u16 status;
+
+	if (mask & ~(NVMET_PR_NOTIFI_MASK_ALL)) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw11);
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+
+	if (nsid != U32_MAX) {
+		status = nvmet_req_find_ns(req);
+		if (status)
+			return status;
+		if (!req->ns->pr.enable)
+			return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+
+		WRITE_ONCE(req->ns->pr.notify_mask, mask);
+		goto success;
+	}
+
+	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+		if (ns->pr.enable)
+			WRITE_ONCE(ns->pr.notify_mask, mask);
+	}
+
+success:
+	nvmet_set_result(req, mask);
+	return NVME_SC_SUCCESS;
+}
+
+u16 nvmet_get_feat_resv_notif_mask(struct nvmet_req *req)
+{
+	u16 status;
+
+	status = nvmet_req_find_ns(req);
+	if (status)
+		return status;
+
+	if (!req->ns->pr.enable)
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+
+	nvmet_set_result(req, READ_ONCE(req->ns->pr.notify_mask));
+	return status;
+}
+
+void nvmet_execute_get_log_page_resv(struct nvmet_req *req)
+{
+	struct nvmet_pr_log_mgr *log_mgr = &req->sq->ctrl->pr_log_mgr;
+	struct nvme_pr_log next_log = {0};
+	struct nvme_pr_log log = {0};
+	u16 status = NVME_SC_SUCCESS;
+	u64 lost_count;
+	u64 cur_count;
+	u64 next_count;
+
+	mutex_lock(&log_mgr->lock);
+	if (!kfifo_get(&log_mgr->log_queue, &log))
+		goto out;
+
+	/*
+	 * We can't get the last in kfifo.
+	 * Utilize the current count and the count from the next log to
+	 * calculate the number of lost logs, while also addressing cases
+	 * of overflow. If there is no subsequent log, the number of lost
+	 * logs is equal to the lost_count within the nvmet_pr_log_mgr.
+	 */
+	cur_count = le64_to_cpu(log.count);
+	if (kfifo_peek(&log_mgr->log_queue, &next_log)) {
+		next_count = le64_to_cpu(next_log.count);
+		if (next_count > cur_count)
+			lost_count = next_count - cur_count - 1;
+		else
+			lost_count = U64_MAX - cur_count + next_count - 1;
+	} else {
+		lost_count = log_mgr->lost_count;
+	}
+
+	log.count = cpu_to_le64((cur_count + lost_count) == 0 ?
+				1 : (cur_count + lost_count));
+	log_mgr->lost_count -= lost_count;
+
+	log.nr_pages = kfifo_len(&log_mgr->log_queue);
+
+out:
+	status = nvmet_copy_to_sgl(req, 0, &log, sizeof(log));
+	mutex_unlock(&log_mgr->lock);
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_pr_add_resv_log(struct nvmet_ctrl *ctrl, u8 log_type,
+				  u32 nsid)
+{
+	struct nvmet_pr_log_mgr *log_mgr = &ctrl->pr_log_mgr;
+	struct nvme_pr_log log = {0};
+
+	mutex_lock(&log_mgr->lock);
+	log_mgr->counter++;
+	if (log_mgr->counter == 0)
+		log_mgr->counter = 1;
+
+	log.count = cpu_to_le64(log_mgr->counter);
+	log.type = log_type;
+	log.nsid = cpu_to_le32(nsid);
+
+	if (!kfifo_put(&log_mgr->log_queue, log)) {
+		pr_info("a reservation log lost, cntlid:%d, log_type:%d, nsid:%d\n",
+			ctrl->cntlid, log_type, nsid);
+		log_mgr->lost_count++;
+	}
+
+	mutex_unlock(&log_mgr->lock);
+}
+
+static void nvmet_pr_resv_released(struct nvmet_pr *pr, uuid_t *hostid)
+{
+	struct nvmet_ns *ns = nvmet_pr_to_ns(pr);
+	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_ctrl *ctrl;
+
+	if (test_bit(NVME_PR_NOTIFY_BIT_RESV_RELEASED, &pr->notify_mask))
+		return;
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		if (!uuid_equal(&ctrl->hostid, hostid) &&
+		    nvmet_pr_find_registrant(pr, &ctrl->hostid)) {
+			nvmet_pr_add_resv_log(ctrl,
+				NVME_PR_LOG_RESERVATION_RELEASED, ns->nsid);
+			nvmet_add_async_event(ctrl, NVME_AER_CSS,
+				NVME_AEN_RESV_LOG_PAGE_AVALIABLE,
+				NVME_LOG_RESERVATION);
+		}
+	}
+	mutex_unlock(&subsys->lock);
+}
+
+static void nvmet_pr_send_event_to_host(struct nvmet_pr *pr, uuid_t *hostid,
+					  u8 log_type)
+{
+	struct nvmet_ns *ns = nvmet_pr_to_ns(pr);
+	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_ctrl *ctrl;
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		if (uuid_equal(hostid, &ctrl->hostid)) {
+			nvmet_pr_add_resv_log(ctrl, log_type, ns->nsid);
+			nvmet_add_async_event(ctrl, NVME_AER_CSS,
+				NVME_AEN_RESV_LOG_PAGE_AVALIABLE,
+				NVME_LOG_RESERVATION);
+		}
+	}
+	mutex_unlock(&subsys->lock);
+}
+
+static void nvmet_pr_resv_preempted(struct nvmet_pr *pr, uuid_t *hostid)
+{
+	if (test_bit(NVME_PR_NOTIFY_BIT_RESV_PREEMPTED, &pr->notify_mask))
+		return;
+
+	nvmet_pr_send_event_to_host(pr, hostid,
+		NVME_PR_LOG_RESERVATOIN_PREEMPTED);
+}
+
+static void nvmet_pr_registration_preempted(struct nvmet_pr *pr,
+					    uuid_t *hostid)
+{
+	if (test_bit(NVME_PR_NOTIFY_BIT_REG_PREEMPTED, &pr->notify_mask))
+		return;
+
+	nvmet_pr_send_event_to_host(pr, hostid,
+		NVME_PR_LOG_REGISTRATION_PREEMPTED);
+}
+
+static inline void nvmet_pr_set_new_holder(struct nvmet_pr *pr, u8 new_rtype,
+					   struct nvmet_pr_registrant *reg)
+{
+	reg->rtype = new_rtype;
+	rcu_assign_pointer(pr->holder, reg);
+}
+
+static u16 nvmet_pr_register(struct nvmet_req *req,
+			     struct nvmet_pr_register_data *d)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr_registrant *new, *reg;
+	struct nvmet_pr *pr = &req->ns->pr;
+	u16 status = NVME_SC_SUCCESS;
+	u64 nrkey = le64_to_cpu(d->nrkey);
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return NVME_SC_INTERNAL;
+
+	down(&pr->pr_sem);
+	reg = nvmet_pr_find_registrant(pr, &ctrl->hostid);
+	if (reg) {
+		if (reg->rkey != nrkey)
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		kfree(new);
+		goto out;
+	}
+
+	memset(new, 0, sizeof(*new));
+	INIT_LIST_HEAD(&new->entry);
+	new->rkey = nrkey;
+	uuid_copy(&new->hostid, &ctrl->hostid);
+	list_add_tail_rcu(&new->entry, &pr->registrant_list);
+
+out:
+	up(&pr->pr_sem);
+	return status;
+}
+
+static void nvmet_pr_unregister_one(struct nvmet_pr *pr,
+				    struct nvmet_pr_registrant *reg)
+{
+	struct nvmet_pr_registrant *first_reg;
+	struct nvmet_pr_registrant *holder;
+	u8 original_rtype;
+
+	list_del_rcu(&reg->entry);
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (reg != holder)
+		goto out;
+
+	original_rtype = holder->rtype;
+	if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+	    original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
+		first_reg = list_first_or_null_rcu(&pr->registrant_list,
+				struct nvmet_pr_registrant, entry);
+		if (first_reg)
+			first_reg->rtype = original_rtype;
+		rcu_assign_pointer(pr->holder, first_reg);
+	} else {
+		rcu_assign_pointer(pr->holder, NULL);
+
+		if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_REG_ONLY ||
+		    original_rtype == NVME_PR_EXCLUSIVE_ACCESS_REG_ONLY)
+			nvmet_pr_resv_released(pr, &reg->hostid);
+	}
+out:
+	kfree_rcu(reg, rcu);
+}
+
+static u16 nvmet_pr_unregister(struct nvmet_req *req,
+			       struct nvmet_pr_register_data *d,
+			       bool ignore_key)
+{
+	u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *reg;
+
+	down(&pr->pr_sem);
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, &ctrl->hostid)) {
+			if (ignore_key || reg->rkey == le64_to_cpu(d->crkey)) {
+				status = NVME_SC_SUCCESS;
+				nvmet_pr_unregister_one(pr, reg);
+			}
+			break;
+		}
+	}
+	up(&pr->pr_sem);
+
+	return status;
+}
+
+static void nvmet_pr_update_reg_rkey(struct nvmet_pr_registrant *reg,
+				     void *attr)
+{
+	reg->rkey = *(u64 *)attr;
+}
+
+static u16 nvmet_pr_update_reg_attr(struct nvmet_pr *pr,
+			struct nvmet_pr_registrant *reg,
+			void (*change_attr)(struct nvmet_pr_registrant *reg,
+			void *attr),
+			void *attr)
+{
+	struct nvmet_pr_registrant *holder;
+	struct nvmet_pr_registrant *new;
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (reg != holder) {
+		change_attr(reg, attr);
+		return NVME_SC_SUCCESS;
+	}
+
+	new = kmalloc(sizeof(*new), GFP_ATOMIC);
+	if (!new)
+		return NVME_SC_INTERNAL;
+
+	new->rkey = holder->rkey;
+	new->rtype = holder->rtype;
+	uuid_copy(&new->hostid, &holder->hostid);
+	INIT_LIST_HEAD(&new->entry);
+
+	change_attr(new, attr);
+	list_replace_rcu(&holder->entry, &new->entry);
+	rcu_assign_pointer(pr->holder, new);
+	kfree_rcu(holder, rcu);
+
+	return NVME_SC_SUCCESS;
+}
+
+static u16 nvmet_pr_replace(struct nvmet_req *req,
+			    struct nvmet_pr_register_data *d,
+			    bool ignore_key)
+{
+	u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *reg;
+	u64 nrkey = le64_to_cpu(d->nrkey);
+
+	down(&pr->pr_sem);
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, &ctrl->hostid)) {
+			if (ignore_key || reg->rkey == le64_to_cpu(d->crkey))
+				status = nvmet_pr_update_reg_attr(pr, reg,
+						nvmet_pr_update_reg_rkey,
+						&nrkey);
+			break;
+		}
+	}
+	up(&pr->pr_sem);
+	return status;
+}
+
+static void nvmet_execute_pr_register(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+	bool ignore_key = nvmet_pr_parse_ignore_key(cdw10);
+	struct nvmet_pr_register_data *d;
+	u8 reg_act = cdw10 & 0x07; /* Reservation Register Action, bit 02:00 */
+	u16 status;
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+	if (status)
+		goto free_data;
+
+	switch (reg_act) {
+	case NVME_PR_REGISTER_ACT_REG:
+		status = nvmet_pr_register(req, d);
+		break;
+	case NVME_PR_REGISTER_ACT_UNREG:
+		status = nvmet_pr_unregister(req, d, ignore_key);
+		break;
+	case NVME_PR_REGISTER_ACT_REPLACE:
+		status = nvmet_pr_replace(req, d, ignore_key);
+		break;
+	default:
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
+		break;
+	}
+free_data:
+	kfree(d);
+out:
+	if (!status)
+		atomic_inc(&req->ns->pr.generation);
+	nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_pr_acquire(struct nvmet_req *req,
+			    struct nvmet_pr_registrant *reg,
+			    u8 rtype)
+{
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *holder;
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (holder && reg != holder)
+		return  NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	if (holder && reg == holder) {
+		if (holder->rtype == rtype)
+			return NVME_SC_SUCCESS;
+		return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	}
+
+	nvmet_pr_set_new_holder(pr, rtype, reg);
+	return NVME_SC_SUCCESS;
+}
+
+static void nvmet_pr_confirm_ns_pc_ref(struct percpu_ref *ref)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref =
+		container_of(ref, struct nvmet_pr_per_ctrl_ref, ref);
+
+	complete(&pc_ref->confirm_done);
+}
+
+static void nvmet_pr_set_ctrl_to_abort(struct nvmet_req *req, uuid_t *hostid)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ns *ns = req->ns;
+	unsigned long idx;
+
+	xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+		if (uuid_equal(&pc_ref->hostid, hostid)) {
+			percpu_ref_kill_and_confirm(&pc_ref->ref,
+						nvmet_pr_confirm_ns_pc_ref);
+			wait_for_completion(&pc_ref->confirm_done);
+		}
+	}
+}
+
+static u16 nvmet_pr_unreg_all_host_by_prkey(struct nvmet_req *req, u64 prkey,
+					    uuid_t *send_hostid,
+					    bool abort)
+{
+	u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr *pr = &req->ns->pr;
+	uuid_t hostid;
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		if (reg->rkey == prkey) {
+			status = NVME_SC_SUCCESS;
+			uuid_copy(&hostid, &reg->hostid);
+			if (abort)
+				nvmet_pr_set_ctrl_to_abort(req, &hostid);
+			nvmet_pr_unregister_one(pr, reg);
+			if (!uuid_equal(&hostid, send_hostid))
+				nvmet_pr_registration_preempted(pr, &hostid);
+		}
+	}
+	return status;
+}
+
+static void nvmet_pr_unreg_all_others_by_prkey(struct nvmet_req *req,
+					       u64 prkey,
+					       uuid_t *send_hostid,
+					       bool abort)
+{
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr *pr = &req->ns->pr;
+	uuid_t hostid;
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		if (reg->rkey == prkey &&
+		    !uuid_equal(&reg->hostid, send_hostid)) {
+			uuid_copy(&hostid, &reg->hostid);
+			if (abort)
+				nvmet_pr_set_ctrl_to_abort(req, &hostid);
+			nvmet_pr_unregister_one(pr, reg);
+			nvmet_pr_registration_preempted(pr, &hostid);
+		}
+	}
+}
+
+static void nvmet_pr_unreg_all_others(struct nvmet_req *req,
+				      uuid_t *send_hostid,
+				      bool abort)
+{
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr *pr = &req->ns->pr;
+	uuid_t hostid;
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		if (!uuid_equal(&reg->hostid, send_hostid)) {
+			uuid_copy(&hostid, &reg->hostid);
+			if (abort)
+				nvmet_pr_set_ctrl_to_abort(req, &hostid);
+			nvmet_pr_unregister_one(pr, reg);
+			nvmet_pr_registration_preempted(pr, &hostid);
+		}
+	}
+}
+
+static void nvmet_pr_update_holder_rtype(struct nvmet_pr_registrant *reg,
+					 void *attr)
+{
+	u8 new_rtype = *(u8 *)attr;
+
+	reg->rtype = new_rtype;
+}
+
+static u16 nvmet_pr_preempt(struct nvmet_req *req,
+			    struct nvmet_pr_registrant *reg,
+			    u8 rtype,
+			    struct nvmet_pr_acquire_data *d,
+			    bool abort)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *holder;
+	enum nvme_pr_type original_rtype;
+	u64 prkey = le64_to_cpu(d->prkey);
+	u16 status;
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (!holder)
+		return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
+					&ctrl->hostid, abort);
+
+	original_rtype = holder->rtype;
+	if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+	    original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
+		if (!prkey) {
+			/*
+			 * To prevent possible access from other hosts, and
+			 * avoid terminate the holder, set the new holder
+			 * first before unregistering.
+			 */
+			nvmet_pr_set_new_holder(pr, rtype, reg);
+			nvmet_pr_unreg_all_others(req, &ctrl->hostid, abort);
+			return NVME_SC_SUCCESS;
+		}
+		return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
+				&ctrl->hostid, abort);
+	}
+
+	if (holder == reg) {
+		status = nvmet_pr_update_reg_attr(pr, holder,
+				nvmet_pr_update_holder_rtype, &rtype);
+		if (!status && original_rtype != rtype)
+			nvmet_pr_resv_released(pr, &reg->hostid);
+		return status;
+	}
+
+	if (prkey == holder->rkey) {
+		/*
+		 * Same as before, set the new holder first.
+		 */
+		nvmet_pr_set_new_holder(pr, rtype, reg);
+		nvmet_pr_unreg_all_others_by_prkey(req, prkey, &ctrl->hostid,
+						abort);
+		if (original_rtype != rtype)
+			nvmet_pr_resv_released(pr, &reg->hostid);
+		return NVME_SC_SUCCESS;
+	}
+
+	if (prkey)
+		return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
+					&ctrl->hostid, abort);
+	return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+}
+
+static void nvmet_pr_do_abort(struct work_struct *w)
+{
+	struct nvmet_req *req = container_of(w, struct nvmet_req, r.abort_work);
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ns *ns = req->ns;
+	unsigned long idx;
+
+	/*
+	 * The target does not support abort, just wait per-controller ref to 0.
+	 */
+	xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+		if (percpu_ref_is_dying(&pc_ref->ref)) {
+			wait_for_completion(&pc_ref->free_done);
+			reinit_completion(&pc_ref->confirm_done);
+			reinit_completion(&pc_ref->free_done);
+			percpu_ref_resurrect(&pc_ref->ref);
+		}
+	}
+
+	up(&ns->pr.pr_sem);
+	nvmet_req_complete(req, NVME_SC_SUCCESS);
+}
+
+static u16 __nvmet_execute_pr_acquire(struct nvmet_req *req,
+				      struct nvmet_pr_registrant *reg,
+				      u8 acquire_act,
+				      u8 rtype,
+				      struct nvmet_pr_acquire_data *d)
+{
+	u16 status;
+
+	switch (acquire_act) {
+	case NVME_PR_ACQUIRE_ACT_ACQUIRE:
+		status = nvmet_pr_acquire(req, reg, rtype);
+		goto out;
+	case NVME_PR_ACQUIRE_ACT_PREEMPT:
+		status = nvmet_pr_preempt(req, reg, rtype, d, false);
+		goto inc_gen;
+	case NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT:
+		status = nvmet_pr_preempt(req, reg, rtype, d, true);
+		goto inc_gen;
+	default:
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
+		goto out;
+	}
+inc_gen:
+	if (!status)
+		atomic_inc(&req->ns->pr.generation);
+out:
+	return status;
+}
+
+static void nvmet_execute_pr_acquire(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+	bool ignore_key = nvmet_pr_parse_ignore_key(cdw10);
+	/* Reservation type, bit 15:08 */
+	u8 rtype = (u8)((cdw10 >> 8) & 0xff);
+	/* Reservation acquire action, bit 02:00 */
+	u8 acquire_act = cdw10 & 0x07;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr_acquire_data *d = NULL;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *reg;
+	u16 status = NVME_SC_SUCCESS;
+
+	if (ignore_key ||
+	    rtype < NVME_PR_WRITE_EXCLUSIVE ||
+	    rtype > NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
+		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+	if (status)
+		goto free_data;
+
+	status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	down(&pr->pr_sem);
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, &ctrl->hostid) &&
+		    reg->rkey == le64_to_cpu(d->crkey)) {
+			status = __nvmet_execute_pr_acquire(req, reg,
+					acquire_act, rtype, d);
+			break;
+		}
+	}
+
+	if (!status && acquire_act == NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT) {
+		kfree(d);
+		INIT_WORK(&req->r.abort_work, nvmet_pr_do_abort);
+		queue_work(nvmet_wq, &req->r.abort_work);
+		return;
+	}
+
+	up(&pr->pr_sem);
+
+free_data:
+	kfree(d);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_pr_release(struct nvmet_req *req,
+			    struct nvmet_pr_registrant *reg,
+			    u8 rtype)
+{
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *holder;
+	u8 original_rtype;
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (!holder || reg != holder)
+		return NVME_SC_SUCCESS;
+
+	original_rtype = holder->rtype;
+	if (original_rtype != rtype)
+		return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+
+	rcu_assign_pointer(pr->holder, NULL);
+
+	if (original_rtype != NVME_PR_WRITE_EXCLUSIVE &&
+	    original_rtype != NVME_PR_EXCLUSIVE_ACCESS)
+		nvmet_pr_resv_released(pr, &reg->hostid);
+
+	return NVME_SC_SUCCESS;
+}
+
+static void nvmet_pr_clear(struct nvmet_req *req)
+{
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr *pr = &req->ns->pr;
+
+	rcu_assign_pointer(pr->holder, NULL);
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		list_del_rcu(&reg->entry);
+		if (!uuid_equal(&req->sq->ctrl->hostid, &reg->hostid))
+			nvmet_pr_resv_preempted(pr, &reg->hostid);
+		kfree_rcu(reg, rcu);
+	}
+
+	atomic_inc(&pr->generation);
+}
+
+static u16 __nvmet_execute_pr_release(struct nvmet_req *req,
+				      struct nvmet_pr_registrant *reg,
+				      u8 release_act, u8 rtype)
+{
+	switch (release_act) {
+	case NVME_PR_RELEASE_ACT_RELEASE:
+		return nvmet_pr_release(req, reg, rtype);
+	case NVME_PR_RELEASE_ACT_CLEAR:
+		nvmet_pr_clear(req);
+		return NVME_SC_SUCCESS;
+	default:
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
+	}
+}
+
+static void nvmet_execute_pr_release(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+	bool ignore_key = nvmet_pr_parse_ignore_key(cdw10);
+	u8 rtype = (u8)((cdw10 >> 8) & 0xff); /* Reservation type, bit 15:08 */
+	u8 release_act = cdw10 & 0x07; /* Reservation release action, bit 02:00 */
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_release_data *d;
+	struct nvmet_pr_registrant *reg;
+	u16 status;
+
+	if (ignore_key) {
+		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+	if (status)
+		goto free_data;
+
+	status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	down(&pr->pr_sem);
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, &ctrl->hostid) &&
+		    reg->rkey == le64_to_cpu(d->crkey)) {
+			status = __nvmet_execute_pr_release(req, reg,
+					release_act, rtype);
+			break;
+		}
+	}
+	up(&pr->pr_sem);
+free_data:
+	kfree(d);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_pr_report(struct nvmet_req *req)
+{
+	u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+	u32 num_bytes = 4 * (cdw10 + 1); /* cdw10 is number of dwords */
+	u8 eds = cdw11 & 1; /* Extended data structure, bit 00 */
+	struct nvme_registered_ctrl_ext *ctrl_eds;
+	struct nvme_reservation_status_ext *data;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *holder;
+	struct nvmet_pr_registrant *reg;
+	u16 num_ctrls = 0;
+	u16 status;
+	u8 rtype;
+
+	/* nvmet hostid(uuid_t) is 128 bit. */
+	if (!eds) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw11);
+		status = NVME_SC_HOST_ID_INCONSIST | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	if (num_bytes < sizeof(struct nvme_reservation_status_ext)) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	data = kmalloc(num_bytes, GFP_KERNEL);
+	if (!data) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+	memset(data, 0, num_bytes);
+	data->gen = cpu_to_le32(atomic_read(&pr->generation));
+	data->ptpls = 0;
+	ctrl_eds = data->regctl_eds;
+
+	rcu_read_lock();
+	holder = rcu_dereference(pr->holder);
+	rtype = holder ? holder->rtype : 0;
+	data->rtype = rtype;
+
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		num_ctrls++;
+		/*
+		 * continue to get the number of all registrans.
+		 */
+		if (((void *)ctrl_eds + sizeof(*ctrl_eds)) >
+		    ((void *)data + num_bytes))
+			continue;
+		/*
+		 * Dynamic controller, set cntlid to 0xffff.
+		 */
+		ctrl_eds->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC);
+		if (rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+		    rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS)
+			ctrl_eds->rcsts = 1;
+		if (reg == holder)
+			ctrl_eds->rcsts = 1;
+		uuid_copy((uuid_t *)&ctrl_eds->hostid, &reg->hostid);
+		ctrl_eds->rkey = cpu_to_le64(reg->rkey);
+		ctrl_eds++;
+	}
+	rcu_read_unlock();
+
+	put_unaligned_le16(num_ctrls, data->regctl);
+	status = nvmet_copy_to_sgl(req, 0, data, num_bytes);
+	kfree(data);
+out:
+	nvmet_req_complete(req, status);
+}
+
+u16 nvmet_parse_pr_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+
+	switch (cmd->common.opcode) {
+	case nvme_cmd_resv_register:
+		req->execute = nvmet_execute_pr_register;
+		break;
+	case nvme_cmd_resv_acquire:
+		req->execute = nvmet_execute_pr_acquire;
+		break;
+	case nvme_cmd_resv_release:
+		req->execute = nvmet_execute_pr_release;
+		break;
+	case nvme_cmd_resv_report:
+		req->execute = nvmet_execute_pr_report;
+		break;
+	default:
+		return 1;
+	}
+	return NVME_SC_SUCCESS;
+}
+
+static bool nvmet_is_req_write_cmd_group(struct nvmet_req *req)
+{
+	u8 opcode = req->cmd->common.opcode;
+
+	if (req->sq->qid) {
+		switch (opcode) {
+		case nvme_cmd_flush:
+		case nvme_cmd_write:
+		case nvme_cmd_write_zeroes:
+		case nvme_cmd_dsm:
+		case nvme_cmd_zone_append:
+		case nvme_cmd_zone_mgmt_send:
+			return true;
+		default:
+			return false;
+		}
+	}
+	return false;
+}
+
+static bool nvmet_is_req_read_cmd_group(struct nvmet_req *req)
+{
+	u8 opcode = req->cmd->common.opcode;
+
+	if (req->sq->qid) {
+		switch (opcode) {
+		case nvme_cmd_read:
+		case nvme_cmd_zone_mgmt_recv:
+			return true;
+		default:
+			return false;
+		}
+	}
+	return false;
+}
+
+u16 nvmet_pr_check_cmd_access(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr_registrant *holder;
+	struct nvmet_ns *ns = req->ns;
+	struct nvmet_pr *pr = &ns->pr;
+	u16 status = NVME_SC_SUCCESS;
+
+	rcu_read_lock();
+	holder = rcu_dereference(pr->holder);
+	if (!holder)
+		goto unlock;
+	if (uuid_equal(&ctrl->hostid, &holder->hostid))
+		goto unlock;
+
+	/*
+	 * The Reservation command group is checked in executing,
+	 * allow it here.
+	 */
+	switch (holder->rtype) {
+	case NVME_PR_WRITE_EXCLUSIVE:
+		if (nvmet_is_req_write_cmd_group(req))
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		break;
+	case NVME_PR_EXCLUSIVE_ACCESS:
+		if (nvmet_is_req_read_cmd_group(req) ||
+		    nvmet_is_req_write_cmd_group(req))
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		break;
+	case NVME_PR_WRITE_EXCLUSIVE_REG_ONLY:
+	case NVME_PR_WRITE_EXCLUSIVE_ALL_REGS:
+		if ((nvmet_is_req_write_cmd_group(req)) &&
+		    !nvmet_pr_find_registrant(pr, &ctrl->hostid))
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		break;
+	case NVME_PR_EXCLUSIVE_ACCESS_REG_ONLY:
+	case NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS:
+		if ((nvmet_is_req_read_cmd_group(req) ||
+		    nvmet_is_req_write_cmd_group(req)) &&
+		    !nvmet_pr_find_registrant(pr, &ctrl->hostid))
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		break;
+	default:
+		pr_warn("the reservation type is set wrong, type:%d\n",
+			holder->rtype);
+		break;
+	}
+
+unlock:
+	rcu_read_unlock();
+	if (status)
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+	return status;
+}
+
+u16 nvmet_pr_get_ns_pc_ref(struct nvmet_req *req)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+
+	pc_ref = xa_load(&req->ns->pr_per_ctrl_refs,
+			req->sq->ctrl->cntlid);
+	if (unlikely(!percpu_ref_tryget_live(&pc_ref->ref)))
+		return NVME_SC_INTERNAL;
+	req->pc_ref = pc_ref;
+	return NVME_SC_SUCCESS;
+}
+
+static void nvmet_pr_ctrl_ns_all_cmds_done(struct percpu_ref *ref)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref =
+		container_of(ref, struct nvmet_pr_per_ctrl_ref, ref);
+
+	complete(&pc_ref->free_done);
+}
+
+static int nvmet_pr_alloc_and_insert_pc_ref(struct nvmet_ns *ns,
+					    unsigned long idx,
+					    uuid_t *hostid)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	int ret;
+
+	pc_ref = kmalloc(sizeof(*pc_ref), GFP_ATOMIC);
+	if (!pc_ref)
+		return  -ENOMEM;
+
+	ret = percpu_ref_init(&pc_ref->ref, nvmet_pr_ctrl_ns_all_cmds_done,
+			PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+	if (ret)
+		goto free;
+
+	init_completion(&pc_ref->free_done);
+	init_completion(&pc_ref->confirm_done);
+	uuid_copy(&pc_ref->hostid, hostid);
+
+	ret = xa_insert(&ns->pr_per_ctrl_refs, idx, pc_ref, GFP_KERNEL);
+	if (ret)
+		goto exit;
+	return ret;
+exit:
+	percpu_ref_exit(&pc_ref->ref);
+free:
+	kfree(pc_ref);
+	return ret;
+}
+
+int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_subsys *subsys = ctrl->subsys;
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ns *ns = NULL;
+	unsigned long idx;
+	int ret;
+
+	ctrl->pr_log_mgr.counter = 0;
+	ctrl->pr_log_mgr.lost_count = 0;
+	mutex_init(&ctrl->pr_log_mgr.lock);
+	INIT_KFIFO(ctrl->pr_log_mgr.log_queue);
+
+	/*
+	 * Here we are under subsys lock, if an ns not in subsys->namespaces,
+	 * we can make sure that ns is not enabled, and not call
+	 * nvmet_pr_init_ns(), see more details in nvmet_ns_enable().
+	 * So just check ns->pr.enable.
+	 */
+	xa_for_each(&subsys->namespaces, idx, ns) {
+		if (ns->pr.enable) {
+			ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid,
+							&ctrl->hostid);
+			if (ret)
+				goto free_per_ctrl_refs;
+		}
+	}
+	return 0;
+
+free_per_ctrl_refs:
+	xa_for_each(&subsys->namespaces, idx, ns) {
+		if (ns->pr.enable) {
+			pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
+			if (pc_ref)
+				percpu_ref_exit(&pc_ref->ref);
+			kfree(pc_ref);
+		}
+	}
+	return ret;
+}
+
+void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ns *ns;
+	unsigned long idx;
+
+	kfifo_free(&ctrl->pr_log_mgr.log_queue);
+	mutex_destroy(&ctrl->pr_log_mgr.lock);
+
+	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+		if (ns->pr.enable) {
+			pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
+			if (pc_ref)
+				percpu_ref_exit(&pc_ref->ref);
+			kfree(pc_ref);
+		}
+	}
+}
+
+int nvmet_pr_init_ns(struct nvmet_ns *ns)
+{
+	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ctrl *ctrl = NULL;
+	unsigned long idx;
+	int ret;
+
+	ns->pr.holder = NULL;
+	atomic_set(&ns->pr.generation, 0);
+	sema_init(&ns->pr.pr_sem, 1);
+	INIT_LIST_HEAD(&ns->pr.registrant_list);
+	ns->pr.notify_mask = 0;
+
+	xa_init(&ns->pr_per_ctrl_refs);
+
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid,
+						&ctrl->hostid);
+		if (ret)
+			goto free_per_ctrl_refs;
+	}
+	return 0;
+
+free_per_ctrl_refs:
+	xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+		xa_erase(&ns->pr_per_ctrl_refs, idx);
+		percpu_ref_exit(&pc_ref->ref);
+		kfree(pc_ref);
+	}
+	return ret;
+}
+
+void nvmet_pr_exit_ns(struct nvmet_ns *ns)
+{
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_pr *pr = &ns->pr;
+	unsigned long idx;
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		list_del(&reg->entry);
+		kfree(reg);
+	}
+
+	xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+		/*
+		 * No command on ns here, we can safely free pc_ref.
+		 */
+		pc_ref = xa_erase(&ns->pr_per_ctrl_refs, idx);
+		percpu_ref_exit(&pc_ref->ref);
+		kfree(pc_ref);
+	}
+
+	xa_destroy(&ns->pr_per_ctrl_refs);
+}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 44d048d68503..0179bb6d502d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -2045,7 +2045,7 @@ enum {
 	NVME_PR_LOG_EMPTY_LOG_PAGE			= 0x00,
 	NVME_PR_LOG_REGISTRATION_PREEMPTED		= 0x01,
 	NVME_PR_LOG_RESERVATION_RELEASED		= 0x02,
-	NVME_PR_LOG_RESERVATOPM_PREEMPTED		= 0x03,
+	NVME_PR_LOG_RESERVATOIN_PREEMPTED		= 0x03,
 };
 
 enum {
-- 
cgit v1.2.3


From 61c9967cd63448292a64f9ee9aeb6e2053e3a624 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 4 Nov 2024 13:24:36 -0800
Subject: nvmet: implement active command set ns list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is required for nvme 2.1 for targets that support multiple command
sets. We support NVM and ZNS, so are required to support this
identification.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 9 +++++++--
 include/linux/nvme.h            | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 3bf2e06baad7..c59e21434833 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -592,7 +592,7 @@ out:
 	nvmet_req_complete(req, status);
 }
 
-static void nvmet_execute_identify_nslist(struct nvmet_req *req)
+static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css)
 {
 	static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -622,6 +622,8 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
 	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
 		if (ns->nsid <= min_nsid)
 			continue;
+		if (match_css && req->ns->csi != req->cmd->identify.csi)
+			continue;
 		list[i++] = cpu_to_le32(ns->nsid);
 		if (i == buf_size / sizeof(__le32))
 			break;
@@ -728,7 +730,7 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 		nvmet_execute_identify_ctrl(req);
 		return;
 	case NVME_ID_CNS_NS_ACTIVE_LIST:
-		nvmet_execute_identify_nslist(req);
+		nvmet_execute_identify_nslist(req, false);
 		return;
 	case NVME_ID_CNS_NS_DESC_LIST:
 		nvmet_execute_identify_desclist(req);
@@ -759,6 +761,9 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 			break;
 		}
 		break;
+	case NVME_ID_CNS_NS_ACTIVE_LIST_CS:
+		nvmet_execute_identify_nslist(req, true);
+		return;
 	}
 
 	pr_debug("unhandled identify cns %d on qid %d\n",
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 0179bb6d502d..26de7c5c12be 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -522,6 +522,7 @@ enum {
 	NVME_ID_CNS_NS_DESC_LIST	= 0x03,
 	NVME_ID_CNS_CS_NS		= 0x05,
 	NVME_ID_CNS_CS_CTRL		= 0x06,
+	NVME_ID_CNS_NS_ACTIVE_LIST_CS	= 0x07,
 	NVME_ID_CNS_NS_CS_INDEP		= 0x08,
 	NVME_ID_CNS_NS_PRESENT_LIST	= 0x10,
 	NVME_ID_CNS_NS_PRESENT		= 0x11,
-- 
cgit v1.2.3


From 83acb24e6de7bbb5cb0df1ba0f47a92da9112061 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 4 Nov 2024 14:00:14 -0800
Subject: nvmet: implement supported log pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This log is required for nvme 2.1.

Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 28 ++++++++++++++++++++++++++++
 include/linux/nvme.h            |  9 +++++++++
 2 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index c59e21434833..cbbf911c5cc0 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -71,6 +71,32 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
 	nvmet_req_complete(req, 0);
 }
 
+static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req)
+{
+	struct nvme_supported_log *logs;
+	u16 status;
+
+	logs = kzalloc(sizeof(*logs), GFP_KERNEL);
+	if (!logs) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	logs->lids[NVME_LOG_SUPPORTED] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_ERROR] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_SMART] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_FW_SLOT] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_CHANGED_NS] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_CMD_EFFECTS] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP);
+
+	status = nvmet_copy_to_sgl(req, 0, logs, sizeof(*logs));
+	kfree(logs);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		struct nvme_smart_log *slog)
 {
@@ -327,6 +353,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return;
 
 	switch (req->cmd->get_log_page.lid) {
+	case NVME_LOG_SUPPORTED:
+		return nvmet_execute_get_supported_log_pages(req);
 	case NVME_LOG_ERROR:
 		return nvmet_execute_get_log_page_error(req);
 	case NVME_LOG_SMART:
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 26de7c5c12be..e9e508bca60f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1245,6 +1245,7 @@ enum {
 	NVME_FEAT_WRITE_PROTECT	= 0x84,
 	NVME_FEAT_VENDOR_START	= 0xC0,
 	NVME_FEAT_VENDOR_END	= 0xFF,
+	NVME_LOG_SUPPORTED	= 0x00,
 	NVME_LOG_ERROR		= 0x01,
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
@@ -1262,6 +1263,14 @@ enum {
 	NVME_FWACT_ACTV		= (2 << 3),
 };
 
+struct nvme_supported_log {
+	__le32	lids[256];
+};
+
+enum {
+	NVME_LIDS_LSUPP	= 1 << 0,
+};
+
 /* NVMe Namespace Write Protect State */
 enum {
 	NVME_NS_NO_WRITE_PROTECT = 0,
-- 
cgit v1.2.3


From e973c91727d49bb128c95210b3aa1960b9421d18 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 4 Nov 2024 14:07:42 -0800
Subject: nvmet: implement supported features log
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This log is required for nvme 2.1.

Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/nvme.h            | 11 +++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index cbbf911c5cc0..f832661a4913 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -89,6 +89,7 @@ static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req)
 	logs->lids[NVME_LOG_CHANGED_NS] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_CMD_EFFECTS] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_FEATURES] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP);
 
 	status = nvmet_copy_to_sgl(req, 0, logs, sizeof(*logs));
@@ -347,6 +348,36 @@ out:
 	nvmet_req_complete(req, status);
 }
 
+static void nvmet_execute_get_log_page_features(struct nvmet_req *req)
+{
+	struct nvme_supported_features_log *features;
+	u16 status;
+
+	features = kzalloc(sizeof(*features), GFP_KERNEL);
+	if (!features) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	features->fis[NVME_FEAT_NUM_QUEUES] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+	features->fis[NVME_FEAT_KATO] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+	features->fis[NVME_FEAT_ASYNC_EVENT] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+	features->fis[NVME_FEAT_HOST_ID] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+	features->fis[NVME_FEAT_WRITE_PROTECT] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_NSCPE);
+	features->fis[NVME_FEAT_RESV_MASK] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_NSCPE);
+
+	status = nvmet_copy_to_sgl(req, 0, features, sizeof(*features));
+	kfree(features);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_get_log_page(struct nvmet_req *req)
 {
 	if (!nvmet_check_transfer_len(req, nvmet_get_log_page_len(req->cmd)))
@@ -372,6 +403,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return nvmet_execute_get_log_cmd_effects_ns(req);
 	case NVME_LOG_ANA:
 		return nvmet_execute_get_log_page_ana(req);
+	case NVME_LOG_FEATURES:
+		return nvmet_execute_get_log_page_features(req);
 	case NVME_LOG_RESERVATION:
 		return nvmet_execute_get_log_page_resv(req);
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index e9e508bca60f..31d7ec6d8b93 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1256,6 +1256,7 @@ enum {
 	NVME_LOG_TELEMETRY_CTRL = 0x08,
 	NVME_LOG_ENDURANCE_GROUP = 0x09,
 	NVME_LOG_ANA		= 0x0c,
+	NVME_LOG_FEATURES	= 0x12,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
 	NVME_FWACT_REPL		= (0 << 3),
@@ -1271,6 +1272,16 @@ enum {
 	NVME_LIDS_LSUPP	= 1 << 0,
 };
 
+struct nvme_supported_features_log {
+	__le32	fis[256];
+};
+
+enum {
+	NVME_FIS_FSUPP	= 1 << 0,
+	NVME_FIS_NSCPE	= 1 << 20,
+	NVME_FIS_CSCPE	= 1 << 21,
+};
+
 /* NVMe Namespace Write Protect State */
 enum {
 	NVME_NS_NO_WRITE_PROTECT = 0,
-- 
cgit v1.2.3


From 266b652c65b44fb2ccfa17cdb54ce2ef723deb0a Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 1 Nov 2024 14:46:01 -0700
Subject: nvmet: implement endurance groups

Most of the returned information is just stubbed data. The target must
support these in order to report rotational media. Since this driver
doesn't know any better, each namespace is its own endurance group with
the engid value matching the nsid.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c        |  1 +
 drivers/nvme/target/admin-cmd.c | 95 +++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h            | 29 ++++++++++++-
 3 files changed, 123 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e9aac07f4c26..426d4b90ecd7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -5001,6 +5001,7 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+	BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index f832661a4913..366582f52200 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -88,6 +88,7 @@ static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req)
 	logs->lids[NVME_LOG_FW_SLOT] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_CHANGED_NS] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_CMD_EFFECTS] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_ENDURANCE_GROUP] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_FEATURES] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP);
@@ -303,6 +304,49 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
 	return struct_size(desc, nsids, count);
 }
 
+static void nvmet_execute_get_log_page_endgrp(struct nvmet_req *req)
+{
+	u64 host_reads, host_writes, data_units_read, data_units_written;
+	struct nvme_endurance_group_log *log;
+	u16 status;
+
+	/*
+	 * The target driver emulates each endurance group as its own
+	 * namespace, reusing the nsid as the endurance group identifier.
+	 */
+	req->cmd->common.nsid = cpu_to_le32(le16_to_cpu(
+					    req->cmd->get_log_page.lsi));
+	status = nvmet_req_find_ns(req);
+	if (status)
+		goto out;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	if (!req->ns->bdev)
+		goto copy;
+
+	host_reads = part_stat_read(req->ns->bdev, ios[READ]);
+	data_units_read =
+		DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[READ]), 1000);
+	host_writes = part_stat_read(req->ns->bdev, ios[WRITE]);
+	data_units_written =
+		DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[WRITE]), 1000);
+
+	put_unaligned_le64(host_reads, &log->hrc[0]);
+	put_unaligned_le64(data_units_read, &log->dur[0]);
+	put_unaligned_le64(host_writes, &log->hwc[0]);
+	put_unaligned_le64(data_units_written, &log->duw[0]);
+copy:
+	status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+	kfree(log);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
 {
 	struct nvme_ana_rsp_hdr hdr = { 0, };
@@ -401,6 +445,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return nvmet_execute_get_log_changed_ns(req);
 	case NVME_LOG_CMD_EFFECTS:
 		return nvmet_execute_get_log_cmd_effects_ns(req);
+	case NVME_LOG_ENDURANCE_GROUP:
+		return nvmet_execute_get_log_page_endgrp(req);
 	case NVME_LOG_ANA:
 		return nvmet_execute_get_log_page_ana(req);
 	case NVME_LOG_FEATURES:
@@ -535,6 +581,13 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	id->msdbd = ctrl->ops->msdbd;
 
+	/*
+	 * Endurance group identifier is 16 bits, so we can't let namespaces
+	 * overflow that since we reuse the nsid
+	 */
+	BUILD_BUG_ON(NVMET_MAX_NAMESPACES > USHRT_MAX);
+	id->endgidmax = cpu_to_le16(NVMET_MAX_NAMESPACES);
+
 	id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4);
 	id->anatt = 10; /* random value */
 	id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS);
@@ -628,6 +681,12 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 			NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS |
 			NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF;
 
+	/*
+	 * Since we don't know any better, every namespace is its own endurance
+	 * group.
+	 */
+	id->endgid = cpu_to_le16(req->ns->nsid);
+
 	memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid));
 
 	id->lbaf[0].ds = req->ns->blksize_shift;
@@ -653,6 +712,39 @@ out:
 	nvmet_req_complete(req, status);
 }
 
+static void nvmet_execute_identify_endgrp_list(struct nvmet_req *req)
+{
+	u16 min_endgid = le16_to_cpu(req->cmd->identify.cnssid);
+	static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_ns *ns;
+	unsigned long idx;
+	__le16 *list;
+	u16 status;
+	int i = 1;
+
+	list = kzalloc(buf_size, GFP_KERNEL);
+	if (!list) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+		if (ns->nsid <= min_endgid)
+			continue;
+
+		list[i++] = cpu_to_le16(ns->nsid);
+		if (i == buf_size / sizeof(__le16))
+			break;
+	}
+
+	list[0] = cpu_to_le16(i - 1);
+	status = nvmet_copy_to_sgl(req, 0, list, buf_size);
+	kfree(list);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css)
 {
 	static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
@@ -825,6 +917,9 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 	case NVME_ID_CNS_NS_ACTIVE_LIST_CS:
 		nvmet_execute_identify_nslist(req, true);
 		return;
+	case NVME_ID_CNS_ENDGRP_LIST:
+		nvmet_execute_identify_endgrp_list(req);
+		return;
 	}
 
 	pr_debug("unhandled identify cns %d on qid %d\n",
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 31d7ec6d8b93..6d5b4299a1b2 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -327,7 +327,8 @@ struct nvme_id_ctrl {
 	__le32			sanicap;
 	__le32			hmminds;
 	__le16			hmmaxd;
-	__u8			rsvd338[4];
+	__le16			nvmsetidmax;
+	__le16			endgidmax;
 	__u8			anatt;
 	__u8			anacap;
 	__le32			anagrpmax;
@@ -531,6 +532,7 @@ enum {
 	NVME_ID_CNS_SCNDRY_CTRL_LIST	= 0x15,
 	NVME_ID_CNS_NS_GRANULARITY	= 0x16,
 	NVME_ID_CNS_UUID_LIST		= 0x17,
+	NVME_ID_CNS_ENDGRP_LIST		= 0x19,
 };
 
 enum {
@@ -618,6 +620,28 @@ enum {
 	NVME_NIDT_CSI		= 0x04,
 };
 
+struct nvme_endurance_group_log {
+	__u8	egcw;
+	__u8	egfeat;
+	__u8	rsvd2;
+	__u8	avsp;
+	__u8	avspt;
+	__u8	pused;
+	__le16	did;
+	__u8	rsvd8[24];
+	__u8	ee[16];
+	__u8	dur[16];
+	__u8	duw[16];
+	__u8	muw[16];
+	__u8	hrc[16];
+	__u8	hwc[16];
+	__u8	mdie[16];
+	__u8	neile[16];
+	__u8	tegcap[16];
+	__u8	uegcap[16];
+	__u8	rsvd192[320];
+};
+
 struct nvme_smart_log {
 	__u8			critical_warning;
 	__u8			temperature[2];
@@ -1302,7 +1326,8 @@ struct nvme_identify {
 	__u8			cns;
 	__u8			rsvd3;
 	__le16			ctrlid;
-	__u8			rsvd11[3];
+	__le16			cnssid;
+	__u8			rsvd11;
 	__u8			csi;
 	__u32			rsvd12[4];
 };
-- 
cgit v1.2.3


From 5fd075cdaf3649000677d960fd9e45c08081b7e0 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 1 Nov 2024 13:48:47 -0700
Subject: nvmet: implement rotational media information log

Most of the information is stubbed. Supporting these commands is a
requirement for supporting rotational media.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c        |  1 +
 drivers/nvme/target/admin-cmd.c | 42 +++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h            | 15 ++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 426d4b90ecd7..279b0f445904 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -5002,6 +5002,7 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 	BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
+	BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512);
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 366582f52200..33b7ecfee3fe 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -91,6 +91,7 @@ static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req)
 	logs->lids[NVME_LOG_ENDURANCE_GROUP] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_FEATURES] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_RMI] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP);
 
 	status = nvmet_copy_to_sgl(req, 0, logs, sizeof(*logs));
@@ -158,6 +159,45 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
 	return NVME_SC_SUCCESS;
 }
 
+static void nvmet_execute_get_log_page_rmi(struct nvmet_req *req)
+{
+	struct nvme_rotational_media_log *log;
+	struct gendisk *disk;
+	u16 status;
+
+	req->cmd->common.nsid = cpu_to_le32(le16_to_cpu(
+					    req->cmd->get_log_page.lsi));
+	status = nvmet_req_find_ns(req);
+	if (status)
+		goto out;
+
+	if (!req->ns->bdev || bdev_nonrot(req->ns->bdev)) {
+		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	if (req->transfer_len != sizeof(*log)) {
+		status = NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		goto out;
+
+	log->endgid = req->cmd->get_log_page.lsi;
+	disk = req->ns->bdev->bd_disk;
+	if (disk && disk->ia_ranges)
+		log->numa = cpu_to_le16(disk->ia_ranges->nr_ia_ranges);
+	else
+		log->numa = cpu_to_le16(1);
+
+	status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+	kfree(log);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
 {
 	struct nvme_smart_log *log;
@@ -451,6 +491,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return nvmet_execute_get_log_page_ana(req);
 	case NVME_LOG_FEATURES:
 		return nvmet_execute_get_log_page_features(req);
+	case NVME_LOG_RMI:
+		return nvmet_execute_get_log_page_rmi(req);
 	case NVME_LOG_RESERVATION:
 		return nvmet_execute_get_log_page_resv(req);
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 6d5b4299a1b2..99cf0ee73714 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -642,6 +642,18 @@ struct nvme_endurance_group_log {
 	__u8	rsvd192[320];
 };
 
+struct nvme_rotational_media_log {
+	__le16	endgid;
+	__le16	numa;
+	__le16	nrs;
+	__u8	rsvd6[2];
+	__le32	spinc;
+	__le32	fspinc;
+	__le32	ldc;
+	__le32	fldc;
+	__u8	rsvd24[488];
+};
+
 struct nvme_smart_log {
 	__u8			critical_warning;
 	__u8			temperature[2];
@@ -1281,6 +1293,7 @@ enum {
 	NVME_LOG_ENDURANCE_GROUP = 0x09,
 	NVME_LOG_ANA		= 0x0c,
 	NVME_LOG_FEATURES	= 0x12,
+	NVME_LOG_RMI		= 0x16,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
 	NVME_FWACT_REPL		= (0 << 3),
@@ -1435,7 +1448,7 @@ struct nvme_get_log_page_command {
 	__u8			lsp; /* upper 4 bits reserved */
 	__le16			numdl;
 	__le16			numdu;
-	__u16			rsvd11;
+	__le16			lsi;
 	union {
 		struct {
 			__le32 lpol;
-- 
cgit v1.2.3


From e2758c76a0ab2032a0d11abc1c2ff08661fdf428 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 1 Nov 2024 12:29:40 -0700
Subject: nvmet: support for csi identify ns

Implements reporting the I/O Command Set Independent Identify Namespace
command.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 32 ++++++++++++++++++++++++++++++++
 include/linux/nvme.h            |  1 +
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 33b7ecfee3fe..0a9fdc533186 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -912,6 +912,35 @@ out:
 	nvmet_req_complete(req, status);
 }
 
+static void nvmet_execute_id_cs_indep(struct nvmet_req *req)
+{
+	struct nvme_id_ns_cs_indep *id;
+	u16 status;
+
+	status = nvmet_req_find_ns(req);
+	if (status)
+		goto out;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	id->nstat = NVME_NSTAT_NRDY;
+	id->anagrpid = cpu_to_le32(req->ns->anagrpid);
+	id->nmic = NVME_NS_NMIC_SHARED;
+	if (req->ns->readonly)
+		id->nsattr |= NVME_NS_ATTR_RO;
+	if (req->ns->bdev && !bdev_nonrot(req->ns->bdev))
+		id->nsfeat |= NVME_NS_ROTATIONAL;
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+	kfree(id);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_identify(struct nvmet_req *req)
 {
 	if (!nvmet_check_transfer_len(req, NVME_IDENTIFY_DATA_SIZE))
@@ -959,6 +988,9 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 	case NVME_ID_CNS_NS_ACTIVE_LIST_CS:
 		nvmet_execute_identify_nslist(req, true);
 		return;
+	case NVME_ID_CNS_NS_CS_INDEP:
+		nvmet_execute_id_cs_indep(req);
+		return;
 	case NVME_ID_CNS_ENDGRP_LIST:
 		nvmet_execute_identify_endgrp_list(req);
 		return;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 99cf0ee73714..c136d64c7d73 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -563,6 +563,7 @@ enum {
 	NVME_NS_FLBAS_LBA_SHIFT	= 1,
 	NVME_NS_FLBAS_META_EXT	= 0x10,
 	NVME_NS_NMIC_SHARED	= 1 << 0,
+	NVME_NS_ROTATIONAL	= 1 << 4,
 	NVME_LBAF_RP_BEST	= 0,
 	NVME_LBAF_RP_BETTER	= 1,
 	NVME_LBAF_RP_GOOD	= 2,
-- 
cgit v1.2.3


From 8a825d22a70915bd80c811fa93538cf2540af29d Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Mon, 4 Nov 2024 16:55:00 +0800
Subject: nvme: check ns's volatile write cache not present

When the VWC of a namespace does not exist, the BLK_FEAT_WRITE_CACHE
flag should not be set when registering the block device, regardless
of whether the controller supports VWC.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 4 +++-
 include/linux/nvme.h     | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6f51dde7de6c..e119ba0f8ab8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -43,6 +43,7 @@ struct nvme_ns_info {
 	bool is_ready;
 	bool is_removed;
 	bool is_rotational;
+	bool no_vwc;
 };
 
 unsigned int admin_timeout = 60;
@@ -1617,6 +1618,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
 		info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
 		info->is_ready = id->nstat & NVME_NSTAT_NRDY;
 		info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
+		info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
 	}
 	kfree(id);
 	return ret;
@@ -2159,7 +2161,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 	    ns->head->ids.csi == NVME_CSI_ZNS)
 		nvme_update_zone_info(ns, &lim, &zi);
 
-	if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+	if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
 		lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
 	else
 		lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c136d64c7d73..0a6e22038ce3 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -564,6 +564,7 @@ enum {
 	NVME_NS_FLBAS_META_EXT	= 0x10,
 	NVME_NS_NMIC_SHARED	= 1 << 0,
 	NVME_NS_ROTATIONAL	= 1 << 4,
+	NVME_NS_VWC_NOT_PRESENT = 1 << 5,
 	NVME_LBAF_RP_BEST	= 0,
 	NVME_LBAF_RP_BETTER	= 1,
 	NVME_LBAF_RP_GOOD	= 2,
-- 
cgit v1.2.3


From 1af75b2ad08bd5977c51c2d0fc11741a4c0a48d9 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@oss.qualcomm.com>
Date: Sun, 10 Nov 2024 09:33:40 -0800
Subject: firmware: qcom: scm: Introduce CP_SMMU_APERTURE_ID

The QCOM_SCM_SVC_MP service provides QCOM_SCM_MP_CP_SMMU_APERTURE_ID,
which is used to trigger the mapping of register banks into the SMMU
context for per-processes page tables to function (in case this isn't
statically setup by firmware).

This is necessary on e.g. QCS6490 Rb3Gen2, in order to avoid "CP | AHB
bus error"-errors from the GPU.

Introduce a function to allow the msm driver to invoke this call.

Signed-off-by: Bjorn Andersson <bjorn.andersson@oss.qualcomm.com>
Reviewed-by: Rob Clark <robdclark@gmail.com>
Link: https://lore.kernel.org/r/20241110-adreno-smmu-aparture-v2-1-9b1fb2ee41d4@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/firmware/qcom/qcom_scm.c       | 26 ++++++++++++++++++++++++++
 drivers/firmware/qcom/qcom_scm.h       |  1 +
 include/linux/firmware/qcom/qcom_scm.h |  2 ++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c
index bdb42eb2e1e8..596a8acf0899 100644
--- a/drivers/firmware/qcom/qcom_scm.c
+++ b/drivers/firmware/qcom/qcom_scm.c
@@ -903,6 +903,32 @@ int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare)
 }
 EXPORT_SYMBOL_GPL(qcom_scm_restore_sec_cfg);
 
+#define QCOM_SCM_CP_APERTURE_CONTEXT_MASK	GENMASK(7, 0)
+
+bool qcom_scm_set_gpu_smmu_aperture_is_available(void)
+{
+	return __qcom_scm_is_call_available(__scm->dev, QCOM_SCM_SVC_MP,
+					    QCOM_SCM_MP_CP_SMMU_APERTURE_ID);
+}
+EXPORT_SYMBOL_GPL(qcom_scm_set_gpu_smmu_aperture_is_available);
+
+int qcom_scm_set_gpu_smmu_aperture(unsigned int context_bank)
+{
+	struct qcom_scm_desc desc = {
+		.svc = QCOM_SCM_SVC_MP,
+		.cmd = QCOM_SCM_MP_CP_SMMU_APERTURE_ID,
+		.arginfo = QCOM_SCM_ARGS(4),
+		.args[0] = 0xffff0000 | FIELD_PREP(QCOM_SCM_CP_APERTURE_CONTEXT_MASK, context_bank),
+		.args[1] = 0xffffffff,
+		.args[2] = 0xffffffff,
+		.args[3] = 0xffffffff,
+		.owner = ARM_SMCCC_OWNER_SIP
+	};
+
+	return qcom_scm_call(__scm->dev, &desc, NULL);
+}
+EXPORT_SYMBOL_GPL(qcom_scm_set_gpu_smmu_aperture);
+
 int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size)
 {
 	struct qcom_scm_desc desc = {
diff --git a/drivers/firmware/qcom/qcom_scm.h b/drivers/firmware/qcom/qcom_scm.h
index 685b8f59e7a6..e36b2f67607f 100644
--- a/drivers/firmware/qcom/qcom_scm.h
+++ b/drivers/firmware/qcom/qcom_scm.h
@@ -116,6 +116,7 @@ struct qcom_tzmem_pool *qcom_scm_get_tzmem_pool(void);
 #define QCOM_SCM_MP_IOMMU_SET_CP_POOL_SIZE	0x05
 #define QCOM_SCM_MP_VIDEO_VAR			0x08
 #define QCOM_SCM_MP_ASSIGN			0x16
+#define QCOM_SCM_MP_CP_SMMU_APERTURE_ID		0x1b
 #define QCOM_SCM_MP_SHM_BRIDGE_ENABLE		0x1c
 #define QCOM_SCM_MP_SHM_BRIDGE_DELETE		0x1d
 #define QCOM_SCM_MP_SHM_BRIDGE_CREATE		0x1e
diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h
index 9f14976399ab..4621aec0328c 100644
--- a/include/linux/firmware/qcom/qcom_scm.h
+++ b/include/linux/firmware/qcom/qcom_scm.h
@@ -85,6 +85,8 @@ int qcom_scm_io_writel(phys_addr_t addr, unsigned int val);
 
 bool qcom_scm_restore_sec_cfg_available(void);
 int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare);
+int qcom_scm_set_gpu_smmu_aperture(unsigned int context_bank);
+bool qcom_scm_set_gpu_smmu_aperture_is_available(void);
 int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size);
 int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare);
 int qcom_scm_iommu_set_cp_pool_size(u32 spare, u32 size);
-- 
cgit v1.2.3


From b376d519bd142c65ba9bba35db12b6be95b46893 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 29 Sep 2024 20:50:16 -0400
Subject: xdrgen: Implement big-endian enums

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xdr.h                         | 21 +++++++++++++
 tools/net/sunrpc/xdrgen/README                     | 17 +++++++++++
 tools/net/sunrpc/xdrgen/generators/enum.py         | 17 ++++++++---
 tools/net/sunrpc/xdrgen/generators/union.py        | 34 ++++++++++++++++------
 tools/net/sunrpc/xdrgen/grammars/xdr.lark          |  4 ++-
 .../xdrgen/templates/C/enum/decoder/enum_be.j2     | 14 +++++++++
 .../xdrgen/templates/C/enum/definition/close_be.j2 |  3 ++
 .../xdrgen/templates/C/enum/encoder/enum_be.j2     | 14 +++++++++
 .../templates/C/union/decoder/case_spec_be.j2      |  2 ++
 .../templates/C/union/encoder/case_spec_be.j2      |  2 ++
 tools/net/sunrpc/xdrgen/xdr_ast.py                 |  3 ++
 11 files changed, 117 insertions(+), 14 deletions(-)
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/enum/encoder/enum_be.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/case_spec_be.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/case_spec_be.j2

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5f775e104f9a..a2ab813a9800 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -680,6 +680,27 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr)
 	return 0;
 }
 
+/**
+ * xdr_stream_decode_be32 - Decode a big-endian 32-bit integer
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store integer
+ *
+ * Return values:
+ *   %0 on success
+ *   %-EBADMSG on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_decode_be32(struct xdr_stream *xdr, __be32 *ptr)
+{
+	const size_t count = sizeof(*ptr);
+	__be32 *p = xdr_inline_decode(xdr, count);
+
+	if (unlikely(!p))
+		return -EBADMSG;
+	*ptr = *p;
+	return 0;
+}
+
 /**
  * xdr_stream_decode_u64 - Decode a 64-bit integer
  * @xdr: pointer to xdr_stream
diff --git a/tools/net/sunrpc/xdrgen/README b/tools/net/sunrpc/xdrgen/README
index 92f7738ad50c..27218a78ab40 100644
--- a/tools/net/sunrpc/xdrgen/README
+++ b/tools/net/sunrpc/xdrgen/README
@@ -150,6 +150,23 @@ Pragma directives specify exceptions to the normal generation of
 encoding and decoding functions. Currently one directive is
 implemented: "public".
 
+Pragma big_endian
+------ ----------
+
+  pragma big_endian <enum> ;
+
+For variables that might contain only a small number values, it
+is more efficient to avoid the byte-swap when encoding or decoding
+on little-endian machines. Such is often the case with error status
+codes. For example:
+
+  pragma big_endian nfsstat3;
+
+In this case, when generating an XDR struct or union containing a
+field of type "nfsstat3", xdrgen will make the type of that field
+"__be32" instead of "enum nfsstat3". XDR unions then switch on the
+non-byte-swapped value of that field.
+
 Pragma exclude
 ------ -------
 
diff --git a/tools/net/sunrpc/xdrgen/generators/enum.py b/tools/net/sunrpc/xdrgen/generators/enum.py
index e37b5c297821..e63f45b8eb74 100644
--- a/tools/net/sunrpc/xdrgen/generators/enum.py
+++ b/tools/net/sunrpc/xdrgen/generators/enum.py
@@ -4,7 +4,7 @@
 """Generate code to handle XDR enum types"""
 
 from generators import SourceGenerator, create_jinja2_environment
-from xdr_ast import _XdrEnum, public_apis
+from xdr_ast import _XdrEnum, public_apis, big_endian
 
 
 class XdrEnumGenerator(SourceGenerator):
@@ -30,15 +30,24 @@ class XdrEnumGenerator(SourceGenerator):
         for enumerator in node.enumerators:
             print(template.render(name=enumerator.name, value=enumerator.value))
 
-        template = self.environment.get_template("definition/close.j2")
+        if node.name in big_endian:
+            template = self.environment.get_template("definition/close_be.j2")
+        else:
+            template = self.environment.get_template("definition/close.j2")
         print(template.render(name=node.name))
 
     def emit_decoder(self, node: _XdrEnum) -> None:
         """Emit one decoder function for an XDR enum type"""
-        template = self.environment.get_template("decoder/enum.j2")
+        if node.name in big_endian:
+            template = self.environment.get_template("decoder/enum_be.j2")
+        else:
+            template = self.environment.get_template("decoder/enum.j2")
         print(template.render(name=node.name))
 
     def emit_encoder(self, node: _XdrEnum) -> None:
         """Emit one encoder function for an XDR enum type"""
-        template = self.environment.get_template("encoder/enum.j2")
+        if node.name in big_endian:
+            template = self.environment.get_template("encoder/enum_be.j2")
+        else:
+            template = self.environment.get_template("encoder/enum.j2")
         print(template.render(name=node.name))
diff --git a/tools/net/sunrpc/xdrgen/generators/union.py b/tools/net/sunrpc/xdrgen/generators/union.py
index 7974967bbb9f..4522a5b7a943 100644
--- a/tools/net/sunrpc/xdrgen/generators/union.py
+++ b/tools/net/sunrpc/xdrgen/generators/union.py
@@ -8,7 +8,7 @@ from jinja2 import Environment
 from generators import SourceGenerator
 from generators import create_jinja2_environment, get_jinja2_template
 
-from xdr_ast import _XdrBasic, _XdrUnion, _XdrVoid
+from xdr_ast import _XdrBasic, _XdrUnion, _XdrVoid, big_endian
 from xdr_ast import _XdrDeclaration, _XdrCaseSpec, public_apis
 
 
@@ -77,13 +77,18 @@ def emit_union_switch_spec_decoder(
     print(template.render(name=node.name, type=node.spec.type_name))
 
 
-def emit_union_case_spec_decoder(environment: Environment, node: _XdrCaseSpec) -> None:
+def emit_union_case_spec_decoder(
+    environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool
+) -> None:
     """Emit decoder functions for an XDR union's case arm"""
 
     if isinstance(node.arm, _XdrVoid):
         return
 
-    template = get_jinja2_template(environment, "decoder", "case_spec")
+    if big_endian_discriminant:
+        template = get_jinja2_template(environment, "decoder", "case_spec_be")
+    else:
+        template = get_jinja2_template(environment, "decoder", "case_spec")
     for case in node.values:
         print(template.render(case=case))
 
@@ -136,7 +141,11 @@ def emit_union_decoder(environment: Environment, node: _XdrUnion) -> None:
     emit_union_switch_spec_decoder(environment, node.discriminant)
 
     for case in node.cases:
-        emit_union_case_spec_decoder(environment, case)
+        emit_union_case_spec_decoder(
+            environment,
+            case,
+            node.discriminant.spec.type_name in big_endian,
+        )
 
     emit_union_default_spec_decoder(environment, node)
 
@@ -153,17 +162,21 @@ def emit_union_switch_spec_encoder(
     print(template.render(name=node.name, type=node.spec.type_name))
 
 
-def emit_union_case_spec_encoder(environment: Environment, node: _XdrCaseSpec) -> None:
+def emit_union_case_spec_encoder(
+    environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool
+) -> None:
     """Emit encoder functions for an XDR union's case arm"""
 
     if isinstance(node.arm, _XdrVoid):
         return
 
-    template = get_jinja2_template(environment, "encoder", "case_spec")
+    if big_endian_discriminant:
+        template = get_jinja2_template(environment, "encoder", "case_spec_be")
+    else:
+        template = get_jinja2_template(environment, "encoder", "case_spec")
     for case in node.values:
         print(template.render(case=case))
 
-    assert isinstance(node.arm, _XdrBasic)
     template = get_jinja2_template(environment, "encoder", node.arm.template)
     print(
         template.render(
@@ -192,7 +205,6 @@ def emit_union_default_spec_encoder(environment: Environment, node: _XdrUnion) -
         print(template.render())
         return
 
-    assert isinstance(default_case.arm, _XdrBasic)
     template = get_jinja2_template(environment, "encoder", default_case.arm.template)
     print(
         template.render(
@@ -210,7 +222,11 @@ def emit_union_encoder(environment, node: _XdrUnion) -> None:
     emit_union_switch_spec_encoder(environment, node.discriminant)
 
     for case in node.cases:
-        emit_union_case_spec_encoder(environment, case)
+        emit_union_case_spec_encoder(
+            environment,
+            case,
+            node.discriminant.spec.type_name in big_endian,
+        )
 
     emit_union_default_spec_encoder(environment, node)
 
diff --git a/tools/net/sunrpc/xdrgen/grammars/xdr.lark b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
index 0e1aeb02d667..7c2c1b8c86d1 100644
--- a/tools/net/sunrpc/xdrgen/grammars/xdr.lark
+++ b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
@@ -87,12 +87,14 @@ procedure_def           : type_specifier identifier "(" type_specifier ")" "=" c
 
 pragma_def              : "pragma" directive identifier [ identifier ] ";"
 
-directive               : exclude_directive
+directive               : big_endian_directive
+                        | exclude_directive
                         | header_directive
                         | pages_directive
                         | public_directive
                         | skip_directive
 
+big_endian_directive    : "big_endian"
 exclude_directive       : "exclude"
 header_directive        : "header"
 pages_directive         : "pages"
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2
new file mode 100644
index 000000000000..44c391c10b42
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2
@@ -0,0 +1,14 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* enum {{ name }} (big-endian) */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr)
+{
+	return xdr_stream_decode_be32(xdr, ptr) == 0;
+}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2
new file mode 100644
index 000000000000..2c18948bddf7
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+};
+typedef __be32 {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/encoder/enum_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/encoder/enum_be.j2
new file mode 100644
index 000000000000..fbbcc45948d6
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/encoder/enum_be.j2
@@ -0,0 +1,14 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{% if annotate %}
+/* enum {{ name }} (big-endian) */
+{% endif %}
+{% if name in public_apis %}
+bool
+{% else %}
+static bool __maybe_unused
+{% endif %}
+xdrgen_encode_{{ name }}(struct xdr_stream *xdr, {{ name }} value)
+{
+	return xdr_stream_encode_be32(xdr, value) == XDR_UNIT;
+}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/case_spec_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/case_spec_be.j2
new file mode 100644
index 000000000000..917f3a1c4588
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/case_spec_be.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	case __constant_cpu_to_be32({{ case }}):
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/case_spec_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/case_spec_be.j2
new file mode 100644
index 000000000000..917f3a1c4588
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/case_spec_be.j2
@@ -0,0 +1,2 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+	case __constant_cpu_to_be32({{ case }}):
diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py
index 576e1ecfe1d7..d5f0535ec84c 100644
--- a/tools/net/sunrpc/xdrgen/xdr_ast.py
+++ b/tools/net/sunrpc/xdrgen/xdr_ast.py
@@ -12,6 +12,7 @@ from lark.tree import Meta
 
 this_module = sys.modules[__name__]
 
+big_endian = []
 excluded_apis = []
 header_name = "none"
 public_apis = []
@@ -480,6 +481,8 @@ class ParseToAst(Transformer):
         """Instantiate one _Pragma object"""
         directive = children[0].children[0].data
         match directive:
+            case "big_endian_directive":
+                big_endian.append(children[1].symbol)
             case "exclude_directive":
                 excluded_apis.append(children[1].symbol)
             case "header_directive":
-- 
cgit v1.2.3


From 631c2925bae41c11dcf3915a2ab5f3be9af54277 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Oct 2024 14:54:33 -0400
Subject: xdrgen: Keep track of on-the-wire data type widths

The generic parts of the RPC layer need to know the widths (in
XDR_UNIT increments) of the XDR data types defined for each
protocol.

As a first step, add dictionaries to keep track of the symbolic and
actual maximum XDR width of XDR types.

This makes it straightforward to look up the width of a type by its
name. The built-in dictionaries are pre-loaded with the widths of
the built-in XDR types as defined in RFC 4506.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xdrgen/_defs.h |  9 ++++++++
 tools/net/sunrpc/xdrgen/xdr_ast.py  | 43 +++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdrgen/_defs.h b/include/linux/sunrpc/xdrgen/_defs.h
index be9e62371758..20c7270aa64d 100644
--- a/include/linux/sunrpc/xdrgen/_defs.h
+++ b/include/linux/sunrpc/xdrgen/_defs.h
@@ -23,4 +23,13 @@ typedef struct {
 	u8 *data;
 } opaque;
 
+#define XDR_void		(0)
+#define XDR_bool		(1)
+#define XDR_int			(1)
+#define XDR_unsigned_int	(1)
+#define XDR_long		(1)
+#define XDR_unsigned_long	(1)
+#define XDR_hyper		(2)
+#define XDR_unsigned_hyper	(2)
+
 #endif /* _SUNRPC_XDRGEN__DEFS_H_ */
diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py
index b7df45f47707..f1d93a1d0ed8 100644
--- a/tools/net/sunrpc/xdrgen/xdr_ast.py
+++ b/tools/net/sunrpc/xdrgen/xdr_ast.py
@@ -21,6 +21,31 @@ pass_by_reference = set()
 
 constants = {}
 
+symbolic_widths = {
+    "void": ["XDR_void"],
+    "bool": ["XDR_bool"],
+    "int": ["XDR_int"],
+    "unsigned_int": ["XDR_unsigned_int"],
+    "long": ["XDR_long"],
+    "unsigned_long": ["XDR_unsigned_long"],
+    "hyper": ["XDR_hyper"],
+    "unsigned_hyper": ["XDR_unsigned_hyper"],
+}
+
+# Numeric XDR widths are tracked in a dictionary that is keyed
+# by type_name because sometimes a caller has nothing more than
+# the type_name to use to figure out the numeric width.
+max_widths = {
+    "void": 0,
+    "bool": 1,
+    "int": 1,
+    "unsigned_int": 1,
+    "long": 1,
+    "unsigned_long": 1,
+    "hyper": 2,
+    "unsigned_hyper": 2,
+}
+
 
 @dataclass
 class _XdrAst(ast_utils.Ast):
@@ -60,15 +85,24 @@ class _XdrTypeSpecifier(_XdrAst):
 class _XdrDefinedType(_XdrTypeSpecifier):
     """Corresponds to a type defined by the input specification"""
 
+    def symbolic_width(self) -> List:
+        """Return list containing XDR width of type's components"""
+        return [get_header_name().upper() + "_" + self.type_name + "_sz"]
+
     def __post_init__(self):
         if self.type_name in structs:
             self.c_classifier = "struct "
+        symbolic_widths[self.type_name] = self.symbolic_width()
 
 
 @dataclass
 class _XdrBuiltInType(_XdrTypeSpecifier):
     """Corresponds to a built-in XDR type"""
 
+    def symbolic_width(self) -> List:
+        """Return list containing XDR width of type's components"""
+        return symbolic_widths[self.type_name]
+
 
 @dataclass
 class _XdrDeclaration(_XdrAst):
@@ -148,8 +182,17 @@ class _XdrBasic(_XdrDeclaration):
 class _XdrVoid(_XdrDeclaration):
     """A void declaration"""
 
+    name: str = "void"
     template: str = "void"
 
+    def max_width(self) -> int:
+        """Return width of type in XDR_UNITS"""
+        return 0
+
+    def symbolic_width(self) -> List:
+        """Return list containing XDR width of type's components"""
+        return []
+
 
 @dataclass
 class _XdrConstant(_XdrAst):
-- 
cgit v1.2.3


From 65941f10caf2c04781a7defa4ec0ab119dbd235a Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Mon, 28 Oct 2024 19:53:37 +0800
Subject: mm: move the page fragment allocator from page_alloc into its own
 file

Inspired by [1], move the page fragment allocator from page_alloc
into its own c file and header file, as we are about to make more
change for it to replace another page_frag implementation in
sock.c

As this patchset is going to replace 'struct page_frag' with
'struct page_frag_cache' in sched.h, including page_frag_cache.h
in sched.h has a compiler error caused by interdependence between
mm_types.h and mm.h for asm-offsets.c, see [2]. So avoid the compiler
error by moving 'struct page_frag_cache' to mm_types_task.h as
suggested by Alexander, see [3].

1. https://lore.kernel.org/all/20230411160902.4134381-3-dhowells@redhat.com/
2. https://lore.kernel.org/all/15623dac-9358-4597-b3ee-3694a5956920@gmail.com/
3. https://lore.kernel.org/all/CAKgT0UdH1yD=LSCXFJ=YM_aiA4OomD-2wXykO42bizaWMt_HOA@mail.gmail.com/
CC: David Howells <dhowells@redhat.com>
CC: Linux-MM <linux-mm@kvack.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/20241028115343.3405838-3-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/gfp.h                                |  22 ----
 include/linux/mm_types.h                           |  18 ---
 include/linux/mm_types_task.h                      |  18 +++
 include/linux/page_frag_cache.h                    |  31 +++++
 include/linux/skbuff.h                             |   1 +
 mm/Makefile                                        |   1 +
 mm/page_alloc.c                                    | 136 -------------------
 mm/page_frag_cache.c                               | 145 +++++++++++++++++++++
 .../selftests/mm/page_frag/page_frag_test.c        |   2 +-
 9 files changed, 197 insertions(+), 177 deletions(-)
 create mode 100644 include/linux/page_frag_cache.h
 create mode 100644 mm/page_frag_cache.c

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a951de920e20..a0a6d25f883f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -371,28 +371,6 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 
-struct page_frag_cache;
-void page_frag_cache_drain(struct page_frag_cache *nc);
-extern void __page_frag_cache_drain(struct page *page, unsigned int count);
-void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
-			      gfp_t gfp_mask, unsigned int align_mask);
-
-static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
-					  unsigned int fragsz, gfp_t gfp_mask,
-					  unsigned int align)
-{
-	WARN_ON_ONCE(!is_power_of_2(align));
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
-}
-
-static inline void *page_frag_alloc(struct page_frag_cache *nc,
-			     unsigned int fragsz, gfp_t gfp_mask)
-{
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
-}
-
-extern void page_frag_free(void *addr);
-
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6e3bdf8e38bc..92314ef2d978 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -521,9 +521,6 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
  */
 #define STRUCT_PAGE_MAX_SHIFT	(order_base_2(sizeof(struct page)))
 
-#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
-#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
-
 /*
  * page_private can be used on tail pages.  However, PagePrivate is only
  * checked by the VM on the head page.  So page_private on the tail pages
@@ -542,21 +539,6 @@ static inline void *folio_get_private(struct folio *folio)
 	return folio->private;
 }
 
-struct page_frag_cache {
-	void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	__u16 offset;
-	__u16 size;
-#else
-	__u32 offset;
-#endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_refcount every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
-};
-
 typedef unsigned long vm_flags_t;
 
 /*
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index bff5706b76e1..0ac6daebdd5c 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -8,6 +8,7 @@
  * (These are defined separately to decouple sched.h from mm_types.h as much as possible.)
  */
 
+#include <linux/align.h>
 #include <linux/types.h>
 
 #include <asm/page.h>
@@ -43,6 +44,23 @@ struct page_frag {
 #endif
 };
 
+#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
+#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+struct page_frag_cache {
+	void *va;
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	__u16 offset;
+	__u16 size;
+#else
+	__u32 offset;
+#endif
+	/* we maintain a pagecount bias, so that we dont dirty cache line
+	 * containing page->_refcount every time we allocate a fragment.
+	 */
+	unsigned int		pagecnt_bias;
+	bool pfmemalloc;
+};
+
 /* Track pages that require TLB flushes */
 struct tlbflush_unmap_batch {
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
new file mode 100644
index 000000000000..67ac8626ed9b
--- /dev/null
+++ b/include/linux/page_frag_cache.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_PAGE_FRAG_CACHE_H
+#define _LINUX_PAGE_FRAG_CACHE_H
+
+#include <linux/log2.h>
+#include <linux/mm_types_task.h>
+#include <linux/types.h>
+
+void page_frag_cache_drain(struct page_frag_cache *nc);
+void __page_frag_cache_drain(struct page *page, unsigned int count);
+void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
+			      gfp_t gfp_mask, unsigned int align_mask);
+
+static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
+					  unsigned int fragsz, gfp_t gfp_mask,
+					  unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
+}
+
+static inline void *page_frag_alloc(struct page_frag_cache *nc,
+				    unsigned int fragsz, gfp_t gfp_mask)
+{
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+}
+
+void page_frag_free(void *addr);
+
+#endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e5095d75abba..60535c706851 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -31,6 +31,7 @@
 #include <linux/in6.h>
 #include <linux/if_packet.h>
 #include <linux/llist.h>
+#include <linux/page_frag_cache.h>
 #include <net/flow.h>
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <linux/netfilter/nf_conntrack_common.h>
diff --git a/mm/Makefile b/mm/Makefile
index d5639b036166..dba52bb0da8a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -65,6 +65,7 @@ page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
 memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 
 obj-y += page-alloc.o
+obj-y += page_frag_cache.o
 obj-y += init-mm.o
 obj-y += memblock.o
 obj-y += $(memory-hotplug-y)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94a2ffe28008..34b3eb74630a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4836,142 +4836,6 @@ void free_pages(unsigned long addr, unsigned int order)
 
 EXPORT_SYMBOL(free_pages);
 
-/*
- * Page Fragment:
- *  An arbitrary-length arbitrary-offset area of memory which resides
- *  within a 0 or higher order page.  Multiple fragments within that page
- *  are individually refcounted, in the page's reference counter.
- *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments.  This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
- */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
-					     gfp_t gfp_mask)
-{
-	struct page *page = NULL;
-	gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
-		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-#endif
-	if (unlikely(!page))
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
-	nc->va = page ? page_address(page) : NULL;
-
-	return page;
-}
-
-void page_frag_cache_drain(struct page_frag_cache *nc)
-{
-	if (!nc->va)
-		return;
-
-	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
-	nc->va = NULL;
-}
-EXPORT_SYMBOL(page_frag_cache_drain);
-
-void __page_frag_cache_drain(struct page *page, unsigned int count)
-{
-	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-
-	if (page_ref_sub_and_test(page, count))
-		free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(__page_frag_cache_drain);
-
-void *__page_frag_alloc_align(struct page_frag_cache *nc,
-			      unsigned int fragsz, gfp_t gfp_mask,
-			      unsigned int align_mask)
-{
-	unsigned int size = PAGE_SIZE;
-	struct page *page;
-	int offset;
-
-	if (unlikely(!nc->va)) {
-refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
-		if (!page)
-			return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
-	}
-
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
-			goto refill;
-		}
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
-			/*
-			 * The caller is trying to allocate a fragment
-			 * with fragsz > PAGE_SIZE but the cache isn't big
-			 * enough to satisfy the request, this may
-			 * happen in low memory conditions.
-			 * We don't release the cache page because
-			 * it could make memory pressure worse
-			 * so we simply return NULL here.
-			 */
-			return NULL;
-		}
-	}
-
-	nc->pagecnt_bias--;
-	offset &= align_mask;
-	nc->offset = offset;
-
-	return nc->va + offset;
-}
-EXPORT_SYMBOL(__page_frag_alloc_align);
-
-/*
- * Frees a page fragment allocated out of either a compound or order 0 page.
- */
-void page_frag_free(void *addr)
-{
-	struct page *page = virt_to_head_page(addr);
-
-	if (unlikely(put_page_testzero(page)))
-		free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(page_frag_free);
-
 static void *make_alloc_exact(unsigned long addr, unsigned int order,
 		size_t size)
 {
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
new file mode 100644
index 000000000000..609a485cd02a
--- /dev/null
+++ b/mm/page_frag_cache.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Page fragment allocator
+ *
+ * Page Fragment:
+ *  An arbitrary-length arbitrary-offset area of memory which resides within a
+ *  0 or higher order page.  Multiple fragments within that page are
+ *  individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions provide a simple allocation framework for page
+ * fragments.  This is used by the network stack and network device drivers to
+ * provide a backing region of memory for use as either an sk_buff->head, or to
+ * be used in the "frags" portion of skb_shared_info.
+ */
+
+#include <linux/export.h>
+#include <linux/gfp_types.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/page_frag_cache.h>
+#include "internal.h"
+
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+					     gfp_t gfp_mask)
+{
+	struct page *page = NULL;
+	gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
+	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+				PAGE_FRAG_CACHE_MAX_ORDER);
+	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+	if (unlikely(!page))
+		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+	nc->va = page ? page_address(page) : NULL;
+
+	return page;
+}
+
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+	if (!nc->va)
+		return;
+
+	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+	nc->va = NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
+void __page_frag_cache_drain(struct page *page, unsigned int count)
+{
+	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+	if (page_ref_sub_and_test(page, count))
+		free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(__page_frag_cache_drain);
+
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+			      unsigned int fragsz, gfp_t gfp_mask,
+			      unsigned int align_mask)
+{
+	unsigned int size = PAGE_SIZE;
+	struct page *page;
+	int offset;
+
+	if (unlikely(!nc->va)) {
+refill:
+		page = __page_frag_cache_refill(nc, gfp_mask);
+		if (!page)
+			return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* Even if we own the page, we do not use atomic_set().
+		 * This would break get_page_unless_zero() users.
+		 */
+		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pfmemalloc = page_is_pfmemalloc(page);
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		nc->offset = size;
+	}
+
+	offset = nc->offset - fragsz;
+	if (unlikely(offset < 0)) {
+		page = virt_to_page(nc->va);
+
+		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+			goto refill;
+
+		if (unlikely(nc->pfmemalloc)) {
+			free_unref_page(page, compound_order(page));
+			goto refill;
+		}
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* OK, page count is 0, we can safely set it */
+		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		offset = size - fragsz;
+		if (unlikely(offset < 0)) {
+			/*
+			 * The caller is trying to allocate a fragment
+			 * with fragsz > PAGE_SIZE but the cache isn't big
+			 * enough to satisfy the request, this may
+			 * happen in low memory conditions.
+			 * We don't release the cache page because
+			 * it could make memory pressure worse
+			 * so we simply return NULL here.
+			 */
+			return NULL;
+		}
+	}
+
+	nc->pagecnt_bias--;
+	offset &= align_mask;
+	nc->offset = offset;
+
+	return nc->va + offset;
+}
+EXPORT_SYMBOL(__page_frag_alloc_align);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void page_frag_free(void *addr)
+{
+	struct page *page = virt_to_head_page(addr);
+
+	if (unlikely(put_page_testzero(page)))
+		free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(page_frag_free);
diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c b/tools/testing/selftests/mm/page_frag/page_frag_test.c
index 912d97b99107..13c44133e009 100644
--- a/tools/testing/selftests/mm/page_frag/page_frag_test.c
+++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c
@@ -6,12 +6,12 @@
  * Copyright (C) 2024 Yunsheng Lin <linyunsheng@huawei.com>
  */
 
-#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/cpumask.h>
 #include <linux/completion.h>
 #include <linux/ptr_ring.h>
 #include <linux/kthread.h>
+#include <linux/page_frag_cache.h>
 
 #define TEST_FAILED_PREFIX	"page_frag_test failed: "
 
-- 
cgit v1.2.3


From 3d18dfe69ce46f106af327736d2261d7e3ee81c0 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Mon, 28 Oct 2024 19:53:39 +0800
Subject: mm: page_frag: avoid caller accessing 'page_frag_cache' directly

Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linux-MM <linux-mm@kvack.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://patch.msgid.link/20241028115343.3405838-5-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/vhost/net.c                                   |  2 +-
 include/linux/page_frag_cache.h                       | 10 ++++++++++
 net/core/skbuff.c                                     |  6 +++---
 net/rxrpc/conn_object.c                               |  4 +---
 net/rxrpc/local_object.c                              |  4 +---
 net/sunrpc/svcsock.c                                  |  6 ++----
 tools/testing/selftests/mm/page_frag/page_frag_test.c |  2 +-
 7 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f16279351db5..9ad37c012189 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 			vqs[VHOST_NET_VQ_RX]);
 
 	f->private_data = n;
-	n->pf_cache.va = NULL;
+	page_frag_cache_init(&n->pf_cache);
 
 	return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 67ac8626ed9b..0a52f7a179c8 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -7,6 +7,16 @@
 #include <linux/mm_types_task.h>
 #include <linux/types.h>
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+	nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+	return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 00afeb90c23a..6841e61a6bd0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -753,14 +753,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 	if (in_hardirq() || irqs_disabled()) {
 		nc = this_cpu_ptr(&netdev_alloc_cache);
 		data = page_frag_alloc(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 	} else {
 		local_bh_disable();
 		local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
 		nc = this_cpu_ptr(&napi_alloc_cache.page);
 		data = page_frag_alloc(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
 		local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 		local_bh_enable();
@@ -850,7 +850,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 		len = SKB_HEAD_ALIGN(len);
 
 		data = page_frag_alloc(&nc->page, len, gfp_mask);
-		pfmemalloc = nc->page.pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
 	}
 	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
 	 */
 	rxrpc_purge_queue(&conn->rx_queue);
 
-	if (conn->tx_data_alloc.va)
-		__page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-					conn->tx_data_alloc.pagecnt_bias);
+	page_frag_cache_drain(&conn->tx_data_alloc);
 	call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index f9623ace2201..2792d2304605 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
 	rxrpc_purge_queue(&local->rx_queue);
 	rxrpc_purge_client_connections(local);
-	if (local->tx_alloc.va)
-		__page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-					local->tx_alloc.pagecnt_bias);
+	page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 825ec5357691..b785425c3315 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1608,7 +1608,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
 	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-	struct page_frag_cache *pfc = &svsk->sk_frag_cache;
 	struct socket *sock = svsk->sk_sock;
 
 	trace_svcsock_free(svsk, sock);
@@ -1618,8 +1617,7 @@ static void svc_sock_free(struct svc_xprt *xprt)
 		sockfd_put(sock);
 	else
 		sock_release(sock);
-	if (pfc->va)
-		__page_frag_cache_drain(virt_to_head_page(pfc->va),
-					pfc->pagecnt_bias);
+
+	page_frag_cache_drain(&svsk->sk_frag_cache);
 	kfree(svsk);
 }
diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c b/tools/testing/selftests/mm/page_frag/page_frag_test.c
index 13c44133e009..e806c1866e36 100644
--- a/tools/testing/selftests/mm/page_frag/page_frag_test.c
+++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c
@@ -126,7 +126,7 @@ static int __init page_frag_test_init(void)
 	u64 duration;
 	int ret;
 
-	test_nc.va = NULL;
+	page_frag_cache_init(&test_nc);
 	atomic_set(&nthreads, 2);
 	init_completion(&wait);
 
-- 
cgit v1.2.3


From 0c3ce2f50261cd2f654d931eeb933c370a3a7d7a Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Mon, 28 Oct 2024 19:53:41 +0800
Subject: mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'

Currently there is one 'struct page_frag' for every 'struct
sock' and 'struct task_struct', we are about to replace the
'struct page_frag' with 'struct page_frag_cache' for them.
Before begin the replacing, we need to ensure the size of
'struct page_frag_cache' is not bigger than the size of
'struct page_frag', as there may be tens of thousands of
'struct sock' and 'struct task_struct' instances in the
system.

By or'ing the page order & pfmemalloc with lower bits of
'va' instead of using 'u16' or 'u32' for page size and 'u8'
for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
And page address & pfmemalloc & order is unchanged for the
same page in the same 'page_frag_cache' instance, it makes
sense to fit them together.

After this patch, the size of 'struct page_frag_cache' should be
the same as the size of 'struct page_frag'.

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linux-MM <linux-mm@kvack.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/20241028115343.3405838-7-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mm_types_task.h   | 19 ++++++-----
 include/linux/page_frag_cache.h | 24 ++++++++++++--
 mm/page_frag_cache.c            | 70 ++++++++++++++++++++++++++++-------------
 3 files changed, 81 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 0ac6daebdd5c..a82aa80c0ba4 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -47,18 +47,21 @@ struct page_frag {
 #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 struct page_frag_cache {
-	void *va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	/* encoded_page consists of the virtual address, pfmemalloc bit and
+	 * order of a page.
+	 */
+	unsigned long encoded_page;
+
+	/* we maintain a pagecount bias, so that we dont dirty cache line
+	 * containing page->_refcount every time we allocate a fragment.
+	 */
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
 	__u16 offset;
-	__u16 size;
+	__u16 pagecnt_bias;
 #else
 	__u32 offset;
+	__u32 pagecnt_bias;
 #endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_refcount every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
 };
 
 /* Track pages that require TLB flushes */
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 0a52f7a179c8..41a91df82631 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -3,18 +3,38 @@
 #ifndef _LINUX_PAGE_FRAG_CACHE_H
 #define _LINUX_PAGE_FRAG_CACHE_H
 
+#include <linux/bits.h>
 #include <linux/log2.h>
 #include <linux/mm_types_task.h>
 #include <linux/types.h>
 
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+/* Use a full byte here to enable assembler optimization as the shift
+ * operation is usually expecting a byte.
+ */
+#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
+#else
+/* Compiler should be able to figure out we don't read things as any value
+ * ANDed with 0 is 0.
+ */
+#define PAGE_FRAG_CACHE_ORDER_MASK		0
+#endif
+
+#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		(PAGE_FRAG_CACHE_ORDER_MASK + 1)
+
+static inline bool encoded_page_decode_pfmemalloc(unsigned long encoded_page)
+{
+	return !!(encoded_page & PAGE_FRAG_CACHE_PFMEMALLOC_BIT);
+}
+
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
 {
-	nc->va = NULL;
+	nc->encoded_page = 0;
 }
 
 static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
 {
-	return !!nc->pfmemalloc;
+	return encoded_page_decode_pfmemalloc(nc->encoded_page);
 }
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 4c8e04379cb3..a36fd09bf275 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -12,6 +12,7 @@
  * be used in the "frags" portion of skb_shared_info.
  */
 
+#include <linux/build_bug.h>
 #include <linux/export.h>
 #include <linux/gfp_types.h>
 #include <linux/init.h>
@@ -19,9 +20,36 @@
 #include <linux/page_frag_cache.h>
 #include "internal.h"
 
+static unsigned long encoded_page_create(struct page *page, unsigned int order,
+					 bool pfmemalloc)
+{
+	BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);
+	BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_BIT >= PAGE_SIZE);
+
+	return (unsigned long)page_address(page) |
+		(order & PAGE_FRAG_CACHE_ORDER_MASK) |
+		((unsigned long)pfmemalloc * PAGE_FRAG_CACHE_PFMEMALLOC_BIT);
+}
+
+static unsigned long encoded_page_decode_order(unsigned long encoded_page)
+{
+	return encoded_page & PAGE_FRAG_CACHE_ORDER_MASK;
+}
+
+static void *encoded_page_decode_virt(unsigned long encoded_page)
+{
+	return (void *)(encoded_page & PAGE_MASK);
+}
+
+static struct page *encoded_page_decode_page(unsigned long encoded_page)
+{
+	return virt_to_page((void *)encoded_page);
+}
+
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
+	unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 
@@ -30,23 +58,26 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
 	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
 				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
 #endif
-	if (unlikely(!page))
+	if (unlikely(!page)) {
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+		order = 0;
+	}
 
-	nc->va = page ? page_address(page) : NULL;
+	nc->encoded_page = page ?
+		encoded_page_create(page, order, page_is_pfmemalloc(page)) : 0;
 
 	return page;
 }
 
 void page_frag_cache_drain(struct page_frag_cache *nc)
 {
-	if (!nc->va)
+	if (!nc->encoded_page)
 		return;
 
-	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
-	nc->va = NULL;
+	__page_frag_cache_drain(encoded_page_decode_page(nc->encoded_page),
+				nc->pagecnt_bias);
+	nc->encoded_page = 0;
 }
 EXPORT_SYMBOL(page_frag_cache_drain);
 
@@ -63,35 +94,29 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			      unsigned int fragsz, gfp_t gfp_mask,
 			      unsigned int align_mask)
 {
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	unsigned int size = nc->size;
-#else
-	unsigned int size = PAGE_SIZE;
-#endif
-	unsigned int offset;
+	unsigned long encoded_page = nc->encoded_page;
+	unsigned int size, offset;
 	struct page *page;
 
-	if (unlikely(!nc->va)) {
+	if (unlikely(!encoded_page)) {
 refill:
 		page = __page_frag_cache_refill(nc, gfp_mask);
 		if (!page)
 			return NULL;
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
+		encoded_page = nc->encoded_page;
+
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
 		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
 
 		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
 		nc->offset = 0;
 	}
 
+	size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
 	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
 	if (unlikely(offset + fragsz > size)) {
 		if (unlikely(fragsz > PAGE_SIZE)) {
@@ -107,13 +132,14 @@ refill:
 			return NULL;
 		}
 
-		page = virt_to_page(nc->va);
+		page = encoded_page_decode_page(encoded_page);
 
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
 			goto refill;
 
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
+		if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) {
+			free_unref_page(page,
+					encoded_page_decode_order(encoded_page));
 			goto refill;
 		}
 
@@ -128,7 +154,7 @@ refill:
 	nc->pagecnt_bias--;
 	nc->offset = offset + fragsz;
 
-	return nc->va + offset;
+	return encoded_page_decode_virt(encoded_page) + offset;
 }
 EXPORT_SYMBOL(__page_frag_alloc_align);
 
-- 
cgit v1.2.3


From d2bd39c0456b75be9dfc7d774b8d021355c26ae3 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 18 Oct 2024 17:47:49 +0300
Subject: PCI: Store all PCIe Supported Link Speeds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PCIe bandwidth controller added by a subsequent commit will require
selecting PCIe Link Speeds that are lower than the Maximum Link Speed.

The struct pci_bus only stores max_bus_speed. Even if PCIe r6.1 sec 8.2.1
currently disallows gaps in supported Link Speeds, the Implementation Note
in PCIe r6.1 sec 7.5.3.18, recommends determining supported Link Speeds
using the Supported Link Speeds Vector in the Link Capabilities 2 Register
(when available) to "avoid software being confused if a future
specification defines Links that do not require support for all slower
speeds."

Reuse code in pcie_get_speed_cap() to add pcie_get_supported_speeds() to
query the Supported Link Speeds Vector of a PCIe device. The value is taken
directly from the Supported Link Speeds Vector or synthesized from the Max
Link Speed in the Link Capabilities Register when the Link Capabilities 2
Register is not available.

The Supported Link Speeds Vector in the Link Capabilities Register 2
corresponds to the bus below on Root Ports and Downstream Ports, whereas it
corresponds to the bus above on Upstream Ports and Endpoints (PCIe r6.1 sec
7.5.3.18):

  Supported Link Speeds Vector - This field indicates the supported Link
  speed(s) of the associated Port.

Add supported_speeds into the struct pci_dev that caches the
Supported Link Speeds Vector.

supported_speeds contains a set of Link Speeds only in the case where PCIe
Link Speed can be determined. Root Complex Integrated Endpoints do not have
a well-defined Link Speed because they do not implement either of the Link
Capabilities Registers, which is allowed by PCIe r6.1 sec 7.5.3 (the same
limitation applies to determining cur_bus_speed and max_bus_speed that are
PCI_SPEED_UNKNOWN in such case). This is of no concern from PCIe bandwidth
controller point of view because such devices are not attached into a PCIe
Root Port that could be controlled.

The supported_speeds field keeps the extra reserved zero at the least
significant bit to match the Link Capabilities 2 Register layout.

An attempt was made to store supported_speeds field into the struct pci_bus
as an intersection of both ends of the Link, however, the subordinate
struct pci_bus is not available early enough. The Target Speed quirk (in
pcie_failed_link_retrain()) can run either during initial scan or later,
requiring it to use the API provided by the PCIe bandwidth controller to
set the Target Link Speed in order to co-exist with the bandwidth
controller. When the Target Speed quirk is calling the bandwidth controller
during initial scan, the struct pci_bus is not yet initialized. As such,
storing supported_speeds into the struct pci_bus is not viable.

Suggested-by: Lukas Wunner <lukas@wunner.de>
Link: https://lore.kernel.org/r/20241018144755.7875-4-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: move pcie_get_supported_speeds() decl to drivers/pci/pci.h]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/pci/pci.c             | 58 +++++++++++++++++++++++++++++++------------
 drivers/pci/pci.h             |  1 +
 drivers/pci/probe.c           |  3 +++
 include/linux/pci.h           | 10 +++++++-
 include/uapi/linux/pci_regs.h |  1 +
 5 files changed, 56 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7d85c04fbba2..3d67e8b50ba2 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -6189,38 +6189,64 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 EXPORT_SYMBOL(pcie_bandwidth_available);
 
 /**
- * pcie_get_speed_cap - query for the PCI device's link speed capability
+ * pcie_get_supported_speeds - query Supported Link Speed Vector
  * @dev: PCI device to query
  *
- * Query the PCI device speed capability.  Return the maximum link speed
- * supported by the device.
+ * Query @dev supported link speeds.
+ *
+ * Implementation Note in PCIe r6.0 sec 7.5.3.18 recommends determining
+ * supported link speeds using the Supported Link Speeds Vector in the Link
+ * Capabilities 2 Register (when available).
+ *
+ * Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18.
+ *
+ * Without Link Capabilities 2, i.e., prior to PCIe r3.0, Supported Link
+ * Speeds field in Link Capabilities is used and only 2.5 GT/s and 5.0 GT/s
+ * speeds were defined.
+ *
+ * For @dev without Supported Link Speed Vector, the field is synthesized
+ * from the Max Link Speed field in the Link Capabilities Register.
+ *
+ * Return: Supported Link Speeds Vector (+ reserved 0 at LSB).
  */
-enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
+u8 pcie_get_supported_speeds(struct pci_dev *dev)
 {
 	u32 lnkcap2, lnkcap;
+	u8 speeds;
 
 	/*
-	 * Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18.  The
-	 * implementation note there recommends using the Supported Link
-	 * Speeds Vector in Link Capabilities 2 when supported.
-	 *
-	 * Without Link Capabilities 2, i.e., prior to PCIe r3.0, software
-	 * should use the Supported Link Speeds field in Link Capabilities,
-	 * where only 2.5 GT/s and 5.0 GT/s speeds were defined.
+	 * Speeds retain the reserved 0 at LSB before PCIe Supported Link
+	 * Speeds Vector to allow using SLS Vector bit defines directly.
 	 */
 	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
+	speeds = lnkcap2 & PCI_EXP_LNKCAP2_SLS;
 
 	/* PCIe r3.0-compliant */
-	if (lnkcap2)
-		return PCIE_LNKCAP2_SLS2SPEED(lnkcap2);
+	if (speeds)
+		return speeds;
 
 	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
+
+	/* Synthesize from the Max Link Speed field */
 	if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB)
-		return PCIE_SPEED_5_0GT;
+		speeds = PCI_EXP_LNKCAP2_SLS_5_0GB | PCI_EXP_LNKCAP2_SLS_2_5GB;
 	else if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_2_5GB)
-		return PCIE_SPEED_2_5GT;
+		speeds = PCI_EXP_LNKCAP2_SLS_2_5GB;
 
-	return PCI_SPEED_UNKNOWN;
+	return speeds;
+}
+
+/**
+ * pcie_get_speed_cap - query for the PCI device's link speed capability
+ * @dev: PCI device to query
+ *
+ * Query the PCI device speed capability.
+ *
+ * Return: the maximum link speed supported by the device.
+ */
+enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
+{
+	return PCIE_LNKCAP2_SLS2SPEED(dev->supported_speeds);
 }
 EXPORT_SYMBOL(pcie_get_speed_cap);
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 14d00ce45bfa..d0a46ecf7289 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -373,6 +373,7 @@ static inline int pcie_dev_speed_mbps(enum pci_bus_speed speed)
 	return -EINVAL;
 }
 
+u8 pcie_get_supported_speeds(struct pci_dev *dev);
 const char *pci_speed_string(enum pci_bus_speed speed);
 enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
 enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..af153a8e8225 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1947,6 +1947,9 @@ int pci_setup_device(struct pci_dev *dev)
 
 	set_pcie_untrusted(dev);
 
+	if (pci_is_pcie(dev))
+		dev->supported_speeds = pcie_get_supported_speeds(dev);
+
 	/* "Unknown power state" */
 	dev->current_state = PCI_UNKNOWN;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index be5ed534c39c..99c6fa30d25b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -318,7 +318,14 @@ struct pci_sriov;
 struct pci_p2pdma;
 struct rcec_ea;
 
-/* The pci_dev structure describes PCI devices */
+/* struct pci_dev - describes a PCI device
+ *
+ * @supported_speeds:	PCIe Supported Link Speeds Vector (+ reserved 0 at
+ *			LSB). 0 when the supported speeds cannot be
+ *			determined (e.g., for Root Complex Integrated
+ *			Endpoints without the relevant Capability
+ *			Registers).
+ */
 struct pci_dev {
 	struct list_head bus_list;	/* Node in per-bus list */
 	struct pci_bus	*bus;		/* Bus this device is on */
@@ -522,6 +529,7 @@ struct pci_dev {
 	struct npem	*npem;		/* Native PCIe Enclosure Management */
 #endif
 	u16		acs_cap;	/* ACS Capability offset */
+	u8		supported_speeds; /* Supported Link Speeds Vector */
 	phys_addr_t	rom;		/* Physical address if not from BAR */
 	size_t		romlen;		/* Length if not from BAR */
 	/*
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 12323b3334a9..f3c9de0a497c 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -678,6 +678,7 @@
 #define PCI_EXP_DEVSTA2		0x2a	/* Device Status 2 */
 #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2 0x2c	/* end of v2 EPs w/o link */
 #define PCI_EXP_LNKCAP2		0x2c	/* Link Capabilities 2 */
+#define  PCI_EXP_LNKCAP2_SLS		0x000000fe /* Supported Link Speeds Vector */
 #define  PCI_EXP_LNKCAP2_SLS_2_5GB	0x00000002 /* Supported Speed 2.5GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_5_0GB	0x00000004 /* Supported Speed 5GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_8_0GB	0x00000008 /* Supported Speed 8GT/s */
-- 
cgit v1.2.3


From 38a18dfe9035d5a02a53271824de1854129c61dc Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Tue, 22 Oct 2024 15:48:51 -0700
Subject: PCI: Unexport pci_walk_bus_locked()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's only one user of pci_walk_bus_locked(), and it's internal to the
PCI core.  Unexport it and make it private to drivers/pci/.

Link: https://lore.kernel.org/r/20241022224851.340648-6-kbusch@meta.com
Signed-off-by: Keith Busch <kbusch@kernel.org>
[bhelgaas: move decl to drivers/pci/pci.h]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/pci/bus.c   | 1 -
 drivers/pci/pci.h   | 3 +++
 include/linux/pci.h | 2 --
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 8491e9c7f058..30620f3bb0e2 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -435,7 +435,6 @@ void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *
 
 	__pci_walk_bus(top, cb, userdata);
 }
-EXPORT_SYMBOL_GPL(pci_walk_bus_locked);
 
 struct pci_bus *pci_bus_get(struct pci_bus *bus)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 187c5c83412d..28cccfeb8076 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -323,6 +323,9 @@ void __pci_bus_assign_resources(const struct pci_bus *bus,
 				struct list_head *realloc_head,
 				struct list_head *fail_head);
 bool pci_bus_clip_resource(struct pci_dev *dev, int idx);
+void pci_walk_bus_locked(struct pci_bus *top,
+			 int (*cb)(struct pci_dev *, void *),
+			 void *userdata);
 
 const char *pci_resource_name(struct pci_dev *dev, unsigned int i);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..056290377273 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1623,8 +1623,6 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 
 void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 		  void *userdata);
-void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
-			 void *userdata);
 int pci_cfg_space_size(struct pci_dev *dev);
 unsigned char pci_bus_max_busnr(struct pci_bus *bus);
 void pci_setup_bridge(struct pci_bus *bus);
-- 
cgit v1.2.3


From fb56007c9bc38550755a53f7afd71a287340b724 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 6 Nov 2024 06:10:19 +0000
Subject: vt_buffer.h: get rid of dead code in default scr_...() instances

Only 4 architectures define VT_BUF_HAVE_RW (alpha, mips, powerpc, sparc)
and all of them define VT_BUF_HAVE_MEM{SET,CPY,MOVE}W.  In other
words, the code under #ifdef VT_BUF_HAVE_RW in default scr_mem...w()
instances won't be compiled anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/vt_buffer.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index 919d999a8c1d..b6eeb8cb6070 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -28,45 +28,21 @@
 #ifndef VT_BUF_HAVE_MEMSETW
 static inline void scr_memsetw(u16 *s, u16 c, unsigned int count)
 {
-#ifdef VT_BUF_HAVE_RW
-	count /= 2;
-	while (count--)
-		scr_writew(c, s++);
-#else
 	memset16(s, c, count / 2);
-#endif
 }
 #endif
 
 #ifndef VT_BUF_HAVE_MEMCPYW
 static inline void scr_memcpyw(u16 *d, const u16 *s, unsigned int count)
 {
-#ifdef VT_BUF_HAVE_RW
-	count /= 2;
-	while (count--)
-		scr_writew(scr_readw(s++), d++);
-#else
 	memcpy(d, s, count);
-#endif
 }
 #endif
 
 #ifndef VT_BUF_HAVE_MEMMOVEW
 static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count)
 {
-#ifdef VT_BUF_HAVE_RW
-	if (d < s)
-		scr_memcpyw(d, s, count);
-	else {
-		count /= 2;
-		d += count;
-		s += count;
-		while (count--)
-			scr_writew(scr_readw(--s), --d);
-	}
-#else
 	memmove(d, s, count);
-#endif
 }
 #endif
 
-- 
cgit v1.2.3


From 82ee16cfb290ae259d1cd6658a6988b430258e94 Mon Sep 17 00:00:00 2001
From: Karel Balej <balejk@matfyz.cz>
Date: Sat, 12 Oct 2024 21:31:39 +0200
Subject: rtc: add driver for Marvell 88PM886 PMIC RTC

RTC lives on the chip's base register page. Add the relevant register
definitions and implement a basic set/read time functionality. Tested
with the samsung,coreprimevelte smartphone which contains this PMIC and
whose vendor kernel tree has also served as the sole reference for this.

Signed-off-by: Karel Balej <balejk@matfyz.cz>
Acked-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20241012193345.18594-2-balejk@matfyz.cz
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 MAINTAINERS                 |  1 +
 drivers/rtc/Kconfig         | 10 +++++
 drivers/rtc/Makefile        |  1 +
 drivers/rtc/rtc-88pm886.c   | 97 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/88pm886.h |  9 +++++
 5 files changed, 118 insertions(+)
 create mode 100644 drivers/rtc/rtc-88pm886.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index ef0a949c380d..054425c4afd2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13706,6 +13706,7 @@ F:	Documentation/devicetree/bindings/mfd/marvell,88pm886-a1.yaml
 F:	drivers/input/misc/88pm886-onkey.c
 F:	drivers/mfd/88pm886.c
 F:	drivers/regulator/88pm886-regulator.c
+F:	drivers/rtc/rtc-88pm886.c
 F:	include/linux/mfd/88pm886.h
 
 MARVELL ARMADA 3700 PHY DRIVERS
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 0e6e0b8b45f7..e1153ed25b3f 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -182,6 +182,16 @@ config RTC_DRV_88PM80X
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-88pm80x.
 
+config RTC_DRV_88PM886
+	tristate "Marvell 88PM886 RTC driver"
+	depends on MFD_88PM886_PMIC
+	help
+	  If you say yes here you will get support for the RTC function in the
+	  Marvell 88PM886 chip.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-88pm886.
+
 config RTC_DRV_ABB5ZES3
 	select REGMAP_I2C
 	tristate "Abracon AB-RTCMC-32.768kHz-B5ZE-S3"
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index b0dc6b648570..411016668753 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_RTC_LIB_KUNIT_TEST)	+= lib_test.o
 
 obj-$(CONFIG_RTC_DRV_88PM80X)	+= rtc-88pm80x.o
 obj-$(CONFIG_RTC_DRV_88PM860X)	+= rtc-88pm860x.o
+obj-$(CONFIG_RTC_DRV_88PM886)	+= rtc-88pm886.o
 obj-$(CONFIG_RTC_DRV_AB8500)	+= rtc-ab8500.o
 obj-$(CONFIG_RTC_DRV_ABB5ZES3)	+= rtc-ab-b5ze-s3.o
 obj-$(CONFIG_RTC_DRV_ABEOZ9)	+= rtc-ab-eoz9.o
diff --git a/drivers/rtc/rtc-88pm886.c b/drivers/rtc/rtc-88pm886.c
new file mode 100644
index 000000000000..57e9b0a66eed
--- /dev/null
+++ b/drivers/rtc/rtc-88pm886.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/limits.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+#include <linux/mfd/88pm886.h>
+
+/*
+ * Time is calculated as the sum of a 32-bit read-only advancing counter and a
+ * writeable constant offset stored in the chip's spare registers.
+ */
+
+static int pm886_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct regmap *regmap = dev_get_drvdata(dev);
+	u32 time;
+	u32 buf;
+	int ret;
+
+	ret = regmap_bulk_read(regmap, PM886_REG_RTC_SPARE1, &buf, 4);
+	if (ret)
+		return ret;
+	time = buf;
+
+	ret = regmap_bulk_read(regmap, PM886_REG_RTC_CNT1, &buf, 4);
+	if (ret)
+		return ret;
+	time += buf;
+
+	rtc_time64_to_tm(time, tm);
+
+	return 0;
+}
+
+static int pm886_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct regmap *regmap = dev_get_drvdata(dev);
+	u32 buf;
+	int ret;
+
+	ret = regmap_bulk_read(regmap, PM886_REG_RTC_CNT1, &buf, 4);
+	if (ret)
+		return ret;
+
+	buf = rtc_tm_to_time64(tm) - buf;
+
+	return regmap_bulk_write(regmap, PM886_REG_RTC_SPARE1, &buf, 4);
+}
+
+static const struct rtc_class_ops pm886_rtc_ops = {
+	.read_time = pm886_rtc_read_time,
+	.set_time = pm886_rtc_set_time,
+};
+
+static int pm886_rtc_probe(struct platform_device *pdev)
+{
+	struct pm886_chip *chip = dev_get_drvdata(pdev->dev.parent);
+	struct device *dev = &pdev->dev;
+	struct rtc_device *rtc;
+	int ret;
+
+	platform_set_drvdata(pdev, chip->regmap);
+
+	rtc = devm_rtc_allocate_device(dev);
+	if (IS_ERR(rtc))
+		return dev_err_probe(dev, PTR_ERR(rtc),
+				"Failed to allocate RTC device\n");
+
+	rtc->ops = &pm886_rtc_ops;
+	rtc->range_max = U32_MAX;
+
+	ret = devm_rtc_register_device(rtc);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to register RTC device\n");
+
+	return 0;
+}
+
+static const struct platform_device_id pm886_rtc_id_table[] = {
+	{ "88pm886-rtc", },
+	{ }
+};
+MODULE_DEVICE_TABLE(platform, pm886_rtc_id_table);
+
+static struct platform_driver pm886_rtc_driver = {
+	.driver = {
+		.name = "88pm886-rtc",
+	},
+	.probe = pm886_rtc_probe,
+	.id_table = pm886_rtc_id_table,
+};
+module_platform_driver(pm886_rtc_driver);
+
+MODULE_DESCRIPTION("Marvell 88PM886 RTC driver");
+MODULE_AUTHOR("Karel Balej <balejk@matfyz.cz>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/88pm886.h b/include/linux/mfd/88pm886.h
index 133aa302e492..85eca44f39ab 100644
--- a/include/linux/mfd/88pm886.h
+++ b/include/linux/mfd/88pm886.h
@@ -31,6 +31,15 @@
 #define PM886_INT_WC			BIT(1)
 #define PM886_INT_MASK_MODE		BIT(2)
 
+#define PM886_REG_RTC_CNT1		0xd1
+#define PM886_REG_RTC_CNT2		0xd2
+#define PM886_REG_RTC_CNT3		0xd3
+#define PM886_REG_RTC_CNT4		0xd4
+#define PM886_REG_RTC_SPARE1		0xea
+#define PM886_REG_RTC_SPARE2		0xeb
+#define PM886_REG_RTC_SPARE3		0xec
+#define PM886_REG_RTC_SPARE4		0xed
+#define PM886_REG_RTC_SPARE5		0xee
 #define PM886_REG_RTC_SPARE6		0xef
 
 #define PM886_REG_BUCK_EN		0x08
-- 
cgit v1.2.3


From a7306f3c283bfe03611229bb6280987aae2af8f9 Mon Sep 17 00:00:00 2001
From: Nataniel Farzan <natanielfarzan@gmail.com>
Date: Mon, 4 Nov 2024 19:22:32 -0800
Subject: Improve consistency of '#error' directive messages

Remove the use of contractions and use proper punctuation in #error
directive messages that discourage the direct inclusion of header files.

Link: https://lkml.kernel.org/r/20241105032231.28833-1-natanielfarzan@gmail.com
Signed-off-by: Nataniel Farzan <natanielfarzan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/include/asm/spinlock_types.h          | 2 +-
 arch/arm/include/asm/spinlock_types.h            | 2 +-
 arch/arm64/include/asm/spinlock_types.h          | 2 +-
 arch/hexagon/include/asm/spinlock_types.h        | 2 +-
 arch/powerpc/include/asm/simple_spinlock_types.h | 2 +-
 arch/powerpc/include/asm/spinlock_types.h        | 2 +-
 arch/s390/include/asm/spinlock_types.h           | 2 +-
 arch/sh/include/asm/spinlock_types.h             | 2 +-
 arch/xtensa/include/asm/spinlock_types.h         | 2 +-
 include/acpi/platform/aclinux.h                  | 2 +-
 include/linux/compiler-clang.h                   | 2 +-
 include/linux/compiler-gcc.h                     | 2 +-
 include/linux/pm_wakeup.h                        | 2 +-
 include/linux/rwlock.h                           | 2 +-
 include/linux/rwlock_api_smp.h                   | 2 +-
 include/linux/spinlock_api_smp.h                 | 2 +-
 include/linux/spinlock_types_up.h                | 2 +-
 include/linux/spinlock_up.h                      | 2 +-
 tools/include/linux/compiler-gcc.h               | 2 +-
 19 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h
index 2526fd3be5fd..05a444d77c53 100644
--- a/arch/alpha/include/asm/spinlock_types.h
+++ b/arch/alpha/include/asm/spinlock_types.h
@@ -3,7 +3,7 @@
 #define _ALPHA_SPINLOCK_TYPES_H
 
 #ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 typedef struct {
diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h
index 0c14b36ef101..5404a2a96bf3 100644
--- a/arch/arm/include/asm/spinlock_types.h
+++ b/arch/arm/include/asm/spinlock_types.h
@@ -3,7 +3,7 @@
 #define __ASM_SPINLOCK_TYPES_H
 
 #ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 #define TICKET_SHIFT	16
diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h
index 11ab1c077697..7cde3d8bd0ad 100644
--- a/arch/arm64/include/asm/spinlock_types.h
+++ b/arch/arm64/include/asm/spinlock_types.h
@@ -6,7 +6,7 @@
 #define __ASM_SPINLOCK_TYPES_H
 
 #if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H)
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 #include <asm-generic/qspinlock_types.h>
diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h
index d5f66495b670..63add2d863e8 100644
--- a/arch/hexagon/include/asm/spinlock_types.h
+++ b/arch/hexagon/include/asm/spinlock_types.h
@@ -9,7 +9,7 @@
 #define _ASM_SPINLOCK_TYPES_H
 
 #ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 typedef struct {
diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h
index 08243338069d..391fc19f7272 100644
--- a/arch/powerpc/include/asm/simple_spinlock_types.h
+++ b/arch/powerpc/include/asm/simple_spinlock_types.h
@@ -3,7 +3,7 @@
 #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H
 
 #ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 typedef struct {
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 40b01446cf75..569765fa16bc 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -3,7 +3,7 @@
 #define _ASM_POWERPC_SPINLOCK_TYPES_H
 
 #ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 #ifdef CONFIG_PPC_QUEUED_SPINLOCKS
diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
index b69695e39957..3653ff57d6d9 100644
--- a/arch/s390/include/asm/spinlock_types.h
+++ b/arch/s390/include/asm/spinlock_types.h
@@ -3,7 +3,7 @@
 #define __ASM_SPINLOCK_TYPES_H
 
 #ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 typedef struct {
diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h
index 907bda4b1619..7cb50e68448f 100644
--- a/arch/sh/include/asm/spinlock_types.h
+++ b/arch/sh/include/asm/spinlock_types.h
@@ -3,7 +3,7 @@
 #define __ASM_SH_SPINLOCK_TYPES_H
 
 #ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 typedef struct {
diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h
index 797aed7df3dd..6baaeac6248b 100644
--- a/arch/xtensa/include/asm/spinlock_types.h
+++ b/arch/xtensa/include/asm/spinlock_types.h
@@ -3,7 +3,7 @@
 #define __ASM_SPINLOCK_TYPES_H
 
 #if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H)
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 #include <asm-generic/qspinlock_types.h>
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 565341c826e3..f3249b7df5cb 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -15,7 +15,7 @@
 /* ACPICA external files should not include ACPICA headers directly. */
 
 #if !defined(BUILDING_ACPICA) && !defined(_LINUX_ACPI_H)
-#error "Please don't include <acpi/acpi.h> directly, include <linux/acpi.h> instead."
+#error "Please do not include <acpi/acpi.h> directly, include <linux/acpi.h> instead."
 #endif
 
 #endif
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 4c1a39dcb624..2e7c2c282f3a 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __LINUX_COMPILER_TYPES_H
-#error "Please don't include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead."
+#error "Please do not include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead."
 #endif
 
 /* Compiler specific definitions for Clang compiler */
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index cd6f9aae311f..d0ed9583743f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __LINUX_COMPILER_TYPES_H
-#error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
+#error "Please do not include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
 #endif
 
 /*
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 76cd1f9f1365..222f7530806c 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -10,7 +10,7 @@
 #define _LINUX_PM_WAKEUP_H
 
 #ifndef _DEVICE_H_
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 #include <linux/types.h>
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index c0ef596f340b..5b87c6f4a243 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -2,7 +2,7 @@
 #define __LINUX_RWLOCK_H
 
 #ifndef __LINUX_INSIDE_SPINLOCK_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 /*
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index dceb0a59b692..31d3d1116323 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -2,7 +2,7 @@
 #define __LINUX_RWLOCK_API_SMP_H
 
 #ifndef __LINUX_SPINLOCK_API_SMP_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 /*
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 89eb6f4c659c..9ecb0ab504e3 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -2,7 +2,7 @@
 #define __LINUX_SPINLOCK_API_SMP_H
 
 #ifndef __LINUX_INSIDE_SPINLOCK_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 /*
diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
index 7f86a2016ac5..fc4e2d017c20 100644
--- a/include/linux/spinlock_types_up.h
+++ b/include/linux/spinlock_types_up.h
@@ -2,7 +2,7 @@
 #define __LINUX_SPINLOCK_TYPES_UP_H
 
 #ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 /*
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index c87204247592..1e84e71ca495 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -2,7 +2,7 @@
 #define __LINUX_SPINLOCK_UP_H
 
 #ifndef __LINUX_INSIDE_SPINLOCK_H
-# error "please don't include this file directly"
+# error "Please do not include this file directly."
 #endif
 
 #include <asm/processor.h>	/* for cpu_relax() */
diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h
index 62e7c901ac28..e20f98e14e81 100644
--- a/tools/include/linux/compiler-gcc.h
+++ b/tools/include/linux/compiler-gcc.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _TOOLS_LINUX_COMPILER_H_
-#error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
+#error "Please do not include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
 #endif
 
 /*
-- 
cgit v1.2.3


From bc73b4186736341ab5cd2c199da82db6e1134e13 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <aardelean@baylibre.com>
Date: Tue, 5 Nov 2024 16:54:05 +0200
Subject: util_macros.h: fix/rework find_closest() macros

A bug was found in the find_closest() (find_closest_descending() is also
affected after some testing), where for certain values with small
progressions, the rounding (done by averaging 2 values) causes an
incorrect index to be returned.  The rounding issues occur for
progressions of 1, 2 and 3.  It goes away when the progression/interval
between two values is 4 or larger.

It's particularly bad for progressions of 1.  For example if there's an
array of 'a = { 1, 2, 3 }', using 'find_closest(2, a ...)' would return 0
(the index of '1'), rather than returning 1 (the index of '2').  This
means that for exact values (with a progression of 1), find_closest() will
misbehave and return the index of the value smaller than the one we're
searching for.

For progressions of 2 and 3, the exact values are obtained correctly; but
values aren't approximated correctly (as one would expect).  Starting with
progressions of 4, all seems to be good (one gets what one would expect).

While one could argue that 'find_closest()' should not be used for arrays
with progressions of 1 (i.e. '{1, 2, 3, ...}', the macro should still
behave correctly.

The bug was found while testing the 'drivers/iio/adc/ad7606.c',
specifically the oversampling feature.
For reference, the oversampling values are listed as:
   static const unsigned int ad7606_oversampling_avail[7] = {
          1, 2, 4, 8, 16, 32, 64,
   };

When doing:
  1. $ echo 1 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio
     $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio
     1  # this is fine
  2. $ echo 2 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio
     $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio
     1  # this is wrong; 2 should be returned here
  3. $ echo 3 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio
     $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio
     2  # this is fine
  4. $ echo 4 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio
     $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio
     4  # this is fine
And from here-on, the values are as correct (one gets what one would
expect.)

While writing a kunit test for this bug, a peculiar issue was found for the
array in the 'drivers/hwmon/ina2xx.c' & 'drivers/iio/adc/ina2xx-adc.c'
drivers. While running the kunit test (for 'ina226_avg_tab' from these
drivers):
  * idx = find_closest([-1 to 2], ina226_avg_tab, ARRAY_SIZE(ina226_avg_tab));
    This returns idx == 0, so value.
  * idx = find_closest(3, ina226_avg_tab, ARRAY_SIZE(ina226_avg_tab));
    This returns idx == 0, value 1; and now one could argue whether 3 is
    closer to 4 or to 1. This quirk only appears for value '3' in this
    array, but it seems to be a another rounding issue.
  * And from 4 onwards the 'find_closest'() works fine (one gets what one
    would expect).

This change reworks the find_closest() macros to also check the difference
between the left and right elements when 'x'. If the distance to the right
is smaller (than the distance to the left), the index is incremented by 1.
This also makes redundant the need for using the DIV_ROUND_CLOSEST() macro.

In order to accommodate for any mix of negative + positive values, the
internal variables '__fc_x', '__fc_mid_x', '__fc_left' & '__fc_right' are
forced to 'long' type. This also addresses any potential bugs/issues with
'x' being of an unsigned type. In those situations any comparison between
signed & unsigned would be promoted to a comparison between 2 unsigned
numbers; this is especially annoying when '__fc_left' & '__fc_right'
underflow.

The find_closest_descending() macro was also reworked and duplicated from
the find_closest(), and it is being iterated in reverse. The main reason
for this is to get the same indices as 'find_closest()' (but in reverse).
The comparison for '__fc_right < __fc_left' favors going the array in
ascending order.
For example for array '{ 1024, 512, 256, 128, 64, 16, 4, 1 }' and x = 3, we
get:
    __fc_mid_x = 2
    __fc_left = -1
    __fc_right = -2
    Then '__fc_right < __fc_left' evaluates to true and '__fc_i++' becomes 7
    which is not quite incorrect, but 3 is closer to 4 than to 1.

This change has been validated with the kunit from the next patch.

Link: https://lkml.kernel.org/r/20241105145406.554365-1-aardelean@baylibre.com
Fixes: 95d119528b0b ("util_macros.h: add find_closest() macro")
Signed-off-by: Alexandru Ardelean <aardelean@baylibre.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/util_macros.h | 56 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h
index 6bb460c3e818..825487fb66fa 100644
--- a/include/linux/util_macros.h
+++ b/include/linux/util_macros.h
@@ -4,19 +4,6 @@
 
 #include <linux/math.h>
 
-#define __find_closest(x, a, as, op)					\
-({									\
-	typeof(as) __fc_i, __fc_as = (as) - 1;				\
-	typeof(x) __fc_x = (x);						\
-	typeof(*a) const *__fc_a = (a);					\
-	for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) {			\
-		if (__fc_x op DIV_ROUND_CLOSEST(__fc_a[__fc_i] +	\
-						__fc_a[__fc_i + 1], 2))	\
-			break;						\
-	}								\
-	(__fc_i);							\
-})
-
 /**
  * find_closest - locate the closest element in a sorted array
  * @x: The reference value.
@@ -25,8 +12,27 @@
  * @as: Size of 'a'.
  *
  * Returns the index of the element closest to 'x'.
+ * Note: If using an array of negative numbers (or mixed positive numbers),
+ *       then be sure that 'x' is of a signed-type to get good results.
  */
-#define find_closest(x, a, as) __find_closest(x, a, as, <=)
+#define find_closest(x, a, as)						\
+({									\
+	typeof(as) __fc_i, __fc_as = (as) - 1;				\
+	long __fc_mid_x, __fc_x = (x);					\
+	long __fc_left, __fc_right;					\
+	typeof(*a) const *__fc_a = (a);					\
+	for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) {			\
+		__fc_mid_x = (__fc_a[__fc_i] + __fc_a[__fc_i + 1]) / 2;	\
+		if (__fc_x <= __fc_mid_x) {				\
+			__fc_left = __fc_x - __fc_a[__fc_i];		\
+			__fc_right = __fc_a[__fc_i + 1] - __fc_x;	\
+			if (__fc_right < __fc_left)			\
+				__fc_i++;				\
+			break;						\
+		}							\
+	}								\
+	(__fc_i);							\
+})
 
 /**
  * find_closest_descending - locate the closest element in a sorted array
@@ -36,9 +42,27 @@
  * @as: Size of 'a'.
  *
  * Similar to find_closest() but 'a' is expected to be sorted in descending
- * order.
+ * order. The iteration is done in reverse order, so that the comparison
+ * of '__fc_right' & '__fc_left' also works for unsigned numbers.
  */
-#define find_closest_descending(x, a, as) __find_closest(x, a, as, >=)
+#define find_closest_descending(x, a, as)				\
+({									\
+	typeof(as) __fc_i, __fc_as = (as) - 1;				\
+	long __fc_mid_x, __fc_x = (x);					\
+	long __fc_left, __fc_right;					\
+	typeof(*a) const *__fc_a = (a);					\
+	for (__fc_i = __fc_as; __fc_i >= 1; __fc_i--) {			\
+		__fc_mid_x = (__fc_a[__fc_i] + __fc_a[__fc_i - 1]) / 2;	\
+		if (__fc_x <= __fc_mid_x) {				\
+			__fc_left = __fc_x - __fc_a[__fc_i];		\
+			__fc_right = __fc_a[__fc_i - 1] - __fc_x;	\
+			if (__fc_right < __fc_left)			\
+				__fc_i--;				\
+			break;						\
+		}							\
+	}								\
+	(__fc_i);							\
+})
 
 /**
  * is_insidevar - check if the @ptr points inside the @var memory range.
-- 
cgit v1.2.3


From 3f28bbe56c7b77e73f1dd0515cad009cfdd64962 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 5 Nov 2024 01:52:52 +0800
Subject: mm/list_lru: don't pass unnecessary key parameters

Patch series "mm/list_lru: Split list_lru lock into per-cgroup scope".

When LOCKDEP is not enabled, lock_class_key is an empty struct that is
never used.  But the list_lru initialization function still takes a
placeholder pointer as parameter, and the compiler cannot optimize it
because the function is not static and exported.

Remove this parameter and move it inside the list_lru struct.  Only use it
when LOCKDEP is enabled.  Kernel builds with LOCKDEP will be slightly
larger, while !LOCKDEP builds without it will be slightly smaller (the
common case).

Link: https://lkml.kernel.org/r/20241104175257.60853-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20241104175257.60853-2-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_lru.h | 18 +++++++++++++++---
 mm/list_lru.c            |  9 +++++----
 mm/workingset.c          |  4 ++--
 3 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 5099a8ccd5f4..eba93f6511f3 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -56,16 +56,28 @@ struct list_lru {
 	bool			memcg_aware;
 	struct xarray		xa;
 #endif
+#ifdef CONFIG_LOCKDEP
+	struct lock_class_key	*key;
+#endif
 };
 
 void list_lru_destroy(struct list_lru *lru);
 int __list_lru_init(struct list_lru *lru, bool memcg_aware,
-		    struct lock_class_key *key, struct shrinker *shrinker);
+		    struct shrinker *shrinker);
 
 #define list_lru_init(lru)				\
-	__list_lru_init((lru), false, NULL, NULL)
+	__list_lru_init((lru), false, NULL)
 #define list_lru_init_memcg(lru, shrinker)		\
-	__list_lru_init((lru), true, NULL, shrinker)
+	__list_lru_init((lru), true, shrinker)
+
+static inline int list_lru_init_memcg_key(struct list_lru *lru, struct shrinker *shrinker,
+					  struct lock_class_key *key)
+{
+#ifdef CONFIG_LOCKDEP
+	lru->key = key;
+#endif
+	return list_lru_init_memcg(lru, shrinker);
+}
 
 int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
 			 gfp_t gfp);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 9b7ff06e9d32..ea7dc9fa4d05 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -562,8 +562,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
 }
 #endif /* CONFIG_MEMCG */
 
-int __list_lru_init(struct list_lru *lru, bool memcg_aware,
-		    struct lock_class_key *key, struct shrinker *shrinker)
+int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker)
 {
 	int i;
 
@@ -583,8 +582,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 
 	for_each_node(i) {
 		spin_lock_init(&lru->node[i].lock);
-		if (key)
-			lockdep_set_class(&lru->node[i].lock, key);
+#ifdef CONFIG_LOCKDEP
+		if (lru->key)
+			lockdep_set_class(&lru->node[i].lock, lru->key);
+#endif
 		init_one_lru(&lru->node[i].lru);
 	}
 
diff --git a/mm/workingset.c b/mm/workingset.c
index 0e38bec261a4..5c8861edbf17 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -813,8 +813,8 @@ static int __init workingset_init(void)
 	if (!workingset_shadow_shrinker)
 		goto err;
 
-	ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
-			      workingset_shadow_shrinker);
+	ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
+				      &shadow_nodes_key);
 	if (ret)
 		goto err_list_lru;
 
-- 
cgit v1.2.3


From fb56fdf8b9a2f7397f8a83dce50189f3f0cf71af Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 5 Nov 2024 01:52:56 +0800
Subject: mm/list_lru: split the lock to per-cgroup scope

Currently, every list_lru has a per-node lock that protects adding,
deletion, isolation, and reparenting of all list_lru_one instances
belonging to this list_lru on this node.  This lock contention is heavy
when multiple cgroups modify the same list_lru.

This lock can be split into per-cgroup scope to reduce contention.

To achieve this, we need a stable list_lru_one for every cgroup.  This
commit adds a lock to each list_lru_one and introduced a helper function
lock_list_lru_of_memcg, making it possible to pin the list_lru of a memcg.
Then reworked the reparenting process.

Reparenting will switch the list_lru_one instances one by one.  By locking
each instance and marking it dead using the nr_items counter, reparenting
ensures that all items in the corresponding cgroup (on-list or not,
because items have a stable cgroup, see below) will see the list_lru_one
switch synchronously.

Objcg reparent is also moved after list_lru reparent so items will have a
stable mem cgroup until all list_lru_one instances are drained.

The only caller that doesn't work the *_obj interfaces are direct calls to
list_lru_{add,del}.  But it's only used by zswap and that's also based on
objcg, so it's fine.

This also changes the bahaviour of the isolation function when LRU_RETRY
or LRU_REMOVED_RETRY is returned, because now releasing the lock could
unblock reparenting and free the list_lru_one, isolation function will
have to return withoug re-lock the lru.

prepare() {
    mkdir /tmp/test-fs
    modprobe brd rd_nr=1 rd_size=33554432
    mkfs.xfs -f /dev/ram0
    mount -t xfs /dev/ram0 /tmp/test-fs
    for i in $(seq 1 512); do
        mkdir "/tmp/test-fs/$i"
        for j in $(seq 1 10240); do
            echo TEST-CONTENT > "/tmp/test-fs/$i/$j"
        done &
    done; wait
}

do_test() {
    read_worker() {
        sleep 1
        tar -cv "$1" &>/dev/null
    }
    read_in_all() {
        cd "/tmp/test-fs" && ls
        for i in $(seq 1 512); do
            (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs"
            read_worker "$i" &
        done; wait
    }
    for i in $(seq 1 512); do
        mkdir -p "/sys/fs/cgroup/benchmark/$i"
    done
    echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control
    echo 512M > /sys/fs/cgroup/benchmark/memory.max
    echo 3 > /proc/sys/vm/drop_caches
    time read_in_all
}

Above script simulates compression of small files in multiple cgroups
with memory pressure. Run prepare() then do_test for 6 times:

Before:
real      0m7.762s user      0m11.340s sys       3m11.224s
real      0m8.123s user      0m11.548s sys       3m2.549s
real      0m7.736s user      0m11.515s sys       3m11.171s
real      0m8.539s user      0m11.508s sys       3m7.618s
real      0m7.928s user      0m11.349s sys       3m13.063s
real      0m8.105s user      0m11.128s sys       3m14.313s

After this commit (about ~15% faster):
real      0m6.953s user      0m11.327s sys       2m42.912s
real      0m7.453s user      0m11.343s sys       2m51.942s
real      0m6.916s user      0m11.269s sys       2m43.957s
real      0m6.894s user      0m11.528s sys       2m45.346s
real      0m6.911s user      0m11.095s sys       2m43.168s
real      0m6.773s user      0m11.518s sys       2m40.774s

Link: https://lkml.kernel.org/r/20241104175257.60853-6-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/android/binder_alloc.c |   1 -
 fs/inode.c                     |   1 -
 fs/xfs/xfs_qm.c                |   1 -
 include/linux/list_lru.h       |   6 +-
 mm/list_lru.c                  | 216 +++++++++++++++++++++++------------------
 mm/memcontrol.c                |   7 +-
 mm/workingset.c                |   1 -
 mm/zswap.c                     |   5 +-
 8 files changed, 135 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index b3acbc4174fb..86bbe40f4bcd 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1106,7 +1106,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 	mmput_async(mm);
 	__free_page(page_to_free);
 
-	spin_lock(lock);
 	return LRU_REMOVED_RETRY;
 
 err_invalid_vma:
diff --git a/fs/inode.c b/fs/inode.c
index 8dabb224f941..442cb4fc09b2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -934,7 +934,6 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 			mm_account_reclaimed_pages(reap);
 		}
 		inode_unpin_lru_isolating(inode);
-		spin_lock(lru_lock);
 		return LRU_RETRY;
 	}
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 7e2307921deb..665d26990b78 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -496,7 +496,6 @@ out_unlock_dirty:
 	trace_xfs_dqreclaim_busy(dqp);
 	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
 	xfs_dqunlock(dqp);
-	spin_lock(lru_lock);
 	return LRU_RETRY;
 }
 
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index eba93f6511f3..10ba9a54d42c 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -32,6 +32,8 @@ struct list_lru_one {
 	struct list_head	list;
 	/* may become negative during memcg reparenting */
 	long			nr_items;
+	/* protects all fields above */
+	spinlock_t		lock;
 };
 
 struct list_lru_memcg {
@@ -41,11 +43,9 @@ struct list_lru_memcg {
 };
 
 struct list_lru_node {
-	/* protects all lists on the node, including per cgroup */
-	spinlock_t		lock;
 	/* global list, used for the root cgroup in cgroup aware lrus */
 	struct list_lru_one	lru;
-	long			nr_items;
+	atomic_long_t		nr_items;
 } ____cacheline_aligned_in_smp;
 
 struct list_lru {
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 172b16146e15..c139202e27f7 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -61,18 +61,51 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
 }
 
 static inline struct list_lru_one *
-list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg)
+lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+		       bool irq, bool skip_empty)
 {
 	struct list_lru_one *l;
+	long nr_items;
+
+	rcu_read_lock();
 again:
 	l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
-	if (likely(l))
-		return l;
-
-	memcg = parent_mem_cgroup(memcg);
+	if (likely(l)) {
+		if (irq)
+			spin_lock_irq(&l->lock);
+		else
+			spin_lock(&l->lock);
+		nr_items = READ_ONCE(l->nr_items);
+		if (likely(nr_items != LONG_MIN)) {
+			WARN_ON(nr_items < 0);
+			rcu_read_unlock();
+			return l;
+		}
+		if (irq)
+			spin_unlock_irq(&l->lock);
+		else
+			spin_unlock(&l->lock);
+	}
+	/*
+	 * Caller may simply bail out if raced with reparenting or
+	 * may iterate through the list_lru and expect empty slots.
+	 */
+	if (skip_empty) {
+		rcu_read_unlock();
+		return NULL;
+	}
 	VM_WARN_ON(!css_is_dying(&memcg->css));
+	memcg = parent_mem_cgroup(memcg);
 	goto again;
 }
+
+static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
+{
+	if (irq_off)
+		spin_unlock_irq(&l->lock);
+	else
+		spin_unlock(&l->lock);
+}
 #else
 static void list_lru_register(struct list_lru *lru)
 {
@@ -99,31 +132,48 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
 }
 
 static inline struct list_lru_one *
-list_lru_from_memcg(struct list_lru *lru, int nid, int idx)
+lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+		       bool irq, bool skip_empty)
 {
-	return &lru->node[nid].lru;
+	struct list_lru_one *l = &lru->node[nid].lru;
+
+	if (irq)
+		spin_lock_irq(&l->lock);
+	else
+		spin_lock(&l->lock);
+
+	return l;
+}
+
+static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
+{
+	if (irq_off)
+		spin_unlock_irq(&l->lock);
+	else
+		spin_unlock(&l->lock);
 }
 #endif /* CONFIG_MEMCG */
 
 /* The caller must ensure the memcg lifetime. */
 bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
-		    struct mem_cgroup *memcg)
+		  struct mem_cgroup *memcg)
 {
 	struct list_lru_node *nlru = &lru->node[nid];
 	struct list_lru_one *l;
 
-	spin_lock(&nlru->lock);
+	l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
+	if (!l)
+		return false;
 	if (list_empty(item)) {
-		l = list_lru_from_memcg(lru, nid, memcg);
 		list_add_tail(item, &l->list);
 		/* Set shrinker bit if the first element was added */
 		if (!l->nr_items++)
 			set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
-		nlru->nr_items++;
-		spin_unlock(&nlru->lock);
+		unlock_list_lru(l, false);
+		atomic_long_inc(&nlru->nr_items);
 		return true;
 	}
-	spin_unlock(&nlru->lock);
+	unlock_list_lru(l, false);
 	return false;
 }
 
@@ -146,24 +196,23 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj);
 
 /* The caller must ensure the memcg lifetime. */
 bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
-		    struct mem_cgroup *memcg)
+		  struct mem_cgroup *memcg)
 {
 	struct list_lru_node *nlru = &lru->node[nid];
 	struct list_lru_one *l;
-
-	spin_lock(&nlru->lock);
+	l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
+	if (!l)
+		return false;
 	if (!list_empty(item)) {
-		l = list_lru_from_memcg(lru, nid, memcg);
 		list_del_init(item);
 		l->nr_items--;
-		nlru->nr_items--;
-		spin_unlock(&nlru->lock);
+		unlock_list_lru(l, false);
+		atomic_long_dec(&nlru->nr_items);
 		return true;
 	}
-	spin_unlock(&nlru->lock);
+	unlock_list_lru(l, false);
 	return false;
 }
-EXPORT_SYMBOL_GPL(list_lru_del);
 
 bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
 {
@@ -220,25 +269,24 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
 	struct list_lru_node *nlru;
 
 	nlru = &lru->node[nid];
-	return nlru->nr_items;
+	return atomic_long_read(&nlru->nr_items);
 }
 EXPORT_SYMBOL_GPL(list_lru_count_node);
 
 static unsigned long
-__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
 		    list_lru_walk_cb isolate, void *cb_arg,
-		    unsigned long *nr_to_walk)
+		    unsigned long *nr_to_walk, bool irq_off)
 {
 	struct list_lru_node *nlru = &lru->node[nid];
-	struct list_lru_one *l;
+	struct list_lru_one *l = NULL;
 	struct list_head *item, *n;
 	unsigned long isolated = 0;
 
 restart:
-	l = list_lru_from_memcg_idx(lru, nid, memcg_idx);
+	l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true);
 	if (!l)
-		goto out;
-
+		return isolated;
 	list_for_each_safe(item, n, &l->list) {
 		enum lru_status ret;
 
@@ -250,19 +298,19 @@ restart:
 			break;
 		--*nr_to_walk;
 
-		ret = isolate(item, l, &nlru->lock, cb_arg);
+		ret = isolate(item, l, &l->lock, cb_arg);
 		switch (ret) {
+		/*
+		 * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru
+		 * lock. List traversal will have to restart from scratch.
+		 */
+		case LRU_RETRY:
+			goto restart;
 		case LRU_REMOVED_RETRY:
-			assert_spin_locked(&nlru->lock);
 			fallthrough;
 		case LRU_REMOVED:
 			isolated++;
-			nlru->nr_items--;
-			/*
-			 * If the lru lock has been dropped, our list
-			 * traversal is now invalid and so we have to
-			 * restart from scratch.
-			 */
+			atomic_long_dec(&nlru->nr_items);
 			if (ret == LRU_REMOVED_RETRY)
 				goto restart;
 			break;
@@ -271,20 +319,13 @@ restart:
 			break;
 		case LRU_SKIP:
 			break;
-		case LRU_RETRY:
-			/*
-			 * The lru lock has been dropped, our list traversal is
-			 * now invalid and so we have to restart from scratch.
-			 */
-			assert_spin_locked(&nlru->lock);
-			goto restart;
 		case LRU_STOP:
-			assert_spin_locked(&nlru->lock);
 			goto out;
 		default:
 			BUG();
 		}
 	}
+	unlock_list_lru(l, irq_off);
 out:
 	return isolated;
 }
@@ -294,14 +335,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
 		  list_lru_walk_cb isolate, void *cb_arg,
 		  unsigned long *nr_to_walk)
 {
-	struct list_lru_node *nlru = &lru->node[nid];
-	unsigned long ret;
-
-	spin_lock(&nlru->lock);
-	ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
-				  cb_arg, nr_to_walk);
-	spin_unlock(&nlru->lock);
-	return ret;
+	return __list_lru_walk_one(lru, nid, memcg, isolate,
+				   cb_arg, nr_to_walk, false);
 }
 EXPORT_SYMBOL_GPL(list_lru_walk_one);
 
@@ -310,14 +345,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
 		      list_lru_walk_cb isolate, void *cb_arg,
 		      unsigned long *nr_to_walk)
 {
-	struct list_lru_node *nlru = &lru->node[nid];
-	unsigned long ret;
-
-	spin_lock_irq(&nlru->lock);
-	ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
-				  cb_arg, nr_to_walk);
-	spin_unlock_irq(&nlru->lock);
-	return ret;
+	return __list_lru_walk_one(lru, nid, memcg, isolate,
+				   cb_arg, nr_to_walk, true);
 }
 
 unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
@@ -332,16 +361,21 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 #ifdef CONFIG_MEMCG
 	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
 		struct list_lru_memcg *mlru;
+		struct mem_cgroup *memcg;
 		unsigned long index;
 
 		xa_for_each(&lru->xa, index, mlru) {
-			struct list_lru_node *nlru = &lru->node[nid];
-
-			spin_lock(&nlru->lock);
-			isolated += __list_lru_walk_one(lru, nid, index,
+			rcu_read_lock();
+			memcg = mem_cgroup_from_id(index);
+			if (!mem_cgroup_tryget(memcg)) {
+				rcu_read_unlock();
+				continue;
+			}
+			rcu_read_unlock();
+			isolated += __list_lru_walk_one(lru, nid, memcg,
 							isolate, cb_arg,
-							nr_to_walk);
-			spin_unlock(&nlru->lock);
+							nr_to_walk, false);
+			mem_cgroup_put(memcg);
 
 			if (*nr_to_walk <= 0)
 				break;
@@ -353,14 +387,19 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 }
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
 
-static void init_one_lru(struct list_lru_one *l)
+static void init_one_lru(struct list_lru *lru, struct list_lru_one *l)
 {
 	INIT_LIST_HEAD(&l->list);
+	spin_lock_init(&l->lock);
 	l->nr_items = 0;
+#ifdef CONFIG_LOCKDEP
+	if (lru->key)
+		lockdep_set_class(&l->lock, lru->key);
+#endif
 }
 
 #ifdef CONFIG_MEMCG
-static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
+static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp)
 {
 	int nid;
 	struct list_lru_memcg *mlru;
@@ -370,7 +409,7 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
 		return NULL;
 
 	for_each_node(nid)
-		init_one_lru(&mlru->node[nid]);
+		init_one_lru(lru, &mlru->node[nid]);
 
 	return mlru;
 }
@@ -398,28 +437,27 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
 	xas_unlock_irq(&xas);
 }
 
-static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
-					 struct list_lru_one *src,
-					 struct mem_cgroup *dst_memcg)
+static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid,
+					struct list_lru_one *src,
+					struct mem_cgroup *dst_memcg)
 {
-	struct list_lru_node *nlru = &lru->node[nid];
+	int dst_idx = dst_memcg->kmemcg_id;
 	struct list_lru_one *dst;
 
-	/*
-	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
-	 * we have to use IRQ-safe primitives here to avoid deadlock.
-	 */
-	spin_lock_irq(&nlru->lock);
-	dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg));
+	spin_lock_irq(&src->lock);
+	dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
+	spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING);
 
 	list_splice_init(&src->list, &dst->list);
-
 	if (src->nr_items) {
 		dst->nr_items += src->nr_items;
 		set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
-		src->nr_items = 0;
 	}
-	spin_unlock_irq(&nlru->lock);
+	/* Mark the list_lru_one dead */
+	src->nr_items = LONG_MIN;
+
+	spin_unlock(&dst->lock);
+	spin_unlock_irq(&src->lock);
 }
 
 void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
@@ -448,7 +486,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
 		 * safe to reparent.
 		 */
 		for_each_node(i)
-			memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent);
+			memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent);
 
 		/*
 		 * Here all list_lrus corresponding to the cgroup are guaranteed
@@ -497,7 +535,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
 			parent = parent_mem_cgroup(pos);
 		}
 
-		mlru = memcg_init_list_lru_one(gfp);
+		mlru = memcg_init_list_lru_one(lru, gfp);
 		if (!mlru)
 			return -ENOMEM;
 		xas_set(&xas, pos->kmemcg_id);
@@ -544,14 +582,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr
 	if (!lru->node)
 		return -ENOMEM;
 
-	for_each_node(i) {
-		spin_lock_init(&lru->node[i].lock);
-#ifdef CONFIG_LOCKDEP
-		if (lru->key)
-			lockdep_set_class(&lru->node[i].lock, lru->key);
-#endif
-		init_one_lru(&lru->node[i].lru);
-	}
+	for_each_node(i)
+		init_one_lru(lru, &lru->node[i].lru);
 
 	memcg_init_list_lru(lru, memcg_aware);
 	list_lru_register(lru);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8b9061b58a72..ed2dd88437ca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3110,8 +3110,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
 	if (!parent)
 		parent = root_mem_cgroup;
 
-	memcg_reparent_objcgs(memcg, parent);
 	memcg_reparent_list_lrus(memcg, parent);
+
+	/*
+	 * Objcg's reparenting must be after list_lru's, make sure list_lru
+	 * helpers won't use parent's list_lru until child is drained.
+	 */
+	memcg_reparent_objcgs(memcg, parent);
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/mm/workingset.c b/mm/workingset.c
index 5c8861edbf17..c187d4a3fbea 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -767,7 +767,6 @@ out_invalid:
 	ret = LRU_REMOVED_RETRY;
 out:
 	cond_resched();
-	spin_lock_irq(lru_lock);
 	return ret;
 }
 
diff --git a/mm/zswap.c b/mm/zswap.c
index 96aeb969d6d6..ba35e4550941 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -711,9 +711,9 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
 	 * Note that it is safe to use rcu_read_lock() here, even in the face of
 	 * concurrent memcg offlining:
 	 *
-	 * 1. list_lru_add() is called before list_lru_memcg is erased. The
+	 * 1. list_lru_add() is called before list_lru_one is dead. The
 	 *    new entry will be reparented to memcg's parent's list_lru.
-	 * 2. list_lru_add() is called after list_lru_memcg is erased. The
+	 * 2. list_lru_add() is called after list_lru_one is dead. The
 	 *    new entry will be added directly to memcg's parent's list_lru.
 	 *
 	 * Similar reasoning holds for list_lru_del().
@@ -1179,7 +1179,6 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 		zswap_written_back_pages++;
 	}
 
-	spin_lock(lock);
 	return ret;
 }
 
-- 
cgit v1.2.3


From da0c02516c501b43bd39ad4aca5779c86153d143 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 5 Nov 2024 01:52:57 +0800
Subject: mm/list_lru: simplify the list_lru walk callback function

Now isolation no longer takes the list_lru global node lock, only use the
per-cgroup lock instead.  And this lock is inside the list_lru_one being
walked, no longer needed to pass the lock explicitly.

Link: https://lkml.kernel.org/r/20241104175257.60853-7-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/android/binder_alloc.c |  7 +++----
 drivers/android/binder_alloc.h |  2 +-
 fs/dcache.c                    |  4 ++--
 fs/gfs2/quota.c                |  2 +-
 fs/inode.c                     |  4 ++--
 fs/nfs/nfs42xattr.c            |  4 ++--
 fs/nfsd/filecache.c            |  5 +----
 fs/xfs/xfs_buf.c               |  2 --
 fs/xfs/xfs_qm.c                |  5 ++---
 include/linux/list_lru.h       |  2 +-
 mm/list_lru.c                  |  2 +-
 mm/workingset.c                | 15 +++++++--------
 mm/zswap.c                     |  4 ++--
 13 files changed, 25 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 86bbe40f4bcd..a738e7745865 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1047,7 +1047,7 @@ void binder_alloc_vma_close(struct binder_alloc *alloc)
 /**
  * binder_alloc_free_page() - shrinker callback to free pages
  * @item:   item to free
- * @lock:   lock protecting the item
+ * @lru:    list_lru instance of the item
  * @cb_arg: callback argument
  *
  * Called from list_lru_walk() in binder_shrink_scan() to free
@@ -1055,9 +1055,8 @@ void binder_alloc_vma_close(struct binder_alloc *alloc)
  */
 enum lru_status binder_alloc_free_page(struct list_head *item,
 				       struct list_lru_one *lru,
-				       spinlock_t *lock,
 				       void *cb_arg)
-	__must_hold(lock)
+	__must_hold(&lru->lock)
 {
 	struct binder_lru_page *page = container_of(item, typeof(*page), lru);
 	struct binder_alloc *alloc = page->alloc;
@@ -1092,7 +1091,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 	list_lru_isolate(lru, item);
 	spin_unlock(&alloc->lock);
-	spin_unlock(lock);
+	spin_unlock(&lru->lock);
 
 	if (vma) {
 		trace_binder_unmap_user_start(alloc, index);
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index 70387234477e..c02c8ebcb466 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -118,7 +118,7 @@ static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
 #endif
 enum lru_status binder_alloc_free_page(struct list_head *item,
 				       struct list_lru_one *lru,
-				       spinlock_t *lock, void *cb_arg);
+				       void *cb_arg);
 struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
 					   size_t data_size,
 					   size_t offsets_size,
diff --git a/fs/dcache.c b/fs/dcache.c
index 0f6b16ba30d0..d7f6866f5f52 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1089,7 +1089,7 @@ void shrink_dentry_list(struct list_head *list)
 }
 
 static enum lru_status dentry_lru_isolate(struct list_head *item,
-		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+		struct list_lru_one *lru, void *arg)
 {
 	struct list_head *freeable = arg;
 	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
@@ -1170,7 +1170,7 @@ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
 }
 
 static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
-		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+		struct list_lru_one *lru, void *arg)
 {
 	struct list_head *freeable = arg;
 	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2e6bc77f4f81..72b48f6f5561 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -149,7 +149,7 @@ static void gfs2_qd_list_dispose(struct list_head *list)
 
 
 static enum lru_status gfs2_qd_isolate(struct list_head *item,
-		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+		struct list_lru_one *lru, void *arg)
 {
 	struct list_head *dispose = arg;
 	struct gfs2_quota_data *qd =
diff --git a/fs/inode.c b/fs/inode.c
index 442cb4fc09b2..46fbd5b23482 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -881,7 +881,7 @@ again:
  * with this flag set because they are the inodes that are out of order.
  */
 static enum lru_status inode_lru_isolate(struct list_head *item,
-		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+		struct list_lru_one *lru, void *arg)
 {
 	struct list_head *freeable = arg;
 	struct inode	*inode = container_of(item, struct inode, i_lru);
@@ -923,7 +923,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 	if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
 		inode_pin_lru_isolating(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(lru_lock);
+		spin_unlock(&lru->lock);
 		if (remove_inode_buffers(inode)) {
 			unsigned long reap;
 			reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index b6e3d8f77b91..37d79400e5f4 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -802,7 +802,7 @@ static struct shrinker *nfs4_xattr_large_entry_shrinker;
 
 static enum lru_status
 cache_lru_isolate(struct list_head *item,
-	struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+	struct list_lru_one *lru, void *arg)
 {
 	struct list_head *dispose = arg;
 	struct inode *inode;
@@ -867,7 +867,7 @@ nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 
 static enum lru_status
 entry_lru_isolate(struct list_head *item,
-	struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+	struct list_lru_one *lru, void *arg)
 {
 	struct list_head *dispose = arg;
 	struct nfs4_xattr_bucket *bucket;
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 2e6783f63712..09c444eb944f 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -487,7 +487,6 @@ void nfsd_file_net_dispose(struct nfsd_net *nn)
  * nfsd_file_lru_cb - Examine an entry on the LRU list
  * @item: LRU entry to examine
  * @lru: controlling LRU
- * @lock: LRU list lock (unused)
  * @arg: dispose list
  *
  * Return values:
@@ -497,9 +496,7 @@ void nfsd_file_net_dispose(struct nfsd_net *nn)
  */
 static enum lru_status
 nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
-		 spinlock_t *lock, void *arg)
-	__releases(lock)
-	__acquires(lock)
+		 void *arg)
 {
 	struct list_head *head = arg;
 	struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aa4dbda7b536..43b914c1f621 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1857,7 +1857,6 @@ static enum lru_status
 xfs_buftarg_drain_rele(
 	struct list_head	*item,
 	struct list_lru_one	*lru,
-	spinlock_t		*lru_lock,
 	void			*arg)
 
 {
@@ -1956,7 +1955,6 @@ static enum lru_status
 xfs_buftarg_isolate(
 	struct list_head	*item,
 	struct list_lru_one	*lru,
-	spinlock_t		*lru_lock,
 	void			*arg)
 {
 	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 665d26990b78..8413ac368042 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -412,9 +412,8 @@ static enum lru_status
 xfs_qm_dquot_isolate(
 	struct list_head	*item,
 	struct list_lru_one	*lru,
-	spinlock_t		*lru_lock,
 	void			*arg)
-		__releases(lru_lock) __acquires(lru_lock)
+		__releases(&lru->lock) __acquires(&lru->lock)
 {
 	struct xfs_dquot	*dqp = container_of(item,
 						struct xfs_dquot, q_lru);
@@ -460,7 +459,7 @@ xfs_qm_dquot_isolate(
 		trace_xfs_dqreclaim_dirty(dqp);
 
 		/* we have to drop the LRU lock to flush the dquot */
-		spin_unlock(lru_lock);
+		spin_unlock(&lru->lock);
 
 		error = xfs_qm_dqflush(dqp, &bp);
 		if (error)
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 10ba9a54d42c..05c166811f6b 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -184,7 +184,7 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
 			   struct list_head *head);
 
 typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
-		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
+		struct list_lru_one *list, void *cb_arg);
 
 /**
  * list_lru_walk_one: walk a @lru, isolating and disposing freeable items.
diff --git a/mm/list_lru.c b/mm/list_lru.c
index c139202e27f7..f93ada6a207b 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -298,7 +298,7 @@ restart:
 			break;
 		--*nr_to_walk;
 
-		ret = isolate(item, l, &l->lock, cb_arg);
+		ret = isolate(item, l, cb_arg);
 		switch (ret) {
 		/*
 		 * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru
diff --git a/mm/workingset.c b/mm/workingset.c
index c187d4a3fbea..a4705e196545 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -702,8 +702,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 
 static enum lru_status shadow_lru_isolate(struct list_head *item,
 					  struct list_lru_one *lru,
-					  spinlock_t *lru_lock,
-					  void *arg) __must_hold(lru_lock)
+					  void *arg) __must_hold(lru->lock)
 {
 	struct xa_node *node = container_of(item, struct xa_node, private_list);
 	struct address_space *mapping;
@@ -712,20 +711,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	/*
 	 * Page cache insertions and deletions synchronously maintain
 	 * the shadow node LRU under the i_pages lock and the
-	 * lru_lock.  Because the page cache tree is emptied before
-	 * the inode can be destroyed, holding the lru_lock pins any
+	 * &lru->lock. Because the page cache tree is emptied before
+	 * the inode can be destroyed, holding the &lru->lock pins any
 	 * address_space that has nodes on the LRU.
 	 *
 	 * We can then safely transition to the i_pages lock to
 	 * pin only the address_space of the particular node we want
-	 * to reclaim, take the node off-LRU, and drop the lru_lock.
+	 * to reclaim, take the node off-LRU, and drop the &lru->lock.
 	 */
 
 	mapping = container_of(node->array, struct address_space, i_pages);
 
 	/* Coming from the list, invert the lock order */
 	if (!xa_trylock(&mapping->i_pages)) {
-		spin_unlock_irq(lru_lock);
+		spin_unlock_irq(&lru->lock);
 		ret = LRU_RETRY;
 		goto out;
 	}
@@ -734,7 +733,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	if (mapping->host != NULL) {
 		if (!spin_trylock(&mapping->host->i_lock)) {
 			xa_unlock(&mapping->i_pages);
-			spin_unlock_irq(lru_lock);
+			spin_unlock_irq(&lru->lock);
 			ret = LRU_RETRY;
 			goto out;
 		}
@@ -743,7 +742,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	list_lru_isolate(lru, item);
 	__dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
 
-	spin_unlock(lru_lock);
+	spin_unlock(&lru->lock);
 
 	/*
 	 * The nodes should only contain one or more shadow entries,
diff --git a/mm/zswap.c b/mm/zswap.c
index ba35e4550941..f6316b66fb23 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1102,7 +1102,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
  *    for reclaim by this ratio.
  */
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
-				       spinlock_t *lock, void *arg)
+				       void *arg)
 {
 	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
 	bool *encountered_page_in_swapcache = (bool *)arg;
@@ -1158,7 +1158,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 	 * It's safe to drop the lock here because we return either
 	 * LRU_REMOVED_RETRY or LRU_RETRY.
 	 */
-	spin_unlock(lock);
+	spin_unlock(&l->lock);
 
 	writeback_result = zswap_writeback_entry(entry, swpentry);
 
-- 
cgit v1.2.3


From 7591c127f3b17d5879f18819cad7058bf3a2e276 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 4 Nov 2024 11:19:44 +0000
Subject: kmemleak: iommu/iova: fix transient kmemleak false positive

The introduction of iova_depot_pop() in 911aa1245da8 ("iommu/iova: Make
the rcache depot scale better") confused kmemleak by moving a struct
iova_magazine object from a singly linked list to rcache->depot and
resetting the 'next' pointer referencing it.  Unlike doubly linked lists,
the content of the object being referred is never changed on removal from
a singly linked list and the kmemleak checksum heuristics do not detect
such scenario.  This leads to false positives like:

unreferenced object 0xffff8881a5301000 (size 1024):
  comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00  .........}......
    0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00  ................
  backtrace:
    [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
    [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
    [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
    [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
    [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
    [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
    [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
    [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
    [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5

Introduce kmemleak_transient_leak() which resets the object checksum
requiring another scan pass before it is reported (if still unreferenced).
Call this new API in iova_depot_pop().

Link: https://lkml.kernel.org/r/20241104111944.2207155-1-catalin.marinas@arm.com
Link: https://lore.kernel.org/r/ZY1osaGLyT-sdKE8@shredder/
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Ido Schimmel <idosch@idosch.org>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/dev-tools/kmemleak.rst |  1 +
 drivers/iommu/iova.c                 |  6 ++++++
 include/linux/kmemleak.h             |  4 ++++
 mm/kmemleak.c                        | 39 ++++++++++++++++++++++++++++++++++++
 4 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index 2cb00b53339f..7d784e03f3f9 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -161,6 +161,7 @@ See the include/linux/kmemleak.h header for the functions prototype.
 - ``kmemleak_free_percpu``	 - notify of a percpu memory block freeing
 - ``kmemleak_update_trace``	 - update object allocation stack trace
 - ``kmemleak_not_leak``	 - mark an object as not a leak
+- ``kmemleak_transient_leak``	 - mark an object as a transient leak
 - ``kmemleak_ignore``		 - do not scan or report an object as leak
 - ``kmemleak_scan_area``	 - add scan areas inside a memory block
 - ``kmemleak_no_scan``	 - do not scan a memory block
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 16c6adff3eb7..5b5400efb657 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/iova.h>
+#include <linux/kmemleak.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
@@ -673,6 +674,11 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
 {
 	struct iova_magazine *mag = rcache->depot;
 
+	/*
+	 * As the mag->next pointer is moved to rcache->depot and reset via
+	 * the mag->size assignment, mark it as a transient false positive.
+	 */
+	kmemleak_transient_leak(mag->next);
 	rcache->depot = mag->next;
 	mag->size = IOVA_MAG_SIZE;
 	rcache->depot_size--;
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 6a3cd1bf4680..93a73c076d16 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -26,6 +26,7 @@ extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
 extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
 extern void kmemleak_update_trace(const void *ptr) __ref;
 extern void kmemleak_not_leak(const void *ptr) __ref;
+extern void kmemleak_transient_leak(const void *ptr) __ref;
 extern void kmemleak_ignore(const void *ptr) __ref;
 extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
 extern void kmemleak_no_scan(const void *ptr) __ref;
@@ -93,6 +94,9 @@ static inline void kmemleak_update_trace(const void *ptr)
 static inline void kmemleak_not_leak(const void *ptr)
 {
 }
+static inline void kmemleak_transient_leak(const void *ptr)
+{
+}
 static inline void kmemleak_ignore(const void *ptr)
 {
 }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 17006d8a2afa..2a945c07ae99 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -934,6 +934,28 @@ static void make_black_object(unsigned long ptr, unsigned int objflags)
 	paint_ptr(ptr, KMEMLEAK_BLACK, objflags);
 }
 
+/*
+ * Reset the checksum of an object. The immediate effect is that it will not
+ * be reported as a leak during the next scan until its checksum is updated.
+ */
+static void reset_checksum(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("Not resetting the checksum of an unknown object at 0x%08lx\n",
+			      ptr);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&object->lock, flags);
+	object->checksum = 0;
+	raw_spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
 /*
  * Add a scanning area to the object. If at least one such area is added,
  * kmemleak will only scan these ranges rather than the whole memory block.
@@ -1202,6 +1224,23 @@ void __ref kmemleak_not_leak(const void *ptr)
 }
 EXPORT_SYMBOL(kmemleak_not_leak);
 
+/**
+ * kmemleak_transient_leak - mark an allocated object as transient false positive
+ * @ptr:	pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to not be
+ * reported as a leak temporarily. This may happen, for example, if the object
+ * is part of a singly linked list and the ->next reference to it is changed.
+ */
+void __ref kmemleak_transient_leak(const void *ptr)
+{
+	pr_debug("%s(0x%px)\n", __func__, ptr);
+
+	if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+		reset_checksum((unsigned long)ptr);
+}
+EXPORT_SYMBOL(kmemleak_transient_leak);
+
 /**
  * kmemleak_ignore - ignore an allocated object
  * @ptr:	pointer to beginning of the object
-- 
cgit v1.2.3


From 7269ed4af344184ab9bdf318fe8864cf64849735 Mon Sep 17 00:00:00 2001
From: Bibo Mao <maobibo@loongson.cn>
Date: Mon, 4 Nov 2024 15:07:12 +0800
Subject: mm: define general function pXd_init()

pud_init(), pmd_init() and kernel_pte_init() are duplicated defined in
file kasan.c and sparse-vmemmap.c as weak functions.  Move them to generic
header file pgtable.h, architecture can redefine them.

Link: https://lkml.kernel.org/r/20241104070712.52902-1-maobibo@loongson.cn
Signed-off-by: Bibo Mao <maobibo@loongson.cn>
Reviewed-by: Huacai Chen <chenhuacai@loongson.cn>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/loongarch/include/asm/pgtable.h |  3 +++
 arch/mips/include/asm/pgtable-64.h   |  2 ++
 include/linux/mm.h                   |  3 ---
 include/linux/pgtable.h              | 21 +++++++++++++++++++++
 mm/kasan/init.c                      | 12 ------------
 mm/sparse-vmemmap.c                  | 12 ------------
 6 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index 20714b73f14c..da346733a1da 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -268,8 +268,11 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pm
  */
 extern void pgd_init(void *addr);
 extern void pud_init(void *addr);
+#define pud_init pud_init
 extern void pmd_init(void *addr);
+#define pmd_init pmd_init
 extern void kernel_pte_init(void *addr);
+#define kernel_pte_init kernel_pte_init
 
 /*
  * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 401c1d9e4409..6e854bb11f37 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -317,7 +317,9 @@ static inline pmd_t *pud_pgtable(pud_t pud)
  */
 extern void pgd_init(void *addr);
 extern void pud_init(void *addr);
+#define pud_init pud_init
 extern void pmd_init(void *addr);
+#define pmd_init pmd_init
 
 /*
  * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 32888a97ab44..5d6cd523c7c0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3819,9 +3819,6 @@ void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
 		struct dev_pagemap *pgmap);
-void pud_init(void *addr);
-void pmd_init(void *addr);
-void kernel_pte_init(void *addr);
 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
 p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 23aeffd89a4e..adef9d6e9b1b 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -90,6 +90,27 @@ static inline unsigned long pud_index(unsigned long address)
 #define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
 #endif
 
+#ifndef kernel_pte_init
+static inline void kernel_pte_init(void *addr)
+{
+}
+#define kernel_pte_init kernel_pte_init
+#endif
+
+#ifndef pmd_init
+static inline void pmd_init(void *addr)
+{
+}
+#define pmd_init pmd_init
+#endif
+
+#ifndef pud_init
+static inline void pud_init(void *addr)
+{
+}
+#define pud_init pud_init
+#endif
+
 #ifndef pte_offset_kernel
 static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 {
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ac607c306292..ced6b29fcf76 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -106,10 +106,6 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
 	}
 }
 
-void __weak __meminit kernel_pte_init(void *addr)
-{
-}
-
 static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
 				unsigned long end)
 {
@@ -145,10 +141,6 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
 	return 0;
 }
 
-void __weak __meminit pmd_init(void *addr)
-{
-}
-
 static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
 				unsigned long end)
 {
@@ -187,10 +179,6 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
 	return 0;
 }
 
-void __weak __meminit pud_init(void *addr)
-{
-}
-
 static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
 				unsigned long end)
 {
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c0388b2e959d..cec67c5f37d8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -184,10 +184,6 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
 	return p;
 }
 
-void __weak __meminit kernel_pte_init(void *addr)
-{
-}
-
 pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
 {
 	pmd_t *pmd = pmd_offset(pud, addr);
@@ -201,10 +197,6 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
 	return pmd;
 }
 
-void __weak __meminit pmd_init(void *addr)
-{
-}
-
 pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
 {
 	pud_t *pud = pud_offset(p4d, addr);
@@ -218,10 +210,6 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
 	return pud;
 }
 
-void __weak __meminit pud_init(void *addr)
-{
-}
-
 p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
 {
 	p4d_t *p4d = p4d_offset(pgd, addr);
-- 
cgit v1.2.3


From 5dc51ec86df6e2214d8398079c1e31736593ab53 Mon Sep 17 00:00:00 2001
From: Martin Karsten <mkarsten@uwaterloo.ca>
Date: Sat, 9 Nov 2024 05:02:31 +0000
Subject: net: Add napi_struct parameter irq_suspend_timeout

Add a per-NAPI IRQ suspension parameter, which can be get/set with
netdev-genl.

This patch doesn't change any behavior but prepares the code for other
changes in the following commits which use irq_suspend_timeout as a
timeout for IRQ suspension.

Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Co-developed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Tested-by: Joe Damato <jdamato@fastly.com>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Link: https://patch.msgid.link/20241109050245.191288-2-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml |  7 +++++++
 include/linux/netdevice.h               |  2 ++
 include/uapi/linux/netdev.h             |  1 +
 net/core/dev.c                          |  2 ++
 net/core/dev.h                          | 25 +++++++++++++++++++++++++
 net/core/netdev-genl-gen.c              |  5 +++--
 net/core/netdev-genl.c                  | 12 ++++++++++++
 tools/include/uapi/linux/netdev.h       |  1 +
 8 files changed, 53 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index f9cb97d6106c..cbb544bd6c84 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -263,6 +263,11 @@ attribute-sets:
              the end of a NAPI cycle. This may add receive latency in exchange
              for reducing the number of frames processed by the network stack.
         type: uint
+      -
+        name: irq-suspend-timeout
+        doc: The timeout, in nanoseconds, of how long to suspend irq
+             processing, if event polling finds events
+        type: uint
   -
     name: queue
     attributes:
@@ -653,6 +658,7 @@ operations:
             - pid
             - defer-hard-irqs
             - gro-flush-timeout
+            - irq-suspend-timeout
       dump:
         request:
           attributes:
@@ -704,6 +710,7 @@ operations:
             - id
             - defer-hard-irqs
             - gro-flush-timeout
+            - irq-suspend-timeout
 
 kernel-family:
   headers: [ "linux/list.h"]
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index df4483598628..0aae346d919e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -348,6 +348,7 @@ struct gro_list {
  */
 struct napi_config {
 	u64 gro_flush_timeout;
+	u64 irq_suspend_timeout;
 	u32 defer_hard_irqs;
 	unsigned int napi_id;
 };
@@ -384,6 +385,7 @@ struct napi_struct {
 	struct hrtimer		timer;
 	struct task_struct	*thread;
 	unsigned long		gro_flush_timeout;
+	unsigned long		irq_suspend_timeout;
 	u32			defer_hard_irqs;
 	/* control-path-only fields follow */
 	struct list_head	dev_list;
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index e3ebb49f60d2..e4be227d3ad6 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -124,6 +124,7 @@ enum {
 	NETDEV_A_NAPI_PID,
 	NETDEV_A_NAPI_DEFER_HARD_IRQS,
 	NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+	NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
diff --git a/net/core/dev.c b/net/core/dev.c
index 6a31152e4606..4d910872963f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6666,6 +6666,7 @@ static void napi_restore_config(struct napi_struct *n)
 {
 	n->defer_hard_irqs = n->config->defer_hard_irqs;
 	n->gro_flush_timeout = n->config->gro_flush_timeout;
+	n->irq_suspend_timeout = n->config->irq_suspend_timeout;
 	/* a NAPI ID might be stored in the config, if so use it. if not, use
 	 * napi_hash_add to generate one for us. It will be saved to the config
 	 * in napi_disable.
@@ -6680,6 +6681,7 @@ static void napi_save_config(struct napi_struct *n)
 {
 	n->config->defer_hard_irqs = n->defer_hard_irqs;
 	n->config->gro_flush_timeout = n->gro_flush_timeout;
+	n->config->irq_suspend_timeout = n->irq_suspend_timeout;
 	n->config->napi_id = n->napi_id;
 	napi_hash_del(n);
 }
diff --git a/net/core/dev.h b/net/core/dev.h
index 7881bced70a9..d043dee25a68 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -236,6 +236,31 @@ static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
 		netdev->napi_config[i].gro_flush_timeout = timeout;
 }
 
+/**
+ * napi_get_irq_suspend_timeout - get the irq_suspend_timeout
+ * @n: napi struct to get the irq_suspend_timeout from
+ *
+ * Return: the per-NAPI value of the irq_suspend_timeout field.
+ */
+static inline unsigned long
+napi_get_irq_suspend_timeout(const struct napi_struct *n)
+{
+	return READ_ONCE(n->irq_suspend_timeout);
+}
+
+/**
+ * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi
+ * @n: napi struct to set the irq_suspend_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout
+ */
+static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
+						unsigned long timeout)
+{
+	WRITE_ONCE(n->irq_suspend_timeout, timeout);
+}
+
 int rps_cpumask_housekeeping(struct cpumask *mask);
 
 #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 21de7e10be16..a89cbd8d87c3 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -92,10 +92,11 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1]
 };
 
 /* NETDEV_CMD_NAPI_SET - do */
-static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = {
+static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = {
 	[NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
 	[NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
 	[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
+	[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
 };
 
 /* Ops table for netdev */
@@ -186,7 +187,7 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.cmd		= NETDEV_CMD_NAPI_SET,
 		.doit		= netdev_nl_napi_set_doit,
 		.policy		= netdev_napi_set_nl_policy,
-		.maxattr	= NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+		.maxattr	= NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 };
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index b49c3b4e5fbe..765ce7c9d73b 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -161,6 +161,7 @@ static int
 netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 			const struct genl_info *info)
 {
+	unsigned long irq_suspend_timeout;
 	unsigned long gro_flush_timeout;
 	u32 napi_defer_hard_irqs;
 	void *hdr;
@@ -196,6 +197,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 			napi_defer_hard_irqs))
 		goto nla_put_failure;
 
+	irq_suspend_timeout = napi_get_irq_suspend_timeout(napi);
+	if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+			 irq_suspend_timeout))
+		goto nla_put_failure;
+
 	gro_flush_timeout = napi_get_gro_flush_timeout(napi);
 	if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
 			 gro_flush_timeout))
@@ -306,6 +312,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 static int
 netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
 {
+	u64 irq_suspend_timeout = 0;
 	u64 gro_flush_timeout = 0;
 	u32 defer = 0;
 
@@ -314,6 +321,11 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
 		napi_set_defer_hard_irqs(napi, defer);
 	}
 
+	if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) {
+		irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]);
+		napi_set_irq_suspend_timeout(napi, irq_suspend_timeout);
+	}
+
 	if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
 		gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
 		napi_set_gro_flush_timeout(napi, gro_flush_timeout);
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index e3ebb49f60d2..e4be227d3ad6 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -124,6 +124,7 @@ enum {
 	NETDEV_A_NAPI_PID,
 	NETDEV_A_NAPI_DEFER_HARD_IRQS,
 	NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+	NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
-- 
cgit v1.2.3


From 213a695297e1f0c2ed814488757d496b0d7f7267 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Mon, 11 Nov 2024 20:49:11 +0800
Subject: bpf: Replace the document for PTR_TO_BTF_ID_OR_NULL

Commit c25b2ae13603 ("bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX |
PTR_MAYBE_NULL") moved the fields around and misplaced the
documentation for "PTR_TO_BTF_ID_OR_NULL". So, let's replace it in the
proper place.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241111124911.1436911-1-dongml2@chinatelecom.cn
---
 include/linux/bpf.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b84613b10ac..7da41ae2eac8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -932,10 +932,6 @@ enum bpf_reg_type {
 	 * additional context, assume the value is non-null.
 	 */
 	PTR_TO_BTF_ID,
-	/* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not
-	 * been checked for null. Used primarily to inform the verifier
-	 * an explicit null check is required for this struct.
-	 */
 	PTR_TO_MEM,		 /* reg points to valid memory region */
 	PTR_TO_ARENA,
 	PTR_TO_BUF,		 /* reg points to a read/write buffer */
@@ -948,6 +944,10 @@ enum bpf_reg_type {
 	PTR_TO_SOCKET_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_SOCKET,
 	PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
 	PTR_TO_TCP_SOCK_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
+	/* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not
+	 * been checked for null. Used primarily to inform the verifier
+	 * an explicit null check is required for this struct.
+	 */
 	PTR_TO_BTF_ID_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_BTF_ID,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
-- 
cgit v1.2.3


From 12079a59ce52e72a342c49cfacf0281213fd6f32 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 7 Nov 2024 08:11:44 -0800
Subject: net: Implement fault injection forcing skb reallocation

Introduce a fault injection mechanism to force skb reallocation. The
primary goal is to catch bugs related to pointer invalidation after
potential skb reallocation.

The fault injection mechanism aims to identify scenarios where callers
retain pointers to various headers in the skb but fail to reload these
pointers after calling a function that may reallocate the data. This
type of bug can lead to memory corruption or crashes if the old,
now-invalid pointers are used.

By forcing reallocation through fault injection, we can stress-test code
paths and ensure proper pointer management after potential skb
reallocations.

Add a hook for fault injection in the following functions:

 * pskb_trim_rcsum()
 * pskb_may_pull_reason()
 * pskb_trim()

As the other fault injection mechanism, protect it under a debug Kconfig
called CONFIG_FAIL_SKB_REALLOC.

This patch was *heavily* inspired by Jakub's proposal from:
https://lore.kernel.org/all/20240719174140.47a868e6@kernel.org/

CC: Akinobu Mita <akinobu.mita@gmail.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20241107-fault_v6-v6-1-1b82cb6ecacd@debian.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt   |   1 +
 Documentation/fault-injection/fault-injection.rst |  40 ++++++++
 include/linux/skbuff.h                            |   9 ++
 lib/Kconfig.debug                                 |  10 ++
 net/core/Makefile                                 |   1 +
 net/core/skb_fault_injection.c                    | 106 ++++++++++++++++++++++
 6 files changed, 167 insertions(+)
 create mode 100644 net/core/skb_fault_injection.c

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1518343bbe22..2fb830453dcc 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1546,6 +1546,7 @@
 	failslab=
 	fail_usercopy=
 	fail_page_alloc=
+	fail_skb_realloc=
 	fail_make_request=[KNL]
 			General fault injection mechanism.
 			Format: <interval>,<probability>,<space>,<times>
diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst
index 8b8aeea71c68..1c14ba08fbfc 100644
--- a/Documentation/fault-injection/fault-injection.rst
+++ b/Documentation/fault-injection/fault-injection.rst
@@ -45,6 +45,32 @@ Available fault injection capabilities
   ALLOW_ERROR_INJECTION() macro, by setting debugfs entries
   under /sys/kernel/debug/fail_function. No boot option supported.
 
+- fail_skb_realloc
+
+  inject skb (socket buffer) reallocation events into the network path. The
+  primary goal is to identify and prevent issues related to pointer
+  mismanagement in the network subsystem.  By forcing skb reallocation at
+  strategic points, this feature creates scenarios where existing pointers to
+  skb headers become invalid.
+
+  When the fault is injected and the reallocation is triggered, cached pointers
+  to skb headers and data no longer reference valid memory locations. This
+  deliberate invalidation helps expose code paths where proper pointer updating
+  is neglected after a reallocation event.
+
+  By creating these controlled fault scenarios, the system can catch instances
+  where stale pointers are used, potentially leading to memory corruption or
+  system instability.
+
+  To select the interface to act on, write the network name to
+  /sys/kernel/debug/fail_skb_realloc/devname.
+  If this field is left empty (which is the default value), skb reallocation
+  will be forced on all network interfaces.
+
+  The effectiveness of this fault detection is enhanced when KASAN is
+  enabled, as it helps identify invalid memory references and use-after-free
+  (UAF) issues.
+
 - NVMe fault injection
 
   inject NVMe status code and retry flag on devices permitted by setting
@@ -216,6 +242,19 @@ configuration of fault-injection capabilities.
 	use a negative errno, you better use 'printf' instead of 'echo', e.g.:
 	$ printf %#x -12 > retval
 
+- /sys/kernel/debug/fail_skb_realloc/devname:
+
+        Specifies the network interface on which to force SKB reallocation.  If
+        left empty, SKB reallocation will be applied to all network interfaces.
+
+        Example usage::
+
+          # Force skb reallocation on eth0
+          echo "eth0" > /sys/kernel/debug/fail_skb_realloc/devname
+
+          # Clear the selection and force skb reallocation on all interfaces
+          echo "" > /sys/kernel/debug/fail_skb_realloc/devname
+
 Boot option
 ^^^^^^^^^^^
 
@@ -227,6 +266,7 @@ use the boot option::
 	fail_usercopy=
 	fail_make_request=
 	fail_futex=
+	fail_skb_realloc=
 	mmc_core.fail_request=<interval>,<probability>,<space>,<times>
 
 proc entries
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 60535c706851..58009fa66102 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2682,6 +2682,12 @@ static inline void skb_assert_len(struct sk_buff *skb)
 #endif /* CONFIG_DEBUG_NET */
 }
 
+#if defined(CONFIG_FAIL_SKB_REALLOC)
+void skb_might_realloc(struct sk_buff *skb);
+#else
+static inline void skb_might_realloc(struct sk_buff *skb) {}
+#endif
+
 /*
  *	Add data to an sk_buff
  */
@@ -2782,6 +2788,7 @@ static inline enum skb_drop_reason
 pskb_may_pull_reason(struct sk_buff *skb, unsigned int len)
 {
 	DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
+	skb_might_realloc(skb);
 
 	if (likely(len <= skb_headlen(skb)))
 		return SKB_NOT_DROPPED_YET;
@@ -3240,6 +3247,7 @@ static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
 
 static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
 {
+	skb_might_realloc(skb);
 	return (len < skb->len) ? __pskb_trim(skb, len) : 0;
 }
 
@@ -3994,6 +4002,7 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
 
 static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 {
+	skb_might_realloc(skb);
 	if (likely(len >= skb->len))
 		return 0;
 	return pskb_trim_rcsum_slow(skb, len);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7312ae7c3cc5..67b669d2e70e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2115,6 +2115,16 @@ config FAIL_SUNRPC
 	  Provide fault-injection capability for SunRPC and
 	  its consumers.
 
+config FAIL_SKB_REALLOC
+	bool "Fault-injection capability forcing skb to reallocate"
+	depends on FAULT_INJECTION_DEBUG_FS
+	help
+	  Provide fault-injection capability that forces the skb to be
+	  reallocated, catching possible invalid pointers to the skb.
+
+	  For more information, check
+	  Documentation/dev-tools/fault-injection/fault-injection.rst
+
 config FAULT_INJECTION_CONFIGFS
 	bool "Configfs interface for fault-injection capabilities"
 	depends on FAULT_INJECTION
diff --git a/net/core/Makefile b/net/core/Makefile
index 5a72a87ee0f1..d9326600e289 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -46,3 +46,4 @@ obj-$(CONFIG_OF)	+= of_net.o
 obj-$(CONFIG_NET_TEST) += net_test.o
 obj-$(CONFIG_NET_DEVMEM) += devmem.o
 obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o
+obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o
diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c
new file mode 100644
index 000000000000..4235db6bdfad
--- /dev/null
+++ b/net/core/skb_fault_injection.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+static struct {
+	struct fault_attr attr;
+	char devname[IFNAMSIZ];
+	bool filtered;
+} skb_realloc = {
+	.attr = FAULT_ATTR_INITIALIZER,
+	.filtered = false,
+};
+
+static bool should_fail_net_realloc_skb(struct sk_buff *skb)
+{
+	struct net_device *net = skb->dev;
+
+	if (skb_realloc.filtered &&
+	    strncmp(net->name, skb_realloc.devname, IFNAMSIZ))
+		/* device name filter set, but names do not match */
+		return false;
+
+	if (!should_fail(&skb_realloc.attr, 1))
+		return false;
+
+	return true;
+}
+ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE);
+
+void skb_might_realloc(struct sk_buff *skb)
+{
+	if (!should_fail_net_realloc_skb(skb))
+		return;
+
+	pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_might_realloc);
+
+static int __init fail_skb_realloc_setup(char *str)
+{
+	return setup_fault_attr(&skb_realloc.attr, str);
+}
+__setup("fail_skb_realloc=", fail_skb_realloc_setup);
+
+static void reset_settings(void)
+{
+	skb_realloc.filtered = false;
+	memset(&skb_realloc.devname, 0, IFNAMSIZ);
+}
+
+static ssize_t devname_write(struct file *file, const char __user *buffer,
+			     size_t count, loff_t *ppos)
+{
+	ssize_t ret;
+
+	reset_settings();
+	ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ,
+				     ppos, buffer, count);
+	if (ret < 0)
+		return ret;
+
+	skb_realloc.devname[IFNAMSIZ - 1] = '\0';
+	/* Remove a possible \n at the end of devname */
+	strim(skb_realloc.devname);
+
+	if (strnlen(skb_realloc.devname, IFNAMSIZ))
+		skb_realloc.filtered = true;
+
+	return count;
+}
+
+static ssize_t devname_read(struct file *file,
+			    char __user *buffer,
+			    size_t size, loff_t *ppos)
+{
+	if (!skb_realloc.filtered)
+		return 0;
+
+	return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname,
+				       strlen(skb_realloc.devname));
+}
+
+static const struct file_operations devname_ops = {
+	.write = devname_write,
+	.read = devname_read,
+};
+
+static int __init fail_skb_realloc_debugfs(void)
+{
+	umode_t mode = S_IFREG | 0600;
+	struct dentry *dir;
+
+	dir = fault_create_debugfs_attr("fail_skb_realloc", NULL,
+					&skb_realloc.attr);
+	if (IS_ERR(dir))
+		return PTR_ERR(dir);
+
+	debugfs_create_file("devname", mode, dir, NULL, &devname_ops);
+
+	return 0;
+}
+
+late_initcall(fail_skb_realloc_debugfs);
-- 
cgit v1.2.3


From 406c5548c661df0bff6bb6ee79bf9d49faf23e31 Mon Sep 17 00:00:00 2001
From: MeiChia Chiu <MeiChia.Chiu@mediatek.com>
Date: Tue, 12 Nov 2024 16:38:46 +0800
Subject: wifi: mac80211: Support EHT 1024 aggregation size in TX

Support EHT 1024 aggregation size in TX

The 1024 agg size for RX is supported but not for TX.
This patch adds this support and refactors common parsing logics for
addbaext in both process_addba_resp and process_addba_req into a
function.

Reviewed-by: Shayne Chen <shayne.chen@mediatek.com>
Reviewed-by: Money Wang <money.wang@mediatek.com>
Co-developed-by: Peter Chiu <chui-hao.chiu@mediatek.com>
Signed-off-by: Peter Chiu <chui-hao.chiu@mediatek.com>
Signed-off-by: MeiChia Chiu <MeiChia.Chiu@mediatek.com>
Link: https://patch.msgid.link/20241112083846.32063-1-MeiChia.Chiu@mediatek.com
[pass elems/len instead of mgmt/len/is_req]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  2 +
 net/mac80211/agg-rx.c      | 94 ++++++++++++++++++++++++++++------------------
 net/mac80211/agg-tx.c      | 31 ++++++++++-----
 net/mac80211/ht.c          |  2 +-
 net/mac80211/ieee80211_i.h |  9 ++++-
 5 files changed, 90 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 456bca45ff05..05dedc45505c 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1445,6 +1445,8 @@ struct ieee80211_mgmt {
 					__le16 status;
 					__le16 capab;
 					__le16 timeout;
+					/* followed by BA Extension */
+					u8 variable[];
 				} __packed addba_resp;
 				struct{
 					u8 action_code;
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index fe7eab4b681b..f3fbe5a4395e 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -170,28 +170,63 @@ static void sta_rx_agg_reorder_timer_expired(struct timer_list *t)
 	rcu_read_unlock();
 }
 
-static void ieee80211_add_addbaext(struct ieee80211_sub_if_data *sdata,
-				   struct sk_buff *skb,
-				   const struct ieee80211_addba_ext_ie *req,
-				   u16 buf_size)
+void ieee80211_add_addbaext(struct sk_buff *skb,
+			    const u8 req_addba_ext_data,
+			    u16 buf_size)
 {
-	struct ieee80211_addba_ext_ie *resp;
+	struct ieee80211_addba_ext_ie *addba_ext;
 	u8 *pos;
 
 	pos = skb_put_zero(skb, 2 + sizeof(struct ieee80211_addba_ext_ie));
 	*pos++ = WLAN_EID_ADDBA_EXT;
 	*pos++ = sizeof(struct ieee80211_addba_ext_ie);
-	resp = (struct ieee80211_addba_ext_ie *)pos;
-	resp->data = req->data & IEEE80211_ADDBA_EXT_NO_FRAG;
+	addba_ext = (struct ieee80211_addba_ext_ie *)pos;
 
-	resp->data |= u8_encode_bits(buf_size >> IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT,
-				     IEEE80211_ADDBA_EXT_BUF_SIZE_MASK);
+	addba_ext->data = IEEE80211_ADDBA_EXT_NO_FRAG;
+	if (req_addba_ext_data)
+		addba_ext->data &= req_addba_ext_data;
+
+	addba_ext->data |=
+		u8_encode_bits(buf_size >> IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT,
+			       IEEE80211_ADDBA_EXT_BUF_SIZE_MASK);
+}
+
+u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta,
+				     const void *elem_data, ssize_t elem_len,
+				     u16 *buf_size)
+{
+	struct ieee802_11_elems *elems;
+	u8 buf_size_1k, data = 0;
+
+	if (!sta->sta.deflink.he_cap.has_he)
+		return 0;
+
+	if (elem_len <= 0)
+		return 0;
+
+	elems = ieee802_11_parse_elems(elem_data, elem_len, true, NULL);
+
+	if (elems && !elems->parse_error && elems->addba_ext_ie) {
+		data = elems->addba_ext_ie->data;
+
+		if (!sta->sta.deflink.eht_cap.has_eht || !buf_size)
+			goto free;
+
+		buf_size_1k = u8_get_bits(elems->addba_ext_ie->data,
+					  IEEE80211_ADDBA_EXT_BUF_SIZE_MASK);
+		*buf_size |= (u16)buf_size_1k <<
+			     IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT;
+	}
+free:
+	kfree(elems);
+
+	return data;
 }
 
 static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid,
 				      u8 dialog_token, u16 status, u16 policy,
 				      u16 buf_size, u16 timeout,
-				      const struct ieee80211_addba_ext_ie *addbaext)
+				      const u8 req_addba_ext_data)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
@@ -223,8 +258,8 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid,
 	mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
 	mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
 
-	if (sta->sta.deflink.he_cap.has_he && addbaext)
-		ieee80211_add_addbaext(sdata, skb, addbaext, buf_size);
+	if (sta->sta.deflink.he_cap.has_he)
+		ieee80211_add_addbaext(skb, req_addba_ext_data, buf_size);
 
 	ieee80211_tx_skb(sdata, skb);
 }
@@ -233,7 +268,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 				     u8 dialog_token, u16 timeout,
 				     u16 start_seq_num, u16 ba_policy, u16 tid,
 				     u16 buf_size, bool tx, bool auto_seq,
-				     const struct ieee80211_addba_ext_ie *addbaext)
+				     const u8 addba_ext_data)
 {
 	struct ieee80211_local *local = sta->sdata->local;
 	struct tid_ampdu_rx *tid_agg_rx;
@@ -419,7 +454,7 @@ end:
 	if (tx)
 		ieee80211_send_addba_resp(sta, sta->sta.addr, tid,
 					  dialog_token, status, 1, buf_size,
-					  timeout, addbaext);
+					  timeout, addba_ext_data);
 }
 
 void ieee80211_process_addba_request(struct ieee80211_local *local,
@@ -428,9 +463,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 				     size_t len)
 {
 	u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num;
-	struct ieee802_11_elems *elems = NULL;
-	u8 dialog_token;
-	int ies_len;
+	u8 dialog_token, addba_ext_data;
 
 	/* extract session parameters from addba request frame */
 	dialog_token = mgmt->u.action.u.addba_req.dialog_token;
@@ -443,28 +476,17 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 	tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
 	buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
 
-	ies_len = len - offsetof(struct ieee80211_mgmt,
-				 u.action.u.addba_req.variable);
-	if (ies_len) {
-		elems = ieee802_11_parse_elems(mgmt->u.action.u.addba_req.variable,
-					       ies_len, true, NULL);
-		if (!elems || elems->parse_error)
-			goto free;
-	}
-
-	if (sta->sta.deflink.eht_cap.has_eht && elems && elems->addba_ext_ie) {
-		u8 buf_size_1k = u8_get_bits(elems->addba_ext_ie->data,
-					     IEEE80211_ADDBA_EXT_BUF_SIZE_MASK);
-
-		buf_size |= buf_size_1k << IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT;
-	}
+	addba_ext_data =
+		ieee80211_retrieve_addba_ext_data(sta,
+						  mgmt->u.action.u.addba_req.variable,
+						  len -
+						  offsetof(typeof(*mgmt),
+							   u.action.u.addba_req.variable),
+						  &buf_size);
 
 	__ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
 					start_seq_num, ba_policy, tid,
-					buf_size, true, false,
-					elems ? elems->addba_ext_ie : NULL);
-free:
-	kfree(elems);
+					buf_size, true, false, addba_ext_data);
 }
 
 void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif,
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 04cb45cfb310..61f2cac37728 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -58,23 +58,24 @@
  * complete.
  */
 
-static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
-					 const u8 *da, u16 tid,
+static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid,
 					 u8 dialog_token, u16 start_seq_num,
 					 u16 agg_size, u16 timeout)
 {
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 	u16 capab;
 
-	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
-
+	skb = dev_alloc_skb(sizeof(*mgmt) +
+			    2 + sizeof(struct ieee80211_addba_ext_ie) +
+			    local->hw.extra_tx_headroom);
 	if (!skb)
 		return;
 
 	skb_reserve(skb, local->hw.extra_tx_headroom);
-	mgmt = ieee80211_mgmt_ba(skb, da, sdata);
+	mgmt = ieee80211_mgmt_ba(skb, sta->sta.addr, sdata);
 
 	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req));
 
@@ -93,6 +94,9 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
 	mgmt->u.action.u.addba_req.start_seq_num =
 					cpu_to_le16(start_seq_num << 4);
 
+	if (sta->sta.deflink.he_cap.has_he)
+		ieee80211_add_addbaext(skb, 0, agg_size);
+
 	ieee80211_tx_skb_tid(sdata, skb, tid, -1);
 }
 
@@ -460,8 +464,11 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta,
 	sta->ampdu_mlme.addba_req_num[tid]++;
 	spin_unlock_bh(&sta->lock);
 
-	if (sta->sta.deflink.he_cap.has_he) {
+	if (sta->sta.deflink.eht_cap.has_eht) {
 		buf_size = local->hw.max_tx_aggregation_subframes;
+	} else if (sta->sta.deflink.he_cap.has_he) {
+		buf_size = min_t(u16, local->hw.max_tx_aggregation_subframes,
+				 IEEE80211_MAX_AMPDU_BUF_HE);
 	} else {
 		/*
 		 * We really should use what the driver told us it will
@@ -473,9 +480,8 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta,
 	}
 
 	/* send AddBA request */
-	ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
-				     tid_tx->dialog_token, tid_tx->ssn,
-				     buf_size, tid_tx->timeout);
+	ieee80211_send_addba_request(sta, tid, tid_tx->dialog_token,
+				     tid_tx->ssn, buf_size, tid_tx->timeout);
 
 	WARN_ON(test_and_set_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state));
 }
@@ -970,6 +976,13 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 	amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK;
 	tid = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_TID_MASK);
 	buf_size = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK);
+
+	ieee80211_retrieve_addba_ext_data(sta,
+					  mgmt->u.action.u.addba_resp.variable,
+					  len - offsetof(typeof(*mgmt),
+							 u.action.u.addba_resp.variable),
+					  &buf_size);
+
 	buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
 
 	txq = sta->sta.txq[tid];
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 1c2b7dd8976a..32390d8a9d75 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -379,7 +379,7 @@ void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work)
 				       sta->ampdu_mlme.tid_rx_manage_offl))
 			__ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
 							IEEE80211_MAX_AMPDU_BUF_HT,
-							false, true, NULL);
+							false, true, 0);
 
 		if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
 				       sta->ampdu_mlme.tid_rx_manage_offl))
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 7dcb46120abc..752297bcde76 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2111,14 +2111,19 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
 			       const u8 *bssid, int link_id);
 bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old,
 				   enum ieee80211_smps_mode smps_mode_new);
-
+void ieee80211_add_addbaext(struct sk_buff *skb,
+			    const u8 req_addba_ext_data,
+			    u16 buf_size);
+u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta,
+				     const void *elem_data, ssize_t elem_len,
+				     u16 *buf_size);
 void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 				    u16 initiator, u16 reason, bool stop);
 void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 				     u8 dialog_token, u16 timeout,
 				     u16 start_seq_num, u16 ba_policy, u16 tid,
 				     u16 buf_size, bool tx, bool auto_seq,
-				     const struct ieee80211_addba_ext_ie *addbaext);
+				     const u8 addba_ext_data);
 void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
 					 enum ieee80211_agg_stop_reason reason);
 void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
-- 
cgit v1.2.3


From 8182a8b39aa227f5b99b8d4d18f296b82ce4b94c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Nov 2024 06:43:55 +0100
Subject: writeback: wbc_attach_fdatawrite_inode out of line

This allows exporting this high-level interface only while keeping
wbc_attach_and_unlock_inode private in fs-writeback.c and unexporting
__inode_attach_wb.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241112054403.1470586-3-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c         | 31 +++++++++++++++++++++++++++----
 include/linux/writeback.h | 28 ++--------------------------
 2 files changed, 29 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3fb115ae44b1..77db1f10023e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio)
 	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
 		wb_put(wb);
 }
-EXPORT_SYMBOL_GPL(__inode_attach_wb);
 
 /**
  * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
@@ -731,8 +730,8 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
  * writeback completion, wbc_detach_inode() should be called.  This is used
  * to track the cgroup writeback context.
  */
-void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
-				 struct inode *inode)
+static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+		struct inode *inode)
 	__releases(&inode->i_lock)
 {
 	if (!inode_cgwb_enabled(inode)) {
@@ -763,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 	if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
 		inode_switch_wbs(inode, wbc->wb_id);
 }
-EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
+
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+		struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	inode_attach_wb(inode, NULL);
+	wbc_attach_and_unlock_inode(wbc, inode);
+}
+EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode);
 
 /**
  * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -1228,6 +1244,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 	}
 }
 
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+	__releases(&inode->i_lock)
+{
+	spin_unlock(&inode->i_lock);
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d6db822e4bb3..aee3e1b4c50f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -213,9 +213,6 @@ static inline void wait_on_inode(struct inode *inode)
 #include <linux/bio.h>
 
 void __inode_attach_wb(struct inode *inode, struct folio *folio);
-void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
-				 struct inode *inode)
-	__releases(&inode->i_lock);
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 			      size_t bytes);
@@ -254,22 +251,8 @@ static inline void inode_detach_wb(struct inode *inode)
 	}
 }
 
-/**
- * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
- * @wbc: writeback_control of interest
- * @inode: target inode
- *
- * This function is to be used by __filemap_fdatawrite_range(), which is an
- * alternative entry point into writeback code, and first ensures @inode is
- * associated with a bdi_writeback and attaches it to @wbc.
- */
-static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
-					       struct inode *inode)
-{
-	spin_lock(&inode->i_lock);
-	inode_attach_wb(inode, NULL);
-	wbc_attach_and_unlock_inode(wbc, inode);
-}
+void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+		struct inode *inode);
 
 /**
  * wbc_init_bio - writeback specific initializtion of bio
@@ -303,13 +286,6 @@ static inline void inode_detach_wb(struct inode *inode)
 {
 }
 
-static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
-					       struct inode *inode)
-	__releases(&inode->i_lock)
-{
-	spin_unlock(&inode->i_lock);
-}
-
 static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
 					       struct inode *inode)
 {
-- 
cgit v1.2.3


From 365f34483be33a9d0151c06ac39627d7927210d9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Oct 2024 09:11:00 -0700
Subject: srcu: Renaming in preparation for additional reader flavor

Currently, there are only two flavors of readers, normal and NMI-safe.
A number of fields, functions, and types reflect this restriction.
This renaming-only commit prepares for the addition of light-weight
(as in memory-barrier-free) readers.  OK, OK, there is also a drive-by
white-space fixeup!

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: <bpf@vger.kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h     | 21 ++++++++++-----------
 include/linux/srcutree.h |  2 +-
 kernel/rcu/srcutree.c    | 22 +++++++++++-----------
 3 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 835bbb2d1f88..06728ef6f32a 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -181,10 +181,9 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
 #define SRCU_NMI_SAFE		0x2
 
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
-void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe);
+void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
 #else
-static inline void srcu_check_nmi_safety(struct srcu_struct *ssp,
-					 bool nmi_safe) { }
+static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { }
 #endif
 
 
@@ -245,7 +244,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
 {
 	int retval;
 
-	srcu_check_nmi_safety(ssp, false);
+	srcu_check_read_flavor(ssp, false);
 	retval = __srcu_read_lock(ssp);
 	srcu_lock_acquire(&ssp->dep_map);
 	return retval;
@@ -262,7 +261,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp
 {
 	int retval;
 
-	srcu_check_nmi_safety(ssp, true);
+	srcu_check_read_flavor(ssp, true);
 	retval = __srcu_read_lock_nmisafe(ssp);
 	rcu_try_lock_acquire(&ssp->dep_map);
 	return retval;
@@ -274,7 +273,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
 {
 	int retval;
 
-	srcu_check_nmi_safety(ssp, false);
+	srcu_check_read_flavor(ssp, false);
 	retval = __srcu_read_lock(ssp);
 	return retval;
 }
@@ -303,7 +302,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
 static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp)
 {
 	WARN_ON_ONCE(in_nmi());
-	srcu_check_nmi_safety(ssp, false);
+	srcu_check_read_flavor(ssp, false);
 	return __srcu_read_lock(ssp);
 }
 
@@ -318,7 +317,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
 	__releases(ssp)
 {
 	WARN_ON_ONCE(idx & ~0x1);
-	srcu_check_nmi_safety(ssp, false);
+	srcu_check_read_flavor(ssp, false);
 	srcu_lock_release(&ssp->dep_map);
 	__srcu_read_unlock(ssp, idx);
 }
@@ -334,7 +333,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
 	__releases(ssp)
 {
 	WARN_ON_ONCE(idx & ~0x1);
-	srcu_check_nmi_safety(ssp, true);
+	srcu_check_read_flavor(ssp, true);
 	rcu_lock_release(&ssp->dep_map);
 	__srcu_read_unlock_nmisafe(ssp, idx);
 }
@@ -343,7 +342,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
 static inline notrace void
 srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
 {
-	srcu_check_nmi_safety(ssp, false);
+	srcu_check_read_flavor(ssp, false);
 	__srcu_read_unlock(ssp, idx);
 }
 
@@ -360,7 +359,7 @@ static inline void srcu_up_read(struct srcu_struct *ssp, int idx)
 {
 	WARN_ON_ONCE(idx & ~0x1);
 	WARN_ON_ONCE(in_nmi());
-	srcu_check_nmi_safety(ssp, false);
+	srcu_check_read_flavor(ssp, false);
 	__srcu_read_unlock(ssp, idx);
 }
 
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index ed57598394de..ab7d8d215b84 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -25,7 +25,7 @@ struct srcu_data {
 	/* Read-side state. */
 	atomic_long_t srcu_lock_count[2];	/* Locks per CPU. */
 	atomic_long_t srcu_unlock_count[2];	/* Unlocks per CPU. */
-	int srcu_nmi_safety;			/* NMI-safe srcu_struct structure? */
+	int srcu_reader_flavor;			/* Reader flavor for srcu_struct structure? */
 
 	/* Update-side state. */
 	spinlock_t __private lock ____cacheline_internodealigned_in_smp;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 5b1a315f77bc..f259dd834272 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -460,7 +460,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
 
 		sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
 		if (IS_ENABLED(CONFIG_PROVE_RCU))
-			mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
+			mask = mask | READ_ONCE(cpuc->srcu_reader_flavor);
 	}
 	WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
 		  "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
@@ -699,25 +699,25 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
 #ifdef CONFIG_PROVE_RCU
 /*
- * Check for consistent NMI safety.
+ * Check for consistent reader flavor.
  */
-void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
+void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
 {
-	int nmi_safe_mask = 1 << nmi_safe;
-	int old_nmi_safe_mask;
+	int reader_flavor_mask = 1 << read_flavor;
+	int old_reader_flavor_mask;
 	struct srcu_data *sdp;
 
 	/* NMI-unsafe use in NMI is a bad sign */
-	WARN_ON_ONCE(!nmi_safe && in_nmi());
+	WARN_ON_ONCE(!read_flavor && in_nmi());
 	sdp = raw_cpu_ptr(ssp->sda);
-	old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
-	if (!old_nmi_safe_mask) {
-		WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
+	old_reader_flavor_mask = READ_ONCE(sdp->srcu_reader_flavor);
+	if (!old_reader_flavor_mask) {
+		WRITE_ONCE(sdp->srcu_reader_flavor, reader_flavor_mask);
 		return;
 	}
-	WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
+	WARN_ONCE(old_reader_flavor_mask != reader_flavor_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_reader_flavor_mask, reader_flavor_mask);
 }
-EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
+EXPORT_SYMBOL_GPL(srcu_check_read_flavor);
 #endif /* CONFIG_PROVE_RCU */
 
 /*
-- 
cgit v1.2.3


From c071b8e5351453c2cb2d12f425d928f3a24ed2d3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Oct 2024 09:11:03 -0700
Subject: srcu: Improve srcu_read_lock{,_nmisafe}() comments

This commit adds some additional usage constraints to the kernel-doc
headers of srcu_read_lock() and srcu_read_lock_nmi_safe().

Suggested-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 06728ef6f32a..46fd06b212ba 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -235,10 +235,13 @@ static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flav
  * a mutex that is held elsewhere while calling synchronize_srcu() or
  * synchronize_srcu_expedited().
  *
- * Note that srcu_read_lock() and the matching srcu_read_unlock() must
- * occur in the same context, for example, it is illegal to invoke
- * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
- * was invoked in process context.
+ * The return value from srcu_read_lock() must be passed unaltered
+ * to the matching srcu_read_unlock().  Note that srcu_read_lock() and
+ * the matching srcu_read_unlock() must occur in the same context, for
+ * example, it is illegal to invoke srcu_read_unlock() in an irq handler
+ * if the matching srcu_read_lock() was invoked in process context.  Or,
+ * for that matter to invoke srcu_read_unlock() from one task and the
+ * matching srcu_read_lock() from another.
  */
 static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
 {
@@ -256,6 +259,10 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
  *
  * Enter an SRCU read-side critical section, but in an NMI-safe manner.
  * See srcu_read_lock() for more information.
+ *
+ * If srcu_read_lock_nmisafe() is ever used on an srcu_struct structure,
+ * then none of the other flavors may be used, whether before, during,
+ * or after.
  */
 static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp)
 {
-- 
cgit v1.2.3


From 05829be27fe6f64e0675dc3be3a12d43b52492e1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Oct 2024 09:11:04 -0700
Subject: srcu: Create CPP macros for normal and NMI-safe SRCU readers

This commit creates SRCU_READ_FLAVOR_NORMAL and SRCU_READ_FLAVOR_NMI
C-preprocessor macros for srcu_read_lock() and srcu_read_lock_nmisafe(),
respectively.  These replace the old true/false values that were
previously passed to srcu_check_read_flavor().  In addition, the
srcu_check_read_flavor() function itself requires a bit of rework to
handle bitmasks instead of true/false values.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: <bpf@vger.kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h     | 20 ++++++++------------
 include/linux/srcutree.h |  4 ++++
 kernel/rcu/srcutree.c    | 21 +++++++++++----------
 3 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 46fd06b212ba..84daaa33ea0a 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -176,10 +176,6 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-#define SRCU_NMI_UNKNOWN	0x0
-#define SRCU_NMI_UNSAFE		0x1
-#define SRCU_NMI_SAFE		0x2
-
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
 void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
 #else
@@ -247,7 +243,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
 {
 	int retval;
 
-	srcu_check_read_flavor(ssp, false);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
 	retval = __srcu_read_lock(ssp);
 	srcu_lock_acquire(&ssp->dep_map);
 	return retval;
@@ -268,7 +264,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp
 {
 	int retval;
 
-	srcu_check_read_flavor(ssp, true);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
 	retval = __srcu_read_lock_nmisafe(ssp);
 	rcu_try_lock_acquire(&ssp->dep_map);
 	return retval;
@@ -280,7 +276,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
 {
 	int retval;
 
-	srcu_check_read_flavor(ssp, false);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
 	retval = __srcu_read_lock(ssp);
 	return retval;
 }
@@ -309,7 +305,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
 static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp)
 {
 	WARN_ON_ONCE(in_nmi());
-	srcu_check_read_flavor(ssp, false);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
 	return __srcu_read_lock(ssp);
 }
 
@@ -324,7 +320,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
 	__releases(ssp)
 {
 	WARN_ON_ONCE(idx & ~0x1);
-	srcu_check_read_flavor(ssp, false);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
 	srcu_lock_release(&ssp->dep_map);
 	__srcu_read_unlock(ssp, idx);
 }
@@ -340,7 +336,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
 	__releases(ssp)
 {
 	WARN_ON_ONCE(idx & ~0x1);
-	srcu_check_read_flavor(ssp, true);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
 	rcu_lock_release(&ssp->dep_map);
 	__srcu_read_unlock_nmisafe(ssp, idx);
 }
@@ -349,7 +345,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
 static inline notrace void
 srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
 {
-	srcu_check_read_flavor(ssp, false);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
 	__srcu_read_unlock(ssp, idx);
 }
 
@@ -366,7 +362,7 @@ static inline void srcu_up_read(struct srcu_struct *ssp, int idx)
 {
 	WARN_ON_ONCE(idx & ~0x1);
 	WARN_ON_ONCE(in_nmi());
-	srcu_check_read_flavor(ssp, false);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
 	__srcu_read_unlock(ssp, idx);
 }
 
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index ab7d8d215b84..79ad809c7f03 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -43,6 +43,10 @@ struct srcu_data {
 	struct srcu_struct *ssp;
 };
 
+/* Values for ->srcu_reader_flavor. */
+#define SRCU_READ_FLAVOR_NORMAL	0x1		// srcu_read_lock().
+#define SRCU_READ_FLAVOR_NMI	0x2		// srcu_read_lock_nmisafe().
+
 /*
  * Node in SRCU combining tree, similar in function to rcu_data.
  */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index b85da944d794..602b4b8c4b89 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -463,7 +463,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
 			mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
 	}
 	WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
-		  "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
+		  "Mixed reader flavors for srcu_struct at %ps.\n", ssp);
 	return sum;
 }
 
@@ -703,20 +703,21 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
  */
 void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
 {
-	int reader_flavor_mask = 1 << read_flavor;
-	int old_reader_flavor_mask;
+	int old_read_flavor;
 	struct srcu_data *sdp;
 
-	/* NMI-unsafe use in NMI is a bad sign */
-	WARN_ON_ONCE(!read_flavor && in_nmi());
+	/* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */
+	WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi());
+	WARN_ON_ONCE(read_flavor & (read_flavor - 1));
+
 	sdp = raw_cpu_ptr(ssp->sda);
-	old_reader_flavor_mask = READ_ONCE(sdp->srcu_reader_flavor);
-	if (!old_reader_flavor_mask) {
-		old_reader_flavor_mask = cmpxchg(&sdp->srcu_reader_flavor, 0, reader_flavor_mask);
-		if (!old_reader_flavor_mask)
+	old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor);
+	if (!old_read_flavor) {
+		old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor);
+		if (!old_read_flavor)
 			return;
 	}
-	WARN_ONCE(old_reader_flavor_mask != reader_flavor_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_reader_flavor_mask, reader_flavor_mask);
+	WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor);
 }
 EXPORT_SYMBOL_GPL(srcu_check_read_flavor);
 #endif /* CONFIG_PROVE_RCU */
-- 
cgit v1.2.3


From 6364dd8191d27230176ac4b1b4daaecaf4807399 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Oct 2024 09:11:05 -0700
Subject: srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()

This patch adds srcu_read_lock_lite() and srcu_read_unlock_lite(), which
dispense with the read-side smp_mb() but also are restricted to code
regions that RCU is watching.  If a given srcu_struct structure uses
srcu_read_lock_lite() and srcu_read_unlock_lite(), it is not permitted
to use any other SRCU read-side marker, before, during, or after.

Another price of light-weight readers is heavier weight grace periods.
Such readers mean that SRCU grace periods on srcu_struct structures
used by light-weight readers will incur at least two calls to
synchronize_rcu().  In addition, normal SRCU grace periods for
light-weight-reader srcu_struct structures never auto-expedite.
Note that expedited SRCU grace periods for light-weight-reader
srcu_struct structures still invoke synchronize_rcu(), not
synchronize_srcu_expedited().  Something about wishing to keep
the IPIs down to a dull roar.

The srcu_read_lock_lite() and srcu_read_unlock_lite() functions may not
(repeat, *not*) be used from NMI handlers, but if this is needed, an
additional flavor of SRCU reader can be added by some future commit.

[ paulmck: Apply Alexei Starovoitov expediting feedback. ]
[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: kernel test robot <oliver.sang@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: <bpf@vger.kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h     | 51 +++++++++++++++++++++++++++++-
 include/linux/srcutree.h |  1 +
 kernel/rcu/srcutree.c    | 82 +++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 122 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 84daaa33ea0a..4ba96e2cfa40 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -56,6 +56,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
 void cleanup_srcu_struct(struct srcu_struct *ssp);
 int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
+#ifdef CONFIG_TINY_SRCU
+#define __srcu_read_lock_lite __srcu_read_lock
+#define __srcu_read_unlock_lite __srcu_read_unlock
+#else // #ifdef CONFIG_TINY_SRCU
+int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp);
+void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp);
+#endif // #else // #ifdef CONFIG_TINY_SRCU
 void synchronize_srcu(struct srcu_struct *ssp);
 
 #define SRCU_GET_STATE_COMPLETED 0x1
@@ -179,7 +186,7 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
 void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
 #else
-static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { }
+#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0)
 #endif
 
 
@@ -249,6 +256,32 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
 	return retval;
 }
 
+/**
+ * srcu_read_lock_lite - register a new reader for an SRCU-protected structure.
+ * @ssp: srcu_struct in which to register the new reader.
+ *
+ * Enter an SRCU read-side critical section, but for a light-weight
+ * smp_mb()-free reader.  See srcu_read_lock() for more information.
+ *
+ * If srcu_read_lock_lite() is ever used on an srcu_struct structure,
+ * then none of the other flavors may be used, whether before, during,
+ * or after.  Note that grace-period auto-expediting is disabled for _lite
+ * srcu_struct structures because auto-expedited grace periods invoke
+ * synchronize_rcu_expedited(), IPIs and all.
+ *
+ * Note that srcu_read_lock_lite() can be invoked only from those contexts
+ * where RCU is watching.  Otherwise, lockdep will complain.
+ */
+static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp)
+{
+	int retval;
+
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
+	retval = __srcu_read_lock_lite(ssp);
+	rcu_try_lock_acquire(&ssp->dep_map);
+	return retval;
+}
+
 /**
  * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure.
  * @ssp: srcu_struct in which to register the new reader.
@@ -325,6 +358,22 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
 	__srcu_read_unlock(ssp, idx);
 }
 
+/**
+ * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure.
+ * @ssp: srcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding srcu_read_lock().
+ *
+ * Exit a light-weight SRCU read-side critical section.
+ */
+static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
+	__releases(ssp)
+{
+	WARN_ON_ONCE(idx & ~0x1);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
+	srcu_lock_release(&ssp->dep_map);
+	__srcu_read_unlock(ssp, idx);
+}
+
 /**
  * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure.
  * @ssp: srcu_struct in which to unregister the old reader.
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 79ad809c7f03..8074138cbd62 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -46,6 +46,7 @@ struct srcu_data {
 /* Values for ->srcu_reader_flavor. */
 #define SRCU_READ_FLAVOR_NORMAL	0x1		// srcu_read_lock().
 #define SRCU_READ_FLAVOR_NMI	0x2		// srcu_read_lock_nmisafe().
+#define SRCU_READ_FLAVOR_LITE	0x4		// srcu_read_lock_lite().
 
 /*
  * Node in SRCU combining tree, similar in function to rcu_data.
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 602b4b8c4b89..8632a3caeb33 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -429,20 +429,29 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp)
 }
 
 /*
- * Returns approximate total of the readers' ->srcu_lock_count[] values
- * for the rank of per-CPU counters specified by idx.
+ * Computes approximate total of the readers' ->srcu_lock_count[] values
+ * for the rank of per-CPU counters specified by idx, and returns true if
+ * the caller did the proper barrier (gp), and if the count of the locks
+ * matches that of the unlocks passed in.
  */
-static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
+static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks)
 {
 	int cpu;
+	unsigned long mask = 0;
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
 		struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
 
 		sum += atomic_long_read(&sdp->srcu_lock_count[idx]);
+		if (IS_ENABLED(CONFIG_PROVE_RCU))
+			mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
 	}
-	return sum;
+	WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
+		  "Mixed reader flavors for srcu_struct at %ps.\n", ssp);
+	if (mask & SRCU_READ_FLAVOR_LITE && !gp)
+		return false;
+	return sum == unlocks;
 }
 
 /*
@@ -473,6 +482,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
  */
 static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
 {
+	bool did_gp = !!(raw_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE);
 	unsigned long unlocks;
 
 	unlocks = srcu_readers_unlock_idx(ssp, idx);
@@ -482,13 +492,16 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
 	 * unlock is counted. Needs to be a smp_mb() as the read side may
 	 * contain a read from a variable that is written to before the
 	 * synchronize_srcu() in the write side. In this case smp_mb()s
-	 * A and B act like the store buffering pattern.
+	 * A and B (or X and Y) act like the store buffering pattern.
 	 *
-	 * This smp_mb() also pairs with smp_mb() C to prevent accesses
-	 * after the synchronize_srcu() from being executed before the
-	 * grace period ends.
+	 * This smp_mb() also pairs with smp_mb() C (or, in the case of X,
+	 * Z) to prevent accesses after the synchronize_srcu() from being
+	 * executed before the grace period ends.
 	 */
-	smp_mb(); /* A */
+	if (!did_gp)
+		smp_mb(); /* A */
+	else
+		synchronize_rcu(); /* X */
 
 	/*
 	 * If the locks are the same as the unlocks, then there must have
@@ -546,7 +559,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
 	 * which are unlikely to be configured with an address space fully
 	 * populated with memory, at least not anytime soon.
 	 */
-	return srcu_readers_lock_idx(ssp, idx) == unlocks;
+	return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks);
 }
 
 /**
@@ -750,6 +763,47 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Returns an index that must be passed to the matching
+ * srcu_read_unlock_lite().
+ *
+ * Note that this_cpu_inc() is an RCU read-side critical section either
+ * because it disables interrupts, because it is a single instruction,
+ * or because it is a read-modify-write atomic operation, depending on
+ * the whims of the architecture.
+ */
+int __srcu_read_lock_lite(struct srcu_struct *ssp)
+{
+	int idx;
+
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite().");
+	idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+	this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */
+	barrier(); /* Avoid leaking the critical section. */
+	return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock_lite);
+
+/*
+ * Removes the count for the old reader from the appropriate
+ * per-CPU element of the srcu_struct.  Note that this may well be a
+ * different CPU than that which was incremented by the corresponding
+ * srcu_read_lock_lite(), but it must be within the same task.
+ *
+ * Note that this_cpu_inc() is an RCU read-side critical section either
+ * because it disables interrupts, because it is a single instruction,
+ * or because it is a read-modify-write atomic operation, depending on
+ * the whims of the architecture.
+ */
+void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
+{
+	barrier();  /* Avoid leaking the critical section. */
+	this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);  /* Z */
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite().");
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock_lite);
+
 #ifdef CONFIG_NEED_SRCU_NMI_SAFE
 
 /*
@@ -1134,6 +1188,8 @@ static void srcu_flip(struct srcu_struct *ssp)
 	 * it stays until either (1) Compilers learn about this sort of
 	 * control dependency or (2) Some production workload running on
 	 * a production system is unduly delayed by this slowpath smp_mb().
+	 * Except for _lite() readers, where it is inoperative, which
+	 * means that it is a good thing that it is redundant.
 	 */
 	smp_mb(); /* E */  /* Pairs with B and C. */
 
@@ -1152,7 +1208,8 @@ static void srcu_flip(struct srcu_struct *ssp)
 
 /*
  * If SRCU is likely idle, in other words, the next SRCU grace period
- * should be expedited, return true, otherwise return false.
+ * should be expedited, return true, otherwise return false.  Except that
+ * in the presence of _lite() readers, always return false.
  *
  * Note that it is OK for several current from-idle requests for a new
  * grace period from idle to specify expediting because they will all end
@@ -1181,6 +1238,9 @@ static bool srcu_should_expedite(struct srcu_struct *ssp)
 	unsigned long tlast;
 
 	check_init_srcu_struct(ssp);
+	/* If _lite() readers, don't do unsolicited expediting. */
+	if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)
+		return false;
 	/* If the local srcu_data structure has callbacks, not idle.  */
 	sdp = raw_cpu_ptr(ssp->sda);
 	spin_lock_irqsave_rcu_node(sdp, flags);
-- 
cgit v1.2.3


From bb94b12e4503bdce003a74e95ee4214eba923f86 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Oct 2024 09:11:06 -0700
Subject: srcu: Allow inlining of __srcu_read_{,un}lock_lite()

This commit moves __srcu_read_lock_lite() and __srcu_read_unlock_lite()
into include/linux/srcu.h and marks them "static inline" so that they
can be inlined into srcu_read_lock_lite() and srcu_read_unlock_lite(),
respectively.  They are not hand-inlined due to Tree SRCU and Tiny SRCU
having different implementations.

The earlier removal of smp_mb() combined with the inlining produce
significant single-percentage performance wins.

Link: https://lore.kernel.org/all/CAEf4BzYgiNmSb=ZKQ65tm6nJDi1UX2Gq26cdHSH1mPwXJYZj5g@mail.gmail.com/

Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: <bpf@vger.kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcutree.h | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/rcu/srcutree.c    | 41 -----------------------------------------
 2 files changed, 39 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 8074138cbd62..778eb61542e1 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -209,4 +209,43 @@ void synchronize_srcu_expedited(struct srcu_struct *ssp);
 void srcu_barrier(struct srcu_struct *ssp);
 void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf);
 
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Returns an index that must be passed to the matching
+ * srcu_read_unlock_lite().
+ *
+ * Note that this_cpu_inc() is an RCU read-side critical section either
+ * because it disables interrupts, because it is a single instruction,
+ * or because it is a read-modify-write atomic operation, depending on
+ * the whims of the architecture.
+ */
+static inline int __srcu_read_lock_lite(struct srcu_struct *ssp)
+{
+	int idx;
+
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite().");
+	idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+	this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */
+	barrier(); /* Avoid leaking the critical section. */
+	return idx;
+}
+
+/*
+ * Removes the count for the old reader from the appropriate
+ * per-CPU element of the srcu_struct.  Note that this may well be a
+ * different CPU than that which was incremented by the corresponding
+ * srcu_read_lock_lite(), but it must be within the same task.
+ *
+ * Note that this_cpu_inc() is an RCU read-side critical section either
+ * because it disables interrupts, because it is a single instruction,
+ * or because it is a read-modify-write atomic operation, depending on
+ * the whims of the architecture.
+ */
+static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
+{
+	barrier();  /* Avoid leaking the critical section. */
+	this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);  /* Z */
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite().");
+}
+
 #endif
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 8632a3caeb33..d3a0c76ce590 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -763,47 +763,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
-/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct.  Returns an index that must be passed to the matching
- * srcu_read_unlock_lite().
- *
- * Note that this_cpu_inc() is an RCU read-side critical section either
- * because it disables interrupts, because it is a single instruction,
- * or because it is a read-modify-write atomic operation, depending on
- * the whims of the architecture.
- */
-int __srcu_read_lock_lite(struct srcu_struct *ssp)
-{
-	int idx;
-
-	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite().");
-	idx = READ_ONCE(ssp->srcu_idx) & 0x1;
-	this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */
-	barrier(); /* Avoid leaking the critical section. */
-	return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock_lite);
-
-/*
- * Removes the count for the old reader from the appropriate
- * per-CPU element of the srcu_struct.  Note that this may well be a
- * different CPU than that which was incremented by the corresponding
- * srcu_read_lock_lite(), but it must be within the same task.
- *
- * Note that this_cpu_inc() is an RCU read-side critical section either
- * because it disables interrupts, because it is a single instruction,
- * or because it is a read-modify-write atomic operation, depending on
- * the whims of the architecture.
- */
-void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
-{
-	barrier();  /* Avoid leaking the critical section. */
-	this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);  /* Z */
-	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite().");
-}
-EXPORT_SYMBOL_GPL(__srcu_read_unlock_lite);
-
 #ifdef CONFIG_NEED_SRCU_NMI_SAFE
 
 /*
-- 
cgit v1.2.3


From 768b1f87098a4a586353898b074989808b1b27ad Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Oct 2024 09:11:12 -0700
Subject: srcu: Improve srcu_read_lock_lite() kernel-doc comment

Where RCU is watching is where it is OK to invoke rcu_read_lock().

Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 4ba96e2cfa40..bab1dae3f69e 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -270,7 +270,8 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
  * synchronize_rcu_expedited(), IPIs and all.
  *
  * Note that srcu_read_lock_lite() can be invoked only from those contexts
- * where RCU is watching.  Otherwise, lockdep will complain.
+ * where RCU is watching, that is, from contexts where it would be legal
+ * to invoke rcu_read_lock().  Otherwise, lockdep will complain.
  */
 static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp)
 {
-- 
cgit v1.2.3


From 7d4f46c2372d5a850e28f63001013de50843b6e6 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:04:18 -0800
Subject: iommufd: Move _iommufd_object_alloc helper to a sharable file

The following patch will add a new vIOMMU allocator that will require this
_iommufd_object_alloc to be sharable with IOMMU drivers (and iommufd too).

Add a new driver.c file that will be built with CONFIG_IOMMUFD_DRIVER_CORE
selected by CONFIG_IOMMUFD, and put the CONFIG_DRIVER under that remaining
to be selectable for drivers to build the existing iova_bitmap.c file.

Link: https://patch.msgid.link/r/2f4f6e116dc49ffb67ff6c5e8a7a8e789ab9e98e.1730836219.git.nicolinc@nvidia.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/Kconfig           |  4 ++++
 drivers/iommu/iommufd/Makefile          |  3 +++
 drivers/iommu/iommufd/driver.c          | 40 +++++++++++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  4 ----
 drivers/iommu/iommufd/main.c            | 32 --------------------------
 include/linux/iommufd.h                 | 13 +++++++++++
 6 files changed, 60 insertions(+), 36 deletions(-)
 create mode 100644 drivers/iommu/iommufd/driver.c

(limited to 'include/linux')

diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
index 76656fe0470d..0a07f9449fd9 100644
--- a/drivers/iommu/iommufd/Kconfig
+++ b/drivers/iommu/iommufd/Kconfig
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0-only
+config IOMMUFD_DRIVER_CORE
+	tristate
+	default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n
+
 config IOMMUFD
 	tristate "IOMMU Userspace API"
 	select INTERVAL_TREE
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index cf4605962bea..83df9077063e 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -13,3 +13,6 @@ iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
 
 obj-$(CONFIG_IOMMUFD) += iommufd.o
 obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
+
+iommufd_driver-y := driver.o
+obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o
diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
new file mode 100644
index 000000000000..2bc47d92a0ab
--- /dev/null
+++ b/drivers/iommu/iommufd/driver.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "iommufd_private.h"
+
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+					     size_t size,
+					     enum iommufd_object_type type)
+{
+	struct iommufd_object *obj;
+	int rc;
+
+	obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
+	if (!obj)
+		return ERR_PTR(-ENOMEM);
+	obj->type = type;
+	/* Starts out bias'd by 1 until it is removed from the xarray */
+	refcount_set(&obj->shortterm_users, 1);
+	refcount_set(&obj->users, 1);
+
+	/*
+	 * Reserve an ID in the xarray but do not publish the pointer yet since
+	 * the caller hasn't initialized it yet. Once the pointer is published
+	 * in the xarray and visible to other threads we can't reliably destroy
+	 * it anymore, so the caller must complete all errorable operations
+	 * before calling iommufd_object_finalize().
+	 */
+	rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b,
+		      GFP_KERNEL_ACCOUNT);
+	if (rc)
+		goto out_free;
+	return obj;
+out_free:
+	kfree(obj);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, IOMMUFD);
+
+MODULE_DESCRIPTION("iommufd code shared with builtin modules");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 94cfcab7e9de..be347f726fda 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -206,10 +206,6 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx,
 	iommufd_object_remove(ictx, obj, obj->id, 0);
 }
 
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
-					     size_t size,
-					     enum iommufd_object_type type);
-
 #define __iommufd_object_alloc(ictx, ptr, type, obj)                           \
 	container_of(_iommufd_object_alloc(                                    \
 			     ictx,                                             \
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 826a2b2be52f..3c32b440471b 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -29,38 +29,6 @@ struct iommufd_object_ops {
 static const struct iommufd_object_ops iommufd_object_ops[];
 static struct miscdevice vfio_misc_dev;
 
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
-					     size_t size,
-					     enum iommufd_object_type type)
-{
-	struct iommufd_object *obj;
-	int rc;
-
-	obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
-	if (!obj)
-		return ERR_PTR(-ENOMEM);
-	obj->type = type;
-	/* Starts out bias'd by 1 until it is removed from the xarray */
-	refcount_set(&obj->shortterm_users, 1);
-	refcount_set(&obj->users, 1);
-
-	/*
-	 * Reserve an ID in the xarray but do not publish the pointer yet since
-	 * the caller hasn't initialized it yet. Once the pointer is published
-	 * in the xarray and visible to other threads we can't reliably destroy
-	 * it anymore, so the caller must complete all errorable operations
-	 * before calling iommufd_object_finalize().
-	 */
-	rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY,
-		      xa_limit_31b, GFP_KERNEL_ACCOUNT);
-	if (rc)
-		goto out_free;
-	return obj;
-out_free:
-	kfree(obj);
-	return ERR_PTR(rc);
-}
-
 /*
  * Allow concurrent access to the object.
  *
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 22948dd03d67..94522d4029ca 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -135,4 +135,17 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
 	return -EOPNOTSUPP;
 }
 #endif /* CONFIG_IOMMUFD */
+
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE)
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+					     size_t size,
+					     enum iommufd_object_type type);
+#else /* !CONFIG_IOMMUFD_DRIVER_CORE */
+static inline struct iommufd_object *
+_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
+		      enum iommufd_object_type type)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+#endif /* CONFIG_IOMMUFD_DRIVER_CORE */
 #endif
-- 
cgit v1.2.3


From 6b22d562fcd6e3d1cc1c265b0596840946d16a09 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:04:19 -0800
Subject: iommufd: Introduce IOMMUFD_OBJ_VIOMMU and its related struct

Add a new IOMMUFD_OBJ_VIOMMU with an iommufd_viommu structure to represent
a slice of physical IOMMU device passed to or shared with a user space VM.
This slice, now a vIOMMU object, is a group of virtualization resources of
a physical IOMMU's, such as:
 - Security namespace for guest owned ID, e.g. guest-controlled cache tags
 - Non-device-affiliated event reporting, e.g. invalidation queue errors
 - Access to a sharable nesting parent pagetable across physical IOMMUs
 - Virtualization of various platforms IDs, e.g. RIDs and others
 - Delivery of paravirtualized invalidation
 - Direct assigned invalidation queues
 - Direct assigned interrupts

Add a new viommu_alloc op in iommu_ops, for drivers to allocate their own
vIOMMU structures. And this allocation also needs a free(), so add struct
iommufd_viommu_ops.

To simplify a vIOMMU allocation, provide a iommufd_viommu_alloc() helper.
It's suggested that a driver should embed a core-level viommu structure in
its driver-level viommu struct and call the iommufd_viommu_alloc() helper,
meanwhile the driver can also implement a viommu ops:
    struct my_driver_viommu {
        struct iommufd_viommu core;
        /* driver-owned properties/features */
        ....
    };

    static const struct iommufd_viommu_ops my_driver_viommu_ops = {
        .free = my_driver_viommu_free,
        /* future ops for virtualization features */
        ....
    };

    static struct iommufd_viommu my_driver_viommu_alloc(...)
    {
        struct my_driver_viommu *my_viommu =
                iommufd_viommu_alloc(ictx, my_driver_viommu, core,
                                     my_driver_viommu_ops);
        /* Init my_viommu and related HW feature */
        ....
        return &my_viommu->core;
    }

    static struct iommu_domain_ops my_driver_domain_ops = {
        ....
        .viommu_alloc = my_driver_viommu_alloc,
    };

Link: https://patch.msgid.link/r/64685e2b79dea0f1dc56f6ede04809b72d578935.1730836219.git.nicolinc@nvidia.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h   | 14 ++++++++++++++
 include/linux/iommufd.h | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..2574fc8abaf2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -42,6 +42,8 @@ struct notifier_block;
 struct iommu_sva;
 struct iommu_dma_cookie;
 struct iommu_fault_param;
+struct iommufd_ctx;
+struct iommufd_viommu;
 
 #define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
 #define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
@@ -542,6 +544,14 @@ static inline int __iommu_copy_struct_from_user_array(
  * @remove_dev_pasid: Remove any translation configurations of a specific
  *                    pasid, so that any DMA transactions with this pasid
  *                    will be blocked by the hardware.
+ * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind
+ *                the @dev, as the set of virtualization resources shared/passed
+ *                to user space IOMMU instance. And associate it with a nesting
+ *                @parent_domain. The @viommu_type must be defined in the header
+ *                include/uapi/linux/iommufd.h
+ *                It is required to call iommufd_viommu_alloc() helper for
+ *                a bundled allocation of the core and the driver structures,
+ *                using the given @ictx pointer.
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
  * @identity_domain: An always available, always attachable identity
@@ -591,6 +601,10 @@ struct iommu_ops {
 	void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid,
 				 struct iommu_domain *domain);
 
+	struct iommufd_viommu *(*viommu_alloc)(
+		struct device *dev, struct iommu_domain *parent_domain,
+		struct iommufd_ctx *ictx, unsigned int viommu_type);
+
 	const struct iommu_domain_ops *default_domain_ops;
 	unsigned long pgsize_bitmap;
 	struct module *owner;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 94522d4029ca..4fc2ce332f10 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -17,6 +17,7 @@ struct iommu_group;
 struct iommufd_access;
 struct iommufd_ctx;
 struct iommufd_device;
+struct iommufd_viommu_ops;
 struct page;
 
 enum iommufd_object_type {
@@ -28,6 +29,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_IOAS,
 	IOMMUFD_OBJ_ACCESS,
 	IOMMUFD_OBJ_FAULT,
+	IOMMUFD_OBJ_VIOMMU,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -78,6 +80,26 @@ void iommufd_access_detach(struct iommufd_access *access);
 
 void iommufd_ctx_get(struct iommufd_ctx *ictx);
 
+struct iommufd_viommu {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct iommu_device *iommu_dev;
+	struct iommufd_hwpt_paging *hwpt;
+
+	const struct iommufd_viommu_ops *ops;
+
+	unsigned int type;
+};
+
+/**
+ * struct iommufd_viommu_ops - vIOMMU specific operations
+ * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory
+ *           of the vIOMMU will be free-ed by iommufd core after calling this op
+ */
+struct iommufd_viommu_ops {
+	void (*destroy)(struct iommufd_viommu *viommu);
+};
+
 #if IS_ENABLED(CONFIG_IOMMUFD)
 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file);
 struct iommufd_ctx *iommufd_ctx_from_fd(int fd);
@@ -148,4 +170,22 @@ _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 #endif /* CONFIG_IOMMUFD_DRIVER_CORE */
+
+/*
+ * Helpers for IOMMU driver to allocate driver structures that will be freed by
+ * the iommufd core. The free op will be called prior to freeing the memory.
+ */
+#define iommufd_viommu_alloc(ictx, drv_struct, member, viommu_ops)             \
+	({                                                                     \
+		drv_struct *ret;                                               \
+									       \
+		static_assert(__same_type(struct iommufd_viommu,               \
+					  ((drv_struct *)NULL)->member));      \
+		static_assert(offsetof(drv_struct, member.obj) == 0);          \
+		ret = (drv_struct *)_iommufd_object_alloc(                     \
+			ictx, sizeof(drv_struct), IOMMUFD_OBJ_VIOMMU);         \
+		if (!IS_ERR(ret))                                              \
+			ret->member.ops = viommu_ops;                          \
+		ret;                                                           \
+	})
 #endif
-- 
cgit v1.2.3


From 69d2689e57f5cb235c0609dd2f8fa10c1a832f87 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:04:22 -0800
Subject: iommufd: Add alloc_domain_nested op to iommufd_viommu_ops

Allow IOMMU driver to use a vIOMMU object that holds a nesting parent
hwpt/domain to allocate a nested domain.

Link: https://patch.msgid.link/r/2dcdb5e405dc0deb68230564530d989d285d959c.1730836219.git.nicolinc@nvidia.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 4fc2ce332f10..de9b56265c9c 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -14,6 +14,7 @@
 struct device;
 struct file;
 struct iommu_group;
+struct iommu_user_data;
 struct iommufd_access;
 struct iommufd_ctx;
 struct iommufd_device;
@@ -95,9 +96,17 @@ struct iommufd_viommu {
  * struct iommufd_viommu_ops - vIOMMU specific operations
  * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory
  *           of the vIOMMU will be free-ed by iommufd core after calling this op
+ * @alloc_domain_nested: Allocate a IOMMU_DOMAIN_NESTED on a vIOMMU that holds a
+ *                       nesting parent domain (IOMMU_DOMAIN_PAGING). @user_data
+ *                       must be defined in include/uapi/linux/iommufd.h.
+ *                       It must fully initialize the new iommu_domain before
+ *                       returning. Upon failure, ERR_PTR must be returned.
  */
 struct iommufd_viommu_ops {
 	void (*destroy)(struct iommufd_viommu *viommu);
+	struct iommu_domain *(*alloc_domain_nested)(
+		struct iommufd_viommu *viommu, u32 flags,
+		const struct iommu_user_data *user_data);
 };
 
 #if IS_ENABLED(CONFIG_IOMMUFD)
-- 
cgit v1.2.3


From 0ce5c2477af2e2284b9c70474e4dae85db211680 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:09 -0800
Subject: iommufd/viommu: Add IOMMUFD_OBJ_VDEVICE and IOMMU_VDEVICE_ALLOC ioctl

Introduce a new IOMMUFD_OBJ_VDEVICE to represent a physical device (struct
device) against a vIOMMU (struct iommufd_viommu) object in a VM.

This vDEVICE object (and its structure) holds all the infos and attributes
in the VM, regarding the device related to the vIOMMU.

As an initial patch, add a per-vIOMMU virtual ID. This can be:
 - Virtual StreamID on a nested ARM SMMUv3, an index to a Stream Table
 - Virtual DeviceID on a nested AMD IOMMU, an index to a Device Table
 - Virtual RID on a nested Intel VT-D IOMMU, an index to a Context Table
Potentially, this vDEVICE structure would hold some vData for Confidential
Compute Architecture (CCA). Use this virtual ID to index an "vdevs" xarray
that belongs to a vIOMMU object.

Add a new ioctl for vDEVICE allocations. Since a vDEVICE is a connection
of a device object and an iommufd_viommu object, take two refcounts in the
ioctl handler.

Link: https://patch.msgid.link/r/cda8fd2263166e61b8191a3b3207e0d2b08545bf.1730836308.git.nicolinc@nvidia.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h | 18 ++++++++
 drivers/iommu/iommufd/main.c            |  6 +++
 drivers/iommu/iommufd/viommu.c          | 76 +++++++++++++++++++++++++++++++++
 include/linux/iommufd.h                 |  4 ++
 include/uapi/linux/iommufd.h            | 22 ++++++++++
 5 files changed, 126 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index e8f5ef550cc9..062656c19a07 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -507,8 +507,26 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
 
+static inline struct iommufd_viommu *
+iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_VIOMMU),
+			    struct iommufd_viommu, obj);
+}
+
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_vdevice_destroy(struct iommufd_object *obj);
+
+struct iommufd_vdevice {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct iommufd_viommu *viommu;
+	struct device *dev;
+	u64 id; /* per-vIOMMU virtual ID */
+};
 
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index cc514f9bc3e6..d735fe04197f 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -308,6 +308,7 @@ union ucmd_buffer {
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
 	struct iommu_viommu_alloc viommu;
+	struct iommu_vdevice_alloc vdev;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -363,6 +364,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 __reserved),
 	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
 		 struct iommu_viommu_alloc, out_viommu_id),
+	IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl,
+		 struct iommu_vdevice_alloc, virt_id),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -501,6 +504,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_VIOMMU] = {
 		.destroy = iommufd_viommu_destroy,
 	},
+	[IOMMUFD_OBJ_VDEVICE] = {
+		.destroy = iommufd_vdevice_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 888239b78667..69b88e8c7c26 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -11,6 +11,7 @@ void iommufd_viommu_destroy(struct iommufd_object *obj)
 	if (viommu->ops && viommu->ops->destroy)
 		viommu->ops->destroy(viommu);
 	refcount_dec(&viommu->hwpt->common.obj.users);
+	xa_destroy(&viommu->vdevs);
 }
 
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
@@ -53,6 +54,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_hwpt;
 	}
 
+	xa_init(&viommu->vdevs);
 	viommu->type = cmd->type;
 	viommu->ictx = ucmd->ictx;
 	viommu->hwpt = hwpt_paging;
@@ -79,3 +81,77 @@ out_put_idev:
 	iommufd_put_object(ucmd->ictx, &idev->obj);
 	return rc;
 }
+
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_vdevice *vdev =
+		container_of(obj, struct iommufd_vdevice, obj);
+	struct iommufd_viommu *viommu = vdev->viommu;
+
+	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
+	xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
+	refcount_dec(&viommu->obj.users);
+	put_device(vdev->dev);
+}
+
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_vdevice_alloc *cmd = ucmd->cmd;
+	struct iommufd_vdevice *vdev, *curr;
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	u64 virt_id = cmd->virt_id;
+	int rc = 0;
+
+	/* virt_id indexes an xarray */
+	if (virt_id > ULONG_MAX)
+		return -EINVAL;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev)) {
+		rc = PTR_ERR(idev);
+		goto out_put_viommu;
+	}
+
+	if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+		rc = -EINVAL;
+		goto out_put_idev;
+	}
+
+	vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
+	if (IS_ERR(vdev)) {
+		rc = PTR_ERR(vdev);
+		goto out_put_idev;
+	}
+
+	vdev->id = virt_id;
+	vdev->dev = idev->dev;
+	get_device(idev->dev);
+	vdev->viommu = viommu;
+	refcount_inc(&viommu->obj.users);
+
+	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
+	if (curr) {
+		rc = xa_err(curr) ?: -EEXIST;
+		goto out_abort;
+	}
+
+	cmd->out_vdevice_id = vdev->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_abort;
+	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
+	goto out_put_idev;
+
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_put_idev:
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+out_put_viommu:
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+	return rc;
+}
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index de9b56265c9c..71fa1e343023 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/refcount.h>
 #include <linux/types.h>
+#include <linux/xarray.h>
 
 struct device;
 struct file;
@@ -31,6 +32,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_ACCESS,
 	IOMMUFD_OBJ_FAULT,
 	IOMMUFD_OBJ_VIOMMU,
+	IOMMUFD_OBJ_VDEVICE,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -89,6 +91,8 @@ struct iommufd_viommu {
 
 	const struct iommufd_viommu_ops *ops;
 
+	struct xarray vdevs;
+
 	unsigned int type;
 };
 
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index a498d4838f9a..9b5236004b8e 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -53,6 +53,7 @@ enum {
 	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
 	IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
 	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
+	IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
 };
 
 /**
@@ -864,4 +865,25 @@ struct iommu_viommu_alloc {
 	__u32 out_viommu_id;
 };
 #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
+
+/**
+ * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
+ * @size: sizeof(struct iommu_vdevice_alloc)
+ * @viommu_id: vIOMMU ID to associate with the virtual device
+ * @dev_id: The physical device to allocate a virtual instance on the vIOMMU
+ * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY
+ * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID
+ *           of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table
+ *
+ * Allocate a virtual device instance (for a physical device) against a vIOMMU.
+ * This instance holds the device's information (related to its vIOMMU) in a VM.
+ */
+struct iommu_vdevice_alloc {
+	__u32 size;
+	__u32 viommu_id;
+	__u32 dev_id;
+	__u32 out_vdevice_id;
+	__aligned_u64 virt_id;
+};
+#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
 #endif
-- 
cgit v1.2.3


From 67db79dc1a411335f8a7e53a5e206d96b237d9a8 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:11 -0800
Subject: iommu/viommu: Add cache_invalidate to iommufd_viommu_ops

This per-vIOMMU cache_invalidate op is like the cache_invalidate_user op
in struct iommu_domain_ops, but wider, supporting device cache (e.g. PCI
ATC invaldiations).

Link: https://patch.msgid.link/r/90138505850fa6b165135e78a87b4cc7022869a4.1730836308.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 71fa1e343023..2bc735ff9511 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -16,6 +16,7 @@ struct device;
 struct file;
 struct iommu_group;
 struct iommu_user_data;
+struct iommu_user_data_array;
 struct iommufd_access;
 struct iommufd_ctx;
 struct iommufd_device;
@@ -105,12 +106,21 @@ struct iommufd_viommu {
  *                       must be defined in include/uapi/linux/iommufd.h.
  *                       It must fully initialize the new iommu_domain before
  *                       returning. Upon failure, ERR_PTR must be returned.
+ * @cache_invalidate: Flush hardware cache used by a vIOMMU. It can be used for
+ *                    any IOMMU hardware specific cache: TLB and device cache.
+ *                    The @array passes in the cache invalidation requests, in
+ *                    form of a driver data structure. A driver must update the
+ *                    array->entry_num to report the number of handled requests.
+ *                    The data structure of the array entry must be defined in
+ *                    include/uapi/linux/iommufd.h
  */
 struct iommufd_viommu_ops {
 	void (*destroy)(struct iommufd_viommu *viommu);
 	struct iommu_domain *(*alloc_domain_nested)(
 		struct iommufd_viommu *viommu, u32 flags,
 		const struct iommu_user_data *user_data);
+	int (*cache_invalidate)(struct iommufd_viommu *viommu,
+				struct iommu_user_data_array *array);
 };
 
 #if IS_ENABLED(CONFIG_IOMMUFD)
-- 
cgit v1.2.3


From 4f2e59ccb69866c5165525bd7aee47cd5cd00acc Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:13 -0800
Subject: iommu: Add iommu_copy_struct_from_full_user_array helper

The iommu_copy_struct_from_user_array helper can be used to copy a single
entry from a user array which might not be efficient if the array is big.

Add a new iommu_copy_struct_from_full_user_array to copy the entire user
array at once. Update the existing iommu_copy_struct_from_user_array kdoc
accordingly.

Link: https://patch.msgid.link/r/5cd773d9c26920c5807d232b21d415ea79172e49.1730836308.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2574fc8abaf2..11de66237eaa 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -493,7 +493,9 @@ static inline int __iommu_copy_struct_from_user_array(
  * @index: Index to the location in the array to copy user data from
  * @min_last: The last member of the data structure @kdst points in the
  *            initial version.
- * Return 0 for success, otherwise -error.
+ *
+ * Copy a single entry from a user array. Return 0 for success, otherwise
+ * -error.
  */
 #define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \
 					  min_last)                           \
@@ -501,6 +503,50 @@ static inline int __iommu_copy_struct_from_user_array(
 		kdst, user_array, data_type, index, sizeof(*(kdst)),          \
 		offsetofend(typeof(*(kdst)), min_last))
 
+/**
+ * iommu_copy_struct_from_full_user_array - Copy iommu driver specific user
+ *         space data from an iommu_user_data_array
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ *        include/uapi/linux/iommufd.h
+ * @kdst_entry_size: sizeof(*kdst)
+ * @user_array: Pointer to a struct iommu_user_data_array for a user space
+ *              array
+ * @data_type: The data type of the @kdst. Must match with @user_array->type
+ *
+ * Copy the entire user array. kdst must have room for kdst_entry_size *
+ * user_array->entry_num bytes. Return 0 for success, otherwise -error.
+ */
+static inline int
+iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
+				       struct iommu_user_data_array *user_array,
+				       unsigned int data_type)
+{
+	unsigned int i;
+	int ret;
+
+	if (user_array->type != data_type)
+		return -EINVAL;
+	if (!user_array->entry_num)
+		return -EINVAL;
+	if (likely(user_array->entry_len == kdst_entry_size)) {
+		if (copy_from_user(kdst, user_array->uptr,
+				   user_array->entry_num *
+					   user_array->entry_len))
+			return -EFAULT;
+	}
+
+	/* Copy item by item */
+	for (i = 0; i != user_array->entry_num; i++) {
+		ret = copy_struct_from_user(
+			kdst + kdst_entry_size * i, kdst_entry_size,
+			user_array->uptr + user_array->entry_len * i,
+			user_array->entry_len);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
-- 
cgit v1.2.3


From c747e67978ff473e6830afe0b1f3986dd1f444b5 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:14 -0800
Subject: iommufd/viommu: Add iommufd_viommu_find_dev helper

This avoids a bigger trouble of exposing struct iommufd_device and struct
iommufd_vdevice in the public header.

Link: https://patch.msgid.link/r/84fa7c624db4d4508067ccfdf42059533950180a.1730836308.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/driver.c | 13 +++++++++++++
 include/linux/iommufd.h        |  8 ++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
index 2bc47d92a0ab..7b67fdf44134 100644
--- a/drivers/iommu/iommufd/driver.c
+++ b/drivers/iommu/iommufd/driver.c
@@ -36,5 +36,18 @@ out_free:
 }
 EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, IOMMUFD);
 
+/* Caller should xa_lock(&viommu->vdevs) to protect the return value */
+struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
+				       unsigned long vdev_id)
+{
+	struct iommufd_vdevice *vdev;
+
+	lockdep_assert_held(&viommu->vdevs.xa_lock);
+
+	vdev = xa_load(&viommu->vdevs, vdev_id);
+	return vdev ? vdev->dev : NULL;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, IOMMUFD);
+
 MODULE_DESCRIPTION("iommufd code shared with builtin modules");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 2bc735ff9511..11110c749200 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -185,6 +185,8 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
 struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
 					     size_t size,
 					     enum iommufd_object_type type);
+struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
+				       unsigned long vdev_id);
 #else /* !CONFIG_IOMMUFD_DRIVER_CORE */
 static inline struct iommufd_object *
 _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
@@ -192,6 +194,12 @@ _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct device *
+iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id)
+{
+	return NULL;
+}
 #endif /* CONFIG_IOMMUFD_DRIVER_CORE */
 
 /*
-- 
cgit v1.2.3


From ad8d1e323dd37f91d0c973e2a74c7b9054219adc Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Mon, 2 Sep 2024 07:39:20 +0100
Subject: ARM: 9415/1: amba: Add dev_is_amba() function and export it for
 modules

Add dev_is_amba() function to determine
whether the device is a AMBA device.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 drivers/amba/bus.c       | 6 ++++++
 include/linux/amba/bus.h | 5 +++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 0230c43377c1..8ef259b4d037 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -449,6 +449,12 @@ const struct bus_type amba_bustype = {
 };
 EXPORT_SYMBOL_GPL(amba_bustype);
 
+bool dev_is_amba(const struct device *dev)
+{
+	return dev->bus == &amba_bustype;
+}
+EXPORT_SYMBOL_GPL(dev_is_amba);
+
 static int __init amba_init(void)
 {
 	return bus_register(&amba_bustype);
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index dda2f3ea89cb..9946276aff73 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -121,6 +121,7 @@ extern const struct bus_type amba_bustype;
 #ifdef CONFIG_ARM_AMBA
 int __amba_driver_register(struct amba_driver *, struct module *);
 void amba_driver_unregister(struct amba_driver *);
+bool dev_is_amba(const struct device *dev);
 #else
 static inline int __amba_driver_register(struct amba_driver *drv,
 					 struct module *owner)
@@ -130,6 +131,10 @@ static inline int __amba_driver_register(struct amba_driver *drv,
 static inline void amba_driver_unregister(struct amba_driver *drv)
 {
 }
+static inline bool dev_is_amba(const struct device *dev)
+{
+	return false;
+}
 #endif
 
 struct amba_device *amba_device_alloc(const char *, resource_size_t, size_t);
-- 
cgit v1.2.3


From 67e4fe3985138325c9b21193be52266750616182 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 30 Oct 2024 21:20:54 -0300
Subject: iommu/arm-smmu-v3: Use S2FWB for NESTED domains

Force Write Back (FWB) changes how the S2 IOPTE's MemAttr field
works. When S2FWB is supported and enabled the IOPTE will force cachable
access to IOMMU_CACHE memory when nesting with a S1 and deny cachable
access when !IOMMU_CACHE.

When using a single stage of translation, a simple S2 domain, it doesn't
change things for PCI devices as it is just a different encoding for the
existing mapping of the IOMMU protection flags to cachability attributes.
For non-PCI it also changes the combining rules when incoming transactions
have inconsistent attributes.

However, when used with a nested S1, FWB has the effect of preventing the
guest from choosing a MemAttr in it's S1 that would cause ordinary DMA to
bypass the cache. Consistent with KVM we wish to deny the guest the
ability to become incoherent with cached memory the hypervisor believes is
cachable so we don't have to flush it.

Allow NESTED domains to be created if the SMMU has S2FWB support and use
S2FWB for NESTING_PARENTS. This is an additional option to CANWBS.

Link: https://patch.msgid.link/r/10-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Donald Dutile <ddutile@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c    |  7 ++++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c        |  8 ++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h        |  3 +++
 drivers/iommu/io-pgtable-arm.c                     | 27 +++++++++++++++++-----
 include/linux/io-pgtable.h                         |  2 ++
 5 files changed, 38 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 91247a2a2d2c..a1c8fcd4797c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -220,9 +220,12 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
 	 * Must support some way to prevent the VM from bypassing the cache
 	 * because VFIO currently does not do any cache maintenance. canwbs
 	 * indicates the device is fully coherent and no cache maintenance is
-	 * ever required, even for PCI No-Snoop.
+	 * ever required, even for PCI No-Snoop. S2FWB means the S1 can't make
+	 * things non-coherent using the memattr, but No-Snoop behavior is not
+	 * effected.
 	 */
-	if (!arm_smmu_master_canwbs(master))
+	if (!arm_smmu_master_canwbs(master) &&
+	    !(smmu->features & ARM_SMMU_FEAT_S2FWB))
 		return ERR_PTR(-EOPNOTSUPP);
 
 	vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 53f12b9d78ab..de598d66b5c2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1046,7 +1046,8 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
 	/* S2 translates */
 	if (cfg & BIT(1)) {
 		used_bits[1] |=
-			cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
+			cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS |
+				    STRTAB_STE_1_SHCFG);
 		used_bits[2] |=
 			cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
 				    STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
@@ -1654,6 +1655,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 		FIELD_PREP(STRTAB_STE_1_EATS,
 			   ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
 
+	if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+		target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB);
 	if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
 		target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
 							  STRTAB_STE_1_SHCFG_INCOMING));
@@ -2472,6 +2475,9 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
 		pgtbl_cfg.oas = smmu->oas;
 		fmt = ARM_64_LPAE_S2;
 		finalise_stage_fn = arm_smmu_domain_finalise_s2;
+		if ((smmu->features & ARM_SMMU_FEAT_S2FWB) &&
+		    (flags & IOMMU_HWPT_ALLOC_NEST_PARENT))
+			pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB;
 		break;
 	default:
 		return -EINVAL;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 3fabe187ea78..5a025d310dbe 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -58,6 +58,7 @@ struct arm_smmu_device;
 #define IDR1_SIDSIZE			GENMASK(5, 0)
 
 #define ARM_SMMU_IDR3			0xc
+#define IDR3_FWB			(1 << 8)
 #define IDR3_RIL			(1 << 10)
 
 #define ARM_SMMU_IDR5			0x14
@@ -265,6 +266,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 #define STRTAB_STE_1_S1COR		GENMASK_ULL(5, 4)
 #define STRTAB_STE_1_S1CSH		GENMASK_ULL(7, 6)
 
+#define STRTAB_STE_1_S2FWB		(1UL << 25)
 #define STRTAB_STE_1_S1STALLD		(1UL << 27)
 
 #define STRTAB_STE_1_EATS		GENMASK_ULL(29, 28)
@@ -740,6 +742,7 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_ATTR_TYPES_OVR	(1 << 20)
 #define ARM_SMMU_FEAT_HA		(1 << 21)
 #define ARM_SMMU_FEAT_HD		(1 << 22)
+#define ARM_SMMU_FEAT_S2FWB		(1 << 23)
 	u32				features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 0e67f1721a3d..74f58c6ac30c 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -106,6 +106,18 @@
 #define ARM_LPAE_PTE_HAP_FAULT		(((arm_lpae_iopte)0) << 6)
 #define ARM_LPAE_PTE_HAP_READ		(((arm_lpae_iopte)1) << 6)
 #define ARM_LPAE_PTE_HAP_WRITE		(((arm_lpae_iopte)2) << 6)
+/*
+ * For !FWB these code to:
+ *  1111 = Normal outer write back cachable / Inner Write Back Cachable
+ *         Permit S1 to override
+ *  0101 = Normal Non-cachable / Inner Non-cachable
+ *  0001 = Device / Device-nGnRE
+ * For S2FWB these code:
+ *  0110 Force Normal Write Back
+ *  0101 Normal* is forced Normal-NC, Device unchanged
+ *  0001 Force Device-nGnRE
+ */
+#define ARM_LPAE_PTE_MEMATTR_FWB_WB	(((arm_lpae_iopte)0x6) << 2)
 #define ARM_LPAE_PTE_MEMATTR_OIWB	(((arm_lpae_iopte)0xf) << 2)
 #define ARM_LPAE_PTE_MEMATTR_NC		(((arm_lpae_iopte)0x5) << 2)
 #define ARM_LPAE_PTE_MEMATTR_DEV	(((arm_lpae_iopte)0x1) << 2)
@@ -458,12 +470,16 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
 	 */
 	if (data->iop.fmt == ARM_64_LPAE_S2 ||
 	    data->iop.fmt == ARM_32_LPAE_S2) {
-		if (prot & IOMMU_MMIO)
+		if (prot & IOMMU_MMIO) {
 			pte |= ARM_LPAE_PTE_MEMATTR_DEV;
-		else if (prot & IOMMU_CACHE)
-			pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
-		else
+		} else if (prot & IOMMU_CACHE) {
+			if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+				pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB;
+			else
+				pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
+		} else {
 			pte |= ARM_LPAE_PTE_MEMATTR_NC;
+		}
 	} else {
 		if (prot & IOMMU_MMIO)
 			pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
@@ -1035,8 +1051,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
 	struct arm_lpae_io_pgtable *data;
 	typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr;
 
-	/* The NS quirk doesn't apply at stage 2 */
-	if (cfg->quirks)
+	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB))
 		return NULL;
 
 	data = arm_lpae_alloc_pgtable(cfg);
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index b1ecfc3cd5bc..ce86b09ae80f 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -87,6 +87,7 @@ struct io_pgtable_cfg {
 	 *	attributes set in the TCR for a non-coherent page-table walker.
 	 *
 	 * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable.
+	 * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits
 	 */
 	#define IO_PGTABLE_QUIRK_ARM_NS			BIT(0)
 	#define IO_PGTABLE_QUIRK_NO_PERMS		BIT(1)
@@ -95,6 +96,7 @@ struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_TTBR1		BIT(5)
 	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA		BIT(6)
 	#define IO_PGTABLE_QUIRK_ARM_HD			BIT(7)
+	#define IO_PGTABLE_QUIRK_ARM_S2FWB		BIT(8)
 	unsigned long			quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
-- 
cgit v1.2.3


From 61952bb73486fff0f5550bccdf4062d9dd0fb163 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Nov 2024 18:00:38 +0100
Subject: block: remove the write_hint field from struct request

The write_hint is only used for read/write requests, which must have a
bio attached to them.  Just use the bio field instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20241112170050.1612998-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c      | 16 ++++++++++------
 block/blk-mq.c         |  2 --
 drivers/scsi/sd.c      |  6 +++---
 include/linux/blk-mq.h |  1 -
 4 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7b0af8317c1c..2306014c108d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -867,9 +867,11 @@ static struct request *attempt_merge(struct request_queue *q,
 	if (rq_data_dir(req) != rq_data_dir(next))
 		return NULL;
 
-	/* Don't merge requests with different write hints. */
-	if (req->write_hint != next->write_hint)
-		return NULL;
+	if (req->bio && next->bio) {
+		/* Don't merge requests with different write hints. */
+		if (req->bio->bi_write_hint != next->bio->bi_write_hint)
+			return NULL;
+	}
 
 	if (req->ioprio != next->ioprio)
 		return NULL;
@@ -1001,9 +1003,11 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (!bio_crypt_rq_ctx_compatible(rq, bio))
 		return false;
 
-	/* Don't merge requests with different write hints. */
-	if (rq->write_hint != bio->bi_write_hint)
-		return false;
+	if (rq->bio) {
+		/* Don't merge requests with different write hints. */
+		if (rq->bio->bi_write_hint != bio->bi_write_hint)
+			return false;
+	}
 
 	if (rq->ioprio != bio_prio(bio))
 		return false;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5e240a4b6be0..65e6b86d341c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2660,7 +2660,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 		rq->cmd_flags |= REQ_FAILFAST_MASK;
 
 	rq->__sector = bio->bi_iter.bi_sector;
-	rq->write_hint = bio->bi_write_hint;
 	blk_rq_bio_prep(rq, bio, nr_segs);
 	if (bio_integrity(bio))
 		rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
@@ -3308,7 +3307,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 	}
 	rq->nr_phys_segments = rq_src->nr_phys_segments;
 	rq->ioprio = rq_src->ioprio;
-	rq->write_hint = rq_src->write_hint;
 
 	if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
 		goto free_and_out;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index ca4bc0ac76ad..8947dab132d7 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1190,8 +1190,8 @@ static u8 sd_group_number(struct scsi_cmnd *cmd)
 	if (!sdkp->rscs)
 		return 0;
 
-	return min3((u32)rq->write_hint, (u32)sdkp->permanent_stream_count,
-		    0x3fu);
+	return min3((u32)rq->bio->bi_write_hint,
+		    (u32)sdkp->permanent_stream_count, 0x3fu);
 }
 
 static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write,
@@ -1389,7 +1389,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
 		ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks,
 					 protect | fua, dld);
 	} else if ((nr_blocks > 0xff) || (lba > 0x1fffff) ||
-		   sdp->use_10_for_rw || protect || rq->write_hint) {
+		   sdp->use_10_for_rw || protect || rq->bio->bi_write_hint) {
 		ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks,
 					 protect | fua);
 	} else {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2035fad3131f..2804fe181d9d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -156,7 +156,6 @@ struct request {
 	struct blk_crypto_keyslot *crypt_keyslot;
 #endif
 
-	enum rw_hint write_hint;
 	unsigned short ioprio;
 
 	enum mq_rq_state state;
-- 
cgit v1.2.3


From 6975c1a486a40446b5bc77a89d9c520f8296fd08 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Nov 2024 18:00:39 +0100
Subject: block: remove the ioprio field from struct request

The request ioprio is only initialized from the first attached bio,
so requests without a bio already never set it.  Directly use the
bio field instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20241112170050.1612998-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c            | 10 ++++------
 block/blk-mq.c               |  3 +--
 include/linux/blk-mq.h       |  7 +++----
 include/trace/events/block.h |  6 +++---
 4 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2306014c108d..df36f83f3738 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -871,11 +871,10 @@ static struct request *attempt_merge(struct request_queue *q,
 		/* Don't merge requests with different write hints. */
 		if (req->bio->bi_write_hint != next->bio->bi_write_hint)
 			return NULL;
+		if (req->bio->bi_ioprio != next->bio->bi_ioprio)
+			return NULL;
 	}
 
-	if (req->ioprio != next->ioprio)
-		return NULL;
-
 	if (!blk_atomic_write_mergeable_rqs(req, next))
 		return NULL;
 
@@ -1007,11 +1006,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 		/* Don't merge requests with different write hints. */
 		if (rq->bio->bi_write_hint != bio->bi_write_hint)
 			return false;
+		if (rq->bio->bi_ioprio != bio->bi_ioprio)
+			return false;
 	}
 
-	if (rq->ioprio != bio_prio(bio))
-		return false;
-
 	if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
 		return false;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 65e6b86d341c..3c6cadba75e3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -842,7 +842,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
 		blk_op_str(req_op(req)),
 		(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
 		req->nr_phys_segments,
-		IOPRIO_PRIO_CLASS(req->ioprio));
+		IOPRIO_PRIO_CLASS(req_get_ioprio(req)));
 }
 
 /*
@@ -3306,7 +3306,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 		rq->special_vec = rq_src->special_vec;
 	}
 	rq->nr_phys_segments = rq_src->nr_phys_segments;
-	rq->ioprio = rq_src->ioprio;
 
 	if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
 		goto free_and_out;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2804fe181d9d..a28264442948 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -156,8 +156,6 @@ struct request {
 	struct blk_crypto_keyslot *crypt_keyslot;
 #endif
 
-	unsigned short ioprio;
-
 	enum mq_rq_state state;
 	atomic_t ref;
 
@@ -221,7 +219,9 @@ static inline bool blk_rq_is_passthrough(struct request *rq)
 
 static inline unsigned short req_get_ioprio(struct request *req)
 {
-	return req->ioprio;
+	if (req->bio)
+		return req->bio->bi_ioprio;
+	return 0;
 }
 
 #define rq_data_dir(rq)		(op_is_write(req_op(rq)) ? WRITE : READ)
@@ -984,7 +984,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
 	rq->nr_phys_segments = nr_segs;
 	rq->__data_len = bio->bi_iter.bi_size;
 	rq->bio = rq->biotail = bio;
-	rq->ioprio = bio_prio(bio);
 }
 
 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 1527d5d45e01..bd0ea07338eb 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -99,7 +99,7 @@ TRACE_EVENT(block_rq_requeue,
 		__entry->dev	   = rq->q->disk ? disk_devt(rq->q->disk) : 0;
 		__entry->sector    = blk_rq_trace_sector(rq);
 		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
-		__entry->ioprio    = rq->ioprio;
+		__entry->ioprio    = req_get_ioprio(rq);
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 		__get_str(cmd)[0] = '\0';
@@ -136,7 +136,7 @@ DECLARE_EVENT_CLASS(block_rq_completion,
 		__entry->sector    = blk_rq_pos(rq);
 		__entry->nr_sector = nr_bytes >> 9;
 		__entry->error     = blk_status_to_errno(error);
-		__entry->ioprio    = rq->ioprio;
+		__entry->ioprio    = req_get_ioprio(rq);
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 		__get_str(cmd)[0] = '\0';
@@ -209,7 +209,7 @@ DECLARE_EVENT_CLASS(block_rq,
 		__entry->sector    = blk_rq_trace_sector(rq);
 		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
 		__entry->bytes     = blk_rq_bytes(rq);
-		__entry->ioprio	   = rq->ioprio;
+		__entry->ioprio	   = req_get_ioprio(rq);
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 		__get_str(cmd)[0] = '\0';
-- 
cgit v1.2.3


From 174dd22a781b97cb355b1037f0931436a254d3be Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 11 Nov 2024 09:38:26 -0800
Subject: srcu: Remove smp_mb() from srcu_read_unlock_lite()

The srcu_read_unlock_lite() function invokes __srcu_read_unlock() instead
of __srcu_read_unlock_lite(), which means that it is doing an unnecessary
smp_mb().  This is harmless other than the performance degradation.

This commit therefore switches to __srcu_read_unlock_lite().

Reported-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Closes: https://lore.kernel.org/all/d07e8f4a-d5ff-4c8e-8e61-50db285c57e9@amd.com/
Fixes: c0f08d6b5a61 ("srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()")
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index bab1dae3f69e..56f83237de4d 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -372,7 +372,7 @@ static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
 	WARN_ON_ONCE(idx & ~0x1);
 	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
 	srcu_lock_release(&ssp->dep_map);
-	__srcu_read_unlock(ssp, idx);
+	__srcu_read_unlock_lite(ssp, idx);
 }
 
 /**
-- 
cgit v1.2.3


From a76ab5731e32d50ff5b1ae97e9dc4b23f41c23f5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 12 Nov 2024 08:39:07 -0800
Subject: bpf: Find eligible subprogs for private stack support

Private stack will be allocated with percpu allocator in jit time.
To avoid complexity at runtime, only one copy of private stack is
available per cpu per prog. So runtime recursion check is necessary
to avoid stack corruption.

Current private stack only supports kprobe/perf_event/tp/raw_tp
which has recursion check in the kernel, and prog types that use
bpf trampoline recursion check. For trampoline related prog types,
currently only tracing progs have recursion checking.

To avoid complexity, all async_cb subprogs use normal kernel stack
including those subprogs used by both main prog subtree and async_cb
subtree. Any prog having tail call also uses kernel stack.

To avoid jit penalty with private stack support, a subprog stack
size threshold is set such that only if the stack size is no less
than the threshold, private stack is supported. The current threshold
is 64 bytes. This avoids jit penality if the stack usage is small.

A useless 'continue' is also removed from a loop in func
check_max_stack_depth().

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241112163907.2223839-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  7 ++++
 include/linux/filter.h       |  1 +
 kernel/bpf/core.c            |  5 +++
 kernel/bpf/verifier.c        | 96 +++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 99 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3a74033d49c4..d62bb2ca1828 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -633,6 +633,12 @@ struct bpf_subprog_arg_info {
 	};
 };
 
+enum priv_stack_mode {
+	PRIV_STACK_UNKNOWN,
+	NO_PRIV_STACK,
+	PRIV_STACK_ADAPTIVE,
+};
+
 struct bpf_subprog_info {
 	/* 'start' has to be the first field otherwise find_subprog() won't work */
 	u32 start; /* insn idx of function entry point */
@@ -653,6 +659,7 @@ struct bpf_subprog_info {
 	/* true if bpf_fastcall stack region is used by functions that can't be inlined */
 	bool keep_fastcall_stack: 1;
 
+	enum priv_stack_mode priv_stack_mode;
 	u8 arg_cnt;
 	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7d7578a8eac1..3a21947f2fd4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void);
 bool bpf_jit_supports_ptr_xchg(void);
 bool bpf_jit_supports_arena(void);
 bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
+bool bpf_jit_supports_private_stack(void);
 u64 bpf_arch_uaddress_limit(void);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 233ea78f8f1b..14d9288441f2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_private_stack(void)
+{
+	return false;
+}
+
 void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
 {
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9f5de8d4fbd0..fb23793ac53d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
 
 #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
 
+#define BPF_PRIV_STACK_MIN_SIZE		64
+
 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
 static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
 static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -6090,6 +6092,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 					   strict);
 }
 
+static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
+{
+	if (!bpf_jit_supports_private_stack())
+		return NO_PRIV_STACK;
+
+	/* bpf_prog_check_recur() checks all prog types that use bpf trampoline
+	 * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
+	 * explicitly.
+	 */
+	switch (prog->type) {
+	case BPF_PROG_TYPE_KPROBE:
+	case BPF_PROG_TYPE_TRACEPOINT:
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+		return PRIV_STACK_ADAPTIVE;
+	case BPF_PROG_TYPE_TRACING:
+	case BPF_PROG_TYPE_LSM:
+	case BPF_PROG_TYPE_STRUCT_OPS:
+		if (bpf_prog_check_recur(prog))
+			return PRIV_STACK_ADAPTIVE;
+		fallthrough;
+	default:
+		break;
+	}
+
+	return NO_PRIV_STACK;
+}
+
 static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
 {
 	if (env->prog->jit_requested)
@@ -6107,17 +6137,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
  * Since recursion is prevented by check_cfg() this algorithm
  * only needs a local stack of MAX_CALL_FRAMES to remember callsites
  */
-static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+					 bool priv_stack_supported)
 {
 	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int depth = 0, frame = 0, i, subprog_end;
+	int depth = 0, frame = 0, i, subprog_end, subprog_depth;
 	bool tail_call_reachable = false;
 	int ret_insn[MAX_CALL_FRAMES];
 	int ret_prog[MAX_CALL_FRAMES];
 	int j;
 
 	i = subprog[idx].start;
+	if (!priv_stack_supported)
+		subprog[idx].priv_stack_mode = NO_PRIV_STACK;
 process_func:
 	/* protect against potential stack overflow that might happen when
 	 * bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -6144,11 +6177,31 @@ process_func:
 			depth);
 		return -EACCES;
 	}
-	depth += round_up_stack_depth(env, subprog[idx].stack_depth);
-	if (depth > MAX_BPF_STACK) {
-		verbose(env, "combined stack size of %d calls is %d. Too large\n",
-			frame + 1, depth);
-		return -EACCES;
+
+	subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+	if (priv_stack_supported) {
+		/* Request private stack support only if the subprog stack
+		 * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
+		 * avoid jit penalty if the stack usage is small.
+		 */
+		if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
+		    subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+			subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
+	}
+
+	if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+		if (subprog_depth > MAX_BPF_STACK) {
+			verbose(env, "stack size of subprog %d is %d. Too large\n",
+				idx, subprog_depth);
+			return -EACCES;
+		}
+	} else {
+		depth += subprog_depth;
+		if (depth > MAX_BPF_STACK) {
+			verbose(env, "combined stack size of %d calls is %d. Too large\n",
+				frame + 1, depth);
+			return -EACCES;
+		}
 	}
 continue_func:
 	subprog_end = subprog[idx + 1].start;
@@ -6205,6 +6258,8 @@ continue_func:
 		}
 		i = next_insn;
 		idx = sidx;
+		if (!priv_stack_supported)
+			subprog[idx].priv_stack_mode = NO_PRIV_STACK;
 
 		if (subprog[idx].has_tail_call)
 			tail_call_reachable = true;
@@ -6238,7 +6293,8 @@ continue_func:
 	 */
 	if (frame == 0)
 		return 0;
-	depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
+	if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
+		depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
 	frame--;
 	i = ret_insn[frame];
 	idx = ret_prog[frame];
@@ -6247,16 +6303,36 @@ continue_func:
 
 static int check_max_stack_depth(struct bpf_verifier_env *env)
 {
+	enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
 	struct bpf_subprog_info *si = env->subprog_info;
+	bool priv_stack_supported;
 	int ret;
 
 	for (int i = 0; i < env->subprog_cnt; i++) {
+		if (si[i].has_tail_call) {
+			priv_stack_mode = NO_PRIV_STACK;
+			break;
+		}
+	}
+
+	if (priv_stack_mode == PRIV_STACK_UNKNOWN)
+		priv_stack_mode = bpf_enable_priv_stack(env->prog);
+
+	/* All async_cb subprogs use normal kernel stack. If a particular
+	 * subprog appears in both main prog and async_cb subtree, that
+	 * subprog will use normal kernel stack to avoid potential nesting.
+	 * The reverse subprog traversal ensures when main prog subtree is
+	 * checked, the subprogs appearing in async_cb subtrees are already
+	 * marked as using normal kernel stack, so stack size checking can
+	 * be done properly.
+	 */
+	for (int i = env->subprog_cnt - 1; i >= 0; i--) {
 		if (!i || si[i].is_async_cb) {
-			ret = check_max_stack_depth_subprog(env, i);
+			priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
+			ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
 			if (ret < 0)
 				return ret;
 		}
-		continue;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From e00931c02568dc6ac76f94b1ab471de05e6fdfe8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 12 Nov 2024 08:39:12 -0800
Subject: bpf: Enable private stack for eligible subprogs

If private stack is used by any subprog, set that subprog
prog->aux->jits_use_priv_stack to be true so later jit can allocate
private stack for that subprog properly.

Also set env->prog->aux->jits_use_priv_stack to be true if
any subprog uses private stack. This is a use case for a
single main prog (no subprogs) to use private stack, and
also a use case for later struct-ops progs where
env->prog->aux->jits_use_priv_stack will enable recursion
check if any subprog uses private stack.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241112163912.2224007-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/verifier.c | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7da41ae2eac8..129b29e85cec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1523,6 +1523,7 @@ struct bpf_prog_aux {
 	bool exception_cb;
 	bool exception_boundary;
 	bool is_extended; /* true if extended by freplace program */
+	bool jits_use_priv_stack;
 	u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
 	struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
 	struct bpf_arena *arena;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fb23793ac53d..176d19ad9d07 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6334,6 +6334,14 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 				return ret;
 		}
 	}
+
+	for (int i = 0; i < env->subprog_cnt; i++) {
+		if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+			env->prog->aux->jits_use_priv_stack = true;
+			break;
+		}
+	}
+
 	return 0;
 }
 
@@ -20274,6 +20282,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+		if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+			func[i]->aux->jits_use_priv_stack = true;
+
 		func[i]->jit_requested = 1;
 		func[i]->blinding_requested = prog->blinding_requested;
 		func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
-- 
cgit v1.2.3


From 7d1cd70d4b16ff0216a5f6c2ae7d0fa9fa978c07 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 12 Nov 2024 08:39:22 -0800
Subject: bpf, x86: Support private stack in jit

Private stack is allocated in function bpf_int_jit_compile() with
alignment 8. Private stack allocation size includes the stack size
determined by verifier and additional space to protect stack overflow
and underflow. See below an illustration:
  ---> memory address increasing
  [8 bytes to protect overflow] [normal stack] [8 bytes to protect underflow]
If overflow/underflow is detected, kernel messages will be
emited in dmesg like
  BPF private stack overflow/underflow detected for prog Fx
  BPF Private stack overflow/underflow detected for prog bpf_prog_a41699c234a1567a_subprog1x
Those messages are generated when I made some changes to jitted code
to intentially cause overflow for some progs.

For the jited prog, The x86 register 9 (X86_REG_R9) is used to replace
bpf frame register (BPF_REG_10). The private stack is used per
subprog per cpu. The X86_REG_R9 is saved and restored around every
func call (not including tailcall) to maintain correctness of
X86_REG_R9.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241112163922.2224385-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 136 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/bpf.h         |   1 +
 2 files changed, 137 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3ff638c37999..8f896c32172c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -325,6 +325,22 @@ struct jit_context {
 /* Number of bytes that will be skipped on tailcall */
 #define X86_TAIL_CALL_OFFSET	(12 + ENDBR_INSN_SIZE)
 
+static void push_r9(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x51);   /* push r9 */
+	*pprog = prog;
+}
+
+static void pop_r9(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x59);   /* pop r9 */
+	*pprog = prog;
+}
+
 static void push_r12(u8 **pprog)
 {
 	u8 *prog = *pprog;
@@ -1404,6 +1420,24 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
 	*pprog = prog;
 }
 
+static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
+{
+	u8 *prog = *pprog;
+
+	/* movabs r9, priv_frame_ptr */
+	emit_mov_imm64(&prog, X86_REG_R9, (__force long) priv_frame_ptr >> 32,
+		       (u32) (__force long) priv_frame_ptr);
+
+#ifdef CONFIG_SMP
+	/* add <r9>, gs:[<off>] */
+	EMIT2(0x65, 0x4c);
+	EMIT3(0x03, 0x0c, 0x25);
+	EMIT((u32)(unsigned long)&this_cpu_off, 4);
+#endif
+
+	*pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 #define __LOAD_TCC_PTR(off)			\
@@ -1412,6 +1446,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
 #define LOAD_TAIL_CALL_CNT_PTR(stack)				\
 	__LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
 
+/* Memory size/value to protect private stack overflow/underflow */
+#define PRIV_STACK_GUARD_SZ    8
+#define PRIV_STACK_GUARD_VAL   0xEB9F12345678eb9fULL
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
 		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
@@ -1421,7 +1459,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	int insn_cnt = bpf_prog->len;
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	void __percpu *priv_frame_ptr = NULL;
 	u64 arena_vm_start, user_vm_start;
+	void __percpu *priv_stack_ptr;
 	int i, excnt = 0;
 	int ilen, proglen = 0;
 	u8 *prog = temp;
@@ -1429,6 +1469,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	int err;
 
 	stack_depth = bpf_prog->aux->stack_depth;
+	priv_stack_ptr = bpf_prog->aux->priv_stack_ptr;
+	if (priv_stack_ptr) {
+		priv_frame_ptr = priv_stack_ptr + PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8);
+		stack_depth = 0;
+	}
 
 	arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
 	user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
@@ -1457,6 +1502,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		emit_mov_imm64(&prog, X86_REG_R12,
 			       arena_vm_start >> 32, (u32) arena_vm_start);
 
+	if (priv_frame_ptr)
+		emit_priv_frame_ptr(&prog, priv_frame_ptr);
+
 	ilen = prog - temp;
 	if (rw_image)
 		memcpy(rw_image + proglen, temp, ilen);
@@ -1476,6 +1524,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		u8 *func;
 		int nops;
 
+		if (priv_frame_ptr) {
+			if (src_reg == BPF_REG_FP)
+				src_reg = X86_REG_R9;
+
+			if (dst_reg == BPF_REG_FP)
+				dst_reg = X86_REG_R9;
+		}
+
 		switch (insn->code) {
 			/* ALU */
 		case BPF_ALU | BPF_ADD | BPF_X:
@@ -2136,9 +2192,15 @@ populate_extable:
 			}
 			if (!imm32)
 				return -EINVAL;
+			if (priv_frame_ptr) {
+				push_r9(&prog);
+				ip += 2;
+			}
 			ip += x86_call_depth_emit_accounting(&prog, func, ip);
 			if (emit_call(&prog, func, ip))
 				return -EINVAL;
+			if (priv_frame_ptr)
+				pop_r9(&prog);
 			break;
 		}
 
@@ -3306,6 +3368,42 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 	return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
 }
 
+static const char *bpf_get_prog_name(struct bpf_prog *prog)
+{
+	if (prog->aux->ksym.prog)
+		return prog->aux->ksym.name;
+	return prog->aux->name;
+}
+
+static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
+{
+	int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+	u64 *stack_ptr;
+
+	for_each_possible_cpu(cpu) {
+		stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+		stack_ptr[0] = PRIV_STACK_GUARD_VAL;
+		stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL;
+	}
+}
+
+static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size,
+				   struct bpf_prog *prog)
+{
+	int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+	u64 *stack_ptr;
+
+	for_each_possible_cpu(cpu) {
+		stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+		if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
+		    stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) {
+			pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
+			       bpf_get_prog_name(prog));
+			break;
+		}
+	}
+}
+
 struct x64_jit_data {
 	struct bpf_binary_header *rw_header;
 	struct bpf_binary_header *header;
@@ -3323,7 +3421,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	struct bpf_binary_header *rw_header = NULL;
 	struct bpf_binary_header *header = NULL;
 	struct bpf_prog *tmp, *orig_prog = prog;
+	void __percpu *priv_stack_ptr = NULL;
 	struct x64_jit_data *jit_data;
+	int priv_stack_alloc_sz;
 	int proglen, oldproglen = 0;
 	struct jit_context ctx = {};
 	bool tmp_blinded = false;
@@ -3359,6 +3459,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		}
 		prog->aux->jit_data = jit_data;
 	}
+	priv_stack_ptr = prog->aux->priv_stack_ptr;
+	if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+		/* Allocate actual private stack size with verifier-calculated
+		 * stack size plus two memory guards to protect overflow and
+		 * underflow.
+		 */
+		priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) +
+				      2 * PRIV_STACK_GUARD_SZ;
+		priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 8, GFP_KERNEL);
+		if (!priv_stack_ptr) {
+			prog = orig_prog;
+			goto out_priv_stack;
+		}
+
+		priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz);
+		prog->aux->priv_stack_ptr = priv_stack_ptr;
+	}
 	addrs = jit_data->addrs;
 	if (addrs) {
 		ctx = jit_data->ctx;
@@ -3494,6 +3611,11 @@ out_image:
 			bpf_prog_fill_jited_linfo(prog, addrs + 1);
 out_addrs:
 		kvfree(addrs);
+		if (!image && priv_stack_ptr) {
+			free_percpu(priv_stack_ptr);
+			prog->aux->priv_stack_ptr = NULL;
+		}
+out_priv_stack:
 		kfree(jit_data);
 		prog->aux->jit_data = NULL;
 	}
@@ -3532,6 +3654,8 @@ void bpf_jit_free(struct bpf_prog *prog)
 	if (prog->jited) {
 		struct x64_jit_data *jit_data = prog->aux->jit_data;
 		struct bpf_binary_header *hdr;
+		void __percpu *priv_stack_ptr;
+		int priv_stack_alloc_sz;
 
 		/*
 		 * If we fail the final pass of JIT (from jit_subprogs),
@@ -3547,6 +3671,13 @@ void bpf_jit_free(struct bpf_prog *prog)
 		prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
 		hdr = bpf_jit_binary_pack_hdr(prog);
 		bpf_jit_binary_pack_free(hdr, NULL);
+		priv_stack_ptr = prog->aux->priv_stack_ptr;
+		if (priv_stack_ptr) {
+			priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) +
+					      2 * PRIV_STACK_GUARD_SZ;
+			priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog);
+			free_percpu(prog->aux->priv_stack_ptr);
+		}
 		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
 	}
 
@@ -3562,6 +3693,11 @@ bool bpf_jit_supports_exceptions(void)
 	return IS_ENABLED(CONFIG_UNWINDER_ORC);
 }
 
+bool bpf_jit_supports_private_stack(void)
+{
+	return true;
+}
+
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
 {
 #if defined(CONFIG_UNWINDER_ORC)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 129b29e85cec..d32cc373dfd1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1507,6 +1507,7 @@ struct bpf_prog_aux {
 	u32 max_rdwr_access;
 	struct btf *attach_btf;
 	const struct bpf_ctx_arg_aux *ctx_arg_info;
+	void __percpu *priv_stack_ptr;
 	struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
 	struct bpf_prog *dst_prog;
 	struct bpf_trampoline *dst_trampoline;
-- 
cgit v1.2.3


From 5bd36da1e37e7a78e8b38efd287de6e1394b7d6e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 12 Nov 2024 08:39:33 -0800
Subject: bpf: Support private stack for struct_ops progs

For struct_ops progs, whether a particular prog uses private stack
depends on prog->aux->priv_stack_requested setting before actual
insn-level verification for that prog. One particular implementation
is to piggyback on struct_ops->check_member(). The next patch has
an example for this. The struct_ops->check_member() sets
prog->aux->priv_stack_requested to be true which enables private stack
usage.

The struct_ops prog follows the same rule as kprobe/tracing progs after
function bpf_enable_priv_stack(). For example, even a struct_ops prog
requests private stack, it could still use normal kernel stack if
the stack size is small (< 64 bytes).

Similar to tracing progs, nested same cpu same prog run will be skipped.
A field (recursion_detected()) is added to bpf_prog_aux structure.
If bpf_prog->aux->recursion_detected is implemented by the struct_ops
subsystem and nested same cpu/prog happens, the function will be
triggered to report an error, collect related info, etc.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241112163933.2224962-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          | 2 ++
 include/linux/bpf_verifier.h | 1 +
 kernel/bpf/trampoline.c      | 4 ++++
 kernel/bpf/verifier.c        | 7 ++++++-
 4 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d32cc373dfd1..10945c8858ce 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1525,9 +1525,11 @@ struct bpf_prog_aux {
 	bool exception_boundary;
 	bool is_extended; /* true if extended by freplace program */
 	bool jits_use_priv_stack;
+	bool priv_stack_requested;
 	u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
 	struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
 	struct bpf_arena *arena;
+	void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d62bb2ca1828..6b7c91629176 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -879,6 +879,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
 	case BPF_PROG_TYPE_TRACING:
 		return prog->expected_attach_type != BPF_TRACE_ITER;
 	case BPF_PROG_TYPE_STRUCT_OPS:
+		return prog->aux->jits_use_priv_stack;
 	case BPF_PROG_TYPE_LSM:
 		return false;
 	default:
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 9f36c049f4c2..a8d188b31da5 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -899,6 +899,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
 
 	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
 		bpf_prog_inc_misses_counter(prog);
+		if (prog->aux->recursion_detected)
+			prog->aux->recursion_detected(prog);
 		return 0;
 	}
 	return bpf_prog_start_time();
@@ -975,6 +977,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 
 	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
 		bpf_prog_inc_misses_counter(prog);
+		if (prog->aux->recursion_detected)
+			prog->aux->recursion_detected(prog);
 		return 0;
 	}
 	return bpf_prog_start_time();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 176d19ad9d07..f4c39bb50511 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6110,7 +6110,7 @@ static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
 	case BPF_PROG_TYPE_TRACING:
 	case BPF_PROG_TYPE_LSM:
 	case BPF_PROG_TYPE_STRUCT_OPS:
-		if (bpf_prog_check_recur(prog))
+		if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog))
 			return PRIV_STACK_ADAPTIVE;
 		fallthrough;
 	default:
@@ -22053,6 +22053,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 		}
 	}
 
+	if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) {
+		verbose(env, "Private stack not supported by jit\n");
+		return -EACCES;
+	}
+
 	/* btf_ctx_access() used this to provide argument type info */
 	prog->aux->ctx_arg_info =
 		st_ops_desc->arg_info[member_idx].info;
-- 
cgit v1.2.3


From 7c8ce4ffb684676039b1ff9ff81c126794e8d88e Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Tue, 12 Nov 2024 22:58:49 +0800
Subject: bpf: Add kernel symbol for struct_ops trampoline

Without kernel symbols for struct_ops trampoline, the unwinder may
produce unexpected stacktraces.

For example, the x86 ORC and FP unwinders check if an IP is in kernel
text by verifying the presence of the IP's kernel symbol. When a
struct_ops trampoline address is encountered, the unwinder stops due
to the absence of symbol, resulting in an incomplete stacktrace that
consists only of direct and indirect child functions called from the
trampoline.

The arm64 unwinder is another example. While the arm64 unwinder can
proceed across a struct_ops trampoline address, the corresponding
symbol name is displayed as "unknown", which is confusing.

Thus, add kernel symbol for struct_ops trampoline. The name is
bpf__<struct_ops_name>_<member_name>, where <struct_ops_name> is the
type name of the struct_ops, and <member_name> is the name of
the member that the trampoline is linked to.

Below is a comparison of stacktraces captured on x86 by perf record,
before and after this patch.

Before:
ffffffff8116545d __lock_acquire+0xad ([kernel.kallsyms])
ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms])
ffffffff813088f4 __bpf_prog_enter+0x34 ([kernel.kallsyms])

After:
ffffffff811656bd __lock_acquire+0x30d ([kernel.kallsyms])
ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms])
ffffffff81309024 __bpf_prog_enter+0x34 ([kernel.kallsyms])
ffffffffc000d7e9 bpf__tcp_congestion_ops_cong_avoid+0x3e ([kernel.kallsyms])
ffffffff81f250a5 tcp_ack+0x10d5 ([kernel.kallsyms])
ffffffff81f27c66 tcp_rcv_established+0x3b6 ([kernel.kallsyms])
ffffffff81f3ad03 tcp_v4_do_rcv+0x193 ([kernel.kallsyms])
ffffffff81d65a18 __release_sock+0xd8 ([kernel.kallsyms])
ffffffff81d65af4 release_sock+0x34 ([kernel.kallsyms])
ffffffff81f15c4b tcp_sendmsg+0x3b ([kernel.kallsyms])
ffffffff81f663d7 inet_sendmsg+0x47 ([kernel.kallsyms])
ffffffff81d5ab40 sock_write_iter+0x160 ([kernel.kallsyms])
ffffffff8149c67b vfs_write+0x3fb ([kernel.kallsyms])
ffffffff8149caf6 ksys_write+0xc6 ([kernel.kallsyms])
ffffffff8149cb5d __x64_sys_write+0x1d ([kernel.kallsyms])
ffffffff81009200 x64_sys_call+0x1d30 ([kernel.kallsyms])
ffffffff82232d28 do_syscall_64+0x68 ([kernel.kallsyms])
ffffffff8240012f entry_SYSCALL_64_after_hwframe+0x76 ([kernel.kallsyms])

Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS")
Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241112145849.3436772-4-xukuohai@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h         |  3 +-
 kernel/bpf/bpf_struct_ops.c | 79 ++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/dispatcher.c     |  3 +-
 kernel/bpf/trampoline.c     |  9 ++++--
 4 files changed, 89 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 10945c8858ce..3ace0d6227e3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1402,7 +1402,8 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 				struct bpf_prog *to);
 /* Called only from JIT-enabled code, so there's no need for stubs. */
-void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym);
+void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym);
+void bpf_image_ksym_add(struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index ff94c8120ebb..606efe32485a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -31,6 +31,8 @@ struct bpf_struct_ops_map {
 	 * (in kvalue.data).
 	 */
 	struct bpf_link **links;
+	/* ksyms for bpf trampolines */
+	struct bpf_ksym **ksyms;
 	u32 funcs_cnt;
 	u32 image_pages_cnt;
 	/* image_pages is an array of pages that has all the trampolines
@@ -585,6 +587,49 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 	return 0;
 }
 
+static void bpf_struct_ops_ksym_init(const char *tname, const char *mname,
+				     void *image, unsigned int size,
+				     struct bpf_ksym *ksym)
+{
+	snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname);
+	INIT_LIST_HEAD_RCU(&ksym->lnode);
+	bpf_image_ksym_init(image, size, ksym);
+}
+
+static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map)
+{
+	u32 i;
+
+	for (i = 0; i < st_map->funcs_cnt; i++) {
+		if (!st_map->ksyms[i])
+			break;
+		bpf_image_ksym_add(st_map->ksyms[i]);
+	}
+}
+
+static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map)
+{
+	u32 i;
+
+	for (i = 0; i < st_map->funcs_cnt; i++) {
+		if (!st_map->ksyms[i])
+			break;
+		bpf_image_ksym_del(st_map->ksyms[i]);
+	}
+}
+
+static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map)
+{
+	u32 i;
+
+	for (i = 0; i < st_map->funcs_cnt; i++) {
+		if (!st_map->ksyms[i])
+			break;
+		kfree(st_map->ksyms[i]);
+		st_map->ksyms[i] = NULL;
+	}
+}
+
 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 					   void *value, u64 flags)
 {
@@ -601,6 +646,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	u32 i, trampoline_start, image_off = 0;
 	void *cur_image = NULL, *image = NULL;
 	struct bpf_link **plink;
+	struct bpf_ksym **pksym;
+	const char *tname, *mname;
 
 	if (flags)
 		return -EINVAL;
@@ -640,14 +687,18 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	kdata = &kvalue->data;
 
 	plink = st_map->links;
+	pksym = st_map->ksyms;
+	tname = btf_name_by_offset(st_map->btf, t->name_off);
 	module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
 	for_each_member(i, t, member) {
 		const struct btf_type *mtype, *ptype;
 		struct bpf_prog *prog;
 		struct bpf_tramp_link *link;
+		struct bpf_ksym *ksym;
 		u32 moff;
 
 		moff = __btf_member_bit_offset(t, member) / 8;
+		mname = btf_name_by_offset(st_map->btf, member->name_off);
 		ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
 		if (ptype == module_type) {
 			if (*(void **)(udata + moff))
@@ -717,6 +768,13 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			      &bpf_struct_ops_link_lops, prog);
 		*plink++ = &link->link;
 
+		ksym = kzalloc(sizeof(*ksym), GFP_USER);
+		if (!ksym) {
+			err = -ENOMEM;
+			goto reset_unlock;
+		}
+		*pksym++ = ksym;
+
 		trampoline_start = image_off;
 		err = bpf_struct_ops_prepare_trampoline(tlinks, link,
 						&st_ops->func_models[i],
@@ -736,6 +794,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 		/* put prog_id to udata */
 		*(unsigned long *)(udata + moff) = prog->aux->id;
+
+		/* init ksym for this trampoline */
+		bpf_struct_ops_ksym_init(tname, mname,
+					 image + trampoline_start,
+					 image_off - trampoline_start,
+					 ksym);
 	}
 
 	if (st_ops->validate) {
@@ -784,6 +848,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 */
 
 reset_unlock:
+	bpf_struct_ops_map_free_ksyms(st_map);
 	bpf_struct_ops_map_free_image(st_map);
 	bpf_struct_ops_map_put_progs(st_map);
 	memset(uvalue, 0, map->value_size);
@@ -791,6 +856,8 @@ reset_unlock:
 unlock:
 	kfree(tlinks);
 	mutex_unlock(&st_map->lock);
+	if (!err)
+		bpf_struct_ops_map_add_ksyms(st_map);
 	return err;
 }
 
@@ -850,7 +917,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
 
 	if (st_map->links)
 		bpf_struct_ops_map_put_progs(st_map);
+	if (st_map->ksyms)
+		bpf_struct_ops_map_free_ksyms(st_map);
 	bpf_map_area_free(st_map->links);
+	bpf_map_area_free(st_map->ksyms);
 	bpf_struct_ops_map_free_image(st_map);
 	bpf_map_area_free(st_map->uvalue);
 	bpf_map_area_free(st_map);
@@ -867,6 +937,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 	if (btf_is_module(st_map->btf))
 		module_put(st_map->st_ops_desc->st_ops->owner);
 
+	bpf_struct_ops_map_del_ksyms(st_map);
+
 	/* The struct_ops's function may switch to another struct_ops.
 	 *
 	 * For example, bpf_tcp_cc_x->init() may switch to
@@ -979,7 +1051,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	st_map->links =
 		bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *),
 				   NUMA_NO_NODE);
-	if (!st_map->uvalue || !st_map->links) {
+
+	st_map->ksyms =
+		bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *),
+				   NUMA_NO_NODE);
+	if (!st_map->uvalue || !st_map->links || !st_map->ksyms) {
 		ret = -ENOMEM;
 		goto errout_free;
 	}
@@ -1009,6 +1085,7 @@ static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
 			vt->size - sizeof(struct bpf_struct_ops_value);
 	usage += vt->size;
 	usage += st_map->funcs_cnt * sizeof(struct bpf_link *);
+	usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *);
 	usage += PAGE_SIZE;
 	return usage;
 }
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 70fb82bf1637..b77db7413f8c 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -154,7 +154,8 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 			d->image = NULL;
 			goto out;
 		}
-		bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
+		bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym);
+		bpf_image_ksym_add(&d->ksym);
 	}
 
 	prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index a8d188b31da5..c4b1a98ff726 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -115,10 +115,14 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
 }
 
-void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
+void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
 {
 	ksym->start = (unsigned long) data;
 	ksym->end = ksym->start + size;
+}
+
+void bpf_image_ksym_add(struct bpf_ksym *ksym)
+{
 	bpf_ksym_add(ksym);
 	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
 			   PAGE_SIZE, false, ksym->name);
@@ -377,7 +381,8 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
 	ksym = &im->ksym;
 	INIT_LIST_HEAD_RCU(&ksym->lnode);
 	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
-	bpf_image_ksym_add(image, size, ksym);
+	bpf_image_ksym_init(image, size, ksym);
+	bpf_image_ksym_add(ksym);
 	return im;
 
 out_free_image:
-- 
cgit v1.2.3


From 12bbabd3cab8a7dab0ddad8ed1e671f40c7cdeeb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 12 Nov 2024 18:01:25 +0200
Subject: usb: cdns3: Synchronise PCI IDs via common data base

There are a few places in the kernel where PCI IDs for different Cadence
USB controllers are being used. Besides different naming, they duplicate
each other. Make this all in order by providing common definitions via
PCI IDs database and use in all users. While doing that, rename
definitions as Roger suggested.

Suggested-by: Roger Quadros <rogerq@kernel.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20241112160125.2340972-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/cdns3/cdns3-pci-wrap.c       |  4 +---
 drivers/usb/cdns3/cdnsp-pci.c            | 26 ++++++++++----------------
 drivers/usb/gadget/udc/cdns2/cdns2-pci.c |  3 +--
 drivers/usb/host/xhci-pci.c              |  5 ++---
 include/linux/pci_ids.h                  |  4 ++++
 5 files changed, 18 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/cdns3/cdns3-pci-wrap.c b/drivers/usb/cdns3/cdns3-pci-wrap.c
index 591d149de8f3..3b3b3dc75f35 100644
--- a/drivers/usb/cdns3/cdns3-pci-wrap.c
+++ b/drivers/usb/cdns3/cdns3-pci-wrap.c
@@ -37,8 +37,6 @@ struct cdns3_wrap {
 #define PCI_DRIVER_NAME		"cdns3-pci-usbss"
 #define PLAT_DRIVER_NAME	"cdns-usb3"
 
-#define PCI_DEVICE_ID_CDNS_USB3	0x0100
-
 static struct pci_dev *cdns3_get_second_fun(struct pci_dev *pdev)
 {
 	struct pci_dev *func;
@@ -189,7 +187,7 @@ static void cdns3_pci_remove(struct pci_dev *pdev)
 }
 
 static const struct pci_device_id cdns3_pci_ids[] = {
-	{ PCI_VDEVICE(CDNS, PCI_DEVICE_ID_CDNS_USB3) },
+	{ PCI_VDEVICE(CDNS, PCI_DEVICE_ID_CDNS_USBSS) },
 	{ 0, }
 };
 
diff --git a/drivers/usb/cdns3/cdnsp-pci.c b/drivers/usb/cdns3/cdnsp-pci.c
index 2d05368a6745..a51144504ff3 100644
--- a/drivers/usb/cdns3/cdnsp-pci.c
+++ b/drivers/usb/cdns3/cdnsp-pci.c
@@ -28,12 +28,6 @@
 #define PCI_DRIVER_NAME		"cdns-pci-usbssp"
 #define PLAT_DRIVER_NAME	"cdns-usbssp"
 
-#define PCI_DEVICE_ID_CDNS_USB3		0x0100
-#define PCI_DEVICE_ID_CDNS_UDC		0x0200
-
-#define PCI_CLASS_SERIAL_USB_CDNS_USB3	(PCI_CLASS_SERIAL_USB << 8 | 0x80)
-#define PCI_CLASS_SERIAL_USB_CDNS_UDC	PCI_CLASS_SERIAL_USB_DEVICE
-
 static struct pci_dev *cdnsp_get_second_fun(struct pci_dev *pdev)
 {
 	/*
@@ -41,10 +35,10 @@ static struct pci_dev *cdnsp_get_second_fun(struct pci_dev *pdev)
 	 * Platform has two function. The fist keeps resources for
 	 * Host/Device while the secon keeps resources for DRD/OTG.
 	 */
-	if (pdev->device == PCI_DEVICE_ID_CDNS_UDC)
-		return pci_get_device(pdev->vendor, PCI_DEVICE_ID_CDNS_USB3, NULL);
-	if (pdev->device == PCI_DEVICE_ID_CDNS_USB3)
-		return pci_get_device(pdev->vendor, PCI_DEVICE_ID_CDNS_UDC, NULL);
+	if (pdev->device == PCI_DEVICE_ID_CDNS_USBSSP)
+		return pci_get_device(pdev->vendor, PCI_DEVICE_ID_CDNS_USBSS, NULL);
+	if (pdev->device == PCI_DEVICE_ID_CDNS_USBSS)
+		return pci_get_device(pdev->vendor, PCI_DEVICE_ID_CDNS_USBSSP, NULL);
 
 	return NULL;
 }
@@ -221,12 +215,12 @@ static const struct dev_pm_ops cdnsp_pci_pm_ops = {
 };
 
 static const struct pci_device_id cdnsp_pci_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_CDNS, PCI_DEVICE_ID_CDNS_UDC),
-	  .class = PCI_CLASS_SERIAL_USB_CDNS_UDC },
-	{ PCI_DEVICE(PCI_VENDOR_ID_CDNS, PCI_DEVICE_ID_CDNS_UDC),
-	  .class = PCI_CLASS_SERIAL_USB_CDNS_USB3 },
-	{ PCI_DEVICE(PCI_VENDOR_ID_CDNS, PCI_DEVICE_ID_CDNS_USB3),
-	  .class = PCI_CLASS_SERIAL_USB_CDNS_USB3 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CDNS, PCI_DEVICE_ID_CDNS_USBSSP),
+	  .class = PCI_CLASS_SERIAL_USB_DEVICE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CDNS, PCI_DEVICE_ID_CDNS_USBSSP),
+	  .class = PCI_CLASS_SERIAL_USB_CDNS },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CDNS, PCI_DEVICE_ID_CDNS_USBSS),
+	  .class = PCI_CLASS_SERIAL_USB_CDNS },
 	{ 0, }
 };
 
diff --git a/drivers/usb/gadget/udc/cdns2/cdns2-pci.c b/drivers/usb/gadget/udc/cdns2/cdns2-pci.c
index b1a8f772467c..e589593b4cbf 100644
--- a/drivers/usb/gadget/udc/cdns2/cdns2-pci.c
+++ b/drivers/usb/gadget/udc/cdns2/cdns2-pci.c
@@ -15,7 +15,6 @@
 #include "cdns2-gadget.h"
 
 #define PCI_DRIVER_NAME		"cdns-pci-usbhs"
-#define PCI_DEVICE_ID_CDNS_USB2	0x0120
 #define PCI_BAR_DEV		0
 #define PCI_DEV_FN_DEVICE	0
 
@@ -113,7 +112,7 @@ static const struct dev_pm_ops cdns2_pci_pm_ops = {
 };
 
 static const struct pci_device_id cdns2_pci_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_CDNS, PCI_DEVICE_ID_CDNS_USB2),
+	{ PCI_DEVICE(PCI_VENDOR_ID_CDNS, PCI_DEVICE_ID_CDNS_USB),
 	  .class = PCI_CLASS_SERIAL_USB_DEVICE },
 	{ 0, }
 };
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 47c4f70793e4..b21474e81482 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -82,8 +82,6 @@
 #define PCI_DEVICE_ID_ASMEDIA_3042_XHCI			0x3042
 #define PCI_DEVICE_ID_ASMEDIA_3242_XHCI			0x3242
 
-#define PCI_DEVICE_ID_CDNS_SSP				0x0200
-
 static const char hcd_name[] = "xhci_hcd";
 
 static struct hc_driver __read_mostly xhci_pci_hc_driver;
@@ -475,8 +473,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
 		if (pdev->device == 0x9203)
 			xhci->quirks |= XHCI_ZHAOXIN_TRB_FETCH;
 	}
+
 	if (pdev->vendor == PCI_VENDOR_ID_CDNS &&
-	    pdev->device == PCI_DEVICE_ID_CDNS_SSP)
+	    pdev->device == PCI_DEVICE_ID_CDNS_USBSSP)
 		xhci->quirks |= XHCI_CDNS_SCTX_QUIRK;
 
 	/* xHC spec requires PCI devices to support D3hot and D3cold */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4cf6aaed5f35..eff9eccc52fc 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -121,6 +121,7 @@
 #define PCI_CLASS_SERIAL_USB_OHCI	0x0c0310
 #define PCI_CLASS_SERIAL_USB_EHCI	0x0c0320
 #define PCI_CLASS_SERIAL_USB_XHCI	0x0c0330
+#define PCI_CLASS_SERIAL_USB_CDNS	0x0c0380
 #define PCI_CLASS_SERIAL_USB_DEVICE	0x0c03fe
 #define PCI_CLASS_SERIAL_FIBER		0x0c04
 #define PCI_CLASS_SERIAL_SMBUS		0x0c05
@@ -2421,6 +2422,9 @@
 #define PCI_VENDOR_ID_QCOM		0x17cb
 
 #define PCI_VENDOR_ID_CDNS		0x17cd
+#define PCI_DEVICE_ID_CDNS_USBSS	0x0100
+#define PCI_DEVICE_ID_CDNS_USB		0x0120
+#define PCI_DEVICE_ID_CDNS_USBSSP	0x0200
 
 #define PCI_VENDOR_ID_ARECA		0x17d3
 #define PCI_DEVICE_ID_ARECA_1110	0x1110
-- 
cgit v1.2.3


From 948ccbd95016f50ce01df5eef9440eede3b8c713 Mon Sep 17 00:00:00 2001
From: Xianglai Li <lixianglai@loongson.cn>
Date: Wed, 13 Nov 2024 16:18:26 +0800
Subject: LoongArch: KVM: Add iocsr and mmio bus simulation in kernel

Add iocsr and mmio memory read and write simulation to the kernel. When
the VM accesses the device address space through iocsr instructions or
mmio, it does not need to return to the qemu user mode but can directly
completes the access in the kernel mode.

Signed-off-by: Tianrui Zhao <zhaotianrui@loongson.cn>
Signed-off-by: Xianglai Li <lixianglai@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/kvm/exit.c  | 82 ++++++++++++++++++++++++++++++++--------------
 include/linux/kvm_host.h   |  1 +
 include/trace/events/kvm.h | 35 ++++++++++++++++++++
 3 files changed, 94 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index 90894f70ff4a..69f3e3782cc9 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -157,7 +157,7 @@ static int kvm_handle_csr(struct kvm_vcpu *vcpu, larch_inst inst)
 int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	int ret;
-	unsigned long val;
+	unsigned long *val;
 	u32 addr, rd, rj, opcode;
 
 	/*
@@ -170,6 +170,7 @@ int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu)
 	ret = EMULATE_DO_IOCSR;
 	run->iocsr_io.phys_addr = addr;
 	run->iocsr_io.is_write = 0;
+	val = &vcpu->arch.gprs[rd];
 
 	/* LoongArch is Little endian */
 	switch (opcode) {
@@ -202,16 +203,25 @@ int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu)
 		run->iocsr_io.is_write = 1;
 		break;
 	default:
-		ret = EMULATE_FAIL;
-		break;
+		return EMULATE_FAIL;
 	}
 
-	if (ret == EMULATE_DO_IOCSR) {
-		if (run->iocsr_io.is_write) {
-			val = vcpu->arch.gprs[rd];
-			memcpy(run->iocsr_io.data, &val, run->iocsr_io.len);
-		}
-		vcpu->arch.io_gpr = rd;
+	if (run->iocsr_io.is_write) {
+		if (!kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, run->iocsr_io.len, val))
+			ret = EMULATE_DONE;
+		else
+			/* Save data and let user space to write it */
+			memcpy(run->iocsr_io.data, val, run->iocsr_io.len);
+
+		trace_kvm_iocsr(KVM_TRACE_IOCSR_WRITE, run->iocsr_io.len, addr, val);
+	} else {
+		if (!kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, run->iocsr_io.len, val))
+			ret = EMULATE_DONE;
+		else
+			/* Save register id for iocsr read completion */
+			vcpu->arch.io_gpr = rd;
+
+		trace_kvm_iocsr(KVM_TRACE_IOCSR_READ, run->iocsr_io.len, addr, NULL);
 	}
 
 	return ret;
@@ -447,19 +457,33 @@ int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst)
 	}
 
 	if (ret == EMULATE_DO_MMIO) {
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ, run->mmio.len, run->mmio.phys_addr, NULL);
+
+		/*
+		 * If mmio device such as PCH-PIC is emulated in KVM,
+		 * it need not return to user space to handle the mmio
+		 * exception.
+		 */
+		ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, vcpu->arch.badv,
+				      run->mmio.len, &vcpu->arch.gprs[rd]);
+		if (!ret) {
+			update_pc(&vcpu->arch);
+			vcpu->mmio_needed = 0;
+			return EMULATE_DONE;
+		}
+
 		/* Set for kvm_complete_mmio_read() use */
 		vcpu->arch.io_gpr = rd;
 		run->mmio.is_write = 0;
 		vcpu->mmio_is_write = 0;
-		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, run->mmio.len,
-				run->mmio.phys_addr, NULL);
-	} else {
-		kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n",
-			inst.word, vcpu->arch.pc, vcpu->arch.badv);
-		kvm_arch_vcpu_dump_regs(vcpu);
-		vcpu->mmio_needed = 0;
+		return EMULATE_DO_MMIO;
 	}
 
+	kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n",
+			inst.word, vcpu->arch.pc, vcpu->arch.badv);
+	kvm_arch_vcpu_dump_regs(vcpu);
+	vcpu->mmio_needed = 0;
+
 	return ret;
 }
 
@@ -600,19 +624,29 @@ int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst)
 	}
 
 	if (ret == EMULATE_DO_MMIO) {
+		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, run->mmio.len, run->mmio.phys_addr, data);
+
+		/*
+		 * If mmio device such as PCH-PIC is emulated in KVM,
+		 * it need not return to user space to handle the mmio
+		 * exception.
+		 */
+		ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, vcpu->arch.badv, run->mmio.len, data);
+		if (!ret)
+			return EMULATE_DONE;
+
 		run->mmio.is_write = 1;
 		vcpu->mmio_needed = 1;
 		vcpu->mmio_is_write = 1;
-		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, run->mmio.len,
-				run->mmio.phys_addr, data);
-	} else {
-		vcpu->arch.pc = curr_pc;
-		kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n",
-			inst.word, vcpu->arch.pc, vcpu->arch.badv);
-		kvm_arch_vcpu_dump_regs(vcpu);
-		/* Rollback PC if emulation was unsuccessful */
+		return EMULATE_DO_MMIO;
 	}
 
+	vcpu->arch.pc = curr_pc;
+	kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n",
+			inst.word, vcpu->arch.pc, vcpu->arch.badv);
+	kvm_arch_vcpu_dump_regs(vcpu);
+	/* Rollback PC if emulation was unsuccessful */
+
 	return ret;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 45be36e5285f..164d9725cb08 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -219,6 +219,7 @@ enum kvm_bus {
 	KVM_PIO_BUS,
 	KVM_VIRTIO_CCW_NOTIFY_BUS,
 	KVM_FAST_MMIO_BUS,
+	KVM_IOCSR_BUS,
 	KVM_NR_BUSES
 };
 
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 74e40d5d4af4..fc7d0f8ff078 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -236,6 +236,41 @@ TRACE_EVENT(kvm_mmio,
 		  __entry->len, __entry->gpa, __entry->val)
 );
 
+#define KVM_TRACE_IOCSR_READ_UNSATISFIED 0
+#define KVM_TRACE_IOCSR_READ 1
+#define KVM_TRACE_IOCSR_WRITE 2
+
+#define kvm_trace_symbol_iocsr \
+	{ KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \
+	{ KVM_TRACE_IOCSR_READ, "read" }, \
+	{ KVM_TRACE_IOCSR_WRITE, "write" }
+
+TRACE_EVENT(kvm_iocsr,
+	TP_PROTO(int type, int len, u64 gpa, void *val),
+	TP_ARGS(type, len, gpa, val),
+
+	TP_STRUCT__entry(
+		__field(	u32,	type	)
+		__field(	u32,	len	)
+		__field(	u64,	gpa	)
+		__field(	u64,	val	)
+	),
+
+	TP_fast_assign(
+		__entry->type		= type;
+		__entry->len		= len;
+		__entry->gpa		= gpa;
+		__entry->val		= 0;
+		if (val)
+			memcpy(&__entry->val, val,
+			       min_t(u32, sizeof(__entry->val), len));
+	),
+
+	TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx",
+		  __print_symbolic(__entry->type, kvm_trace_symbol_iocsr),
+		  __entry->len, __entry->gpa, __entry->val)
+);
+
 #define kvm_fpu_load_symbol	\
 	{0, "unload"},		\
 	{1, "load"}
-- 
cgit v1.2.3


From da115c4ee29f589bb72ec2e86eb5e196b6bbcb41 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 12 Nov 2024 15:29:15 +0100
Subject: printk: add dummy printk_force_console_enter/exit helpers

The newly added interface is broken when PRINTK is disabled:

drivers/tty/sysrq.c: In function '__handle_sysrq':
drivers/tty/sysrq.c:601:9: error: implicit declaration of function 'printk_force_console_enter' [-Wimplicit-function-declaration]
  601 |         printk_force_console_enter();
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/tty/sysrq.c:611:25: error: implicit declaration of function 'printk_force_console_exit' [-Wimplicit-function-declaration]
  611 |                         printk_force_console_exit();
      |                         ^~~~~~~~~~~~~~~~~~~~~~~~~

Add empty stub functions for both.

Fixes: ed76c07c6885 ("printk: Introduce FORCE_CON flag")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Tested-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Link: https://lore.kernel.org/r/20241112142939.724093-1-arnd@kernel.org
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 232e5fd06701..4217a9f412b2 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -232,6 +232,14 @@ static inline void printk_deferred_exit(void)
 {
 }
 
+static inline void printk_force_console_enter(void)
+{
+}
+
+static inline void printk_force_console_exit(void)
+{
+}
+
 static inline int printk_ratelimit(void)
 {
 	return 0;
-- 
cgit v1.2.3


From 1d58f7f3a1373734b2e86a246edcf1cd39359f3e Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Fri, 25 Oct 2024 21:31:01 +0100
Subject: clocksource/drivers/dw_apb: Remove unused dw_apb_clockevent functions

dw_apb_clockevent_pause(), dw_apb_clockevent_resume() and
dw_apb_clockevent_stop() have been unused since 2021's
commit 1b79fc4f2bfd ("x86/apb_timer: Remove driver for deprecated
platform")

Remove them.

(Some of the other clockevent functions are still called by
dw_apb_timer_of.c  so I guess it is still in use?)

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://lore.kernel.org/r/20241025203101.241709-1-linux@treblig.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/dw_apb_timer.c | 39 --------------------------------------
 include/linux/dw_apb_timer.h       |  3 ---
 2 files changed, 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/dw_apb_timer.c b/drivers/clocksource/dw_apb_timer.c
index f5f24a95ee82..3a55ae5fe225 100644
--- a/drivers/clocksource/dw_apb_timer.c
+++ b/drivers/clocksource/dw_apb_timer.c
@@ -68,25 +68,6 @@ static inline void apbt_writel_relaxed(struct dw_apb_timer *timer, u32 val,
 	writel_relaxed(val, timer->base + offs);
 }
 
-static void apbt_disable_int(struct dw_apb_timer *timer)
-{
-	u32 ctrl = apbt_readl(timer, APBTMR_N_CONTROL);
-
-	ctrl |= APBTMR_CONTROL_INT;
-	apbt_writel(timer, ctrl, APBTMR_N_CONTROL);
-}
-
-/**
- * dw_apb_clockevent_pause() - stop the clock_event_device from running
- *
- * @dw_ced:	The APB clock to stop generating events.
- */
-void dw_apb_clockevent_pause(struct dw_apb_clock_event_device *dw_ced)
-{
-	disable_irq(dw_ced->timer.irq);
-	apbt_disable_int(&dw_ced->timer);
-}
-
 static void apbt_eoi(struct dw_apb_timer *timer)
 {
 	apbt_readl_relaxed(timer, APBTMR_N_EOI);
@@ -284,26 +265,6 @@ dw_apb_clockevent_init(int cpu, const char *name, unsigned rating,
 	return dw_ced;
 }
 
-/**
- * dw_apb_clockevent_resume() - resume a clock that has been paused.
- *
- * @dw_ced:	The APB clock to resume.
- */
-void dw_apb_clockevent_resume(struct dw_apb_clock_event_device *dw_ced)
-{
-	enable_irq(dw_ced->timer.irq);
-}
-
-/**
- * dw_apb_clockevent_stop() - stop the clock_event_device and release the IRQ.
- *
- * @dw_ced:	The APB clock to stop generating the events.
- */
-void dw_apb_clockevent_stop(struct dw_apb_clock_event_device *dw_ced)
-{
-	free_irq(dw_ced->timer.irq, &dw_ced->ced);
-}
-
 /**
  * dw_apb_clockevent_register() - register the clock with the generic layer
  *
diff --git a/include/linux/dw_apb_timer.h b/include/linux/dw_apb_timer.h
index 82ebf9223948..f8811c46b89e 100644
--- a/include/linux/dw_apb_timer.h
+++ b/include/linux/dw_apb_timer.h
@@ -34,9 +34,6 @@ struct dw_apb_clocksource {
 };
 
 void dw_apb_clockevent_register(struct dw_apb_clock_event_device *dw_ced);
-void dw_apb_clockevent_pause(struct dw_apb_clock_event_device *dw_ced);
-void dw_apb_clockevent_resume(struct dw_apb_clock_event_device *dw_ced);
-void dw_apb_clockevent_stop(struct dw_apb_clock_event_device *dw_ced);
 
 struct dw_apb_clock_event_device *
 dw_apb_clockevent_init(int cpu, const char *name, unsigned rating,
-- 
cgit v1.2.3


From 2bd9077b6261b8f1281d0fa74d51afe090319263 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Fri, 1 Nov 2024 14:45:23 -0600
Subject: jbd2: avoid dozens of -Wflex-array-member-not-at-end warnings

-Wflex-array-member-not-at-end was introduced in GCC-14, and we
are getting ready to enable it, globally.

Use the `DEFINE_RAW_FLEX()` helper for an on-stack definition of
a flexible structure (`struct shash_desc`) where the size of the
flexible-array member (`__ctx`) is known at compile-time, and
refactor the rest of the code, accordingly.

So, with this, fix 77 of the following warnings:

include/linux/jbd2.h:1800:35: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/ZyU94w0IALVhc9Jy@kspp
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 8aef9bb6ad57..50f7ea8714bf 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1796,22 +1796,21 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal)
 static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
 			      const void *address, unsigned int length)
 {
-	struct {
-		struct shash_desc shash;
-		char ctx[JBD_MAX_CHECKSUM_SIZE];
-	} desc;
+	DEFINE_RAW_FLEX(struct shash_desc, desc, __ctx,
+		DIV_ROUND_UP(JBD_MAX_CHECKSUM_SIZE,
+			     sizeof(*((struct shash_desc *)0)->__ctx)));
 	int err;
 
 	BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) >
 		JBD_MAX_CHECKSUM_SIZE);
 
-	desc.shash.tfm = journal->j_chksum_driver;
-	*(u32 *)desc.ctx = crc;
+	desc->tfm = journal->j_chksum_driver;
+	*(u32 *)desc->__ctx = crc;
 
-	err = crypto_shash_update(&desc.shash, address, length);
+	err = crypto_shash_update(desc, address, length);
 	BUG_ON(err);
 
-	return *(u32 *)desc.ctx;
+	return *(u32 *)desc->__ctx;
 }
 
 /* Return most recent uncommitted transaction */
-- 
cgit v1.2.3


From 470d2bc3a0bc19a849cc7478c02d3f5ecaa1233e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 09:45:35 +0100
Subject: block: export blk_validate_limits

While block drivers do the validation as part of committing them to the
queue, users that use the limit outside of a block device context have
to validate the limits and fill in the calculated values as well.

So far btrfs is the only user of queue limits without a block device,
and it has gotten away with that more or less by accident.  But with
commit 559218d43ec9 ("block: pre-calculate max_zone_append_sectors")
this became fatal for setups that have small max zone append size,
as it won't be limited now.

Export blk_validate_limits so that it can be called directly from btrfs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241113084541.34315-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c   | 3 ++-
 include/linux/blkdev.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7d6b296997c2..f1d4dfdc37a7 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -222,7 +222,7 @@ unsupported:
  * Check that the limits in lim are valid, initialize defaults for unset
  * values, and cap values based on others where needed.
  */
-static int blk_validate_limits(struct queue_limits *lim)
+int blk_validate_limits(struct queue_limits *lim)
 {
 	unsigned int max_hw_sectors;
 	unsigned int logical_block_sectors;
@@ -365,6 +365,7 @@ static int blk_validate_limits(struct queue_limits *lim)
 		return err;
 	return blk_validate_zoned_limits(lim);
 }
+EXPORT_SYMBOL_GPL(blk_validate_limits);
 
 /*
  * Set the default limits for a newly allocated queue.  @lim contains the
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 65f37ae70712..cd905afaf51a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -948,6 +948,7 @@ queue_limits_start_update(struct request_queue *q)
 int queue_limits_commit_update(struct request_queue *q,
 		struct queue_limits *lim);
 int queue_limits_set(struct request_queue *q, struct queue_limits *lim);
+int blk_validate_limits(struct queue_limits *lim);
 
 /**
  * queue_limits_cancel_update - cancel an atomic update of queue limits
-- 
cgit v1.2.3


From f3dd9ae7f03aefa5bb12a4606f3d6cca87863622 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 1 Sep 2024 17:17:09 +0200
Subject: dax: Remove an unused field in struct dax_operations

.dax_supported() was apparently removed by commit 7b0800d00dae ("dax:
remove dax_capable") on 2021-11.

Remove the now unused function pointer from the struct dax_operations.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/56b92b722ca0a6fd1387c871a6ec01bcb9bd525e.1725203804.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 include/linux/dax.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9d3e3327af4c..df41a0017b31 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -27,12 +27,6 @@ struct dax_operations {
 	 */
 	long (*direct_access)(struct dax_device *, pgoff_t, long,
 			enum dax_access_mode, void **, pfn_t *);
-	/*
-	 * Validate whether this device is usable as an fsdax backing
-	 * device.
-	 */
-	bool (*dax_supported)(struct dax_device *, struct block_device *, int,
-			sector_t, sector_t);
 	/* zero_page_range: required operation. Zero page range   */
 	int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
 	/*
-- 
cgit v1.2.3


From e8225ab15006fbcdb14cef426a0a54475292fbbc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 16:20:43 +0100
Subject: block: remove rq_list_move

Unused now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241113152050.157179-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a28264442948..ad26a41d13f9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -268,23 +268,6 @@ static inline unsigned short req_get_ioprio(struct request *req)
 #define rq_list_next(rq)	(rq)->rq_next
 #define rq_list_empty(list)	((list) == (struct request *) NULL)
 
-/**
- * rq_list_move() - move a struct request from one list to another
- * @src: The source list @rq is currently in
- * @dst: The destination list that @rq will be appended to
- * @rq: The request to move
- * @prev: The request preceding @rq in @src (NULL if @rq is the head)
- */
-static inline void rq_list_move(struct request **src, struct request **dst,
-				struct request *rq, struct request *prev)
-{
-	if (prev)
-		prev->rq_next = rq->rq_next;
-	else
-		*src = rq->rq_next;
-	rq_list_add(dst, rq);
-}
-
 /**
  * enum blk_eh_timer_return - How the timeout handler should proceed
  * @BLK_EH_DONE: The block driver completed the command or will complete it at
-- 
cgit v1.2.3


From a3396b99990d8b4e5797e7b16fdeb64c15ae97bb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 16:20:44 +0100
Subject: block: add a rq_list type

Replace the semi-open coded request list helpers with a proper rq_list
type that mirrors the bio_list and has head and tail pointers.  Besides
better type safety this actually allows to insert at the tail of the
list, which will be useful soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241113152050.157179-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c              |  6 +--
 block/blk-merge.c             |  2 +-
 block/blk-mq.c                | 40 ++++++++++----------
 block/blk-mq.h                |  2 +-
 drivers/block/null_blk/main.c |  9 ++---
 drivers/block/virtio_blk.c    | 13 +++----
 drivers/nvme/host/apple.c     |  2 +-
 drivers/nvme/host/pci.c       | 15 ++++----
 include/linux/blk-mq.h        | 88 +++++++++++++++++++++++++------------------
 include/linux/blkdev.h        | 11 ++++--
 io_uring/rw.c                 |  4 +-
 11 files changed, 104 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 0387172e8259..666efe8fa202 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1120,8 +1120,8 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
 		return;
 
 	plug->cur_ktime = 0;
-	plug->mq_list = NULL;
-	plug->cached_rq = NULL;
+	rq_list_init(&plug->mq_list);
+	rq_list_init(&plug->cached_rqs);
 	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
 	plug->rq_count = 0;
 	plug->multiple_queues = false;
@@ -1217,7 +1217,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 	 * queue for cached requests, we don't want a blocked task holding
 	 * up a queue freeze/quiesce event.
 	 */
-	if (unlikely(!rq_list_empty(plug->cached_rq)))
+	if (unlikely(!rq_list_empty(&plug->cached_rqs)))
 		blk_mq_free_plug_rqs(plug);
 
 	plug->cur_ktime = 0;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index df36f83f3738..e0b28e9298c9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -1179,7 +1179,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	struct blk_plug *plug = current->plug;
 	struct request *rq;
 
-	if (!plug || rq_list_empty(plug->mq_list))
+	if (!plug || rq_list_empty(&plug->mq_list))
 		return false;
 
 	rq_list_for_each(&plug->mq_list, rq) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3c6cadba75e3..ff0b819e35fc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -478,7 +478,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
 		prefetch(tags->static_rqs[tag]);
 		tag_mask &= ~(1UL << i);
 		rq = blk_mq_rq_ctx_init(data, tags, tag);
-		rq_list_add(data->cached_rq, rq);
+		rq_list_add_head(data->cached_rqs, rq);
 		nr++;
 	}
 	if (!(data->rq_flags & RQF_SCHED_TAGS))
@@ -487,7 +487,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
 	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
 	data->nr_tags -= nr;
 
-	return rq_list_pop(data->cached_rq);
+	return rq_list_pop(data->cached_rqs);
 }
 
 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
@@ -584,7 +584,7 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
 		.flags		= flags,
 		.cmd_flags	= opf,
 		.nr_tags	= plug->nr_ios,
-		.cached_rq	= &plug->cached_rq,
+		.cached_rqs	= &plug->cached_rqs,
 	};
 	struct request *rq;
 
@@ -609,14 +609,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
 	if (!plug)
 		return NULL;
 
-	if (rq_list_empty(plug->cached_rq)) {
+	if (rq_list_empty(&plug->cached_rqs)) {
 		if (plug->nr_ios == 1)
 			return NULL;
 		rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
 		if (!rq)
 			return NULL;
 	} else {
-		rq = rq_list_peek(&plug->cached_rq);
+		rq = rq_list_peek(&plug->cached_rqs);
 		if (!rq || rq->q != q)
 			return NULL;
 
@@ -625,7 +625,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
 		if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
 			return NULL;
 
-		plug->cached_rq = rq_list_next(rq);
+		rq_list_pop(&plug->cached_rqs);
 		blk_mq_rq_time_init(rq, blk_time_get_ns());
 	}
 
@@ -802,7 +802,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
 {
 	struct request *rq;
 
-	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
+	while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
 		blk_mq_free_request(rq);
 }
 
@@ -1392,8 +1392,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
 	 */
 	if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
 		plug->has_elevator = true;
-	rq->rq_next = NULL;
-	rq_list_add(&plug->mq_list, rq);
+	rq_list_add_head(&plug->mq_list, rq);
 	plug->rq_count++;
 }
 
@@ -2785,7 +2784,7 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
 	blk_status_t ret = BLK_STS_OK;
 
 	while ((rq = rq_list_pop(&plug->mq_list))) {
-		bool last = rq_list_empty(plug->mq_list);
+		bool last = rq_list_empty(&plug->mq_list);
 
 		if (hctx != rq->mq_hctx) {
 			if (hctx) {
@@ -2828,8 +2827,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 {
 	struct blk_mq_hw_ctx *this_hctx = NULL;
 	struct blk_mq_ctx *this_ctx = NULL;
-	struct request *requeue_list = NULL;
-	struct request **requeue_lastp = &requeue_list;
+	struct rq_list requeue_list = {};
 	unsigned int depth = 0;
 	bool is_passthrough = false;
 	LIST_HEAD(list);
@@ -2843,12 +2841,12 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 			is_passthrough = blk_rq_is_passthrough(rq);
 		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
 			   is_passthrough != blk_rq_is_passthrough(rq)) {
-			rq_list_add_tail(&requeue_lastp, rq);
+			rq_list_add_tail(&requeue_list, rq);
 			continue;
 		}
 		list_add(&rq->queuelist, &list);
 		depth++;
-	} while (!rq_list_empty(plug->mq_list));
+	} while (!rq_list_empty(&plug->mq_list));
 
 	plug->mq_list = requeue_list;
 	trace_block_unplug(this_hctx->queue, depth, !from_sched);
@@ -2903,19 +2901,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		if (q->mq_ops->queue_rqs) {
 			blk_mq_run_dispatch_ops(q,
 				__blk_mq_flush_plug_list(q, plug));
-			if (rq_list_empty(plug->mq_list))
+			if (rq_list_empty(&plug->mq_list))
 				return;
 		}
 
 		blk_mq_run_dispatch_ops(q,
 				blk_mq_plug_issue_direct(plug));
-		if (rq_list_empty(plug->mq_list))
+		if (rq_list_empty(&plug->mq_list))
 			return;
 	}
 
 	do {
 		blk_mq_dispatch_plug_list(plug, from_schedule);
-	} while (!rq_list_empty(plug->mq_list));
+	} while (!rq_list_empty(&plug->mq_list));
 }
 
 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2980,7 +2978,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	if (plug) {
 		data.nr_tags = plug->nr_ios;
 		plug->nr_ios = 1;
-		data.cached_rq = &plug->cached_rq;
+		data.cached_rqs = &plug->cached_rqs;
 	}
 
 	rq = __blk_mq_alloc_requests(&data);
@@ -3003,7 +3001,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
 
 	if (!plug)
 		return NULL;
-	rq = rq_list_peek(&plug->cached_rq);
+	rq = rq_list_peek(&plug->cached_rqs);
 	if (!rq || rq->q != q)
 		return NULL;
 	if (type != rq->mq_hctx->type &&
@@ -3017,14 +3015,14 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
 static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
 		struct bio *bio)
 {
-	WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+	if (rq_list_pop(&plug->cached_rqs) != rq)
+		WARN_ON_ONCE(1);
 
 	/*
 	 * If any qos ->throttle() end up blocking, we will have flushed the
 	 * plug and hence killed the cached_rq list as well. Pop this entry
 	 * before we throttle.
 	 */
-	plug->cached_rq = rq_list_next(rq);
 	rq_qos_throttle(rq->q, bio);
 
 	blk_mq_rq_time_init(rq, blk_time_get_ns());
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f4ac1af77a26..89a20fffa4b1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -155,7 +155,7 @@ struct blk_mq_alloc_data {
 
 	/* allocate multiple requests/tags in one go */
 	unsigned int nr_tags;
-	struct request **cached_rq;
+	struct rq_list *cached_rqs;
 
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 2f0431e42c49..3c3d8d200abb 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1638,10 +1638,9 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_STS_OK;
 }
 
-static void null_queue_rqs(struct request **rqlist)
+static void null_queue_rqs(struct rq_list *rqlist)
 {
-	struct request *requeue_list = NULL;
-	struct request **requeue_lastp = &requeue_list;
+	struct rq_list requeue_list = {};
 	struct blk_mq_queue_data bd = { };
 	blk_status_t ret;
 
@@ -1651,8 +1650,8 @@ static void null_queue_rqs(struct request **rqlist)
 		bd.rq = rq;
 		ret = null_queue_rq(rq->mq_hctx, &bd);
 		if (ret != BLK_STS_OK)
-			rq_list_add_tail(&requeue_lastp, rq);
-	} while (!rq_list_empty(*rqlist));
+			rq_list_add_tail(&requeue_list, rq);
+	} while (!rq_list_empty(rqlist));
 
 	*rqlist = requeue_list;
 }
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index b25f7c06a28e..a19f24c19140 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -472,7 +472,7 @@ static bool virtblk_prep_rq_batch(struct request *req)
 }
 
 static void virtblk_add_req_batch(struct virtio_blk_vq *vq,
-					struct request **rqlist)
+		struct rq_list *rqlist)
 {
 	struct request *req;
 	unsigned long flags;
@@ -499,11 +499,10 @@ static void virtblk_add_req_batch(struct virtio_blk_vq *vq,
 		virtqueue_notify(vq->vq);
 }
 
-static void virtio_queue_rqs(struct request **rqlist)
+static void virtio_queue_rqs(struct rq_list *rqlist)
 {
-	struct request *submit_list = NULL;
-	struct request *requeue_list = NULL;
-	struct request **requeue_lastp = &requeue_list;
+	struct rq_list submit_list = { };
+	struct rq_list requeue_list = { };
 	struct virtio_blk_vq *vq = NULL;
 	struct request *req;
 
@@ -515,9 +514,9 @@ static void virtio_queue_rqs(struct request **rqlist)
 		vq = this_vq;
 
 		if (virtblk_prep_rq_batch(req))
-			rq_list_add(&submit_list, req); /* reverse order */
+			rq_list_add_head(&submit_list, req); /* reverse order */
 		else
-			rq_list_add_tail(&requeue_lastp, req);
+			rq_list_add_tail(&requeue_list, req);
 	}
 
 	if (vq)
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index b1387dc459a3..7cd1102a8d2c 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -649,7 +649,7 @@ static bool apple_nvme_handle_cq(struct apple_nvme_queue *q, bool force)
 
 	found = apple_nvme_poll_cq(q, &iob);
 
-	if (!rq_list_empty(iob.req_list))
+	if (!rq_list_empty(&iob.req_list))
 		apple_nvme_complete_batch(&iob);
 
 	return found;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ec1c44c75d92..707dbe8be6a2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -904,7 +904,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_STS_OK;
 }
 
-static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
+static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist)
 {
 	struct request *req;
 
@@ -932,11 +932,10 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
 	return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
 }
 
-static void nvme_queue_rqs(struct request **rqlist)
+static void nvme_queue_rqs(struct rq_list *rqlist)
 {
-	struct request *submit_list = NULL;
-	struct request *requeue_list = NULL;
-	struct request **requeue_lastp = &requeue_list;
+	struct rq_list submit_list = { };
+	struct rq_list requeue_list = { };
 	struct nvme_queue *nvmeq = NULL;
 	struct request *req;
 
@@ -946,9 +945,9 @@ static void nvme_queue_rqs(struct request **rqlist)
 		nvmeq = req->mq_hctx->driver_data;
 
 		if (nvme_prep_rq_batch(nvmeq, req))
-			rq_list_add(&submit_list, req); /* reverse order */
+			rq_list_add_head(&submit_list, req); /* reverse order */
 		else
-			rq_list_add_tail(&requeue_lastp, req);
+			rq_list_add_tail(&requeue_list, req);
 	}
 
 	if (nvmeq)
@@ -1080,7 +1079,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
 	DEFINE_IO_COMP_BATCH(iob);
 
 	if (nvme_poll_cq(nvmeq, &iob)) {
-		if (!rq_list_empty(iob.req_list))
+		if (!rq_list_empty(&iob.req_list))
 			nvme_pci_complete_batch(&iob);
 		return IRQ_HANDLED;
 	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ad26a41d13f9..c61e04365677 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -229,44 +229,60 @@ static inline unsigned short req_get_ioprio(struct request *req)
 #define rq_dma_dir(rq) \
 	(op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
 
-#define rq_list_add(listptr, rq)	do {		\
-	(rq)->rq_next = *(listptr);			\
-	*(listptr) = rq;				\
-} while (0)
-
-#define rq_list_add_tail(lastpptr, rq)	do {		\
-	(rq)->rq_next = NULL;				\
-	**(lastpptr) = rq;				\
-	*(lastpptr) = &rq->rq_next;			\
-} while (0)
-
-#define rq_list_pop(listptr)				\
-({							\
-	struct request *__req = NULL;			\
-	if ((listptr) && *(listptr))	{		\
-		__req = *(listptr);			\
-		*(listptr) = __req->rq_next;		\
-	}						\
-	__req;						\
-})
+static inline int rq_list_empty(const struct rq_list *rl)
+{
+	return rl->head == NULL;
+}
 
-#define rq_list_peek(listptr)				\
-({							\
-	struct request *__req = NULL;			\
-	if ((listptr) && *(listptr))			\
-		__req = *(listptr);			\
-	__req;						\
-})
+static inline void rq_list_init(struct rq_list *rl)
+{
+	rl->head = NULL;
+	rl->tail = NULL;
+}
+
+static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq)
+{
+	rq->rq_next = NULL;
+	if (rl->tail)
+		rl->tail->rq_next = rq;
+	else
+		rl->head = rq;
+	rl->tail = rq;
+}
+
+static inline void rq_list_add_head(struct rq_list *rl, struct request *rq)
+{
+	rq->rq_next = rl->head;
+	rl->head = rq;
+	if (!rl->tail)
+		rl->tail = rq;
+}
+
+static inline struct request *rq_list_pop(struct rq_list *rl)
+{
+	struct request *rq = rl->head;
+
+	if (rq) {
+		rl->head = rl->head->rq_next;
+		if (!rl->head)
+			rl->tail = NULL;
+		rq->rq_next = NULL;
+	}
+
+	return rq;
+}
 
-#define rq_list_for_each(listptr, pos)			\
-	for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))
+static inline struct request *rq_list_peek(struct rq_list *rl)
+{
+	return rl->head;
+}
 
-#define rq_list_for_each_safe(listptr, pos, nxt)			\
-	for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos);	\
-		pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL)
+#define rq_list_for_each(rl, pos)					\
+	for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next)
 
-#define rq_list_next(rq)	(rq)->rq_next
-#define rq_list_empty(list)	((list) == (struct request *) NULL)
+#define rq_list_for_each_safe(rl, pos, nxt)				\
+	for (pos = rq_list_peek((rl)), nxt = pos->rq_next;		\
+		pos; pos = nxt, nxt = pos ? pos->rq_next : NULL)
 
 /**
  * enum blk_eh_timer_return - How the timeout handler should proceed
@@ -559,7 +575,7 @@ struct blk_mq_ops {
 	 * empty the @rqlist completely, then the rest will be queued
 	 * individually by the block layer upon return.
 	 */
-	void (*queue_rqs)(struct request **rqlist);
+	void (*queue_rqs)(struct rq_list *rqlist);
 
 	/**
 	 * @get_budget: Reserve budget before queue request, once .queue_rq is
@@ -868,7 +884,7 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 	else if (iob->complete != complete)
 		return false;
 	iob->need_ts |= blk_mq_need_time_stamp(req);
-	rq_list_add(&iob->req_list, req);
+	rq_list_add_head(&iob->req_list, req);
 	return true;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cd905afaf51a..00212e96261a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1007,6 +1007,11 @@ extern void blk_put_queue(struct request_queue *);
 void blk_mark_disk_dead(struct gendisk *disk);
 
 #ifdef CONFIG_BLOCK
+struct rq_list {
+	struct request *head;
+	struct request *tail;
+};
+
 /*
  * blk_plug permits building a queue of related requests by holding the I/O
  * fragments for a short period. This allows merging of sequential requests
@@ -1019,10 +1024,10 @@ void blk_mark_disk_dead(struct gendisk *disk);
  * blk_flush_plug() is called.
  */
 struct blk_plug {
-	struct request *mq_list; /* blk-mq requests */
+	struct rq_list mq_list; /* blk-mq requests */
 
 	/* if ios_left is > 1, we can batch tag/rq allocations */
-	struct request *cached_rq;
+	struct rq_list cached_rqs;
 	u64 cur_ktime;
 	unsigned short nr_ios;
 
@@ -1684,7 +1689,7 @@ int bdev_thaw(struct block_device *bdev);
 void bdev_fput(struct file *bdev_file);
 
 struct io_comp_batch {
-	struct request *req_list;
+	struct rq_list req_list;
 	bool need_ts;
 	void (*complete)(struct io_comp_batch *);
 };
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 354c4e175654..9daef985543e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1160,12 +1160,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 			poll_flags |= BLK_POLL_ONESHOT;
 
 		/* iopoll may have completed current req */
-		if (!rq_list_empty(iob.req_list) ||
+		if (!rq_list_empty(&iob.req_list) ||
 		    READ_ONCE(req->iopoll_completed))
 			break;
 	}
 
-	if (!rq_list_empty(iob.req_list))
+	if (!rq_list_empty(&iob.req_list))
 		iob.complete(&iob);
 	else if (!pos)
 		return 0;
-- 
cgit v1.2.3


From 00e8d290b55f2fa5c5a0500b4dccf9e090650447 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 16:20:46 +0100
Subject: block: don't reorder requests in blk_mq_add_to_batch

LIFO ordering for batched completions is a bit unexpected and also
defeats some merging optimizations in e.g. the XFS buffered write
code.  Now that we can easily add the request to the tail of the list
do that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241113152050.157179-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c61e04365677..c596e0e4cb75 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -884,7 +884,7 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 	else if (iob->complete != complete)
 		return false;
 	iob->need_ts |= blk_mq_need_time_stamp(req);
-	rq_list_add_head(&iob->req_list, req);
+	rq_list_add_tail(&iob->req_list, req);
 	return true;
 }
 
-- 
cgit v1.2.3


From 27184f8905ba680f22abf1707fbed24036a67119 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Wed, 13 Nov 2024 07:54:12 +0200
Subject: tpm: Opt-in in disable PCR integrity protection

The initial HMAC session feature added TPM bus encryption and/or integrity
protection to various in-kernel TPM operations. This can cause performance
bottlenecks with IMA, as it heavily utilizes PCR extend operations.

In order to mitigate this performance issue, introduce a kernel
command-line parameter to the TPM driver for disabling the integrity
protection for PCR extend operations (i.e. TPM2_PCR_Extend).

Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Link: https://lore.kernel.org/linux-integrity/20241015193916.59964-1-zohar@linux.ibm.com/
Fixes: 6519fea6fd37 ("tpm: add hmac checks to tpm2_pcr_extend()")
Tested-by: Mimi Zohar <zohar@linux.ibm.com>
Co-developed-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Co-developed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  9 +++++
 drivers/char/tpm/tpm-buf.c                      | 20 ++++++++++
 drivers/char/tpm/tpm2-cmd.c                     | 30 +++++++++++----
 drivers/char/tpm/tpm2-sessions.c                | 51 ++++++++++++++-----------
 include/linux/tpm.h                             |  3 ++
 5 files changed, 82 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1666576acc0e..d401577b5a6a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6727,6 +6727,15 @@
 	torture.verbose_sleep_duration= [KNL]
 			Duration of each verbose-printk() sleep in jiffies.
 
+	tpm.disable_pcr_integrity= [HW,TPM]
+			Do not protect PCR registers from unintended physical
+			access, or interposers in the bus by the means of
+			having an integrity protected session wrapped around
+			TPM2_PCR_Extend command. Consider this in a situation
+			where TPM is heavily utilized by IMA, thus protection
+			causing a major performance hit, and the space where
+			machines are deployed is by other means guarded.
+
 	tpm_suspend_pcr=[HW,TPM]
 			Format: integer pcr id
 			Specify that at suspend time, the tpm driver
diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c
index cad0048bcc3c..e49a19fea3bd 100644
--- a/drivers/char/tpm/tpm-buf.c
+++ b/drivers/char/tpm/tpm-buf.c
@@ -146,6 +146,26 @@ void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value)
 }
 EXPORT_SYMBOL_GPL(tpm_buf_append_u32);
 
+/**
+ * tpm_buf_append_handle() - Add a handle
+ * @chip:	&tpm_chip instance
+ * @buf:	&tpm_buf instance
+ * @handle:	a TPM object handle
+ *
+ * Add a handle to the buffer, and increase the count tracking the number of
+ * handles in the command buffer. Works only for command buffers.
+ */
+void tpm_buf_append_handle(struct tpm_chip *chip, struct tpm_buf *buf, u32 handle)
+{
+	if (buf->flags & TPM_BUF_TPM2B) {
+		dev_err(&chip->dev, "Invalid buffer type (TPM2B)\n");
+		return;
+	}
+
+	tpm_buf_append_u32(buf, handle);
+	buf->handles++;
+}
+
 /**
  * tpm_buf_read() - Read from a TPM buffer
  * @buf:	&tpm_buf instance
diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 1e856259219e..dfdcbd009720 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -14,6 +14,10 @@
 #include "tpm.h"
 #include <crypto/hash_info.h>
 
+static bool disable_pcr_integrity;
+module_param(disable_pcr_integrity, bool, 0444);
+MODULE_PARM_DESC(disable_pcr_integrity, "Disable integrity protection of TPM2_PCR_Extend");
+
 static struct tpm2_hash tpm2_hash_map[] = {
 	{HASH_ALGO_SHA1, TPM_ALG_SHA1},
 	{HASH_ALGO_SHA256, TPM_ALG_SHA256},
@@ -232,18 +236,26 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 	int rc;
 	int i;
 
-	rc = tpm2_start_auth_session(chip);
-	if (rc)
-		return rc;
+	if (!disable_pcr_integrity) {
+		rc = tpm2_start_auth_session(chip);
+		if (rc)
+			return rc;
+	}
 
 	rc = tpm_buf_init(&buf, TPM2_ST_SESSIONS, TPM2_CC_PCR_EXTEND);
 	if (rc) {
-		tpm2_end_auth_session(chip);
+		if (!disable_pcr_integrity)
+			tpm2_end_auth_session(chip);
 		return rc;
 	}
 
-	tpm_buf_append_name(chip, &buf, pcr_idx, NULL);
-	tpm_buf_append_hmac_session(chip, &buf, 0, NULL, 0);
+	if (!disable_pcr_integrity) {
+		tpm_buf_append_name(chip, &buf, pcr_idx, NULL);
+		tpm_buf_append_hmac_session(chip, &buf, 0, NULL, 0);
+	} else {
+		tpm_buf_append_handle(chip, &buf, pcr_idx);
+		tpm_buf_append_auth(chip, &buf, 0, NULL, 0);
+	}
 
 	tpm_buf_append_u32(&buf, chip->nr_allocated_banks);
 
@@ -253,9 +265,11 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 			       chip->allocated_banks[i].digest_size);
 	}
 
-	tpm_buf_fill_hmac_session(chip, &buf);
+	if (!disable_pcr_integrity)
+		tpm_buf_fill_hmac_session(chip, &buf);
 	rc = tpm_transmit_cmd(chip, &buf, 0, "attempting extend a PCR value");
-	rc = tpm_buf_check_hmac_response(chip, &buf, rc);
+	if (!disable_pcr_integrity)
+		rc = tpm_buf_check_hmac_response(chip, &buf, rc);
 
 	tpm_buf_destroy(&buf);
 
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index 0739830904b2..52d3523042fb 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -237,9 +237,7 @@ void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
 #endif
 
 	if (!tpm2_chip_auth(chip)) {
-		tpm_buf_append_u32(buf, handle);
-		/* count the number of handles in the upper bits of flags */
-		buf->handles++;
+		tpm_buf_append_handle(chip, buf, handle);
 		return;
 	}
 
@@ -272,6 +270,31 @@ void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
 }
 EXPORT_SYMBOL_GPL(tpm_buf_append_name);
 
+void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf,
+			 u8 attributes, u8 *passphrase, int passphrase_len)
+{
+	/* offset tells us where the sessions area begins */
+	int offset = buf->handles * 4 + TPM_HEADER_SIZE;
+	u32 len = 9 + passphrase_len;
+
+	if (tpm_buf_length(buf) != offset) {
+		/* not the first session so update the existing length */
+		len += get_unaligned_be32(&buf->data[offset]);
+		put_unaligned_be32(len, &buf->data[offset]);
+	} else {
+		tpm_buf_append_u32(buf, len);
+	}
+	/* auth handle */
+	tpm_buf_append_u32(buf, TPM2_RS_PW);
+	/* nonce */
+	tpm_buf_append_u16(buf, 0);
+	/* attributes */
+	tpm_buf_append_u8(buf, 0);
+	/* passphrase */
+	tpm_buf_append_u16(buf, passphrase_len);
+	tpm_buf_append(buf, passphrase, passphrase_len);
+}
+
 /**
  * tpm_buf_append_hmac_session() - Append a TPM session element
  * @chip: the TPM chip structure
@@ -309,26 +332,8 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
 #endif
 
 	if (!tpm2_chip_auth(chip)) {
-		/* offset tells us where the sessions area begins */
-		int offset = buf->handles * 4 + TPM_HEADER_SIZE;
-		u32 len = 9 + passphrase_len;
-
-		if (tpm_buf_length(buf) != offset) {
-			/* not the first session so update the existing length */
-			len += get_unaligned_be32(&buf->data[offset]);
-			put_unaligned_be32(len, &buf->data[offset]);
-		} else {
-			tpm_buf_append_u32(buf, len);
-		}
-		/* auth handle */
-		tpm_buf_append_u32(buf, TPM2_RS_PW);
-		/* nonce */
-		tpm_buf_append_u16(buf, 0);
-		/* attributes */
-		tpm_buf_append_u8(buf, 0);
-		/* passphrase */
-		tpm_buf_append_u16(buf, passphrase_len);
-		tpm_buf_append(buf, passphrase, passphrase_len);
+		tpm_buf_append_auth(chip, buf, attributes, passphrase,
+				    passphrase_len);
 		return;
 	}
 
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 587b96b4418e..20a40ade8030 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -421,6 +421,7 @@ void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value);
 u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset);
 u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset);
 u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset);
+void tpm_buf_append_handle(struct tpm_chip *chip, struct tpm_buf *buf, u32 handle);
 
 /*
  * Check if TPM device is in the firmware upgrade mode.
@@ -505,6 +506,8 @@ void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
 void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
 				 u8 attributes, u8 *passphrase,
 				 int passphraselen);
+void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf,
+			 u8 attributes, u8 *passphrase, int passphraselen);
 static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip,
 						   struct tpm_buf *buf,
 						   u8 attributes,
-- 
cgit v1.2.3


From 704806ca400e5daa86c110f14bfdda9d28203bb7 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Wed, 13 Nov 2024 13:51:55 +0200
Subject: virtio: Extend the admin command to include the result size

Extend the admin command by incorporating a result size field.

This allows higher layers to determine the actual result size from the
backend when this information is not included in the result_sg.

The additional information introduced here will be used in subsequent
patches of this series.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20241113115200.209269-3-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/virtio/virtio_pci_modern.c | 4 +++-
 include/linux/virtio.h             | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 9193c30d640a..487d04610ecb 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -64,8 +64,10 @@ void vp_modern_avq_done(struct virtqueue *vq)
 	spin_lock_irqsave(&admin_vq->lock, flags);
 	do {
 		virtqueue_disable_cb(vq);
-		while ((cmd = virtqueue_get_buf(vq, &len)))
+		while ((cmd = virtqueue_get_buf(vq, &len))) {
+			cmd->result_sg_size = len;
 			complete(&cmd->completion);
+		}
 	} while (!virtqueue_enable_cb(vq));
 	spin_unlock_irqrestore(&admin_vq->lock, flags);
 }
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 306137a15d07..b5f7a611715a 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -111,6 +111,7 @@ struct virtio_admin_cmd {
 	struct scatterlist *data_sg;
 	struct scatterlist *result_sg;
 	struct completion completion;
+	u32 result_sg_size;
 	int ret;
 };
 
-- 
cgit v1.2.3


From 52a22c0ed03ce23f20df81f79b23cb6637716fae Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Wed, 13 Nov 2024 13:51:57 +0200
Subject: virtio-pci: Introduce APIs to execute device parts admin commands

Introduce APIs to handle the execution of device parts admin commands.

These APIs cover functionalities such as mode setting, object creation
and destruction, and operations like parts get/set and metadata
retrieval.

These APIs will be utilized in upcoming patches within this series.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20241113115200.209269-5-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/virtio/virtio_pci_common.h |   8 +-
 drivers/virtio/virtio_pci_modern.c | 348 +++++++++++++++++++++++++++++++++++++
 include/linux/virtio_pci_admin.h   |  11 ++
 3 files changed, 366 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 04b1d17663b3..0d00740cca07 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -173,7 +173,13 @@ struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev);
 #define VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP \
 	(BIT_ULL(VIRTIO_ADMIN_CMD_CAP_ID_LIST_QUERY) | \
 	 BIT_ULL(VIRTIO_ADMIN_CMD_DRIVER_CAP_SET) | \
-	 BIT_ULL(VIRTIO_ADMIN_CMD_DEVICE_CAP_GET))
+	 BIT_ULL(VIRTIO_ADMIN_CMD_DEVICE_CAP_GET) | \
+	 BIT_ULL(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_CREATE) | \
+	 BIT_ULL(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_DESTROY) | \
+	 BIT_ULL(VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_GET) | \
+	 BIT_ULL(VIRTIO_ADMIN_CMD_DEV_PARTS_GET) | \
+	 BIT_ULL(VIRTIO_ADMIN_CMD_DEV_PARTS_SET) | \
+	 BIT_ULL(VIRTIO_ADMIN_CMD_DEV_MODE_SET))
 
 /* Unlike modern drivers which support hardware virtio devices, legacy drivers
  * assume software-based devices: e.g. they don't use proper memory barriers
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 8ddac2829bc8..3f5aba71cfde 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/delay.h>
+#include <linux/virtio_pci_admin.h>
 #define VIRTIO_PCI_NO_LEGACY
 #define VIRTIO_RING_NO_LEGACY
 #include "virtio_pci_common.h"
@@ -875,6 +876,353 @@ static bool vp_get_shm_region(struct virtio_device *vdev,
 	return true;
 }
 
+/*
+ * virtio_pci_admin_has_dev_parts - Checks whether the device parts
+ * functionality is supported
+ * @pdev: VF pci_dev
+ *
+ * Returns true on success.
+ */
+bool virtio_pci_admin_has_dev_parts(struct pci_dev *pdev)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_pci_device *vp_dev;
+
+	if (!virtio_dev)
+		return false;
+
+	if (!virtio_has_feature(virtio_dev, VIRTIO_F_ADMIN_VQ))
+		return false;
+
+	vp_dev = to_vp_device(virtio_dev);
+
+	if (!((vp_dev->admin_vq.supported_cmds & VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP) ==
+		VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP))
+		return false;
+
+	return vp_dev->admin_vq.max_dev_parts_objects;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_has_dev_parts);
+
+/*
+ * virtio_pci_admin_mode_set - Sets the mode of a member device
+ * @pdev: VF pci_dev
+ * @flags: device mode's flags
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_mode_set(struct pci_dev *pdev, u8 flags)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_admin_cmd_dev_mode_set_data *data;
+	struct virtio_admin_cmd cmd = {};
+	struct scatterlist data_sg;
+	int vf_id;
+	int ret;
+
+	if (!virtio_dev)
+		return -ENODEV;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->flags = flags;
+	sg_init_one(&data_sg, data, sizeof(*data));
+	cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_MODE_SET);
+	cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+	cmd.group_member_id = cpu_to_le64(vf_id + 1);
+	cmd.data_sg = &data_sg;
+	ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_mode_set);
+
+/*
+ * virtio_pci_admin_obj_create - Creates an object for a given type and operation,
+ * following the max objects that can be created for that request.
+ * @pdev: VF pci_dev
+ * @obj_type: Object type
+ * @operation_type: Operation type
+ * @obj_id: Output unique object id
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_obj_create(struct pci_dev *pdev, u16 obj_type, u8 operation_type,
+				u32 *obj_id)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	u16 data_size = sizeof(struct virtio_admin_cmd_resource_obj_create_data);
+	struct virtio_admin_cmd_resource_obj_create_data *obj_create_data;
+	struct virtio_resource_obj_dev_parts obj_dev_parts = {};
+	struct virtio_pci_admin_vq *avq;
+	struct virtio_admin_cmd cmd = {};
+	struct scatterlist data_sg;
+	void *data;
+	int id = -1;
+	int vf_id;
+	int ret;
+
+	if (!virtio_dev)
+		return -ENODEV;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	if (obj_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS)
+		return -EOPNOTSUPP;
+
+	if (operation_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET &&
+	    operation_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET)
+		return -EINVAL;
+
+	avq = &to_vp_device(virtio_dev)->admin_vq;
+	if (!avq->max_dev_parts_objects)
+		return -EOPNOTSUPP;
+
+	id = ida_alloc_range(&avq->dev_parts_ida, 0,
+			     avq->max_dev_parts_objects - 1, GFP_KERNEL);
+	if (id < 0)
+		return id;
+
+	*obj_id = id;
+	data_size += sizeof(obj_dev_parts);
+	data = kzalloc(data_size, GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	obj_create_data = data;
+	obj_create_data->hdr.type = cpu_to_le16(obj_type);
+	obj_create_data->hdr.id = cpu_to_le32(*obj_id);
+	obj_dev_parts.type = operation_type;
+	memcpy(obj_create_data->resource_obj_specific_data, &obj_dev_parts,
+	       sizeof(obj_dev_parts));
+	sg_init_one(&data_sg, data, data_size);
+	cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_CREATE);
+	cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+	cmd.group_member_id = cpu_to_le64(vf_id + 1);
+	cmd.data_sg = &data_sg;
+	ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+
+	kfree(data);
+end:
+	if (ret)
+		ida_free(&avq->dev_parts_ida, id);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_obj_create);
+
+/*
+ * virtio_pci_admin_obj_destroy - Destroys an object of a given type and id
+ * @pdev: VF pci_dev
+ * @obj_type: Object type
+ * @id: Object id
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_obj_destroy(struct pci_dev *pdev, u16 obj_type, u32 id)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_admin_cmd_resource_obj_cmd_hdr *data;
+	struct virtio_pci_device *vp_dev;
+	struct virtio_admin_cmd cmd = {};
+	struct scatterlist data_sg;
+	int vf_id;
+	int ret;
+
+	if (!virtio_dev)
+		return -ENODEV;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	if (obj_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS)
+		return -EINVAL;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->type = cpu_to_le16(obj_type);
+	data->id = cpu_to_le32(id);
+	sg_init_one(&data_sg, data, sizeof(*data));
+	cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_DESTROY);
+	cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+	cmd.group_member_id = cpu_to_le64(vf_id + 1);
+	cmd.data_sg = &data_sg;
+	ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+	if (!ret) {
+		vp_dev = to_vp_device(virtio_dev);
+		ida_free(&vp_dev->admin_vq.dev_parts_ida, id);
+	}
+
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_obj_destroy);
+
+/*
+ * virtio_pci_admin_dev_parts_metadata_get - Gets the metadata of the device parts
+ * identified by the below attributes.
+ * @pdev: VF pci_dev
+ * @obj_type: Object type
+ * @id: Object id
+ * @metadata_type: Metadata type
+ * @out: Upon success holds the output for 'metadata type size'
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_dev_parts_metadata_get(struct pci_dev *pdev, u16 obj_type,
+					    u32 id, u8 metadata_type, u32 *out)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_admin_cmd_dev_parts_metadata_result *result;
+	struct virtio_admin_cmd_dev_parts_metadata_data *data;
+	struct scatterlist data_sg, result_sg;
+	struct virtio_admin_cmd cmd = {};
+	int vf_id;
+	int ret;
+
+	if (!virtio_dev)
+		return -ENODEV;
+
+	if (metadata_type != VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE)
+		return -EOPNOTSUPP;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	result = kzalloc(sizeof(*result), GFP_KERNEL);
+	if (!result) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	data->hdr.type = cpu_to_le16(obj_type);
+	data->hdr.id = cpu_to_le32(id);
+	data->type = metadata_type;
+	sg_init_one(&data_sg, data, sizeof(*data));
+	sg_init_one(&result_sg, result, sizeof(*result));
+	cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_GET);
+	cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+	cmd.group_member_id = cpu_to_le64(vf_id + 1);
+	cmd.data_sg = &data_sg;
+	cmd.result_sg = &result_sg;
+	ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+	if (!ret)
+		*out = le32_to_cpu(result->parts_size.size);
+
+	kfree(result);
+end:
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_dev_parts_metadata_get);
+
+/*
+ * virtio_pci_admin_dev_parts_get - Gets the device parts identified by the below attributes.
+ * @pdev: VF pci_dev
+ * @obj_type: Object type
+ * @id: Object id
+ * @get_type: Get type
+ * @res_sg: Upon success holds the output result data
+ * @res_size: Upon success holds the output result size
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_dev_parts_get(struct pci_dev *pdev, u16 obj_type, u32 id,
+				   u8 get_type, struct scatterlist *res_sg,
+				   u32 *res_size)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_admin_cmd_dev_parts_get_data *data;
+	struct scatterlist data_sg;
+	struct virtio_admin_cmd cmd = {};
+	int vf_id;
+	int ret;
+
+	if (!virtio_dev)
+		return -ENODEV;
+
+	if (get_type != VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL)
+		return -EOPNOTSUPP;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->hdr.type = cpu_to_le16(obj_type);
+	data->hdr.id = cpu_to_le32(id);
+	data->type = get_type;
+	sg_init_one(&data_sg, data, sizeof(*data));
+	cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_PARTS_GET);
+	cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+	cmd.group_member_id = cpu_to_le64(vf_id + 1);
+	cmd.data_sg = &data_sg;
+	cmd.result_sg = res_sg;
+	ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+	if (!ret)
+		*res_size = cmd.result_sg_size;
+
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_dev_parts_get);
+
+/*
+ * virtio_pci_admin_dev_parts_set - Sets the device parts identified by the below attributes.
+ * @pdev: VF pci_dev
+ * @data_sg: The device parts data, its layout follows struct virtio_admin_cmd_dev_parts_set_data
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_dev_parts_set(struct pci_dev *pdev, struct scatterlist *data_sg)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_admin_cmd cmd = {};
+	int vf_id;
+
+	if (!virtio_dev)
+		return -ENODEV;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_PARTS_SET);
+	cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+	cmd.group_member_id = cpu_to_le64(vf_id + 1);
+	cmd.data_sg = data_sg;
+	return vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_dev_parts_set);
+
 static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
 	.get		= NULL,
 	.set		= NULL,
diff --git a/include/linux/virtio_pci_admin.h b/include/linux/virtio_pci_admin.h
index f4a100a0fe2e..dffc92c17ad2 100644
--- a/include/linux/virtio_pci_admin.h
+++ b/include/linux/virtio_pci_admin.h
@@ -20,4 +20,15 @@ int virtio_pci_admin_legacy_io_notify_info(struct pci_dev *pdev,
 					   u64 *bar_offset);
 #endif
 
+bool virtio_pci_admin_has_dev_parts(struct pci_dev *pdev);
+int virtio_pci_admin_mode_set(struct pci_dev *pdev, u8 mode);
+int virtio_pci_admin_obj_create(struct pci_dev *pdev, u16 obj_type, u8 operation_type,
+				u32 *obj_id);
+int virtio_pci_admin_obj_destroy(struct pci_dev *pdev, u16 obj_type, u32 id);
+int virtio_pci_admin_dev_parts_metadata_get(struct pci_dev *pdev, u16 obj_type,
+					    u32 id, u8 metadata_type, u32 *out);
+int virtio_pci_admin_dev_parts_get(struct pci_dev *pdev, u16 obj_type, u32 id,
+				   u8 get_type, struct scatterlist *res_sg, u32 *res_size);
+int virtio_pci_admin_dev_parts_set(struct pci_dev *pdev, struct scatterlist *data_sg);
+
 #endif /* _LINUX_VIRTIO_PCI_ADMIN_H */
-- 
cgit v1.2.3


From eb94b7bb10109a14a5431a67e5d8e31cfa06b395 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Mon, 11 Nov 2024 00:17:34 +0100
Subject: net: Make copy_safe_from_sockptr() match documentation

copy_safe_from_sockptr()
  return copy_from_sockptr()
    return copy_from_sockptr_offset()
      return copy_from_user()

copy_from_user() does not return an error on fault. Instead, it returns a
number of bytes that were not copied. Have it handled.

Patch has a side effect: it un-breaks garbage input handling of
nfc_llcp_setsockopt() and mISDN's data_sock_setsockopt().

Fixes: 6309863b31dd ("net: add copy_safe_from_sockptr() helper")
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20241111-sockptr-copy-ret-fix-v1-1-a520083a93fb@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/sockptr.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index fc5a206c4043..195debe2b1db 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -77,7 +77,9 @@ static inline int copy_safe_from_sockptr(void *dst, size_t ksize,
 {
 	if (optlen < ksize)
 		return -EINVAL;
-	return copy_from_sockptr(dst, optval, ksize);
+	if (copy_from_sockptr(dst, optval, ksize))
+		return -EFAULT;
+	return 0;
 }
 
 static inline int copy_struct_from_sockptr(void *dst, size_t ksize,
-- 
cgit v1.2.3


From 16220cb315a0a10d2a003d9af534988686e675ea Mon Sep 17 00:00:00 2001
From: Tristram Ha <tristram.ha@microchip.com>
Date: Fri, 8 Nov 2024 17:57:05 -0800
Subject: net: dsa: microchip: Add LAN9646 switch support to KSZ DSA driver

LAN9646 switch is a 6-port switch with functions like KSZ9897.  It has
4 internal PHYs and 1 SGMII port.  The chip id read from hardware is
same as KSZ9477, so software driver needs to create a new chip id and
group allowable functions under its chip data structure to
differentiate the product.

Signed-off-by: Tristram Ha <tristram.ha@microchip.com>
Link: https://patch.msgid.link/20241109015705.82685-3-Tristram.Ha@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/microchip/ksz9477.c         |  4 +++
 drivers/net/dsa/microchip/ksz9477_i2c.c     | 14 +++++++-
 drivers/net/dsa/microchip/ksz_common.c      | 50 +++++++++++++++++++++++++++--
 drivers/net/dsa/microchip/ksz_common.h      |  1 +
 drivers/net/dsa/microchip/ksz_spi.c         |  7 ++++
 include/linux/platform_data/microchip-ksz.h |  1 +
 6 files changed, 74 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c
index 0ba658a72d8f..d16817e0476f 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -1131,6 +1131,10 @@ void ksz9477_config_cpu_port(struct dsa_switch *ds)
 		if (i == dev->cpu_port)
 			continue;
 		ksz_port_stp_state_set(ds, i, BR_STATE_DISABLED);
+
+		/* Power down the internal PHY if port is unused. */
+		if (dsa_is_unused_port(ds, i) && dev->info->internal_phy[i])
+			ksz_pwrite16(dev, i, 0x100, BMCR_PDOWN);
 	}
 }
 
diff --git a/drivers/net/dsa/microchip/ksz9477_i2c.c b/drivers/net/dsa/microchip/ksz9477_i2c.c
index 7d7560f23a73..1c6d7fc16772 100644
--- a/drivers/net/dsa/microchip/ksz9477_i2c.c
+++ b/drivers/net/dsa/microchip/ksz9477_i2c.c
@@ -2,7 +2,7 @@
 /*
  * Microchip KSZ9477 series register access through I2C
  *
- * Copyright (C) 2018-2019 Microchip Technology Inc.
+ * Copyright (C) 2018-2024 Microchip Technology Inc.
  */
 
 #include <linux/i2c.h>
@@ -16,6 +16,8 @@ KSZ_REGMAP_TABLE(ksz9477, not_used, 16, 0, 0);
 
 static int ksz9477_i2c_probe(struct i2c_client *i2c)
 {
+	const struct ksz_chip_data *chip;
+	struct device *ddev = &i2c->dev;
 	struct regmap_config rc;
 	struct ksz_device *dev;
 	int i, ret;
@@ -24,6 +26,12 @@ static int ksz9477_i2c_probe(struct i2c_client *i2c)
 	if (!dev)
 		return -ENOMEM;
 
+	chip = device_get_match_data(ddev);
+	if (!chip)
+		return -EINVAL;
+
+	/* Save chip id to do special initialization when probing. */
+	dev->chip_id = chip->chip_id;
 	for (i = 0; i < __KSZ_NUM_REGMAPS; i++) {
 		rc = ksz9477_regmap_config[i];
 		rc.lock_arg = &dev->regmap_mutex;
@@ -111,6 +119,10 @@ static const struct of_device_id ksz9477_dt_ids[] = {
 		.compatible = "microchip,ksz9567",
 		.data = &ksz_switch_chips[KSZ9567]
 	},
+	{
+		.compatible = "microchip,lan9646",
+		.data = &ksz_switch_chips[LAN9646]
+	},
 	{},
 };
 MODULE_DEVICE_TABLE(of, ksz9477_dt_ids);
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index d57782cdda59..920443ee8ffd 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1908,6 +1908,41 @@ const struct ksz_chip_data ksz_switch_chips[] = {
 		.internal_phy	= {true, true, true, true,
 				   false, false, true, true},
 	},
+
+	[LAN9646] = {
+		.chip_id = LAN9646_CHIP_ID,
+		.dev_name = "LAN9646",
+		.num_vlans = 4096,
+		.num_alus = 4096,
+		.num_statics = 16,
+		.cpu_ports = 0x7F,	/* can be configured as cpu port */
+		.port_cnt = 7,		/* total physical port count */
+		.port_nirqs = 4,
+		.num_tx_queues = 4,
+		.num_ipms = 8,
+		.ops = &ksz9477_dev_ops,
+		.phylink_mac_ops = &ksz9477_phylink_mac_ops,
+		.phy_errata_9477 = true,
+		.mib_names = ksz9477_mib_names,
+		.mib_cnt = ARRAY_SIZE(ksz9477_mib_names),
+		.reg_mib_cnt = MIB_COUNTER_NUM,
+		.regs = ksz9477_regs,
+		.masks = ksz9477_masks,
+		.shifts = ksz9477_shifts,
+		.xmii_ctrl0 = ksz9477_xmii_ctrl0,
+		.xmii_ctrl1 = ksz9477_xmii_ctrl1,
+		.supports_mii	= {false, false, false, false,
+				   false, true, true},
+		.supports_rmii	= {false, false, false, false,
+				   false, true, true},
+		.supports_rgmii = {false, false, false, false,
+				   false, true, true},
+		.internal_phy	= {true, true, true, true,
+				   true, false, false},
+		.gbit_capable	= {true, true, true, true, true, true, true},
+		.wr_table = &ksz9477_register_set,
+		.rd_table = &ksz9477_register_set,
+	},
 };
 EXPORT_SYMBOL_GPL(ksz_switch_chips);
 
@@ -2970,6 +3005,7 @@ static u32 ksz_get_phy_flags(struct dsa_switch *ds, int port)
 	case KSZ9896_CHIP_ID:
 		/* KSZ9896C Errata DS80000757A Module 3 */
 	case KSZ9897_CHIP_ID:
+	case LAN9646_CHIP_ID:
 		/* KSZ9897R Errata DS80000758C Module 4 */
 		/* Energy Efficient Ethernet (EEE) feature select must be manually disabled
 		 *   The EEE feature is enabled by default, but it is not fully
@@ -3230,6 +3266,7 @@ static void ksz_port_teardown(struct dsa_switch *ds, int port)
 	case KSZ9893_CHIP_ID:
 	case KSZ9896_CHIP_ID:
 	case KSZ9897_CHIP_ID:
+	case LAN9646_CHIP_ID:
 		if (dsa_is_user_port(ds, port))
 			ksz9477_port_acl_free(dev, port);
 	}
@@ -3286,7 +3323,8 @@ static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
 	    dev->chip_id == KSZ9477_CHIP_ID ||
 	    dev->chip_id == KSZ9896_CHIP_ID ||
 	    dev->chip_id == KSZ9897_CHIP_ID ||
-	    dev->chip_id == KSZ9567_CHIP_ID)
+	    dev->chip_id == KSZ9567_CHIP_ID ||
+	    dev->chip_id == LAN9646_CHIP_ID)
 		proto = DSA_TAG_PROTO_KSZ9477;
 
 	if (is_lan937x(dev))
@@ -3405,6 +3443,7 @@ static int ksz_max_mtu(struct dsa_switch *ds, int port)
 	case LAN9372_CHIP_ID:
 	case LAN9373_CHIP_ID:
 	case LAN9374_CHIP_ID:
+	case LAN9646_CHIP_ID:
 		return KSZ9477_MAX_FRAME_SIZE - VLAN_ETH_HLEN - ETH_FCS_LEN;
 	}
 
@@ -3427,6 +3466,7 @@ static int ksz_validate_eee(struct dsa_switch *ds, int port)
 	case KSZ9893_CHIP_ID:
 	case KSZ9896_CHIP_ID:
 	case KSZ9897_CHIP_ID:
+	case LAN9646_CHIP_ID:
 		return 0;
 	}
 
@@ -3779,7 +3819,10 @@ static int ksz_switch_detect(struct ksz_device *dev)
 		case LAN9372_CHIP_ID:
 		case LAN9373_CHIP_ID:
 		case LAN9374_CHIP_ID:
-			dev->chip_id = id32;
+
+			/* LAN9646 does not have its own chip id. */
+			if (dev->chip_id != LAN9646_CHIP_ID)
+				dev->chip_id = id32;
 			break;
 		case KSZ9893_CHIP_ID:
 			ret = ksz_read8(dev, REG_CHIP_ID4,
@@ -3818,6 +3861,7 @@ static int ksz_cls_flower_add(struct dsa_switch *ds, int port,
 	case KSZ9893_CHIP_ID:
 	case KSZ9896_CHIP_ID:
 	case KSZ9897_CHIP_ID:
+	case LAN9646_CHIP_ID:
 		return ksz9477_cls_flower_add(ds, port, cls, ingress);
 	}
 
@@ -3838,6 +3882,7 @@ static int ksz_cls_flower_del(struct dsa_switch *ds, int port,
 	case KSZ9893_CHIP_ID:
 	case KSZ9896_CHIP_ID:
 	case KSZ9897_CHIP_ID:
+	case LAN9646_CHIP_ID:
 		return ksz9477_cls_flower_del(ds, port, cls, ingress);
 	}
 
@@ -4925,6 +4970,7 @@ static int ksz_parse_drive_strength(struct ksz_device *dev)
 	case KSZ9893_CHIP_ID:
 	case KSZ9896_CHIP_ID:
 	case KSZ9897_CHIP_ID:
+	case LAN9646_CHIP_ID:
 		return ksz9477_drive_strength_write(dev, of_props,
 						    ARRAY_SIZE(of_props));
 	default:
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index bbb548af201e..b3bb75ca0796 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -236,6 +236,7 @@ enum ksz_model {
 	LAN9372,
 	LAN9373,
 	LAN9374,
+	LAN9646,
 };
 
 enum ksz_regs {
diff --git a/drivers/net/dsa/microchip/ksz_spi.c b/drivers/net/dsa/microchip/ksz_spi.c
index 1c6652f2b9fe..108a958dc356 100644
--- a/drivers/net/dsa/microchip/ksz_spi.c
+++ b/drivers/net/dsa/microchip/ksz_spi.c
@@ -54,6 +54,8 @@ static int ksz_spi_probe(struct spi_device *spi)
 	if (!chip)
 		return -EINVAL;
 
+	/* Save chip id to do special initialization when probing. */
+	dev->chip_id = chip->chip_id;
 	if (chip->chip_id == KSZ88X3_CHIP_ID)
 		regmap_config = ksz8863_regmap_config;
 	else if (chip->chip_id == KSZ8795_CHIP_ID ||
@@ -203,6 +205,10 @@ static const struct of_device_id ksz_dt_ids[] = {
 		.compatible = "microchip,lan9374",
 		.data = &ksz_switch_chips[LAN9374]
 	},
+	{
+		.compatible = "microchip,lan9646",
+		.data = &ksz_switch_chips[LAN9646]
+	},
 	{},
 };
 MODULE_DEVICE_TABLE(of, ksz_dt_ids);
@@ -228,6 +234,7 @@ static const struct spi_device_id ksz_spi_ids[] = {
 	{ "lan9372" },
 	{ "lan9373" },
 	{ "lan9374" },
+	{ "lan9646" },
 	{ },
 };
 MODULE_DEVICE_TABLE(spi, ksz_spi_ids);
diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h
index 2ee1a679e592..0e0e8fe6975f 100644
--- a/include/linux/platform_data/microchip-ksz.h
+++ b/include/linux/platform_data/microchip-ksz.h
@@ -42,6 +42,7 @@ enum ksz_chip_id {
 	LAN9372_CHIP_ID = 0x00937200,
 	LAN9373_CHIP_ID = 0x00937300,
 	LAN9374_CHIP_ID = 0x00937400,
+	LAN9646_CHIP_ID = 0x00964600,
 };
 
 struct ksz_platform_data {
-- 
cgit v1.2.3


From e311b04db66aaed1819bdd479d42c4f338f105b9 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 12 Nov 2024 12:56:45 +0000
Subject: soundwire: Update the includes on the sdw.h header

There are quite a few things used in the sdw.h header that it relies on
the consumer to include. If something is used directly in the header it
should be included by the header. Update the includes to cover the
missing items, or add forward declarations for things that are only used
as pointers. Whilst making the change also alphabetise the list of
includes.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20241112125646.590240-1-ckeepax@opensource.cirrus.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 73f655334fe9..1fd4b126287f 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -4,12 +4,19 @@
 #ifndef __SOUNDWIRE_H
 #define __SOUNDWIRE_H
 
+#include <linux/bitfield.h>
 #include <linux/bug.h>
-#include <linux/lockdep_types.h>
+#include <linux/completion.h>
+#include <linux/device.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
+#include <linux/lockdep_types.h>
 #include <linux/mod_devicetable.h>
-#include <linux/bitfield.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+
+struct dentry;
+struct fwnode_handle;
 
 struct sdw_bus;
 struct sdw_slave;
-- 
cgit v1.2.3


From dd690b31de0ed46adc5856698880560b900386ba Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 12 Nov 2024 12:56:46 +0000
Subject: soundwire: Minor formatting fixups in sdw.h header

Fixup some minor formatting and whitespace in the sdw.h header file.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20241112125646.590240-2-ckeepax@opensource.cirrus.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 40 +++++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 1fd4b126287f..784656f740f6 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -616,7 +616,6 @@ struct sdw_slave_ops {
 	int (*clk_stop)(struct sdw_slave *slave,
 			enum sdw_clk_stop_mode mode,
 			enum sdw_clk_stop_type type);
-
 };
 
 /**
@@ -690,8 +689,7 @@ struct sdw_master_device {
 	container_of(d, struct sdw_master_device, dev)
 
 struct sdw_driver {
-	int (*probe)(struct sdw_slave *sdw,
-			const struct sdw_device_id *id);
+	int (*probe)(struct sdw_slave *sdw, const struct sdw_device_id *id);
 	int (*remove)(struct sdw_slave *sdw);
 	void (*shutdown)(struct sdw_slave *sdw);
 
@@ -710,7 +708,7 @@ struct sdw_driver {
 	SDW_SLAVE_ENTRY_EXT((_mfg_id), (_part_id), 0, 0, (_drv_data))
 
 int sdw_handle_slave_status(struct sdw_bus *bus,
-			enum sdw_slave_status status[]);
+			    enum sdw_slave_status status[]);
 
 /*
  * SDW master structures and APIs
@@ -792,15 +790,14 @@ struct sdw_enable_ch {
  */
 struct sdw_master_port_ops {
 	int (*dpn_set_port_params)(struct sdw_bus *bus,
-			struct sdw_port_params *port_params,
-			unsigned int bank);
+				   struct sdw_port_params *port_params,
+				   unsigned int bank);
 	int (*dpn_set_port_transport_params)(struct sdw_bus *bus,
-			struct sdw_transport_params *transport_params,
-			enum sdw_reg_bank bank);
-	int (*dpn_port_prep)(struct sdw_bus *bus,
-			struct sdw_prepare_ch *prepare_ch);
+					     struct sdw_transport_params *transport_params,
+					     enum sdw_reg_bank bank);
+	int (*dpn_port_prep)(struct sdw_bus *bus, struct sdw_prepare_ch *prepare_ch);
 	int (*dpn_port_enable_ch)(struct sdw_bus *bus,
-			struct sdw_enable_ch *enable_ch, unsigned int bank);
+				  struct sdw_enable_ch *enable_ch, unsigned int bank);
 };
 
 struct sdw_msg;
@@ -835,14 +832,11 @@ struct sdw_defer {
  */
 struct sdw_master_ops {
 	int (*read_prop)(struct sdw_bus *bus);
-	u64 (*override_adr)
-			(struct sdw_bus *bus, u64 addr);
-	enum sdw_command_response (*xfer_msg)
-			(struct sdw_bus *bus, struct sdw_msg *msg);
-	enum sdw_command_response (*xfer_msg_defer)
-			(struct sdw_bus *bus);
+	u64 (*override_adr)(struct sdw_bus *bus, u64 addr);
+	enum sdw_command_response (*xfer_msg)(struct sdw_bus *bus, struct sdw_msg *msg);
+	enum sdw_command_response (*xfer_msg_defer)(struct sdw_bus *bus);
 	int (*set_bus_conf)(struct sdw_bus *bus,
-			struct sdw_bus_params *params);
+			    struct sdw_bus_params *params);
 	int (*pre_bank_switch)(struct sdw_bus *bus);
 	int (*post_bank_switch)(struct sdw_bus *bus);
 	u32 (*read_ping_status)(struct sdw_bus *bus);
@@ -1019,12 +1013,12 @@ void sdw_release_stream(struct sdw_stream_runtime *stream);
 int sdw_compute_params(struct sdw_bus *bus);
 
 int sdw_stream_add_master(struct sdw_bus *bus,
-		struct sdw_stream_config *stream_config,
-		const struct sdw_port_config *port_config,
-		unsigned int num_ports,
-		struct sdw_stream_runtime *stream);
+			  struct sdw_stream_config *stream_config,
+			  const struct sdw_port_config *port_config,
+			  unsigned int num_ports,
+			  struct sdw_stream_runtime *stream);
 int sdw_stream_remove_master(struct sdw_bus *bus,
-		struct sdw_stream_runtime *stream);
+			     struct sdw_stream_runtime *stream);
 int sdw_startup_stream(void *sdw_substream);
 int sdw_prepare_stream(struct sdw_stream_runtime *stream);
 int sdw_enable_stream(struct sdw_stream_runtime *stream);
-- 
cgit v1.2.3


From 04782e63917dbcb60932fe93df52c4a4e3859d07 Mon Sep 17 00:00:00 2001
From: Colton Lewis <coltonlewis@google.com>
Date: Wed, 13 Nov 2024 19:01:52 +0000
Subject: perf/core: Hoist perf_instruction_pointer() and perf_misc_flags()

For clarity, rename the arch-specific definitions of these functions
to perf_arch_* to denote they are arch-specifc. Define the
generic-named functions in one place where they can call the
arch-specific ones as needed.

Signed-off-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Acked-by: Thomas Richter <tmricht@linux.ibm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Acked-by: Kan Liang <kan.liang@linux.intel.com>
Link: https://lore.kernel.org/r/20241113190156.2145593-3-coltonlewis@google.com
---
 arch/arm64/include/asm/perf_event.h          |  6 +++---
 arch/arm64/kernel/perf_callchain.c           |  4 ++--
 arch/powerpc/include/asm/perf_event_server.h |  6 +++---
 arch/powerpc/perf/core-book3s.c              |  4 ++--
 arch/s390/include/asm/perf_event.h           |  6 +++---
 arch/s390/kernel/perf_event.c                |  4 ++--
 arch/x86/events/core.c                       |  4 ++--
 arch/x86/include/asm/perf_event.h            | 10 +++++-----
 include/linux/perf_event.h                   |  9 ++++++---
 kernel/events/core.c                         | 10 ++++++++++
 10 files changed, 38 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index eb7071c9eb34..31a5584ed423 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -11,9 +11,9 @@
 
 #ifdef CONFIG_PERF_EVENTS
 struct pt_regs;
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-#define perf_misc_flags(regs)	perf_misc_flags(regs)
+extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
+#define perf_arch_misc_flags(regs)	perf_misc_flags(regs)
 #define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs
 #endif
 
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index e8ed5673f481..01a9d08fc009 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -39,7 +39,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 	arch_stack_walk(callchain_trace, entry, current, regs);
 }
 
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
 {
 	if (perf_guest_state())
 		return perf_guest_get_ip();
@@ -47,7 +47,7 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs)
 	return instruction_pointer(regs);
 }
 
-unsigned long perf_misc_flags(struct pt_regs *regs)
+unsigned long perf_arch_misc_flags(struct pt_regs *regs)
 {
 	unsigned int guest_state = perf_guest_state();
 	int misc = 0;
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 5995614e9062..af0f46e2373b 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -102,8 +102,8 @@ struct power_pmu {
 int __init register_power_pmu(struct power_pmu *pmu);
 
 struct pt_regs;
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
+extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
 extern unsigned long int read_bhrb(int n);
 
 /*
@@ -111,7 +111,7 @@ extern unsigned long int read_bhrb(int n);
  * if we have hardware PMU support.
  */
 #ifdef CONFIG_PPC_PERF_CTRS
-#define perf_misc_flags(regs)	perf_misc_flags(regs)
+#define perf_arch_misc_flags(regs)	perf_arch_misc_flags(regs)
 #endif
 
 /*
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 42867469752d..dc01aa604cc1 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2332,7 +2332,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
  * Called from generic code to get the misc flags (i.e. processor mode)
  * for an event_id.
  */
-unsigned long perf_misc_flags(struct pt_regs *regs)
+unsigned long perf_arch_misc_flags(struct pt_regs *regs)
 {
 	u32 flags = perf_get_misc_flags(regs);
 
@@ -2346,7 +2346,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
  * Called from generic code to get the instruction pointer
  * for an event_id.
  */
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long siar = mfspr(SPRN_SIAR);
 
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 66200d4a2134..8895e7d9bcf9 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -37,9 +37,9 @@ extern ssize_t cpumf_events_sysfs_show(struct device *dev,
 
 /* Perf callbacks */
 struct pt_regs;
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-#define perf_misc_flags(regs) perf_misc_flags(regs)
+extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
+#define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs)
 #define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs
 
 /* Perf pt_regs extension for sample-data-entry indicators */
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 5fff629b1a89..f9000ab49f4a 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -57,7 +57,7 @@ static unsigned long instruction_pointer_guest(struct pt_regs *regs)
 	return sie_block(regs)->gpsw.addr;
 }
 
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
 {
 	return is_in_guest(regs) ? instruction_pointer_guest(regs)
 				 : instruction_pointer(regs);
@@ -84,7 +84,7 @@ static unsigned long perf_misc_flags_sf(struct pt_regs *regs)
 	return flags;
 }
 
-unsigned long perf_misc_flags(struct pt_regs *regs)
+unsigned long perf_arch_misc_flags(struct pt_regs *regs)
 {
 	/* Check if the cpum_sf PMU has created the pt_regs structure.
 	 * In this case, perf misc flags can be easily extracted.  Otherwise,
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 65ab6460aed4..d19e939f3998 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -3003,7 +3003,7 @@ static unsigned long code_segment_base(struct pt_regs *regs)
 	return 0;
 }
 
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
 {
 	if (perf_guest_state())
 		return perf_guest_get_ip();
@@ -3011,7 +3011,7 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs)
 	return regs->ip + code_segment_base(regs);
 }
 
-unsigned long perf_misc_flags(struct pt_regs *regs)
+unsigned long perf_arch_misc_flags(struct pt_regs *regs)
 {
 	unsigned int guest_state = perf_guest_state();
 	int misc = 0;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 91b73571412f..feb87bf3d2e9 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -536,15 +536,15 @@ struct x86_perf_regs {
 	u64		*xmm_regs;
 };
 
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-#define perf_misc_flags(regs)	perf_misc_flags(regs)
+extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
+#define perf_arch_misc_flags(regs)	perf_arch_misc_flags(regs)
 
 #include <asm/stacktrace.h>
 
 /*
- * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
- * and the comment with PERF_EFLAGS_EXACT.
+ * We abuse bit 3 from flags to pass exact information, see
+ * perf_arch_misc_flags() and the comment with PERF_EFLAGS_EXACT.
  */
 #define perf_arch_fetch_caller_regs(regs, __ip)		{	\
 	(regs)->ip = (__ip);					\
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 91b310052a7c..3b4bf5e329f6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1676,10 +1676,13 @@ extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  struct task_struct *task);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
-#ifndef perf_misc_flags
-# define perf_misc_flags(regs) \
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+
+#ifndef perf_arch_misc_flags
+# define perf_arch_misc_flags(regs) \
 		(user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
-# define perf_instruction_pointer(regs)	instruction_pointer(regs)
+# define perf_arch_instruction_pointer(regs)	instruction_pointer(regs)
 #endif
 #ifndef perf_arch_bpf_user_pt_regs
 # define perf_arch_bpf_user_pt_regs(regs) regs
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e9cfe6f3535..6050ce033ead 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7026,6 +7026,16 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
 #endif
 
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	return perf_arch_misc_flags(regs);
+}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	return perf_arch_instruction_pointer(regs);
+}
+
 static void
 perf_output_sample_regs(struct perf_output_handle *handle,
 			struct pt_regs *regs, u64 mask)
-- 
cgit v1.2.3


From 2c47e7a74f445426d156278e339b7abb259e50de Mon Sep 17 00:00:00 2001
From: Colton Lewis <coltonlewis@google.com>
Date: Wed, 13 Nov 2024 19:01:55 +0000
Subject: perf/core: Correct perf sampling with guest VMs

Previously any PMU overflow interrupt that fired while a VCPU was
loaded was recorded as a guest event whether it truly was or not. This
resulted in nonsense perf recordings that did not honor
perf_event_attr.exclude_guest and recorded guest IPs where it should
have recorded host IPs.

Rework the sampling logic to only record guest samples for events with
exclude_guest = 0. This way any host-only events with exclude_guest
set will never see unexpected guest samples. The behaviour of events
with exclude_guest = 0 is unchanged.

Note that events configured to sample both host and guest may still
misattribute a PMI that arrived in the host as a guest event depending
on KVM arch and vendor behavior.

Signed-off-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20241113190156.2145593-6-coltonlewis@google.com
---
 arch/arm64/include/asm/perf_event.h |  4 ----
 arch/arm64/kernel/perf_callchain.c  | 28 ----------------------------
 arch/x86/events/core.c              |  3 ---
 include/linux/perf_event.h          | 21 +++++++++++++++++++--
 kernel/events/core.c                | 21 +++++++++++++++++----
 5 files changed, 36 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 31a5584ed423..ee45b4e77347 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -10,10 +10,6 @@
 #include <asm/ptrace.h>
 
 #ifdef CONFIG_PERF_EVENTS
-struct pt_regs;
-extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
-extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
-#define perf_arch_misc_flags(regs)	perf_misc_flags(regs)
 #define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs
 #endif
 
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 01a9d08fc009..9b7f26b128b5 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -38,31 +38,3 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 
 	arch_stack_walk(callchain_trace, entry, current, regs);
 }
-
-unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
-{
-	if (perf_guest_state())
-		return perf_guest_get_ip();
-
-	return instruction_pointer(regs);
-}
-
-unsigned long perf_arch_misc_flags(struct pt_regs *regs)
-{
-	unsigned int guest_state = perf_guest_state();
-	int misc = 0;
-
-	if (guest_state) {
-		if (guest_state & PERF_GUEST_USER)
-			misc |= PERF_RECORD_MISC_GUEST_USER;
-		else
-			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
-	} else {
-		if (user_mode(regs))
-			misc |= PERF_RECORD_MISC_USER;
-		else
-			misc |= PERF_RECORD_MISC_KERNEL;
-	}
-
-	return misc;
-}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index bfc0a35fe399..c75c482d4c52 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -3005,9 +3005,6 @@ static unsigned long code_segment_base(struct pt_regs *regs)
 
 unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
 {
-	if (perf_guest_state())
-		return perf_guest_get_ip();
-
 	return regs->ip + code_segment_base(regs);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3b4bf5e329f6..cb99ec8c9e96 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1676,8 +1676,9 @@ extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  struct task_struct *task);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs);
+extern unsigned long perf_instruction_pointer(struct perf_event *event,
+					      struct pt_regs *regs);
 
 #ifndef perf_arch_misc_flags
 # define perf_arch_misc_flags(regs) \
@@ -1688,6 +1689,22 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 # define perf_arch_bpf_user_pt_regs(regs) regs
 #endif
 
+#ifndef perf_arch_guest_misc_flags
+static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
+{
+	unsigned long guest_state = perf_guest_state();
+
+	if (!(guest_state & PERF_GUEST_ACTIVE))
+		return 0;
+
+	if (guest_state & PERF_GUEST_USER)
+		return PERF_RECORD_MISC_GUEST_USER;
+	else
+		return PERF_RECORD_MISC_GUEST_KERNEL;
+}
+# define perf_arch_guest_misc_flags(regs)	perf_arch_guest_misc_flags(regs)
+#endif
+
 static inline bool has_branch_stack(struct perf_event *event)
 {
 	return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6050ce033ead..1869164a4e99 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7026,13 +7026,26 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
 #endif
 
-unsigned long perf_misc_flags(struct pt_regs *regs)
+static bool should_sample_guest(struct perf_event *event)
 {
+	return !event->attr.exclude_guest && perf_guest_state();
+}
+
+unsigned long perf_misc_flags(struct perf_event *event,
+			      struct pt_regs *regs)
+{
+	if (should_sample_guest(event))
+		return perf_arch_guest_misc_flags(regs);
+
 	return perf_arch_misc_flags(regs);
 }
 
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+unsigned long perf_instruction_pointer(struct perf_event *event,
+				       struct pt_regs *regs)
 {
+	if (should_sample_guest(event))
+		return perf_guest_get_ip();
+
 	return perf_arch_instruction_pointer(regs);
 }
 
@@ -7853,7 +7866,7 @@ void perf_prepare_sample(struct perf_sample_data *data,
 	__perf_event_header__init_id(data, event, filtered_sample_type);
 
 	if (filtered_sample_type & PERF_SAMPLE_IP) {
-		data->ip = perf_instruction_pointer(regs);
+		data->ip = perf_instruction_pointer(event, regs);
 		data->sample_flags |= PERF_SAMPLE_IP;
 	}
 
@@ -8017,7 +8030,7 @@ void perf_prepare_header(struct perf_event_header *header,
 {
 	header->type = PERF_RECORD_SAMPLE;
 	header->size = perf_sample_data_size(data, event);
-	header->misc = perf_misc_flags(regs);
+	header->misc = perf_misc_flags(event, regs);
 
 	/*
 	 * If you're adding more sample types here, you likely need to do
-- 
cgit v1.2.3


From 9fed2c0f2f0771b990d068ef0a2b32e770ae6d48 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 13 Nov 2024 09:17:51 -0500
Subject: fs: reduce pointer chasing in is_mgtime() test

The is_mgtime test checks whether the FS_MGTIME flag is set in the
fstype. To get there from the inode though, we have to dereference 3
pointers.

Add a new IOP_MGTIME flag, and have inode_init_always() set that flag
when the fstype flag is set. Then, make is_mgtime test for IOP_MGTIME
instead.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20241113-mgtime-v1-1-84e256980e11@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c         | 2 ++
 include/linux/fs.h | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 3b66082afe22..bb0117e679de 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -241,6 +241,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_opflags = 0;
 	if (sb->s_xattr)
 		inode->i_opflags |= IOP_XATTR;
+	if (sb->s_type->fs_flags & FS_MGTIME)
+		inode->i_opflags |= IOP_MGTIME;
 	i_uid_write(inode, 0);
 	i_gid_write(inode, 0);
 	atomic_set(&inode->i_writecount, 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b1a3bd07711b..a9c6c8cbda50 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -623,6 +623,7 @@ is_uncached_acl(struct posix_acl *acl)
 #define IOP_NOFOLLOW	0x0004
 #define IOP_XATTR	0x0008
 #define IOP_DEFAULT_READLINK	0x0010
+#define IOP_MGTIME	0x0020
 
 /*
  * Keep mostly read-only and often accessed (especially for
@@ -2581,7 +2582,7 @@ struct file_system_type {
  */
 static inline bool is_mgtime(const struct inode *inode)
 {
-	return inode->i_sb->s_type->fs_flags & FS_MGTIME;
+	return inode->i_opflags & IOP_MGTIME;
 }
 
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
-- 
cgit v1.2.3


From 0c32840763b1579c923b4216c18bb756ca4ba473 Mon Sep 17 00:00:00 2001
From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
Date: Thu, 14 Nov 2024 08:03:57 -0500
Subject: platform/x86/intel/pmt: allow user offset for PMT callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Usage of the telem sysfs file allows for partial reads at an offset.
The current callback method returns the buffer starting from offset 0
only.

Include the requested offset in the callback and update the necessary
address calculations with the offset.

Note: offset addition is moved from the caller to the local usage. For
non-callback usage this is unchanged behavior.

Fixes: e92affc74cd8 ("platform/x86/intel/vsec: Add PMT read callbacks")
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Link: https://lore.kernel.org/r/20241114130358.2467787-2-michael.j.ruhl@intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/intel/pmt/class.c     | 8 +++++---
 drivers/platform/x86/intel/pmt/class.h     | 2 +-
 drivers/platform/x86/intel/pmt/telemetry.c | 2 +-
 include/linux/intel_vsec.h                 | 3 ++-
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/intel/pmt/class.c b/drivers/platform/x86/intel/pmt/class.c
index 7680474c4f96..375695cc0d60 100644
--- a/drivers/platform/x86/intel/pmt/class.c
+++ b/drivers/platform/x86/intel/pmt/class.c
@@ -59,10 +59,12 @@ pmt_memcpy64_fromio(void *to, const u64 __iomem *from, size_t count)
 }
 
 int pmt_telem_read_mmio(struct pci_dev *pdev, struct pmt_callbacks *cb, u32 guid, void *buf,
-			void __iomem *addr, u32 count)
+			void __iomem *addr, loff_t off, u32 count)
 {
 	if (cb && cb->read_telem)
-		return cb->read_telem(pdev, guid, buf, count);
+		return cb->read_telem(pdev, guid, buf, off, count);
+
+	addr += off;
 
 	if (guid == GUID_SPR_PUNIT)
 		/* PUNIT on SPR only supports aligned 64-bit read */
@@ -96,7 +98,7 @@ intel_pmt_read(struct file *filp, struct kobject *kobj,
 		count = entry->size - off;
 
 	count = pmt_telem_read_mmio(entry->ep->pcidev, entry->cb, entry->header.guid, buf,
-				    entry->base + off, count);
+				    entry->base, off, count);
 
 	return count;
 }
diff --git a/drivers/platform/x86/intel/pmt/class.h b/drivers/platform/x86/intel/pmt/class.h
index a267ac964423..b2006d57779d 100644
--- a/drivers/platform/x86/intel/pmt/class.h
+++ b/drivers/platform/x86/intel/pmt/class.h
@@ -62,7 +62,7 @@ struct intel_pmt_namespace {
 };
 
 int pmt_telem_read_mmio(struct pci_dev *pdev, struct pmt_callbacks *cb, u32 guid, void *buf,
-			void __iomem *addr, u32 count);
+			void __iomem *addr, loff_t off, u32 count);
 bool intel_pmt_is_early_client_hw(struct device *dev);
 int intel_pmt_dev_create(struct intel_pmt_entry *entry,
 			 struct intel_pmt_namespace *ns,
diff --git a/drivers/platform/x86/intel/pmt/telemetry.c b/drivers/platform/x86/intel/pmt/telemetry.c
index c9feac859e57..0cea617c6c2e 100644
--- a/drivers/platform/x86/intel/pmt/telemetry.c
+++ b/drivers/platform/x86/intel/pmt/telemetry.c
@@ -219,7 +219,7 @@ int pmt_telem_read(struct telem_endpoint *ep, u32 id, u64 *data, u32 count)
 	if (offset + NUM_BYTES_QWORD(count) > size)
 		return -EINVAL;
 
-	pmt_telem_read_mmio(ep->pcidev, ep->cb, ep->header.guid, data, ep->base + offset,
+	pmt_telem_read_mmio(ep->pcidev, ep->cb, ep->header.guid, data, ep->base, offset,
 			    NUM_BYTES_QWORD(count));
 
 	return ep->present ? 0 : -EPIPE;
diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index 11ee185566c3..b94beab64610 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -74,10 +74,11 @@ enum intel_vsec_quirks {
  * @pdev:  PCI device reference for the callback's use
  * @guid:  ID of data to acccss
  * @data:  buffer for the data to be copied
+ * @off:   offset into the requested buffer
  * @count: size of buffer
  */
 struct pmt_callbacks {
-	int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, u32 count);
+	int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count);
 };
 
 /**
-- 
cgit v1.2.3


From 0b3144da31f855fce652303f588416a60991bdef Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 13 Nov 2024 07:49:22 +0100
Subject: USB: make single lock for all usb dynamic id lists

There are a number of places where we accidentally pass in a constant
structure to later cast it off to a dynamic one, and then attempt to
grab a lock on it, which is not a good idea.  To help resolve this, move
the dynamic id lock out of the dynamic id structure for the driver and
into one single lock for all USB dynamic ids.  As this lock should never
have any real contention (it's only every accessed when a device is
added or removed, which is always serialized) there should not be any
difference except for some memory savings.

Note, this just converts the existing use of the dynamic id lock to the
new static lock, there is one place that is accessing the dynamic id
list without grabbing the lock, that will be fixed up in a follow-on
change.

Cc: Johan Hovold <johan@kernel.org>
Cc: Herve Codina <herve.codina@bootlin.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Yajun Deng <yajun.deng@linux.dev>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: linux-usb@vger.kernel.org
Link: https://lore.kernel.org/r/2024111322-kindly-finalist-d247@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/common/common.c     |  3 +++
 drivers/usb/core/driver.c       | 15 +++++----------
 drivers/usb/serial/bus.c        |  4 +---
 drivers/usb/serial/usb-serial.c |  4 +---
 include/linux/usb.h             |  2 +-
 5 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/common/common.c b/drivers/usb/common/common.c
index b7bea1015d7c..871cf199b6bf 100644
--- a/drivers/usb/common/common.c
+++ b/drivers/usb/common/common.c
@@ -415,6 +415,9 @@ EXPORT_SYMBOL_GPL(usb_of_get_companion_dev);
 struct dentry *usb_debug_root;
 EXPORT_SYMBOL_GPL(usb_debug_root);
 
+DEFINE_MUTEX(usb_dynids_lock);
+EXPORT_SYMBOL_GPL(usb_dynids_lock);
+
 static int __init usb_common_init(void)
 {
 	usb_debug_root = debugfs_create_dir("usb", NULL);
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 0c3f12daac79..bc3c00580238 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -95,9 +95,9 @@ ssize_t usb_store_new_id(struct usb_dynids *dynids,
 		}
 	}
 
-	spin_lock(&dynids->lock);
+	mutex_lock(&usb_dynids_lock);
 	list_add_tail(&dynid->node, &dynids->list);
-	spin_unlock(&dynids->lock);
+	mutex_unlock(&usb_dynids_lock);
 
 	retval = driver_attach(driver);
 
@@ -160,7 +160,7 @@ static ssize_t remove_id_store(struct device_driver *driver, const char *buf,
 	if (fields < 2)
 		return -EINVAL;
 
-	spin_lock(&usb_driver->dynids.lock);
+	guard(mutex)(&usb_dynids_lock);
 	list_for_each_entry_safe(dynid, n, &usb_driver->dynids.list, node) {
 		struct usb_device_id *id = &dynid->id;
 
@@ -171,7 +171,6 @@ static ssize_t remove_id_store(struct device_driver *driver, const char *buf,
 			break;
 		}
 	}
-	spin_unlock(&usb_driver->dynids.lock);
 	return count;
 }
 
@@ -220,12 +219,11 @@ static void usb_free_dynids(struct usb_driver *usb_drv)
 {
 	struct usb_dynid *dynid, *n;
 
-	spin_lock(&usb_drv->dynids.lock);
+	guard(mutex)(&usb_dynids_lock);
 	list_for_each_entry_safe(dynid, n, &usb_drv->dynids.list, node) {
 		list_del(&dynid->node);
 		kfree(dynid);
 	}
-	spin_unlock(&usb_drv->dynids.lock);
 }
 
 static const struct usb_device_id *usb_match_dynamic_id(struct usb_interface *intf,
@@ -233,14 +231,12 @@ static const struct usb_device_id *usb_match_dynamic_id(struct usb_interface *in
 {
 	struct usb_dynid *dynid;
 
-	spin_lock(&drv->dynids.lock);
+	guard(mutex)(&usb_dynids_lock);
 	list_for_each_entry(dynid, &drv->dynids.list, node) {
 		if (usb_match_one_id(intf, &dynid->id)) {
-			spin_unlock(&drv->dynids.lock);
 			return &dynid->id;
 		}
 	}
-	spin_unlock(&drv->dynids.lock);
 	return NULL;
 }
 
@@ -1076,7 +1072,6 @@ int usb_register_driver(struct usb_driver *new_driver, struct module *owner,
 	new_driver->driver.owner = owner;
 	new_driver->driver.mod_name = mod_name;
 	new_driver->driver.dev_groups = new_driver->dev_groups;
-	spin_lock_init(&new_driver->dynids.lock);
 	INIT_LIST_HEAD(&new_driver->dynids.list);
 
 	retval = driver_register(&new_driver->driver);
diff --git a/drivers/usb/serial/bus.c b/drivers/usb/serial/bus.c
index d200e2c29a8f..2fea1b1db4a2 100644
--- a/drivers/usb/serial/bus.c
+++ b/drivers/usb/serial/bus.c
@@ -136,12 +136,11 @@ static void free_dynids(struct usb_serial_driver *drv)
 {
 	struct usb_dynid *dynid, *n;
 
-	spin_lock(&drv->dynids.lock);
+	guard(mutex)(&usb_dynids_lock);
 	list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
 		list_del(&dynid->node);
 		kfree(dynid);
 	}
-	spin_unlock(&drv->dynids.lock);
 }
 
 const struct bus_type usb_serial_bus_type = {
@@ -157,7 +156,6 @@ int usb_serial_bus_register(struct usb_serial_driver *driver)
 	int retval;
 
 	driver->driver.bus = &usb_serial_bus_type;
-	spin_lock_init(&driver->dynids.lock);
 	INIT_LIST_HEAD(&driver->dynids.list);
 
 	retval = driver_register(&driver->driver);
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index df6a2ae0bf42..7266558d823a 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -706,14 +706,12 @@ static const struct usb_device_id *match_dynamic_id(struct usb_interface *intf,
 {
 	struct usb_dynid *dynid;
 
-	spin_lock(&drv->dynids.lock);
+	guard(mutex)(&usb_dynids_lock);
 	list_for_each_entry(dynid, &drv->dynids.list, node) {
 		if (usb_match_one_id(intf, &dynid->id)) {
-			spin_unlock(&drv->dynids.lock);
 			return &dynid->id;
 		}
 	}
-	spin_unlock(&drv->dynids.lock);
 	return NULL;
 }
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 672d8fc2abdb..b66b1af3e439 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1129,8 +1129,8 @@ static inline int usb_make_path(struct usb_device *dev, char *buf, size_t size)
 /* ----------------------------------------------------------------------- */
 
 /* Stuff for dynamic usb ids */
+extern struct mutex usb_dynids_lock;
 struct usb_dynids {
-	spinlock_t lock;
 	struct list_head list;
 };
 
-- 
cgit v1.2.3


From 2f3aab7aecb827ba93c6222646eb0faa8228d590 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 13 Nov 2024 15:04:40 +0100
Subject: USB: make to_usb_driver() use container_of_const()

Turns out that we have some const pointers being passed to
to_usb_driver() but were not catching this.  Change the macro to
properly propagate the const-ness of the pointer so that we will notice
when we try to write to memory that we shouldn't be writing to.

This requires fixing up the usb_match_dynamic_id() function as well,
because it can handle a const * to struct usb_driver.

Cc: Johan Hovold <johan@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Yajun Deng <yajun.deng@linux.dev>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: linux-usb@vger.kernel.org
Link: https://lore.kernel.org/r/2024111339-shaky-goldsmith-b233@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/driver.c | 4 ++--
 include/linux/usb.h       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 9ea955a3d115..bc5c561bdbd5 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -228,7 +228,7 @@ static void usb_free_dynids(struct usb_driver *usb_drv)
 }
 
 static const struct usb_device_id *usb_match_dynamic_id(struct usb_interface *intf,
-							struct usb_driver *drv)
+							const struct usb_driver *drv)
 {
 	struct usb_dynid *dynid;
 
@@ -890,7 +890,7 @@ static int usb_device_match(struct device *dev, const struct device_driver *drv)
 
 	} else if (is_usb_interface(dev)) {
 		struct usb_interface *intf;
-		struct usb_driver *usb_drv;
+		const struct usb_driver *usb_drv;
 		const struct usb_device_id *id;
 
 		/* device drivers never match interfaces */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index b66b1af3e439..7a9e96f9d886 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1243,7 +1243,7 @@ struct usb_driver {
 	unsigned int disable_hub_initiated_lpm:1;
 	unsigned int soft_unbind:1;
 };
-#define	to_usb_driver(d) container_of(d, struct usb_driver, driver)
+#define	to_usb_driver(d) container_of_const(d, struct usb_driver, driver)
 
 /**
  * struct usb_device_driver - identifies USB device driver to usbcore
-- 
cgit v1.2.3


From d6fa15bbcf9604e3c14816410550d2cf22b955e4 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 13 Nov 2024 15:04:41 +0100
Subject: USB: make to_usb_device_driver() use container_of_const()

Turns out that we have some const pointers being passed to
to_usb_device_driver() but were not catching this.  Change the macro to
properly propagate the const-ness of the pointer so that we will notice
when we try to write to memory that we shouldn't be writing to.

This requires fixing up the usb_driver_applicable() function as well,
because it can handle a const * to struct usb_driver.

Cc: Johan Hovold <johan@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Yajun Deng <yajun.deng@linux.dev>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: linux-usb@vger.kernel.org
Link: https://lore.kernel.org/r/2024111342-lagoon-reapprove-5e49@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/driver.c | 4 ++--
 drivers/usb/core/usb.h    | 2 +-
 include/linux/usb.h       | 3 +--
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index bc5c561bdbd5..f203fdbfb6f6 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -850,7 +850,7 @@ const struct usb_device_id *usb_device_match_id(struct usb_device *udev,
 EXPORT_SYMBOL_GPL(usb_device_match_id);
 
 bool usb_driver_applicable(struct usb_device *udev,
-			   struct usb_device_driver *udrv)
+			   const struct usb_device_driver *udrv)
 {
 	if (udrv->id_table && udrv->match)
 		return usb_device_match_id(udev, udrv->id_table) != NULL &&
@@ -870,7 +870,7 @@ static int usb_device_match(struct device *dev, const struct device_driver *drv)
 	/* devices and interfaces are handled separately */
 	if (is_usb_device(dev)) {
 		struct usb_device *udev;
-		struct usb_device_driver *udrv;
+		const struct usb_device_driver *udrv;
 
 		/* interface drivers never match devices */
 		if (!is_usb_device_driver(drv))
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index b8324ea05b20..a9b37aeb515b 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -75,7 +75,7 @@ extern int usb_match_device(struct usb_device *dev,
 extern const struct usb_device_id *usb_device_match_id(struct usb_device *udev,
 				const struct usb_device_id *id);
 extern bool usb_driver_applicable(struct usb_device *udev,
-				  struct usb_device_driver *udrv);
+				  const struct usb_device_driver *udrv);
 extern void usb_forced_unbind_intf(struct usb_interface *intf);
 extern void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev);
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 7a9e96f9d886..cfa8005e24f9 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1294,8 +1294,7 @@ struct usb_device_driver {
 	unsigned int supports_autosuspend:1;
 	unsigned int generic_subclass:1;
 };
-#define	to_usb_device_driver(d) container_of(d, struct usb_device_driver, \
-		driver)
+#define	to_usb_device_driver(d) container_of_const(d, struct usb_device_driver, driver)
 
 /**
  * struct usb_class_driver - identifies a USB driver that wants to use the USB major number
-- 
cgit v1.2.3


From d96c77bd4eeba469bddbbb14323d2191684da82a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 8 Nov 2024 04:56:31 -0500
Subject: KVM: x86: switch hugepage recovery thread to vhost_task

kvm_vm_create_worker_thread() is meant to be used for kthreads that
can consume significant amounts of CPU time on behalf of a VM or in
response to how the VM behaves (for example how it accesses its memory).
Therefore it wants to charge the CPU time consumed by that work to
the VM's container.

However, because of these threads, cgroups which have kvm instances
inside never complete freezing.  This can be trivially reproduced:

  root@test ~# mkdir /sys/fs/cgroup/test
  root@test ~# echo $$ > /sys/fs/cgroup/test/cgroup.procs
  root@test ~# qemu-system-x86_64 -nographic -enable-kvm

and in another terminal:

  root@test ~# echo 1 > /sys/fs/cgroup/test/cgroup.freeze
  root@test ~# cat /sys/fs/cgroup/test/cgroup.events
  populated 1
  frozen 0

The cgroup freezing happens in the signal delivery path but
kvm_nx_huge_page_recovery_worker, while joining non-root cgroups, never
calls into the signal delivery path and thus never gets frozen. Because
the cgroup freezer determines whether a given cgroup is frozen by
comparing the number of frozen threads to the total number of threads
in the cgroup, the cgroup never becomes frozen and users waiting for
the state transition may hang indefinitely.

Since the worker kthread is tied to a user process, it's better if
it behaves similarly to user tasks as much as possible, including
being able to send SIGSTOP and SIGCONT.  In fact, vhost_task is all
that kvm_vm_create_worker_thread() wanted to be and more: not only it
inherits the userspace process's cgroups, it has other niceties like
being parented properly in the process tree.  Use it instead of the
homegrown alternative.

Incidentally, the new code is also better behaved when you flip recovery
back and forth to disabled and back to enabled.  If your recovery period
is 1 minute, it will run the next recovery after 1 minute independent
of how many times you flipped the parameter.

(Commit message based on emails from Tejun).

Reported-by: Tejun Heo <tj@kernel.org>
Reported-by: Luca Boccassi <bluca@debian.org>
Acked-by: Tejun Heo <tj@kernel.org>
Tested-by: Luca Boccassi <bluca@debian.org>
Cc: stable@vger.kernel.org
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   4 +-
 arch/x86/kvm/Kconfig            |   1 +
 arch/x86/kvm/mmu/mmu.c          |  68 ++++++++++++--------------
 include/linux/kvm_host.h        |   6 ---
 virt/kvm/kvm_main.c             | 103 ----------------------------------------
 5 files changed, 35 insertions(+), 147 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3e8afc82ae2f..e159e44a6a1b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -26,6 +26,7 @@
 #include <linux/irqbypass.h>
 #include <linux/hyperv.h>
 #include <linux/kfifo.h>
+#include <linux/sched/vhost_task.h>
 
 #include <asm/apic.h>
 #include <asm/pvclock-abi.h>
@@ -1442,7 +1443,8 @@ struct kvm_arch {
 	bool sgx_provisioning_allowed;
 
 	struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
-	struct task_struct *nx_huge_page_recovery_thread;
+	struct vhost_task *nx_huge_page_recovery_thread;
+	u64 nx_huge_page_last;
 
 #ifdef CONFIG_X86_64
 	/* The number of TDP MMU pages across all roots. */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1ed1e4f5d51c..d93af5390341 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -30,6 +30,7 @@ config KVM_X86
 	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_READONLY_MEM
+	select VHOST_TASK
 	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d7b391fe2c23..22e7ad235123 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7162,7 +7162,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
 			kvm_mmu_zap_all_fast(kvm);
 			mutex_unlock(&kvm->slots_lock);
 
-			wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+			vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
 		}
 		mutex_unlock(&kvm_lock);
 	}
@@ -7291,7 +7291,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
 		mutex_lock(&kvm_lock);
 
 		list_for_each_entry(kvm, &vm_list, vm_list)
-			wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+			vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
 
 		mutex_unlock(&kvm_lock);
 	}
@@ -7394,62 +7394,56 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
 	srcu_read_unlock(&kvm->srcu, rcu_idx);
 }
 
-static long get_nx_huge_page_recovery_timeout(u64 start_time)
+static void kvm_nx_huge_page_recovery_worker_kill(void *data)
 {
-	bool enabled;
-	uint period;
-
-	enabled = calc_nx_huge_pages_recovery_period(&period);
-
-	return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
-		       : MAX_SCHEDULE_TIMEOUT;
 }
 
-static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
+static bool kvm_nx_huge_page_recovery_worker(void *data)
 {
-	u64 start_time;
+	struct kvm *kvm = data;
+	bool enabled;
+	uint period;
 	long remaining_time;
 
-	while (true) {
-		start_time = get_jiffies_64();
-		remaining_time = get_nx_huge_page_recovery_timeout(start_time);
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		while (!kthread_should_stop() && remaining_time > 0) {
-			schedule_timeout(remaining_time);
-			remaining_time = get_nx_huge_page_recovery_timeout(start_time);
-			set_current_state(TASK_INTERRUPTIBLE);
-		}
-
-		set_current_state(TASK_RUNNING);
-
-		if (kthread_should_stop())
-			return 0;
+	enabled = calc_nx_huge_pages_recovery_period(&period);
+	if (!enabled)
+		return false;
 
-		kvm_recover_nx_huge_pages(kvm);
+	remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
+		- get_jiffies_64();
+	if (remaining_time > 0) {
+		schedule_timeout(remaining_time);
+		/* check for signals and come back */
+		return true;
 	}
+
+	__set_current_state(TASK_RUNNING);
+	kvm_recover_nx_huge_pages(kvm);
+	kvm->arch.nx_huge_page_last = get_jiffies_64();
+	return true;
 }
 
 int kvm_mmu_post_init_vm(struct kvm *kvm)
 {
-	int err;
-
 	if (nx_hugepage_mitigation_hard_disabled)
 		return 0;
 
-	err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
-					  "kvm-nx-lpage-recovery",
-					  &kvm->arch.nx_huge_page_recovery_thread);
-	if (!err)
-		kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
+	kvm->arch.nx_huge_page_last = get_jiffies_64();
+	kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
+		kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
+		kvm, "kvm-nx-lpage-recovery");
 
-	return err;
+	if (!kvm->arch.nx_huge_page_recovery_thread)
+		return -ENOMEM;
+
+	vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
+	return 0;
 }
 
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
 {
 	if (kvm->arch.nx_huge_page_recovery_thread)
-		kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
+		vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
 }
 
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 03e4d26e3bcc..401439bb21e3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2425,12 +2425,6 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
 
-typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
-
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
-				uintptr_t data, const char *name,
-				struct task_struct **thread_ptr);
-
 #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
 static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 27186b06518a..de2c11dae231 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6426,106 +6426,3 @@ void kvm_exit(void)
 	kvm_irqfd_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
-
-struct kvm_vm_worker_thread_context {
-	struct kvm *kvm;
-	struct task_struct *parent;
-	struct completion init_done;
-	kvm_vm_thread_fn_t thread_fn;
-	uintptr_t data;
-	int err;
-};
-
-static int kvm_vm_worker_thread(void *context)
-{
-	/*
-	 * The init_context is allocated on the stack of the parent thread, so
-	 * we have to locally copy anything that is needed beyond initialization
-	 */
-	struct kvm_vm_worker_thread_context *init_context = context;
-	struct task_struct *parent;
-	struct kvm *kvm = init_context->kvm;
-	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
-	uintptr_t data = init_context->data;
-	int err;
-
-	err = kthread_park(current);
-	/* kthread_park(current) is never supposed to return an error */
-	WARN_ON(err != 0);
-	if (err)
-		goto init_complete;
-
-	err = cgroup_attach_task_all(init_context->parent, current);
-	if (err) {
-		kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
-			__func__, err);
-		goto init_complete;
-	}
-
-	set_user_nice(current, task_nice(init_context->parent));
-
-init_complete:
-	init_context->err = err;
-	complete(&init_context->init_done);
-	init_context = NULL;
-
-	if (err)
-		goto out;
-
-	/* Wait to be woken up by the spawner before proceeding. */
-	kthread_parkme();
-
-	if (!kthread_should_stop())
-		err = thread_fn(kvm, data);
-
-out:
-	/*
-	 * Move kthread back to its original cgroup to prevent it lingering in
-	 * the cgroup of the VM process, after the latter finishes its
-	 * execution.
-	 *
-	 * kthread_stop() waits on the 'exited' completion condition which is
-	 * set in exit_mm(), via mm_release(), in do_exit(). However, the
-	 * kthread is removed from the cgroup in the cgroup_exit() which is
-	 * called after the exit_mm(). This causes the kthread_stop() to return
-	 * before the kthread actually quits the cgroup.
-	 */
-	rcu_read_lock();
-	parent = rcu_dereference(current->real_parent);
-	get_task_struct(parent);
-	rcu_read_unlock();
-	cgroup_attach_task_all(parent, current);
-	put_task_struct(parent);
-
-	return err;
-}
-
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
-				uintptr_t data, const char *name,
-				struct task_struct **thread_ptr)
-{
-	struct kvm_vm_worker_thread_context init_context = {};
-	struct task_struct *thread;
-
-	*thread_ptr = NULL;
-	init_context.kvm = kvm;
-	init_context.parent = current;
-	init_context.thread_fn = thread_fn;
-	init_context.data = data;
-	init_completion(&init_context.init_done);
-
-	thread = kthread_run(kvm_vm_worker_thread, &init_context,
-			     "%s-%d", name, task_pid_nr(current));
-	if (IS_ERR(thread))
-		return PTR_ERR(thread);
-
-	/* kthread_run is never supposed to return NULL */
-	WARN_ON(thread == NULL);
-
-	wait_for_completion(&init_context.init_done);
-
-	if (!init_context.err)
-		*thread_ptr = thread;
-
-	return init_context.err;
-}
-- 
cgit v1.2.3


From 4eb5e9c6c4cdfca9fb0577f62580db46f5acd1a5 Mon Sep 17 00:00:00 2001
From: Théo Lebrun <theo.lebrun@bootlin.com>
Date: Wed, 6 Nov 2024 17:03:54 +0100
Subject: clk: fixed-factor: add clk_hw_register_fixed_factor_index() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add non-devres version of clk_hw_register_fixed_factor(), with parent
targeted using its index.

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://lore.kernel.org/r/20241106-mbly-clk-v2-3-84cfefb3f485@bootlin.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-fixed-factor.c | 11 +++++++++++
 include/linux/clk-provider.h   |  3 +++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fixed-factor.c b/drivers/clk/clk-fixed-factor.c
index 8fba63fc70c5..e62ae8794d44 100644
--- a/drivers/clk/clk-fixed-factor.c
+++ b/drivers/clk/clk-fixed-factor.c
@@ -241,6 +241,17 @@ struct clk_hw *clk_hw_register_fixed_factor_with_accuracy_fwname(struct device *
 }
 EXPORT_SYMBOL_GPL(clk_hw_register_fixed_factor_with_accuracy_fwname);
 
+struct clk_hw *clk_hw_register_fixed_factor_index(struct device *dev,
+		const char *name, unsigned int index, unsigned long flags,
+		unsigned int mult, unsigned int div)
+{
+	const struct clk_parent_data pdata = { .index = index };
+
+	return __clk_hw_register_fixed_factor(dev, NULL, name, NULL, NULL, &pdata,
+					      flags, mult, div, 0, 0, false);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fixed_factor_index);
+
 struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div)
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index dbe793964c24..52b46c63a2bd 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1142,6 +1142,9 @@ struct clk_hw *clk_hw_register_fixed_factor_with_accuracy_fwname(struct device *
 		struct device_node *np, const char *name, const char *fw_name,
 		unsigned long flags, unsigned int mult, unsigned int div,
 		unsigned long acc);
+struct clk_hw *clk_hw_register_fixed_factor_index(struct device *dev,
+		const char *name, unsigned int index, unsigned long flags,
+		unsigned int mult, unsigned int div);
 void clk_hw_unregister_fixed_factor(struct clk_hw *hw);
 struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
-- 
cgit v1.2.3


From 721aa69e708b7432af83c4bb00a30e2b7c27da28 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 8 Nov 2024 07:54:47 +0100
Subject: net: phy: convert eee_broken_modes to a linkmode bitmap

eee_broken_modes has a eee_cap1 register layout currently. This doen't
allow to flag e.g. 2.5Gbps or 5Gbps BaseT EEE as broken. To overcome
this limitation switch eee_broken_modes to a linkmode bitmap.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/dfe0c9ff-84b0-4328-86d7-e917ebc084a1@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/micrel.c   |  2 +-
 drivers/net/phy/phy-c45.c  | 12 +++++-------
 drivers/net/phy/phy-core.c | 21 +++++++++------------
 include/linux/phy.h        |  9 ++++-----
 4 files changed, 19 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 43c82a87bc3a..3ef508840674 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -2004,7 +2004,7 @@ static int ksz9477_config_init(struct phy_device *phydev)
 	 * in this switch shall be regarded as broken.
 	 */
 	if (phydev->dev_flags & MICREL_NO_EEE)
-		phydev->eee_broken_modes = -1;
+		linkmode_fill(phydev->eee_broken_modes);
 
 	return kszphy_config_init(phydev);
 }
diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 7b0ffa3515aa..da5c15310d8c 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -683,15 +683,13 @@ EXPORT_SYMBOL_GPL(genphy_c45_read_mdix);
 static int genphy_c45_write_eee_adv(struct phy_device *phydev,
 				    unsigned long *adv)
 {
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(tmp);
 	int val, changed = 0;
 
-	if (linkmode_intersects(phydev->supported_eee, PHY_EEE_CAP1_FEATURES)) {
-		val = linkmode_to_mii_eee_cap1_t(adv);
+	linkmode_andnot(tmp, adv, phydev->eee_broken_modes);
 
-		/* In eee_broken_modes are stored MDIO_AN_EEE_ADV specific raw
-		 * register values.
-		 */
-		val &= ~phydev->eee_broken_modes;
+	if (linkmode_intersects(phydev->supported_eee, PHY_EEE_CAP1_FEATURES)) {
+		val = linkmode_to_mii_eee_cap1_t(tmp);
 
 		/* IEEE 802.3-2018 45.2.7.13 EEE advertisement 1
 		 * (Register 7.60)
@@ -709,7 +707,7 @@ static int genphy_c45_write_eee_adv(struct phy_device *phydev,
 	}
 
 	if (linkmode_intersects(phydev->supported_eee, PHY_EEE_CAP2_FEATURES)) {
-		val = linkmode_to_mii_eee_cap2_t(adv);
+		val = linkmode_to_mii_eee_cap2_t(tmp);
 
 		/* IEEE 802.3-2022 45.2.7.16 EEE advertisement 2
 		 * (Register 7.62)
diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 4e8db12d6092..6bf3ec985f3d 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -388,28 +388,25 @@ void of_set_phy_supported(struct phy_device *phydev)
 void of_set_phy_eee_broken(struct phy_device *phydev)
 {
 	struct device_node *node = phydev->mdio.dev.of_node;
-	u32 broken = 0;
+	unsigned long *modes = phydev->eee_broken_modes;
 
-	if (!IS_ENABLED(CONFIG_OF_MDIO))
+	if (!IS_ENABLED(CONFIG_OF_MDIO) || !node)
 		return;
 
-	if (!node)
-		return;
+	linkmode_zero(modes);
 
 	if (of_property_read_bool(node, "eee-broken-100tx"))
-		broken |= MDIO_EEE_100TX;
+		linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, modes);
 	if (of_property_read_bool(node, "eee-broken-1000t"))
-		broken |= MDIO_EEE_1000T;
+		linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, modes);
 	if (of_property_read_bool(node, "eee-broken-10gt"))
-		broken |= MDIO_EEE_10GT;
+		linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT, modes);
 	if (of_property_read_bool(node, "eee-broken-1000kx"))
-		broken |= MDIO_EEE_1000KX;
+		linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, modes);
 	if (of_property_read_bool(node, "eee-broken-10gkx4"))
-		broken |= MDIO_EEE_10GKX4;
+		linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, modes);
 	if (of_property_read_bool(node, "eee-broken-10gkr"))
-		broken |= MDIO_EEE_10GKR;
-
-	phydev->eee_broken_modes = broken;
+		linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT, modes);
 }
 
 /**
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1e4127c495c0..a6622f873d03 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -721,16 +721,15 @@ struct phy_device {
 	/* used for eee validation and configuration*/
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee);
+	/* Energy efficient ethernet modes which should be prohibited */
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(eee_broken_modes);
 	bool eee_enabled;
+	bool enable_tx_lpi;
+	struct eee_config eee_cfg;
 
 	/* Host supported PHY interface types. Should be ignored if empty. */
 	DECLARE_PHY_INTERFACE_MASK(host_interfaces);
 
-	/* Energy efficient ethernet modes which should be prohibited */
-	u32 eee_broken_modes;
-	bool enable_tx_lpi;
-	struct eee_config eee_cfg;
-
 #ifdef CONFIG_LED_TRIGGER_PHY
 	struct phy_led_trigger *phy_led_triggers;
 	unsigned int phy_num_led_triggers;
-- 
cgit v1.2.3


From ed623fb8e38e2a241da12864778ec9c9cf930c65 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 8 Nov 2024 08:07:10 +0100
Subject: net: phy: add phy_set_eee_broken

Add an accessor for eee_broken_modes, so that drivers
don't have to deal with phylib internals.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/0f8ee279-d40d-4489-a3b0-d993472d744a@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index a6622f873d03..b8346db42727 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1263,6 +1263,16 @@ void of_set_phy_eee_broken(struct phy_device *phydev);
 void of_set_phy_timing_role(struct phy_device *phydev);
 int phy_speed_down_core(struct phy_device *phydev);
 
+/**
+ * phy_set_eee_broken - Mark an EEE mode as broken so that it isn't advertised.
+ * @phydev: The phy_device struct
+ * @link_mode: The broken EEE mode
+ */
+static inline void phy_set_eee_broken(struct phy_device *phydev, u32 link_mode)
+{
+	linkmode_set_bit(link_mode, phydev->eee_broken_modes);
+}
+
 /**
  * phy_is_started - Convenience function to check whether PHY is started
  * @phydev: The phy_device struct
-- 
cgit v1.2.3


From e7cb7cf43afb96a4fd8f68c2dfb9b2fbdd444654 Mon Sep 17 00:00:00 2001
From: Alistair Francis <alistair.francis@wdc.com>
Date: Tue, 12 Nov 2024 20:54:29 +1000
Subject: include: mdio: Remove mdio45_ethtool_gset()

mdio45_ethtool_gset() is never called, so let's remove it.

Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
Link: https://patch.msgid.link/20241112105430.438491-1-alistair@alistair23.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mdio.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index efeca5bd7600..c63f43645d50 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -173,22 +173,6 @@ mdio45_ethtool_ksettings_get_npage(const struct mdio_if_info *mdio,
 				   struct ethtool_link_ksettings *cmd,
 				   u32 npage_adv, u32 npage_lpa);
 
-/**
- * mdio45_ethtool_gset - get settings for ETHTOOL_GSET
- * @mdio: MDIO interface
- * @ecmd: Ethtool request structure
- *
- * Since the CSRs for auto-negotiation using next pages are not fully
- * standardised, this function does not attempt to decode them.  Use
- * mdio45_ethtool_gset_npage() to specify advertisement bits from next
- * pages.
- */
-static inline void mdio45_ethtool_gset(const struct mdio_if_info *mdio,
-				       struct ethtool_cmd *ecmd)
-{
-	mdio45_ethtool_gset_npage(mdio, ecmd, 0, 0);
-}
-
 /**
  * mdio45_ethtool_ksettings_get - get settings for ETHTOOL_GLINKSETTINGS
  * @mdio: MDIO interface
-- 
cgit v1.2.3


From 575092a7f0ce4c6d5907cd908d28c0f69690a136 Mon Sep 17 00:00:00 2001
From: Alistair Francis <alistair.francis@wdc.com>
Date: Tue, 12 Nov 2024 20:54:30 +1000
Subject: mdio: Remove mdio45_ethtool_gset_npage()

The mdio45_ethtool_gset_npage() function isn't called, so let's remove
it.

Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
Link: https://patch.msgid.link/20241112105430.438491-2-alistair@alistair23.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/mdio.c   | 172 ---------------------------------------------------
 include/linux/mdio.h |   3 -
 2 files changed, 175 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c
index e08c90ac0c6e..f67a4d4005e7 100644
--- a/drivers/net/mdio.c
+++ b/drivers/net/mdio.c
@@ -166,178 +166,6 @@ static u32 mdio45_get_an(const struct mdio_if_info *mdio, u16 addr)
 	return result;
 }
 
-/**
- * mdio45_ethtool_gset_npage - get settings for ETHTOOL_GSET
- * @mdio: MDIO interface
- * @ecmd: Ethtool request structure
- * @npage_adv: Modes currently advertised on next pages
- * @npage_lpa: Modes advertised by link partner on next pages
- *
- * The @ecmd parameter is expected to have been cleared before calling
- * mdio45_ethtool_gset_npage().
- *
- * Since the CSRs for auto-negotiation using next pages are not fully
- * standardised, this function does not attempt to decode them.  The
- * caller must pass them in.
- */
-void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
-			       struct ethtool_cmd *ecmd,
-			       u32 npage_adv, u32 npage_lpa)
-{
-	int reg;
-	u32 speed;
-
-	BUILD_BUG_ON(MDIO_SUPPORTS_C22 != ETH_MDIO_SUPPORTS_C22);
-	BUILD_BUG_ON(MDIO_SUPPORTS_C45 != ETH_MDIO_SUPPORTS_C45);
-
-	ecmd->transceiver = XCVR_INTERNAL;
-	ecmd->phy_address = mdio->prtad;
-	ecmd->mdio_support =
-		mdio->mode_support & (MDIO_SUPPORTS_C45 | MDIO_SUPPORTS_C22);
-
-	reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
-			      MDIO_CTRL2);
-	switch (reg & MDIO_PMA_CTRL2_TYPE) {
-	case MDIO_PMA_CTRL2_10GBT:
-	case MDIO_PMA_CTRL2_1000BT:
-	case MDIO_PMA_CTRL2_100BTX:
-	case MDIO_PMA_CTRL2_10BT:
-		ecmd->port = PORT_TP;
-		ecmd->supported = SUPPORTED_TP;
-		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
-				      MDIO_SPEED);
-		if (reg & MDIO_SPEED_10G)
-			ecmd->supported |= SUPPORTED_10000baseT_Full;
-		if (reg & MDIO_PMA_SPEED_1000)
-			ecmd->supported |= (SUPPORTED_1000baseT_Full |
-					    SUPPORTED_1000baseT_Half);
-		if (reg & MDIO_PMA_SPEED_100)
-			ecmd->supported |= (SUPPORTED_100baseT_Full |
-					    SUPPORTED_100baseT_Half);
-		if (reg & MDIO_PMA_SPEED_10)
-			ecmd->supported |= (SUPPORTED_10baseT_Full |
-					    SUPPORTED_10baseT_Half);
-		ecmd->advertising = ADVERTISED_TP;
-		break;
-
-	case MDIO_PMA_CTRL2_10GBCX4:
-		ecmd->port = PORT_OTHER;
-		ecmd->supported = 0;
-		ecmd->advertising = 0;
-		break;
-
-	case MDIO_PMA_CTRL2_10GBKX4:
-	case MDIO_PMA_CTRL2_10GBKR:
-	case MDIO_PMA_CTRL2_1000BKX:
-		ecmd->port = PORT_OTHER;
-		ecmd->supported = SUPPORTED_Backplane;
-		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
-				      MDIO_PMA_EXTABLE);
-		if (reg & MDIO_PMA_EXTABLE_10GBKX4)
-			ecmd->supported |= SUPPORTED_10000baseKX4_Full;
-		if (reg & MDIO_PMA_EXTABLE_10GBKR)
-			ecmd->supported |= SUPPORTED_10000baseKR_Full;
-		if (reg & MDIO_PMA_EXTABLE_1000BKX)
-			ecmd->supported |= SUPPORTED_1000baseKX_Full;
-		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
-				      MDIO_PMA_10GBR_FECABLE);
-		if (reg & MDIO_PMA_10GBR_FECABLE_ABLE)
-			ecmd->supported |= SUPPORTED_10000baseR_FEC;
-		ecmd->advertising = ADVERTISED_Backplane;
-		break;
-
-	/* All the other defined modes are flavours of optical */
-	default:
-		ecmd->port = PORT_FIBRE;
-		ecmd->supported = SUPPORTED_FIBRE;
-		ecmd->advertising = ADVERTISED_FIBRE;
-		break;
-	}
-
-	if (mdio->mmds & MDIO_DEVS_AN) {
-		ecmd->supported |= SUPPORTED_Autoneg;
-		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_AN,
-				      MDIO_CTRL1);
-		if (reg & MDIO_AN_CTRL1_ENABLE) {
-			ecmd->autoneg = AUTONEG_ENABLE;
-			ecmd->advertising |=
-				ADVERTISED_Autoneg |
-				mdio45_get_an(mdio, MDIO_AN_ADVERTISE) |
-				npage_adv;
-		} else {
-			ecmd->autoneg = AUTONEG_DISABLE;
-		}
-	} else {
-		ecmd->autoneg = AUTONEG_DISABLE;
-	}
-
-	if (ecmd->autoneg) {
-		u32 modes = 0;
-		int an_stat = mdio->mdio_read(mdio->dev, mdio->prtad,
-					      MDIO_MMD_AN, MDIO_STAT1);
-
-		/* If AN is complete and successful, report best common
-		 * mode, otherwise report best advertised mode. */
-		if (an_stat & MDIO_AN_STAT1_COMPLETE) {
-			ecmd->lp_advertising =
-				mdio45_get_an(mdio, MDIO_AN_LPA) | npage_lpa;
-			if (an_stat & MDIO_AN_STAT1_LPABLE)
-				ecmd->lp_advertising |= ADVERTISED_Autoneg;
-			modes = ecmd->advertising & ecmd->lp_advertising;
-		}
-		if ((modes & ~ADVERTISED_Autoneg) == 0)
-			modes = ecmd->advertising;
-
-		if (modes & (ADVERTISED_10000baseT_Full |
-			     ADVERTISED_10000baseKX4_Full |
-			     ADVERTISED_10000baseKR_Full)) {
-			speed = SPEED_10000;
-			ecmd->duplex = DUPLEX_FULL;
-		} else if (modes & (ADVERTISED_1000baseT_Full |
-				    ADVERTISED_1000baseT_Half |
-				    ADVERTISED_1000baseKX_Full)) {
-			speed = SPEED_1000;
-			ecmd->duplex = !(modes & ADVERTISED_1000baseT_Half);
-		} else if (modes & (ADVERTISED_100baseT_Full |
-				    ADVERTISED_100baseT_Half)) {
-			speed = SPEED_100;
-			ecmd->duplex = !!(modes & ADVERTISED_100baseT_Full);
-		} else {
-			speed = SPEED_10;
-			ecmd->duplex = !!(modes & ADVERTISED_10baseT_Full);
-		}
-	} else {
-		/* Report forced settings */
-		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
-				      MDIO_CTRL1);
-		speed = (((reg & MDIO_PMA_CTRL1_SPEED1000) ? 100 : 1)
-			 * ((reg & MDIO_PMA_CTRL1_SPEED100) ? 100 : 10));
-		ecmd->duplex = (reg & MDIO_CTRL1_FULLDPLX ||
-				speed == SPEED_10000);
-	}
-
-	ethtool_cmd_speed_set(ecmd, speed);
-
-	/* 10GBASE-T MDI/MDI-X */
-	if (ecmd->port == PORT_TP
-	    && (ethtool_cmd_speed(ecmd) == SPEED_10000)) {
-		switch (mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
-					MDIO_PMA_10GBT_SWAPPOL)) {
-		case MDIO_PMA_10GBT_SWAPPOL_ABNX | MDIO_PMA_10GBT_SWAPPOL_CDNX:
-			ecmd->eth_tp_mdix = ETH_TP_MDI;
-			break;
-		case 0:
-			ecmd->eth_tp_mdix = ETH_TP_MDI_X;
-			break;
-		default:
-			/* It's complicated... */
-			ecmd->eth_tp_mdix = ETH_TP_MDI_INVALID;
-			break;
-		}
-	}
-}
-EXPORT_SYMBOL(mdio45_ethtool_gset_npage);
-
 /**
  * mdio45_ethtool_ksettings_get_npage - get settings for ETHTOOL_GLINKSETTINGS
  * @mdio: MDIO interface
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index c63f43645d50..3c3deac57894 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -165,9 +165,6 @@ extern int mdio_set_flag(const struct mdio_if_info *mdio,
 			 bool sense);
 extern int mdio45_links_ok(const struct mdio_if_info *mdio, u32 mmds);
 extern int mdio45_nway_restart(const struct mdio_if_info *mdio);
-extern void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
-				      struct ethtool_cmd *ecmd,
-				      u32 npage_adv, u32 npage_lpa);
 extern void
 mdio45_ethtool_ksettings_get_npage(const struct mdio_if_info *mdio,
 				   struct ethtool_link_ksettings *cmd,
-- 
cgit v1.2.3


From 9e43ad7a1edef268acac603e1975c8f50a20d02f Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree.xilinx@gmail.com>
Date: Wed, 13 Nov 2024 12:13:09 +0000
Subject: net: ethtool: only allow set_rxnfc with rss + ring_cookie if driver
 opts in

Ethtool ntuple filters with FLOW_RSS were originally defined as adding
 the base queue ID (ring_cookie) to the value from the indirection table,
 so that the same table could distribute over more than one set of queues
 when used by different filters.
However, some drivers / hardware ignore the ring_cookie, and simply use
 the indirection table entries as queue IDs directly.  Thus, for drivers
 which have not opted in by setting ethtool_ops.cap_rss_rxnfc_adds to
 declare that they support the original (addition) semantics, reject in
 ethtool_set_rxnfc any filter which combines FLOW_RSS and a nonzero ring.
(For a ring_cookie of zero, both behaviours are equivalent.)
Set the cap bit in sfc, as it is known to support this feature.

Signed-off-by: Edward Cree <ecree.xilinx@gmail.com>
Reviewed-by: Martin Habets <habetsm.xilinx@gmail.com>
Link: https://patch.msgid.link/cc3da0844083b0e301a33092a6299e4042b65221.1731499022.git.ecree.xilinx@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/sfc/ef100_ethtool.c | 1 +
 drivers/net/ethernet/sfc/ethtool.c       | 1 +
 include/linux/ethtool.h                  | 4 ++++
 net/ethtool/ioctl.c                      | 5 +++++
 4 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/sfc/ef100_ethtool.c b/drivers/net/ethernet/sfc/ef100_ethtool.c
index 5c2551369812..6c3b74000d3b 100644
--- a/drivers/net/ethernet/sfc/ef100_ethtool.c
+++ b/drivers/net/ethernet/sfc/ef100_ethtool.c
@@ -59,6 +59,7 @@ const struct ethtool_ops ef100_ethtool_ops = {
 	.get_rxfh_indir_size	= efx_ethtool_get_rxfh_indir_size,
 	.get_rxfh_key_size	= efx_ethtool_get_rxfh_key_size,
 	.rxfh_per_ctx_key	= true,
+	.cap_rss_rxnfc_adds	= true,
 	.rxfh_priv_size		= sizeof(struct efx_rss_context_priv),
 	.get_rxfh		= efx_ethtool_get_rxfh,
 	.set_rxfh		= efx_ethtool_set_rxfh,
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index bb1930818beb..83d715544f7f 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -263,6 +263,7 @@ const struct ethtool_ops efx_ethtool_ops = {
 	.get_rxfh_indir_size	= efx_ethtool_get_rxfh_indir_size,
 	.get_rxfh_key_size	= efx_ethtool_get_rxfh_key_size,
 	.rxfh_per_ctx_key	= true,
+	.cap_rss_rxnfc_adds	= true,
 	.rxfh_priv_size		= sizeof(struct efx_rss_context_priv),
 	.get_rxfh		= efx_ethtool_get_rxfh,
 	.set_rxfh		= efx_ethtool_set_rxfh,
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 1199e308c8dd..299280c94d07 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -734,6 +734,9 @@ struct kernel_ethtool_ts_info {
  * @rxfh_per_ctx_key: device supports setting different RSS key for each
  *	additional context. Netlink API should report hfunc, key, and input_xfrm
  *	for every context, not just context 0.
+ * @cap_rss_rxnfc_adds: device supports nonzero ring_cookie in filters with
+ *	%FLOW_RSS flag; the queue ID from the filter is added to the value from
+ *	the indirection table to determine the delivery queue.
  * @rxfh_indir_space: max size of RSS indirection tables, if indirection table
  *	size as returned by @get_rxfh_indir_size may change during lifetime
  *	of the device. Leave as 0 if the table size is constant.
@@ -956,6 +959,7 @@ struct ethtool_ops {
 	u32     cap_rss_ctx_supported:1;
 	u32	cap_rss_sym_xor_supported:1;
 	u32	rxfh_per_ctx_key:1;
+	u32	cap_rss_rxnfc_adds:1;
 	u32	rxfh_indir_space;
 	u16	rxfh_key_space;
 	u16	rxfh_priv_size;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 7da94e26ced6..d86399bcf223 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -992,6 +992,11 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
 	if (rc)
 		return rc;
 
+	/* Nonzero ring with RSS only makes sense if NIC adds them together */
+	if (info.flow_type & FLOW_RSS && !ops->cap_rss_rxnfc_adds &&
+	    ethtool_get_flow_spec_ring(info.fs.ring_cookie))
+		return -EINVAL;
+
 	if (ops->get_rxfh) {
 		struct ethtool_rxfh_param rxfh = {};
 
-- 
cgit v1.2.3


From fd7b4f9f46d46acbc7af3a439bb0d869efdc5c58 Mon Sep 17 00:00:00 2001
From: Qun-Wei Lin <qun-wei.lin@mediatek.com>
Date: Wed, 13 Nov 2024 12:25:43 +0800
Subject: sched/task_stack: fix object_is_on_stack() for KASAN tagged pointers

When CONFIG_KASAN_SW_TAGS and CONFIG_KASAN_STACK are enabled, the
object_is_on_stack() function may produce incorrect results due to the
presence of tags in the obj pointer, while the stack pointer does not have
tags.  This discrepancy can lead to incorrect stack object detection and
subsequently trigger warnings if CONFIG_DEBUG_OBJECTS is also enabled.

Example of the warning:

ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated.
------------[ cut here ]------------
WARNING: CPU: 0 PID: 1 at lib/debugobjects.c:557 __debug_object_init+0x330/0x364
Modules linked in:
CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc5 #4
Hardware name: linux,dummy-virt (DT)
pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : __debug_object_init+0x330/0x364
lr : __debug_object_init+0x330/0x364
sp : ffff800082ea7b40
x29: ffff800082ea7b40 x28: 98ff0000c0164518 x27: 98ff0000c0164534
x26: ffff800082d93ec8 x25: 0000000000000001 x24: 1cff0000c00172a0
x23: 0000000000000000 x22: ffff800082d93ed0 x21: ffff800081a24418
x20: 3eff800082ea7bb0 x19: efff800000000000 x18: 0000000000000000
x17: 00000000000000ff x16: 0000000000000047 x15: 206b63617473206e
x14: 0000000000000018 x13: ffff800082ea7780 x12: 0ffff800082ea78e
x11: 0ffff800082ea790 x10: 0ffff800082ea79d x9 : 34d77febe173e800
x8 : 34d77febe173e800 x7 : 0000000000000001 x6 : 0000000000000001
x5 : feff800082ea74b8 x4 : ffff800082870a90 x3 : ffff80008018d3c4
x2 : 0000000000000001 x1 : ffff800082858810 x0 : 0000000000000050
Call trace:
 __debug_object_init+0x330/0x364
 debug_object_init_on_stack+0x30/0x3c
 schedule_hrtimeout_range_clock+0xac/0x26c
 schedule_hrtimeout+0x1c/0x30
 wait_task_inactive+0x1d4/0x25c
 kthread_bind_mask+0x28/0x98
 init_rescuer+0x1e8/0x280
 workqueue_init+0x1a0/0x3cc
 kernel_init_freeable+0x118/0x200
 kernel_init+0x28/0x1f0
 ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---
ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated.
------------[ cut here ]------------

Link: https://lkml.kernel.org/r/20241113042544.19095-1-qun-wei.lin@mediatek.com
Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
Cc: Andrew Yang <andrew.yang@mediatek.com>
Cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Cc: Casper Li <casper.li@mediatek.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/task_stack.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index bf10bdb487dd..6c2fef89a4fd 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -9,6 +9,7 @@
 #include <linux/sched.h>
 #include <linux/magic.h>
 #include <linux/refcount.h>
+#include <linux/kasan.h>
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 
@@ -89,6 +90,7 @@ static inline int object_is_on_stack(const void *obj)
 {
 	void *stack = task_stack_page(current);
 
+	obj = kasan_reset_tag(obj);
 	return (obj >= stack) && (obj < (stack + THREAD_SIZE));
 }
 
-- 
cgit v1.2.3


From 05d4532b60e3e6e2a094ec56a88d1def50bd2430 Mon Sep 17 00:00:00 2001
From: Joshua Hahn <joshua.hahnjy@gmail.com>
Date: Fri, 1 Nov 2024 13:44:02 -0700
Subject: memcg/hugetlb: add hugeTLB counters to memcg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a new counter to memory.stat that tracks hugeTLB
usage, only if hugeTLB accounting is done to memory.current.  This feature
is enabled the same way hugeTLB accounting is enabled, via the
memory_hugetlb_accounting mount flag for cgroupsv2.

1. Why is this patch necessary?
Currently, memcg hugeTLB accounting is an opt-in feature [1] that adds
hugeTLB usage to memory.current.  However, the metric is not reported in
memory.stat.  Given that users often interpret memory.stat as a breakdown
of the value reported in memory.current, the disparity between the two
reports can be confusing.  This patch solves this problem by including the
metric in memory.stat as well, but only if it is also reported in
memory.current (it would also be confusing if the value was reported in
memory.stat, but not in memory.current)

Aside from the consistency between the two files, we also see benefits in
observability.  Userspace might be interested in the hugeTLB footprint of
cgroups for many reasons.  For instance, system admins might want to
verify that hugeTLB usage is distributed as expected across tasks: i.e.
memory-intensive tasks are using more hugeTLB pages than tasks that don't
consume a lot of memory, or are seen to fault frequently.  Note that this
is separate from wanting to inspect the distribution for limiting purposes
(in which case, hugeTLB controller makes more sense).

2. We already have a hugeTLB controller. Why not use that?
It is true that hugeTLB tracks the exact value that we want.  In fact, by
enabling the hugeTLB controller, we get all of the observability benefits
that I mentioned above, and users can check the total hugeTLB usage,
verify if it is distributed as expected, etc.

With this said, there are 2 problems:
(a) They are still not reported in memory.stat, which means the
    disparity between the memcg reports are still there.
(b) We cannot reasonably expect users to enable the hugeTLB controller
    just for the sake of hugeTLB usage reporting, especially since
    they don't have any use for hugeTLB usage enforcing [2].

3. Implementation Details:
In the alloc / free hugetlb functions, we call lruvec_stat_mod_folio
regardless of whether memcg accounts hugetlb.  mem_cgroup_commit_charge
which is called from alloc_hugetlb_folio will set memcg for the folio only
if the CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING cgroup mount option is used, so
lruvec_stat_mod_folio accounts per-memcg hugetlb counters only if the
feature is enabled.  Regardless of whether memcg accounts for hugetlb, the
newly added global counter is updated and shown in /proc/vmstat.

The global counter is added because vmstats is the preferred framework for
cgroup stats.  It makes stat items consistent between global and cgroups.
It also provides a per-node breakdown, which is useful.  Because it does
not use cgroup-specific hooks, we also keep generic MM code separate from
memcg code.

[1] https://lore.kernel.org/all/20231006184629.155543-1-nphamcs@gmail.com/
[2] Of course, we can't make a new patch for every feature that can be
    duplicated. However, since the existing solution of enabling the
    hugeTLB controller is an imperfect solution that still leaves a
    discrepancy between memory.stat and memory.curent, I think that it
    is reasonable to isolate the feature in this case.

Link: https://lkml.kernel.org/r/20241101204402.1885383-1-joshua.hahnjy@gmail.com
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Suggested-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Shakeel Butt <shakeel.butt@linux.dev>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Chris Down <chris@chrisdown.name>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst |  5 +++++
 include/linux/mmzone.h                  |  3 +++
 mm/hugetlb.c                            |  2 ++
 mm/memcontrol.c                         | 11 +++++++++++
 mm/vmstat.c                             |  3 +++
 5 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 6d02168d78be..02dc54fe1f53 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1655,6 +1655,11 @@ The following nested keys are defined.
 	  pgdemote_khugepaged
 		Number of pages demoted by khugepaged.
 
+	  hugetlb
+		Amount of memory used by hugetlb pages. This metric only shows
+		up if hugetlb usage is accounted for in memory.current (i.e.
+		cgroup is mounted with the memory_hugetlb_accounting option).
+
   memory.numa_stat
 	A read-only nested-keyed file which exists on non-root cgroups.
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5e8f567753bd..b36124145a16 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -220,6 +220,9 @@ enum node_stat_item {
 	PGDEMOTE_KSWAPD,
 	PGDEMOTE_DIRECT,
 	PGDEMOTE_KHUGEPAGED,
+#ifdef CONFIG_HUGETLB_PAGE
+	NR_HUGETLB,
+#endif
 	NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2c8c5da0f5d3..ea2ed8e301ef 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1925,6 +1925,7 @@ void free_huge_folio(struct folio *folio)
 				     pages_per_huge_page(h), folio);
 	hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
 					  pages_per_huge_page(h), folio);
+	lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h));
 	mem_cgroup_uncharge(folio);
 	if (restore_reserve)
 		h->resv_huge_pages++;
@@ -3093,6 +3094,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 
 	if (!memcg_charge_ret)
 		mem_cgroup_commit_charge(folio, memcg);
+	lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
 	mem_cgroup_put(memcg);
 
 	return folio;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ed2dd88437ca..7b3503d12aaf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -315,6 +315,9 @@ static const unsigned int memcg_node_stat_items[] = {
 	PGDEMOTE_KSWAPD,
 	PGDEMOTE_DIRECT,
 	PGDEMOTE_KHUGEPAGED,
+#ifdef CONFIG_HUGETLB_PAGE
+	NR_HUGETLB,
+#endif
 };
 
 static const unsigned int memcg_stat_items[] = {
@@ -1366,6 +1369,9 @@ static const struct memory_stat memory_stats[] = {
 	{ "unevictable",		NR_UNEVICTABLE			},
 	{ "slab_reclaimable",		NR_SLAB_RECLAIMABLE_B		},
 	{ "slab_unreclaimable",		NR_SLAB_UNRECLAIMABLE_B		},
+#ifdef CONFIG_HUGETLB_PAGE
+	{ "hugetlb",			NR_HUGETLB			},
+#endif
 
 	/* The memory events */
 	{ "workingset_refault_anon",	WORKINGSET_REFAULT_ANON		},
@@ -1461,6 +1467,11 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
 		u64 size;
 
+#ifdef CONFIG_HUGETLB_PAGE
+		if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
+		    !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+			continue;
+#endif
 		size = memcg_page_state_output(memcg, memory_stats[i].idx);
 		seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 11a37c528395..4d016314a56c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1273,6 +1273,9 @@ const char * const vmstat_text[] = {
 	"pgdemote_kswapd",
 	"pgdemote_direct",
 	"pgdemote_khugepaged",
+#ifdef CONFIG_HUGETLB_PAGE
+	"nr_hugetlb",
+#endif
 	/* system-wide enum vm_stat_item counters */
 	"nr_dirty_threshold",
 	"nr_dirty_background_threshold",
-- 
cgit v1.2.3


From 4a530a7c751d27f9dbd70b7fc45670cd11713b13 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 11 Oct 2024 11:00:21 +0200
Subject: fs: prepare for "explicit connectable" file handles

We would like to use the high 16bit of the handle_type field to encode
file handle traits, such as "connectable".

In preparation for this change, make sure that filesystems do not return
a handle_type value with upper bits set and that the open_by_handle_at(2)
syscall rejects these handle types.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20241011090023.655623-2-amir73il@gmail.com
Fixes: 570df4e9c23f ("ceph: snapshot nfs re-export")
Acked-by:
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exportfs/expfs.c      | 17 +++++++++++++++--
 fs/fhandle.c             |  7 +++++++
 include/linux/exportfs.h | 11 +++++++++++
 3 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4f2dd4ab4486..0c899cfba578 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -382,14 +382,24 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 			     int *max_len, struct inode *parent, int flags)
 {
 	const struct export_operations *nop = inode->i_sb->s_export_op;
+	enum fid_type type;
 
 	if (!exportfs_can_encode_fh(nop, flags))
 		return -EOPNOTSUPP;
 
 	if (!nop && (flags & EXPORT_FH_FID))
-		return exportfs_encode_ino64_fid(inode, fid, max_len);
+		type = exportfs_encode_ino64_fid(inode, fid, max_len);
+	else
+		type = nop->encode_fh(inode, fid->raw, max_len, parent);
+
+	if (type > 0 && FILEID_USER_FLAGS(type)) {
+		pr_warn_once("%s: unexpected fh type value 0x%x from fstype %s.\n",
+			     __func__, type, inode->i_sb->s_type->name);
+		return -EINVAL;
+	}
+
+	return type;
 
-	return nop->encode_fh(inode, fid->raw, max_len, parent);
 }
 EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
 
@@ -436,6 +446,9 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
 	char nbuf[NAME_MAX+1];
 	int err;
 
+	if (fileid_type < 0 || FILEID_USER_FLAGS(fileid_type))
+		return ERR_PTR(-EINVAL);
+
 	/*
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 82df28d45cd7..218511f38cbb 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -307,6 +307,11 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
 		retval = -EINVAL;
 		goto out_path;
 	}
+	if (f_handle.handle_type < 0 ||
+	    FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) {
+		retval = -EINVAL;
+		goto out_path;
+	}
 	handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
 			 GFP_KERNEL);
 	if (!handle) {
@@ -322,6 +327,8 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
 		goto out_handle;
 	}
 
+	/* Filesystem code should not be exposed to user flags */
+	handle->handle_type &= ~FILEID_USER_FLAGS_MASK;
 	retval = do_handle_to_path(handle, path, &ctx);
 
 out_handle:
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 893a1d21dc1c..5e14d4500a75 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -160,6 +160,17 @@ struct fid {
 #define EXPORT_FH_FID		0x2 /* File handle may be non-decodeable */
 #define EXPORT_FH_DIR_ONLY	0x4 /* Only decode file handle for a directory */
 
+/*
+ * Filesystems use only lower 8 bits of file_handle type for fid_type.
+ * name_to_handle_at() uses upper 16 bits of type as user flags to be
+ * interpreted by open_by_handle_at().
+ */
+#define FILEID_USER_FLAGS_MASK	0xffff0000
+#define FILEID_USER_FLAGS(type) ((type) & FILEID_USER_FLAGS_MASK)
+
+/* Flags supported in encoded handle_type that is exported to user */
+#define FILEID_VALID_USER_FLAGS	(0)
+
 /**
  * struct export_operations - for nfsd to communicate with file systems
  * @encode_fh:      encode a file handle fragment from a dentry
-- 
cgit v1.2.3


From c374196b2b9f4b803fccd59ed82f0712041e21e1 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 11 Oct 2024 11:00:22 +0200
Subject: fs: name_to_handle_at() support for "explicit connectable" file
 handles

nfsd encodes "connectable" file handles for the subtree_check feature,
which can be resolved to an open file with a connected path.
So far, userspace nfs server could not make use of this functionality.

Introduce a new flag AT_HANDLE_CONNECTABLE to name_to_handle_at(2).
When used, the encoded file handle is "explicitly connectable".

The "explicitly connectable" file handle sets bits in the high 16bit of
the handle_type field, so open_by_handle_at(2) will know that it needs
to open a file with a connected path.

old kernels will now recognize the handle_type with high bits set,
so "explicitly connectable" file handles cannot be decoded by
open_by_handle_at(2) on old kernels.

The flag AT_HANDLE_CONNECTABLE is not allowed together with either
AT_HANDLE_FID or AT_EMPTY_PATH.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20241011090023.655623-3-amir73il@gmail.com
Fixes: 570df4e9c23f ("ceph: snapshot nfs re-export")
Acked-by:
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c               | 48 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/exportfs.h   |  2 ++
 include/uapi/linux/fcntl.h |  1 +
 3 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 218511f38cbb..8339a1041025 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -31,6 +31,14 @@ static long do_sys_name_to_handle(const struct path *path,
 	if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags))
 		return -EOPNOTSUPP;
 
+	/*
+	 * A request to encode a connectable handle for a disconnected dentry
+	 * is unexpected since AT_EMPTY_PATH is not allowed.
+	 */
+	if (fh_flags & EXPORT_FH_CONNECTABLE &&
+	    WARN_ON(path->dentry->d_flags & DCACHE_DISCONNECTED))
+		return -EINVAL;
+
 	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
 		return -EFAULT;
 
@@ -45,7 +53,7 @@ static long do_sys_name_to_handle(const struct path *path,
 	/* convert handle size to multiple of sizeof(u32) */
 	handle_dwords = f_handle.handle_bytes >> 2;
 
-	/* we ask for a non connectable maybe decodeable file handle */
+	/* Encode a possibly decodeable/connectable file handle */
 	retval = exportfs_encode_fh(path->dentry,
 				    (struct fid *)handle->f_handle,
 				    &handle_dwords, fh_flags);
@@ -67,8 +75,23 @@ static long do_sys_name_to_handle(const struct path *path,
 		 * non variable part of the file_handle
 		 */
 		handle_bytes = 0;
-	} else
+	} else {
+		/*
+		 * When asked to encode a connectable file handle, encode this
+		 * property in the file handle itself, so that we later know
+		 * how to decode it.
+		 * For sanity, also encode in the file handle if the encoded
+		 * object is a directory and verify this during decode, because
+		 * decoding directory file handles is quite different than
+		 * decoding connectable non-directory file handles.
+		 */
+		if (fh_flags & EXPORT_FH_CONNECTABLE) {
+			handle->handle_type |= FILEID_IS_CONNECTABLE;
+			if (d_is_dir(path->dentry))
+				fh_flags |= FILEID_IS_DIR;
+		}
 		retval = 0;
+	}
 	/* copy the mount id */
 	if (unique_mntid) {
 		if (put_user(real_mount(path->mnt)->mnt_id_unique,
@@ -109,15 +132,30 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
 {
 	struct path path;
 	int lookup_flags;
-	int fh_flags;
+	int fh_flags = 0;
 	int err;
 
 	if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID |
-		     AT_HANDLE_MNT_ID_UNIQUE))
+		     AT_HANDLE_MNT_ID_UNIQUE | AT_HANDLE_CONNECTABLE))
+		return -EINVAL;
+
+	/*
+	 * AT_HANDLE_FID means there is no intention to decode file handle
+	 * AT_HANDLE_CONNECTABLE means there is an intention to decode a
+	 * connected fd (with known path), so these flags are conflicting.
+	 * AT_EMPTY_PATH could be used along with a dfd that refers to a
+	 * disconnected non-directory, which cannot be used to encode a
+	 * connectable file handle, because its parent is unknown.
+	 */
+	if (flag & AT_HANDLE_CONNECTABLE &&
+	    flag & (AT_HANDLE_FID | AT_EMPTY_PATH))
 		return -EINVAL;
+	else if (flag & AT_HANDLE_FID)
+		fh_flags |= EXPORT_FH_FID;
+	else if (flag & AT_HANDLE_CONNECTABLE)
+		fh_flags |= EXPORT_FH_CONNECTABLE;
 
 	lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
-	fh_flags = (flag & AT_HANDLE_FID) ? EXPORT_FH_FID : 0;
 	if (flag & AT_EMPTY_PATH)
 		lookup_flags |= LOOKUP_EMPTY;
 	err = user_path_at(dfd, name, lookup_flags, &path);
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 5e14d4500a75..4ee42b2cf4ab 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -169,6 +169,8 @@ struct fid {
 #define FILEID_USER_FLAGS(type) ((type) & FILEID_USER_FLAGS_MASK)
 
 /* Flags supported in encoded handle_type that is exported to user */
+#define FILEID_IS_CONNECTABLE	0x10000
+#define FILEID_IS_DIR		0x20000
 #define FILEID_VALID_USER_FLAGS	(0)
 
 /**
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 87e2dec79fea..56ff2100e021 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -153,6 +153,7 @@
 					   object identity and may not be
 					   usable with open_by_handle_at(2). */
 #define AT_HANDLE_MNT_ID_UNIQUE	0x001	/* Return the u64 unique mount ID. */
+#define AT_HANDLE_CONNECTABLE	0x002	/* Request a connectable file handle */
 
 #if defined(__KERNEL__)
 #define AT_GETATTR_NOSEC	0x80000000
-- 
cgit v1.2.3


From a20853ab8296d4a8754482cb5e9adde8ab426a25 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 11 Oct 2024 11:00:23 +0200
Subject: fs: open_by_handle_at() support for decoding "explicit connectable"
 file handles

Teach open_by_handle_at(2) about the type format of "explicit connectable"
file handles that were created using the AT_HANDLE_CONNECTABLE flag to
name_to_handle_at(2).

When decoding an "explicit connectable" file handles, name_to_handle_at(2)
should fail if it cannot open a "connected" fd with known path, which is
accessible (to capable user) from mount fd path.

Note that this does not check if the path is accessible to the calling
user, just that it is accessible wrt the mount namesapce, so if there
is no "connected" alias, or if parts of the path are hidden in the
mount namespace, open_by_handle_at(2) will return -ESTALE.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20241011090023.655623-4-amir73il@gmail.com
Fixes: 570df4e9c23f ("ceph: snapshot nfs re-export")
Acked-by:
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c             | 20 +++++++++++++++++++-
 include/linux/exportfs.h |  2 +-
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 8339a1041025..75cfd190cd69 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -246,7 +246,13 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
 
 	if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root)
 		retval = 1;
-	WARN_ON_ONCE(d != root && d != root->d_sb->s_root);
+	/*
+	 * exportfs_decode_fh_raw() does not call acceptable() callback with
+	 * a disconnected directory dentry, so we should have reached either
+	 * mount fd directory or sb root.
+	 */
+	if (ctx->fh_flags & EXPORT_FH_DIR_ONLY)
+		WARN_ON_ONCE(d != root && d != root->d_sb->s_root);
 	dput(d);
 	return retval;
 }
@@ -350,6 +356,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
 		retval = -EINVAL;
 		goto out_path;
 	}
+
 	handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
 			 GFP_KERNEL);
 	if (!handle) {
@@ -365,6 +372,17 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
 		goto out_handle;
 	}
 
+	/*
+	 * If handle was encoded with AT_HANDLE_CONNECTABLE, verify that we
+	 * are decoding an fd with connected path, which is accessible from
+	 * the mount fd path.
+	 */
+	if (f_handle.handle_type & FILEID_IS_CONNECTABLE) {
+		ctx.fh_flags |= EXPORT_FH_CONNECTABLE;
+		ctx.flags |= HANDLE_CHECK_SUBTREE;
+	}
+	if (f_handle.handle_type & FILEID_IS_DIR)
+		ctx.fh_flags |= EXPORT_FH_DIR_ONLY;
 	/* Filesystem code should not be exposed to user flags */
 	handle->handle_type &= ~FILEID_USER_FLAGS_MASK;
 	retval = do_handle_to_path(handle, path, &ctx);
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 4ee42b2cf4ab..fcab6ab1d38a 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -171,7 +171,7 @@ struct fid {
 /* Flags supported in encoded handle_type that is exported to user */
 #define FILEID_IS_CONNECTABLE	0x10000
 #define FILEID_IS_DIR		0x20000
-#define FILEID_VALID_USER_FLAGS	(0)
+#define FILEID_VALID_USER_FLAGS	(FILEID_IS_CONNECTABLE | FILEID_IS_DIR)
 
 /**
  * struct export_operations - for nfsd to communicate with file systems
-- 
cgit v1.2.3


From 957860cbc1dc89f79f2acc193470224e350dfd03 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Nov 2024 07:14:03 -0700
Subject: block: make struct rq_list available for !CONFIG_BLOCK

A previous commit changed how requests are linked in the plug structure,
but unlike the previous method, it uses a new type for it rather than
struct request. The latter is available even for !CONFIG_BLOCK, while
struct rq_list is now. Move it outside CONFIG_BLOCK.

Reported-by: Nathan Chancellor <nathan@kernel.org>
Fixes: a3396b99990d ("block: add a rq_list type")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 00212e96261a..a1fd0ddce5cf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1006,12 +1006,12 @@ extern void blk_put_queue(struct request_queue *);
 
 void blk_mark_disk_dead(struct gendisk *disk);
 
-#ifdef CONFIG_BLOCK
 struct rq_list {
 	struct request *head;
 	struct request *tail;
 };
 
+#ifdef CONFIG_BLOCK
 /*
  * blk_plug permits building a queue of related requests by holding the I/O
  * fragments for a short period. This allows merging of sequential requests
-- 
cgit v1.2.3


From 7eb4e1dd71009472215bc1f8226ee9a041da8d37 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 12 Nov 2024 18:52:16 +0000
Subject: x86/efi: Drop support for the EFI_PROPERTIES_TABLE

Drop support for the EFI_PROPERTIES_TABLE. It was a failed, short-lived
experiment that broke the boot both on Linux and Windows, and was
replaced by the EFI_MEMORY_ATTRIBUTES_TABLE shortly after.

Suggested-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/platform/efi/efi.c    | 19 -------------------
 arch/x86/platform/efi/efi_64.c | 42 ------------------------------------------
 include/linux/efi.h            | 17 +++--------------
 3 files changed, 3 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 88a96816de9a..375ebd78296a 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -54,14 +54,12 @@
 #include <asm/uv/uv.h>
 
 static unsigned long efi_systab_phys __initdata;
-static unsigned long prop_phys = EFI_INVALID_TABLE_ADDR;
 static unsigned long uga_phys = EFI_INVALID_TABLE_ADDR;
 static unsigned long efi_runtime, efi_nr_tables;
 
 unsigned long efi_fw_vendor, efi_config_table;
 
 static const efi_config_table_type_t arch_tables[] __initconst = {
-	{EFI_PROPERTIES_TABLE_GUID,	&prop_phys,		"PROP"		},
 	{UGA_IO_PROTOCOL_GUID,		&uga_phys,		"UGA"		},
 #ifdef CONFIG_X86_UV
 	{UV_SYSTEM_TABLE_GUID,		&uv_systab_phys,	"UVsystab"	},
@@ -82,7 +80,6 @@ static const unsigned long * const efi_tables[] = {
 	&efi_runtime,
 	&efi_config_table,
 	&efi.esrt,
-	&prop_phys,
 	&efi_mem_attr_table,
 #ifdef CONFIG_EFI_RCI2_TABLE
 	&rci2_table_phys,
@@ -502,22 +499,6 @@ void __init efi_init(void)
 		return;
 	}
 
-	/* Parse the EFI Properties table if it exists */
-	if (prop_phys != EFI_INVALID_TABLE_ADDR) {
-		efi_properties_table_t *tbl;
-
-		tbl = early_memremap_ro(prop_phys, sizeof(*tbl));
-		if (tbl == NULL) {
-			pr_err("Could not map Properties table!\n");
-		} else {
-			if (tbl->memory_protection_attribute &
-			    EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA)
-				set_bit(EFI_NX_PE_DATA, &efi.flags);
-
-			early_memunmap(tbl, sizeof(*tbl));
-		}
-	}
-
 	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
 	efi_clean_memmap();
 
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 91d31ac422d6..ac57259a432b 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -412,51 +412,9 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m
 
 void __init efi_runtime_update_mappings(void)
 {
-	efi_memory_desc_t *md;
-
-	/*
-	 * Use the EFI Memory Attribute Table for mapping permissions if it
-	 * exists, since it is intended to supersede EFI_PROPERTIES_TABLE.
-	 */
 	if (efi_enabled(EFI_MEM_ATTR)) {
 		efi_disable_ibt_for_runtime = false;
 		efi_memattr_apply_permissions(NULL, efi_update_mem_attr);
-		return;
-	}
-
-	/*
-	 * EFI_MEMORY_ATTRIBUTES_TABLE is intended to replace
-	 * EFI_PROPERTIES_TABLE. So, use EFI_PROPERTIES_TABLE to update
-	 * permissions only if EFI_MEMORY_ATTRIBUTES_TABLE is not
-	 * published by the firmware. Even if we find a buggy implementation of
-	 * EFI_MEMORY_ATTRIBUTES_TABLE, don't fall back to
-	 * EFI_PROPERTIES_TABLE, because of the same reason.
-	 */
-
-	if (!efi_enabled(EFI_NX_PE_DATA))
-		return;
-
-	for_each_efi_memory_desc(md) {
-		unsigned long pf = 0;
-
-		if (!(md->attribute & EFI_MEMORY_RUNTIME))
-			continue;
-
-		if (!(md->attribute & EFI_MEMORY_WB))
-			pf |= _PAGE_PCD;
-
-		if ((md->attribute & EFI_MEMORY_XP) ||
-			(md->type == EFI_RUNTIME_SERVICES_DATA))
-			pf |= _PAGE_NX;
-
-		if (!(md->attribute & EFI_MEMORY_RO) &&
-			(md->type != EFI_RUNTIME_SERVICES_CODE))
-			pf |= _PAGE_RW;
-
-		if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
-			pf |= _PAGE_ENC;
-
-		efi_update_mappings(md, pf);
 	}
 }
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index e28d88066033..e5815867aba9 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -379,7 +379,6 @@ void efi_native_runtime_setup(void);
 #define EFI_SYSTEM_RESOURCE_TABLE_GUID		EFI_GUID(0xb122a263, 0x3661, 0x4f68,  0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80)
 #define EFI_FILE_SYSTEM_GUID			EFI_GUID(0x964e5b22, 0x6459, 0x11d2,  0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
 #define DEVICE_TREE_GUID			EFI_GUID(0xb1b621d5, 0xf19c, 0x41a5,  0x83, 0x0b, 0xd9, 0x15, 0x2c, 0x69, 0xaa, 0xe0)
-#define EFI_PROPERTIES_TABLE_GUID		EFI_GUID(0x880aaca3, 0x4adc, 0x4a04,  0x90, 0x79, 0xb7, 0x47, 0x34, 0x08, 0x25, 0xe5)
 #define EFI_RNG_PROTOCOL_GUID			EFI_GUID(0x3152bca5, 0xeade, 0x433d,  0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44)
 #define EFI_RNG_ALGORITHM_RAW			EFI_GUID(0xe43176d7, 0xb6e8, 0x4827,  0xb7, 0x84, 0x7f, 0xfd, 0xc4, 0xb6, 0x85, 0x61)
 #define EFI_MEMORY_ATTRIBUTES_TABLE_GUID	EFI_GUID(0xdcfa911d, 0x26eb, 0x469f,  0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20)
@@ -580,15 +579,6 @@ struct efi_mem_range {
 	u64 attribute;
 };
 
-typedef struct {
-	u32 version;
-	u32 length;
-	u64 memory_protection_attribute;
-} efi_properties_table_t;
-
-#define EFI_PROPERTIES_TABLE_VERSION	0x00010000
-#define EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA	0x1
-
 typedef struct {
 	u16 version;
 	u16 length;
@@ -871,10 +861,9 @@ static inline int efi_range_is_wc(unsigned long start, unsigned long len)
 #define EFI_PARAVIRT		6	/* Access is via a paravirt interface */
 #define EFI_ARCH_1		7	/* First arch-specific bit */
 #define EFI_DBG			8	/* Print additional debug info at runtime */
-#define EFI_NX_PE_DATA		9	/* Can runtime data regions be mapped non-executable? */
-#define EFI_MEM_ATTR		10	/* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */
-#define EFI_MEM_NO_SOFT_RESERVE	11	/* Is the kernel configured to ignore soft reservations? */
-#define EFI_PRESERVE_BS_REGIONS	12	/* Are EFI boot-services memory segments available? */
+#define EFI_MEM_ATTR		9	/* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */
+#define EFI_MEM_NO_SOFT_RESERVE	10	/* Is the kernel configured to ignore soft reservations? */
+#define EFI_PRESERVE_BS_REGIONS	11	/* Are EFI boot-services memory segments available? */
 
 #ifdef CONFIG_EFI
 /*
-- 
cgit v1.2.3


From 83e041522eb9c45479f4490b212687cf1e7e9999 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:40 +0000
Subject: io_uring: temporarily disable registered waits

Disable wait argument registration as it'll be replaced with a more
generic feature. We'll still need IORING_ENTER_EXT_ARG_REG parsing
in a few commits so leave it be.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/70b1d1d218c41ba77a76d1789c8641dab0b0563e.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 10 ------
 include/uapi/linux/io_uring.h  |  3 --
 io_uring/io_uring.c            | 10 ------
 io_uring/register.c            | 82 ------------------------------------------
 io_uring/register.h            |  1 -
 5 files changed, 106 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 072e65e93105..52a5da99a205 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -330,14 +330,6 @@ struct io_ring_ctx {
 		atomic_t		cq_wait_nr;
 		atomic_t		cq_timeouts;
 		struct wait_queue_head	cq_wait;
-
-		/*
-		 * If registered with IORING_REGISTER_CQWAIT_REG, a single
-		 * page holds N entries, mapped in cq_wait_arg. cq_wait_index
-		 * is the maximum allowable index.
-		 */
-		struct io_uring_reg_wait	*cq_wait_arg;
-		unsigned char			cq_wait_index;
 	} ____cacheline_aligned_in_smp;
 
 	/* timeouts */
@@ -431,8 +423,6 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
-
-	struct page			**cq_wait_page;
 };
 
 struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5d08435b95a8..132f5db3d4e8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -627,9 +627,6 @@ enum io_uring_register_op {
 	/* resize CQ ring */
 	IORING_REGISTER_RESIZE_RINGS		= 33,
 
-	/* register fixed io_uring_reg_wait arguments */
-	IORING_REGISTER_CQWAIT_REG		= 34,
-
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 464a70bde7e6..286b7bb73978 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2709,7 +2709,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
-	io_unregister_cqwait_reg(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
@@ -3195,15 +3194,6 @@ void __io_uring_cancel(bool cancel_all)
 static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
 			const struct io_uring_getevents_arg __user *uarg)
 {
-	struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg);
-
-	if (arg) {
-		unsigned int index = (unsigned int) (uintptr_t) uarg;
-
-		if (index <= ctx->cq_wait_index)
-			return arg + index;
-	}
-
 	return ERR_PTR(-EFAULT);
 }
 
diff --git a/io_uring/register.c b/io_uring/register.c
index 45edfc57963a..3c5a3cfb186b 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,82 +570,6 @@ out:
 	return ret;
 }
 
-void io_unregister_cqwait_reg(struct io_ring_ctx *ctx)
-{
-	unsigned short npages = 1;
-
-	if (!ctx->cq_wait_page)
-		return;
-
-	io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true);
-	ctx->cq_wait_arg = NULL;
-	if (ctx->user)
-		__io_unaccount_mem(ctx->user, 1);
-}
-
-/*
- * Register a page holding N entries of struct io_uring_reg_wait, which can
- * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set.
- * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing
- * in a pointer for a struct io_uring_getevents_arg, an index into this
- * registered array is passed, avoiding two (arg + timeout) copies per
- * invocation.
- */
-static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg)
-{
-	struct io_uring_cqwait_reg_arg arg;
-	struct io_uring_reg_wait *reg;
-	struct page **pages;
-	unsigned long len;
-	int nr_pages, poff;
-	int ret;
-
-	if (ctx->cq_wait_page || ctx->cq_wait_arg)
-		return -EBUSY;
-	if (copy_from_user(&arg, uarg, sizeof(arg)))
-		return -EFAULT;
-	if (!arg.nr_entries || arg.flags)
-		return -EINVAL;
-	if (arg.struct_size != sizeof(*reg))
-		return -EINVAL;
-	if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len))
-		return -EOVERFLOW;
-	if (len > PAGE_SIZE)
-		return -EINVAL;
-	/* offset + len must fit within a page, and must be reg_wait aligned */
-	poff = arg.user_addr & ~PAGE_MASK;
-	if (len + poff > PAGE_SIZE)
-		return -EINVAL;
-	if (poff % arg.struct_size)
-		return -EINVAL;
-
-	pages = io_pin_pages(arg.user_addr, len, &nr_pages);
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
-	ret = -EINVAL;
-	if (nr_pages != 1)
-		goto out_free;
-	if (ctx->user) {
-		ret = __io_account_mem(ctx->user, 1);
-		if (ret)
-			goto out_free;
-	}
-
-	reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
-	if (reg) {
-		ctx->cq_wait_index = arg.nr_entries - 1;
-		WRITE_ONCE(ctx->cq_wait_page, pages);
-		WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff);
-		return 0;
-	}
-	ret = -ENOMEM;
-	if (ctx->user)
-		__io_unaccount_mem(ctx->user, 1);
-out_free:
-	io_pages_free(&pages, nr_pages);
-	return ret;
-}
-
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -840,12 +764,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_resize_rings(ctx, arg);
 		break;
-	case IORING_REGISTER_CQWAIT_REG:
-		ret = -EINVAL;
-		if (!arg || nr_args != 1)
-			break;
-		ret = io_register_cqwait_reg(ctx, arg);
-		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/register.h b/io_uring/register.h
index 3e935e8fa4b2..a5f39d5ef9e0 100644
--- a/io_uring/register.h
+++ b/io_uring/register.h
@@ -5,6 +5,5 @@
 int io_eventfd_unregister(struct io_ring_ctx *ctx);
 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
 struct file *io_uring_register_get_file(unsigned int fd, bool registered);
-void io_unregister_cqwait_reg(struct io_ring_ctx *ctx);
 
 #endif
-- 
cgit v1.2.3


From dfbbfbf191878e8dd422768ce009858d8b5b761e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:41 +0000
Subject: io_uring: introduce concept of memory regions

We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)

Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.

Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  6 ++++
 include/uapi/linux/io_uring.h  | 14 +++++++++
 io_uring/memmap.c              | 67 ++++++++++++++++++++++++++++++++++++++++++
 io_uring/memmap.h              | 14 +++++++++
 4 files changed, 101 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 52a5da99a205..1d3a37234ace 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -75,6 +75,12 @@ struct io_hash_table {
 	unsigned		hash_bits;
 };
 
+struct io_mapped_region {
+	struct page		**pages;
+	void			*vmap_ptr;
+	size_t			nr_pages;
+};
+
 /*
  * Arbitrary limit, can be raised if need be
  */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 132f5db3d4e8..5cbfd330c688 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -647,6 +647,20 @@ struct io_uring_files_update {
 	__aligned_u64 /* __s32 * */ fds;
 };
 
+enum {
+	/* initialise with user provided memory pointed by user_addr */
+	IORING_MEM_REGION_TYPE_USER		= 1,
+};
+
+struct io_uring_region_desc {
+	__u64 user_addr;
+	__u64 size;
+	__u32 flags;
+	__u32 id;
+	__u64 mmap_offset;
+	__u64 __resv[4];
+};
+
 /*
  * Register a fully sparse file space, rather than pass in an array of all
  * -1 file descriptors.
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 6ab59c60dfd0..bbd9569a0120 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -12,6 +12,7 @@
 
 #include "memmap.h"
 #include "kbuf.h"
+#include "rsrc.h"
 
 static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
 				   size_t size, gfp_t gfp)
@@ -194,6 +195,72 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
 	return ERR_PTR(-ENOMEM);
 }
 
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
+{
+	if (mr->pages) {
+		unpin_user_pages(mr->pages, mr->nr_pages);
+		kvfree(mr->pages);
+	}
+	if (mr->vmap_ptr)
+		vunmap(mr->vmap_ptr);
+	if (mr->nr_pages && ctx->user)
+		__io_unaccount_mem(ctx->user, mr->nr_pages);
+
+	memset(mr, 0, sizeof(*mr));
+}
+
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+		     struct io_uring_region_desc *reg)
+{
+	int pages_accounted = 0;
+	struct page **pages;
+	int nr_pages, ret;
+	void *vptr;
+	u64 end;
+
+	if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages))
+		return -EFAULT;
+	if (memchr_inv(&reg->__resv, 0, sizeof(reg->__resv)))
+		return -EINVAL;
+	if (reg->flags != IORING_MEM_REGION_TYPE_USER)
+		return -EINVAL;
+	if (!reg->user_addr)
+		return -EFAULT;
+	if (!reg->size || reg->mmap_offset || reg->id)
+		return -EINVAL;
+	if ((reg->size >> PAGE_SHIFT) > INT_MAX)
+		return E2BIG;
+	if ((reg->user_addr | reg->size) & ~PAGE_MASK)
+		return -EINVAL;
+	if (check_add_overflow(reg->user_addr, reg->size, &end))
+		return -EOVERFLOW;
+
+	pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	if (ctx->user) {
+		ret = __io_account_mem(ctx->user, nr_pages);
+		if (ret)
+			goto out_free;
+		pages_accounted = nr_pages;
+	}
+
+	vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!vptr)
+		goto out_free;
+
+	mr->pages = pages;
+	mr->vmap_ptr = vptr;
+	mr->nr_pages = nr_pages;
+	return 0;
+out_free:
+	if (pages_accounted)
+		__io_unaccount_mem(ctx->user, pages_accounted);
+	io_pages_free(&pages, nr_pages);
+	return ret;
+}
+
 static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
 					    size_t sz)
 {
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index 5cec5b7ac49a..f361a635b6c7 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -22,4 +22,18 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
 					 unsigned long flags);
 int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
 
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr);
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+		     struct io_uring_region_desc *reg);
+
+static inline void *io_region_get_ptr(struct io_mapped_region *mr)
+{
+	return mr->vmap_ptr;
+}
+
+static inline bool io_region_is_set(struct io_mapped_region *mr)
+{
+	return !!mr->nr_pages;
+}
+
 #endif
-- 
cgit v1.2.3


From 93238e66185524aad925acefb2312203b9e26d63 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:42 +0000
Subject: io_uring: add memory region registration

Regions will serve multiple purposes. First, with it we can decouple
ring/etc. object creation from registration / mapping of the memory they
will be placed in. We already have hacks that allow to put both SQ and
CQ into the same huge page, in the future we should be able to:

region = create_region(io_ring);
create_pbuf_ring(io_uring, region, offset=0);
create_pbuf_ring(io_uring, region, offset=N);

The second use case is efficiently passing parameters. The following
patch enables back on top of regions IORING_ENTER_EXT_ARG_REG, which
optimises wait arguments. It'll also be useful for request arguments
replacing iovecs, msghdr, etc. pointers. Eventually it would also be
handy for BPF as well if it comes to fruition.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0798cf3a14fad19cfc96fc9feca5f3e11481691d.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 include/uapi/linux/io_uring.h  |  8 ++++++++
 io_uring/io_uring.c            |  1 +
 io_uring/register.c            | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 1d3a37234ace..e1d69123e164 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -429,6 +429,9 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+
+	/* used for optimised request parameter and wait argument passing  */
+	struct io_mapped_region		param_region;
 };
 
 struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5cbfd330c688..1ee35890125b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -627,6 +627,8 @@ enum io_uring_register_op {
 	/* resize CQ ring */
 	IORING_REGISTER_RESIZE_RINGS		= 33,
 
+	IORING_REGISTER_MEM_REGION		= 34,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -661,6 +663,12 @@ struct io_uring_region_desc {
 	__u64 __resv[4];
 };
 
+struct io_uring_mem_region_reg {
+	__u64 region_uptr; /* struct io_uring_region_desc * */
+	__u64 flags;
+	__u64 __resv[2];
+};
+
 /*
  * Register a fully sparse file space, rather than pass in an array of all
  * -1 file descriptors.
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 286b7bb73978..c640b8a4ceee 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2709,6 +2709,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
+	io_free_region(ctx, &ctx->param_region);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
diff --git a/io_uring/register.c b/io_uring/register.c
index 3c5a3cfb186b..2cbac3d9b288 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,6 +570,37 @@ out:
 	return ret;
 }
 
+static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
+{
+	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
+	struct io_uring_mem_region_reg reg;
+	struct io_uring_region_desc __user *rd_uptr;
+	struct io_uring_region_desc rd;
+	int ret;
+
+	if (io_region_is_set(&ctx->param_region))
+		return -EBUSY;
+	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
+		return -EFAULT;
+	rd_uptr = u64_to_user_ptr(reg.region_uptr);
+	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
+		return -EFAULT;
+
+	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
+		return -EINVAL;
+	if (reg.flags)
+		return -EINVAL;
+
+	ret = io_create_region(ctx, &ctx->param_region, &rd);
+	if (ret)
+		return ret;
+	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
+		io_free_region(ctx, &ctx->param_region);
+		return -EFAULT;
+	}
+	return 0;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -764,6 +795,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_resize_rings(ctx, arg);
 		break;
+	case IORING_REGISTER_MEM_REGION:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_mem_region(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 96a30e469ca1d2b8cc7811b40911f8614b558241 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Nov 2024 16:13:03 -0800
Subject: bpf: use common instruction history across all states

Instead of allocating and copying instruction history each time we
enqueue child verifier state, switch to a model where we use one common
dynamically sized array of instruction history entries across all states.

The key observation for proving this is correct is that instruction
history is only relevant while state is active, which means it either is
a current state (and thus we are actively modifying instruction history
and no other state can interfere with us) or we are checkpointed state
with some children still active (either enqueued or being current).

In the latter case our portion of instruction history is finalized and
won't change or grow, so as long as we keep it immutable until the state
is finalized, we are good.

Now, when state is finalized and is put into state hash for potentially
future pruning lookups, instruction history is not used anymore. This is
because instruction history is only used by precision marking logic, and
we never modify precision markings for finalized states.

So, instead of each state having its own small instruction history, we
keep a global dynamically-sized instruction history, where each state in
current DFS path from root to active state remembers its portion of
instruction history. Current state can append to this history, but
cannot modify any of its parent histories.

Async callback state enqueueing, while logically detached from parent
state, still is part of verification backtracking tree, so has to follow
the same schema as normal state checkpoints.

Because the insn_hist array can be grown through realloc, states don't
keep pointers, they instead maintain two indices, [start, end), into
global instruction history array. End is exclusive index, so
`start == end` means there is no relevant instruction history.

This eliminates a lot of allocations and minimizes overall memory usage.

For instance, running a worst-case test from [0] (but without the
heuristics-based fix [1]), it took 12.5 minutes until we get -ENOMEM.
With the changes in this patch the whole test succeeds in 10 minutes
(very slow, so heuristics from [1] is important, of course).

To further validate correctness, veristat-based comparison was performed for
Meta production BPF objects and BPF selftests objects. In both cases there
were no differences *at all* in terms of verdict or instruction and state
counts, providing a good confidence in the change.

Having this low-memory-overhead solution of keeping dynamic
per-instruction history cheaply opens up some new possibilities, like
keeping extra information for literally every single validated
instruction. This will be used for simplifying precision backpropagation
logic in follow up patches.

  [0] https://lore.kernel.org/bpf/20241029172641.1042523-2-eddyz87@gmail.com/
  [1] https://lore.kernel.org/bpf/20241029172641.1042523-1-eddyz87@gmail.com/

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241115001303.277272-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  19 ++++----
 kernel/bpf/verifier.c        | 107 +++++++++++++++++++++----------------------
 2 files changed, 63 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 6b7c91629176..f4290c179bee 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -334,7 +334,7 @@ struct bpf_func_state {
 
 #define MAX_CALL_FRAMES 8
 
-/* instruction history flags, used in bpf_jmp_history_entry.flags field */
+/* instruction history flags, used in bpf_insn_hist_entry.flags field */
 enum {
 	/* instruction references stack slot through PTR_TO_STACK register;
 	 * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8)
@@ -352,7 +352,7 @@ enum {
 static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES);
 static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8);
 
-struct bpf_jmp_history_entry {
+struct bpf_insn_hist_entry {
 	u32 idx;
 	/* insn idx can't be bigger than 1 million */
 	u32 prev_idx : 22;
@@ -442,13 +442,14 @@ struct bpf_verifier_state {
 	 * See get_loop_entry() for more information.
 	 */
 	struct bpf_verifier_state *loop_entry;
-	/* jmp history recorded from first to last.
-	 * backtracking is using it to go from last to first.
-	 * For most states jmp_history_cnt is [0-3].
+	/* Sub-range of env->insn_hist[] corresponding to this state's
+	 * instruction history.
+	 * Backtracking is using it to go from last to first.
+	 * For most states instruction history is short, 0-3 instructions.
 	 * For loops can go up to ~40.
 	 */
-	struct bpf_jmp_history_entry *jmp_history;
-	u32 jmp_history_cnt;
+	u32 insn_hist_start;
+	u32 insn_hist_end;
 	u32 dfs_depth;
 	u32 callback_unroll_depth;
 	u32 may_goto_depth;
@@ -738,7 +739,9 @@ struct bpf_verifier_env {
 		int cur_stack;
 	} cfg;
 	struct backtrack_state bt;
-	struct bpf_jmp_history_entry *cur_hist_ent;
+	struct bpf_insn_hist_entry *insn_hist;
+	struct bpf_insn_hist_entry *cur_hist_ent;
+	u32 insn_hist_cap;
 	u32 pass_cnt; /* number of times do_check() was called */
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 09f7fa635f67..1c4ebb326785 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1452,13 +1452,6 @@ static void free_func_state(struct bpf_func_state *state)
 	kfree(state);
 }
 
-static void clear_jmp_history(struct bpf_verifier_state *state)
-{
-	kfree(state->jmp_history);
-	state->jmp_history = NULL;
-	state->jmp_history_cnt = 0;
-}
-
 static void free_verifier_state(struct bpf_verifier_state *state,
 				bool free_self)
 {
@@ -1468,7 +1461,6 @@ static void free_verifier_state(struct bpf_verifier_state *state,
 		free_func_state(state->frame[i]);
 		state->frame[i] = NULL;
 	}
-	clear_jmp_history(state);
 	if (free_self)
 		kfree(state);
 }
@@ -1494,13 +1486,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	struct bpf_func_state *dst;
 	int i, err;
 
-	dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
-					  src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
-					  GFP_USER);
-	if (!dst_state->jmp_history)
-		return -ENOMEM;
-	dst_state->jmp_history_cnt = src->jmp_history_cnt;
-
 	/* if dst has more stack frames then src frame, free them, this is also
 	 * necessary in case of exceptional exits using bpf_throw.
 	 */
@@ -1517,6 +1502,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->parent = src->parent;
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
+	dst_state->insn_hist_start = src->insn_hist_start;
+	dst_state->insn_hist_end = src->insn_hist_end;
 	dst_state->dfs_depth = src->dfs_depth;
 	dst_state->callback_unroll_depth = src->callback_unroll_depth;
 	dst_state->used_as_loop_entry = src->used_as_loop_entry;
@@ -2569,9 +2556,14 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
 	 * The caller state doesn't matter.
 	 * This is async callback. It starts in a fresh stack.
 	 * Initialize it similar to do_check_common().
+	 * But we do need to make sure to not clobber insn_hist, so we keep
+	 * chaining insn_hist_start/insn_hist_end indices as for a normal
+	 * child state.
 	 */
 	elem->st.branches = 1;
 	elem->st.in_sleepable = is_sleepable;
+	elem->st.insn_hist_start = env->cur_state->insn_hist_end;
+	elem->st.insn_hist_end = elem->st.insn_hist_start;
 	frame = kzalloc(sizeof(*frame), GFP_KERNEL);
 	if (!frame)
 		goto err;
@@ -3551,11 +3543,10 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s)
 }
 
 /* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
-			    int insn_flags, u64 linked_regs)
+static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+			     int insn_flags, u64 linked_regs)
 {
-	u32 cnt = cur->jmp_history_cnt;
-	struct bpf_jmp_history_entry *p;
+	struct bpf_insn_hist_entry *p;
 	size_t alloc_size;
 
 	/* combine instruction flags if we already recorded this instruction */
@@ -3575,29 +3566,32 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st
 		return 0;
 	}
 
-	cnt++;
-	alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
-	p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
-	if (!p)
-		return -ENOMEM;
-	cur->jmp_history = p;
+	if (cur->insn_hist_end + 1 > env->insn_hist_cap) {
+		alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p));
+		p = kvrealloc(env->insn_hist, alloc_size, GFP_USER);
+		if (!p)
+			return -ENOMEM;
+		env->insn_hist = p;
+		env->insn_hist_cap = alloc_size / sizeof(*p);
+	}
 
-	p = &cur->jmp_history[cnt - 1];
+	p = &env->insn_hist[cur->insn_hist_end];
 	p->idx = env->insn_idx;
 	p->prev_idx = env->prev_insn_idx;
 	p->flags = insn_flags;
 	p->linked_regs = linked_regs;
-	cur->jmp_history_cnt = cnt;
+
+	cur->insn_hist_end++;
 	env->cur_hist_ent = p;
 
 	return 0;
 }
 
-static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
-						        u32 hist_end, int insn_idx)
+static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env,
+						       u32 hist_start, u32 hist_end, int insn_idx)
 {
-	if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
-		return &st->jmp_history[hist_end - 1];
+	if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx)
+		return &env->insn_hist[hist_end - 1];
 	return NULL;
 }
 
@@ -3614,25 +3608,26 @@ static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_stat
  * history entry recording a jump from last instruction of parent state and
  * first instruction of given state.
  */
-static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
-			     u32 *history)
+static int get_prev_insn_idx(const struct bpf_verifier_env *env,
+			     struct bpf_verifier_state *st,
+			     int insn_idx, u32 hist_start, u32 *hist_endp)
 {
-	u32 cnt = *history;
+	u32 hist_end = *hist_endp;
+	u32 cnt = hist_end - hist_start;
 
-	if (i == st->first_insn_idx) {
+	if (insn_idx == st->first_insn_idx) {
 		if (cnt == 0)
 			return -ENOENT;
-		if (cnt == 1 && st->jmp_history[0].idx == i)
+		if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx)
 			return -ENOENT;
 	}
 
-	if (cnt && st->jmp_history[cnt - 1].idx == i) {
-		i = st->jmp_history[cnt - 1].prev_idx;
-		(*history)--;
+	if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) {
+		(*hist_endp)--;
+		return env->insn_hist[hist_end - 1].prev_idx;
 	} else {
-		i--;
+		return insn_idx - 1;
 	}
-	return i;
 }
 
 static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
@@ -3804,7 +3799,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
 /* If any register R in hist->linked_regs is marked as precise in bt,
  * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
  */
-static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist)
 {
 	struct linked_regs linked_regs;
 	bool some_precise = false;
@@ -3849,7 +3844,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
  *   - *was* processed previously during backtracking.
  */
 static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
-			  struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
+			  struct bpf_insn_hist_entry *hist, struct backtrack_state *bt)
 {
 	const struct bpf_insn_cbs cbs = {
 		.cb_call	= disasm_kfunc_name,
@@ -4268,7 +4263,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
  * SCALARS, as well as any other registers and slots that contribute to
  * a tracked state of given registers/stack slots, depending on specific BPF
  * assembly instructions (see backtrack_insns() for exact instruction handling
- * logic). This backtracking relies on recorded jmp_history and is able to
+ * logic). This backtracking relies on recorded insn_hist and is able to
  * traverse entire chain of parent states. This process ends only when all the
  * necessary registers/slots and their transitive dependencies are marked as
  * precise.
@@ -4385,8 +4380,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 
 	for (;;) {
 		DECLARE_BITMAP(mask, 64);
-		u32 history = st->jmp_history_cnt;
-		struct bpf_jmp_history_entry *hist;
+		u32 hist_start = st->insn_hist_start;
+		u32 hist_end = st->insn_hist_end;
+		struct bpf_insn_hist_entry *hist;
 
 		if (env->log.level & BPF_LOG_LEVEL2) {
 			verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
@@ -4425,7 +4421,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				err = 0;
 				skip_first = false;
 			} else {
-				hist = get_jmp_hist_entry(st, history, i);
+				hist = get_insn_hist_entry(env, hist_start, hist_end, i);
 				err = backtrack_insn(env, i, subseq_idx, hist, bt);
 			}
 			if (err == -ENOTSUPP) {
@@ -4442,7 +4438,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				 */
 				return 0;
 			subseq_idx = i;
-			i = get_prev_insn_idx(st, i, &history);
+			i = get_prev_insn_idx(env, st, i, hist_start, &hist_end);
 			if (i == -ENOENT)
 				break;
 			if (i >= env->prog->len) {
@@ -4808,7 +4804,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	}
 
 	if (insn_flags)
-		return push_jmp_history(env, env->cur_state, insn_flags, 0);
+		return push_insn_history(env, env->cur_state, insn_flags, 0);
 	return 0;
 }
 
@@ -5115,7 +5111,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 		insn_flags = 0; /* we are not restoring spilled register */
 	}
 	if (insn_flags)
-		return push_jmp_history(env, env->cur_state, insn_flags, 0);
+		return push_insn_history(env, env->cur_state, insn_flags, 0);
 	return 0;
 }
 
@@ -15740,7 +15736,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
 		collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
 	if (linked_regs.cnt > 1) {
-		err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+		err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
 		if (err)
 			return err;
 	}
@@ -18129,7 +18125,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 
 	force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
 			  /* Avoid accumulating infinitely long jmp history */
-			  cur->jmp_history_cnt > 40;
+			  cur->insn_hist_end - cur->insn_hist_start > 40;
 
 	/* bpf progs typically have pruning point every 4 instructions
 	 * http://vger.kernel.org/bpfconf2019.html#session-1
@@ -18327,7 +18323,7 @@ hit:
 			 * the current state.
 			 */
 			if (is_jmp_point(env, env->insn_idx))
-				err = err ? : push_jmp_history(env, cur, 0, 0);
+				err = err ? : push_insn_history(env, cur, 0, 0);
 			err = err ? : propagate_precision(env, &sl->state);
 			if (err)
 				return err;
@@ -18426,8 +18422,8 @@ next:
 
 	cur->parent = new;
 	cur->first_insn_idx = insn_idx;
+	cur->insn_hist_start = cur->insn_hist_end;
 	cur->dfs_depth = new->dfs_depth + 1;
-	clear_jmp_history(cur);
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
 	/* connect new state to parentage chain. Current frame needs all
@@ -18595,7 +18591,7 @@ static int do_check(struct bpf_verifier_env *env)
 		}
 
 		if (is_jmp_point(env, env->insn_idx)) {
-			err = push_jmp_history(env, state, 0, 0);
+			err = push_insn_history(env, state, 0, 0);
 			if (err)
 				return err;
 		}
@@ -22789,6 +22785,7 @@ err_unlock:
 	if (!is_priv)
 		mutex_unlock(&bpf_verifier_lock);
 	vfree(env->insn_aux_data);
+	kvfree(env->insn_hist);
 err_free_env:
 	kvfree(env);
 	return ret;
-- 
cgit v1.2.3


From 906c508afdca3487c4273bfeda8abedc8e21047b Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Fri, 15 Nov 2024 17:42:48 +0100
Subject: sysfs: attribute_group: allow registration of const bin_attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To be able to constify instances of struct bin_attribute it has to be
possible to add them to string attribute_group.
The current type of the bin_attrs member however is not compatible with
that.
Introduce a union that allows registration of both const and non-const
attributes to enable a piecewise transition.
As both union member types are compatible no logic needs to be adapted.

Technically it is now possible register a const struct
bin_attribute and receive it as mutable pointer in the callbacks.
This is a soundness issue.
But this same soundness issue already exists today in
sysfs_create_bin_file().
Also the struct definition and callback implementation are always
closely linked and are meant to be moved to const in lockstep.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20241115-b4-sysfs-const-bin_attr-group-v1-1-2c9bb12dfc48@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index d713a6445a62..0f2fcd244523 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -106,7 +106,10 @@ struct attribute_group {
 					    const struct bin_attribute *,
 					    int);
 	struct attribute	**attrs;
-	struct bin_attribute	**bin_attrs;
+	union {
+		struct bin_attribute		**bin_attrs;
+		const struct bin_attribute	*const *bin_attrs_new;
+	};
 };
 
 #define SYSFS_PREALLOC		010000
-- 
cgit v1.2.3


From d617b3147d54c42351eac63b5398d4ddf4f4011b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:43 +0000
Subject: io_uring: restore back registered wait arguments

Now we've got a more generic region registration API, place
IORING_ENTER_EXT_ARG_REG and re-enable it.

First, the user has to register a region with the
IORING_MEM_REGION_REG_WAIT_ARG flag set. It can only be done for a
ring in a disabled state, aka IORING_SETUP_R_DISABLED, to avoid races
with already running waiters. With that we should have stable constant
values for ctx->cq_wait_{size,arg} in io_get_ext_arg_reg() and hence no
READ_ONCE required.

The other API difference is that we're now passing byte offsets instead
of indexes. The user _must_ align all offsets / pointers to the native
word size, failing to do so might but not necessarily has to lead to a
failure usually returned as -EFAULT. liburing will be hiding this
details from users.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/81822c1b4ffbe8ad391b4f9ad1564def0d26d990.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 include/uapi/linux/io_uring.h  |  5 +++++
 io_uring/io_uring.c            | 14 +++++++++++++-
 io_uring/register.c            | 16 +++++++++++++++-
 4 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e1d69123e164..aa5f5ea98076 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -324,6 +324,9 @@ struct io_ring_ctx {
 		unsigned		cq_entries;
 		struct io_ev_fd	__rcu	*io_ev_fd;
 		unsigned		cq_extra;
+
+		void			*cq_wait_arg;
+		size_t			cq_wait_size;
 	} ____cacheline_aligned_in_smp;
 
 	/*
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1ee35890125b..4418d0192959 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -663,6 +663,11 @@ struct io_uring_region_desc {
 	__u64 __resv[4];
 };
 
+enum {
+	/* expose the region as registered wait arguments */
+	IORING_MEM_REGION_REG_WAIT_ARG		= 1,
+};
+
 struct io_uring_mem_region_reg {
 	__u64 region_uptr; /* struct io_uring_region_desc * */
 	__u64 flags;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c640b8a4ceee..da8fd460977b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3195,7 +3195,19 @@ void __io_uring_cancel(bool cancel_all)
 static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
 			const struct io_uring_getevents_arg __user *uarg)
 {
-	return ERR_PTR(-EFAULT);
+	unsigned long size = sizeof(struct io_uring_reg_wait);
+	unsigned long offset = (uintptr_t)uarg;
+	unsigned long end;
+
+	if (unlikely(offset % sizeof(long)))
+		return ERR_PTR(-EFAULT);
+
+	/* also protects from NULL ->cq_wait_arg as the size would be 0 */
+	if (unlikely(check_add_overflow(offset, size, &end) ||
+		     end > ctx->cq_wait_size))
+		return ERR_PTR(-EFAULT);
+
+	return ctx->cq_wait_arg + offset;
 }
 
 static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
diff --git a/io_uring/register.c b/io_uring/register.c
index 2cbac3d9b288..1a60f4916649 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -588,7 +588,16 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
 
 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
 		return -EINVAL;
-	if (reg.flags)
+	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
+		return -EINVAL;
+
+	/*
+	 * This ensures there are no waiters. Waiters are unlocked and it's
+	 * hard to synchronise with them, especially if we need to initialise
+	 * the region.
+	 */
+	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
+	    !(ctx->flags & IORING_SETUP_R_DISABLED))
 		return -EINVAL;
 
 	ret = io_create_region(ctx, &ctx->param_region, &rd);
@@ -598,6 +607,11 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
 		io_free_region(ctx, &ctx->param_region);
 		return -EFAULT;
 	}
+
+	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
+		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
+		ctx->cq_wait_size = rd.size;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9407f5c3ec10c155aae61fc4496a7c17a96907b4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 11 Nov 2024 16:28:51 -0800
Subject: srcu: Unconditionally record srcu_read_lock_lite() in
 ->srcu_reader_flavor

Currently, srcu_read_lock_lite() uses the SRCU_READ_FLAVOR_LITE bit in
->srcu_reader_flavor to communicate to the grace-period processing in
srcu_readers_active_idx_check() that the smp_mb() must be replaced by a
synchronize_rcu().  Unfortunately, ->srcu_reader_flavor is not updated
unless the kernel is built with CONFIG_PROVE_RCU=y.  Therefore in all
kernels built with CONFIG_PROVE_RCU=n, srcu_readers_active_idx_check()
incorrectly uses smp_mb() instead of synchronize_rcu() for srcu_struct
structures whose readers use srcu_read_lock_lite().

This commit therefore causes Tree SRCU srcu_read_lock_lite()
to unconditionally update ->srcu_reader_flavor so that
srcu_readers_active_idx_check() can make the correct choice.

Reported-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Closes: https://lore.kernel.org/all/d07e8f4a-d5ff-4c8e-8e61-50db285c57e9@amd.com/
Fixes: c0f08d6b5a61 ("srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()")
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h     |  8 +-------
 include/linux/srcutiny.h |  3 +++
 include/linux/srcutree.h | 21 +++++++++++++++++++++
 kernel/rcu/srcutree.c    |  6 ++----
 4 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 56f83237de4d..08339eb8a01c 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -183,12 +183,6 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
-void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
-#else
-#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0)
-#endif
-
 
 /**
  * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
@@ -277,7 +271,7 @@ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp)
 {
 	int retval;
 
-	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
+	srcu_check_read_flavor_lite(ssp);
 	retval = __srcu_read_lock_lite(ssp);
 	rcu_try_lock_acquire(&ssp->dep_map);
 	return retval;
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 4d96bbdb45f0..1321da803274 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -81,6 +81,9 @@ static inline void srcu_barrier(struct srcu_struct *ssp)
 	synchronize_srcu(ssp);
 }
 
+#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0)
+#define srcu_check_read_flavor_lite(ssp) do { } while (0)
+
 /* Defined here to avoid size increase for non-torture kernels. */
 static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
 					    char *tt, char *tf)
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 778eb61542e1..490aeecc6bb4 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -248,4 +248,25 @@ static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite().");
 }
 
+void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
+
+// Record _lite() usage even for CONFIG_PROVE_RCU=n kernels.
+static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp)
+{
+	struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+
+	if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE))
+		return;
+
+	// Note that the cmpxchg() in srcu_check_read_flavor() is fully ordered.
+	__srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
+}
+
+// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels.
+static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
+{
+	if (IS_ENABLED(CONFIG_PROVE_RCU))
+		__srcu_check_read_flavor(ssp, read_flavor);
+}
+
 #endif
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a381b553cdca..5e2e53464794 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -712,11 +712,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
-#ifdef CONFIG_PROVE_RCU
 /*
  * Check for consistent reader flavor.
  */
-void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
+void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
 {
 	int old_read_flavor;
 	struct srcu_data *sdp;
@@ -734,8 +733,7 @@ void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
 	}
 	WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor);
 }
-EXPORT_SYMBOL_GPL(srcu_check_read_flavor);
-#endif /* CONFIG_PROVE_RCU */
+EXPORT_SYMBOL_GPL(__srcu_check_read_flavor);
 
 /*
  * Counts the new reader in the appropriate per-CPU element of the
-- 
cgit v1.2.3


From 41ffcd95015f18178ddc53fed919c2842d52fe38 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 14 Nov 2024 10:33:27 +0000
Subject: net: phy: fix phylib's dual eee_enabled

phylib has two eee_enabled members. Some parts of the code are using
phydev->eee_enabled, other parts are using phydev->eee_cfg.eee_enabled.
This leads to incorrect behaviour as their state goes out of sync.
ethtool --show-eee shows incorrect information, and --set-eee sometimes
doesn't take effect.

Fix this by only having one eee_enabled member - that in eee_cfg.

Fixes: 49168d1980e2 ("net: phy: Add phy_support_eee() indicating MAC support EEE")
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/E1tBXAF-00341F-EQ@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy-c45.c    | 4 +---
 drivers/net/phy/phy_device.c | 4 ++--
 include/linux/phy.h          | 2 --
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 5695935fdce9..ac987e5e82dc 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -942,7 +942,7 @@ EXPORT_SYMBOL_GPL(genphy_c45_read_eee_abilities);
  */
 int genphy_c45_an_config_eee_aneg(struct phy_device *phydev)
 {
-	if (!phydev->eee_enabled) {
+	if (!phydev->eee_cfg.eee_enabled) {
 		__ETHTOOL_DECLARE_LINK_MODE_MASK(adv) = {};
 
 		return genphy_c45_write_eee_adv(phydev, adv);
@@ -1575,8 +1575,6 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
 		linkmode_copy(phydev->advertising_eee, adv);
 	}
 
-	phydev->eee_enabled = data->eee_enabled;
-
 	ret = genphy_c45_an_config_eee_aneg(phydev);
 	if (ret > 0) {
 		ret = phy_restart_aneg(phydev);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 499797646580..5dfa2aa53c90 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3595,12 +3595,12 @@ static int phy_probe(struct device *dev)
 	/* There is no "enabled" flag. If PHY is advertising, assume it is
 	 * kind of enabled.
 	 */
-	phydev->eee_enabled = !linkmode_empty(phydev->advertising_eee);
+	phydev->eee_cfg.eee_enabled = !linkmode_empty(phydev->advertising_eee);
 
 	/* Some PHYs may advertise, by default, not support EEE modes. So,
 	 * we need to clean them.
 	 */
-	if (phydev->eee_enabled)
+	if (phydev->eee_cfg.eee_enabled)
 		linkmode_and(phydev->advertising_eee, phydev->supported_eee,
 			     phydev->advertising_eee);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index a98bc91a0cde..44890cdf40a2 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -601,7 +601,6 @@ struct macsec_ops;
  * @adv_old: Saved advertised while power saving for WoL
  * @supported_eee: supported PHY EEE linkmodes
  * @advertising_eee: Currently advertised EEE linkmodes
- * @eee_enabled: Flag indicating whether the EEE feature is enabled
  * @enable_tx_lpi: When True, MAC should transmit LPI to PHY
  * @eee_cfg: User configuration of EEE
  * @lp_advertising: Current link partner advertised linkmodes
@@ -721,7 +720,6 @@ struct phy_device {
 	/* used for eee validation and configuration*/
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee);
-	bool eee_enabled;
 
 	/* Host supported PHY interface types. Should be ignored if empty. */
 	DECLARE_PHY_INTERFACE_MASK(host_interfaces);
-- 
cgit v1.2.3


From 221a9c1df790fa711d65daf5ba05d0addc279153 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 14 Nov 2024 03:00:11 -0800
Subject: net: netpoll: Individualize the skb pool

The current implementation of the netpoll system uses a global skb
pool, which can lead to inefficient memory usage and
waste when targets are disabled or no longer in use.

This can result in a significant amount of memory being unnecessarily
allocated and retained, potentially causing performance issues and
limiting the availability of resources for other system components.

Modify the netpoll system to assign a skb pool to each target instead of
using a global one.

This approach allows for more fine-grained control over memory
allocation and deallocation, ensuring that resources are only allocated
and retained as needed.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20241114-skb_buffers_v2-v3-1-9be9f52a8b69@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netpoll.h |  1 +
 net/core/netpoll.c      | 31 +++++++++++++------------------
 2 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index cd4e28db0cbd..77635b885c18 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -32,6 +32,7 @@ struct netpoll {
 	bool ipv6;
 	u16 local_port, remote_port;
 	u8 remote_mac[ETH_ALEN];
+	struct sk_buff_head skb_pool;
 };
 
 struct netpoll_info {
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 94b7f07a952f..719c9aae845f 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -45,9 +45,6 @@
 
 #define MAX_UDP_CHUNK 1460
 #define MAX_SKBS 32
-
-static struct sk_buff_head skb_pool;
-
 #define USEC_PER_POLL	50
 
 #define MAX_SKB_SIZE							\
@@ -234,20 +231,23 @@ void netpoll_poll_enable(struct net_device *dev)
 		up(&ni->dev_lock);
 }
 
-static void refill_skbs(void)
+static void refill_skbs(struct netpoll *np)
 {
+	struct sk_buff_head *skb_pool;
 	struct sk_buff *skb;
 	unsigned long flags;
 
-	spin_lock_irqsave(&skb_pool.lock, flags);
-	while (skb_pool.qlen < MAX_SKBS) {
+	skb_pool = &np->skb_pool;
+
+	spin_lock_irqsave(&skb_pool->lock, flags);
+	while (skb_pool->qlen < MAX_SKBS) {
 		skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
 		if (!skb)
 			break;
 
-		__skb_queue_tail(&skb_pool, skb);
+		__skb_queue_tail(skb_pool, skb);
 	}
-	spin_unlock_irqrestore(&skb_pool.lock, flags);
+	spin_unlock_irqrestore(&skb_pool->lock, flags);
 }
 
 static void zap_completion_queue(void)
@@ -284,12 +284,12 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
 	struct sk_buff *skb;
 
 	zap_completion_queue();
-	refill_skbs();
+	refill_skbs(np);
 repeat:
 
 	skb = alloc_skb(len, GFP_ATOMIC);
 	if (!skb)
-		skb = skb_dequeue(&skb_pool);
+		skb = skb_dequeue(&np->skb_pool);
 
 	if (!skb) {
 		if (++count < 10) {
@@ -673,6 +673,8 @@ int netpoll_setup(struct netpoll *np)
 	struct in_device *in_dev;
 	int err;
 
+	skb_queue_head_init(&np->skb_pool);
+
 	rtnl_lock();
 	if (np->dev_name[0]) {
 		struct net *net = current->nsproxy->net_ns;
@@ -773,7 +775,7 @@ put_noaddr:
 	}
 
 	/* fill up the skb queue */
-	refill_skbs();
+	refill_skbs(np);
 
 	err = __netpoll_setup(np, ndev);
 	if (err)
@@ -792,13 +794,6 @@ unlock:
 }
 EXPORT_SYMBOL(netpoll_setup);
 
-static int __init netpoll_init(void)
-{
-	skb_queue_head_init(&skb_pool);
-	return 0;
-}
-core_initcall(netpoll_init);
-
 static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
 {
 	struct netpoll_info *npinfo =
-- 
cgit v1.2.3


From 4b42fbc6bd8f73d9ded535d8c61ccaa837ff3bd4 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 14 Nov 2024 15:09:53 +0100
Subject: ndo_fdb_add: Add a parameter to report whether notification was sent

Currently when FDB entries are added to or deleted from a VXLAN netdevice,
the VXLAN driver emits one notification, including the VXLAN-specific
attributes. The core however always sends a notification as well, a generic
one. Thus two notifications are unnecessarily sent for these operations. A
similar situation comes up with bridge driver, which also emits
notifications on its own:

 # ip link add name vx type vxlan id 1000 dstport 4789
 # bridge monitor fdb &
 [1] 1981693
 # bridge fdb add de:ad:be:ef:13:37 dev vx self dst 192.0.2.1
 de:ad:be:ef:13:37 dev vx dst 192.0.2.1 self permanent
 de:ad:be:ef:13:37 dev vx self permanent

In order to prevent this duplicity, add a paremeter to ndo_fdb_add,
bool *notified. The flag is primed to false, and if the callee sends a
notification on its own, it sets it to true, thus informing the core that
it should not generate another notification.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/cbf6ae8195e85cbf922f8058ce4eba770f3b71ed.1731589511.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c      |  3 ++-
 drivers/net/ethernet/intel/ice/ice_main.c        |  4 +++-
 drivers/net/ethernet/intel/igb/igb_main.c        |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  2 +-
 drivers/net/ethernet/mscc/ocelot_net.c           |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  2 +-
 drivers/net/macvlan.c                            |  2 +-
 drivers/net/vxlan/vxlan_core.c                   |  5 ++++-
 include/linux/netdevice.h                        |  5 ++++-
 net/bridge/br_fdb.c                              | 12 +++++++-----
 net/bridge/br_private.h                          |  2 +-
 net/core/rtnetlink.c                             |  9 ++++++---
 12 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 55fb362eb508..ab5febf83ec3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -13095,12 +13095,13 @@ static int i40e_get_phys_port_id(struct net_device *netdev,
  * @addr: the MAC address entry being added
  * @vid: VLAN ID
  * @flags: instructions from stack about fdb operation
+ * @notified: whether notification was emitted
  * @extack: netlink extended ack, unused currently
  */
 static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			    struct net_device *dev,
 			    const unsigned char *addr, u16 vid,
-			    u16 flags,
+			    u16 flags, bool *notified,
 			    struct netlink_ext_ack *extack)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index a6f586f9bfd1..c875036f654b 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -6125,12 +6125,14 @@ ice_set_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate)
  * @addr: the MAC address entry being added
  * @vid: VLAN ID
  * @flags: instructions from stack about fdb operation
+ * @notified: whether notification was emitted
  * @extack: netlink extended ack
  */
 static int
 ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
 	    struct net_device *dev, const unsigned char *addr, u16 vid,
-	    u16 flags, struct netlink_ext_ack __always_unused *extack)
+	    u16 flags, bool *notified,
+	    struct netlink_ext_ack __always_unused *extack)
 {
 	int err;
 
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index f1d088168723..f0528bd13184 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2486,7 +2486,7 @@ static int igb_set_features(struct net_device *netdev,
 static int igb_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			   struct net_device *dev,
 			   const unsigned char *addr, u16 vid,
-			   u16 flags,
+			   u16 flags, bool *notified,
 			   struct netlink_ext_ack *extack)
 {
 	/* guarantee we can provide a unique filter for the unicast address */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8b8404d8c946..adc9392463ce 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9954,7 +9954,7 @@ static int ixgbe_set_features(struct net_device *netdev,
 static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			     struct net_device *dev,
 			     const unsigned char *addr, u16 vid,
-			     u16 flags,
+			     u16 flags, bool *notified,
 			     struct netlink_ext_ack *extack)
 {
 	/* guarantee we can provide a unique filter for the unicast address */
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 7c9540a71725..4f15ba2c5525 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -730,7 +730,7 @@ static void ocelot_get_stats64(struct net_device *dev,
 static int ocelot_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			       struct net_device *dev,
 			       const unsigned char *addr,
-			       u16 vid, u16 flags,
+			       u16 vid, u16 flags, bool *notified,
 			       struct netlink_ext_ack *extack)
 {
 	struct ocelot_port_private *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index b3588a1ebc25..2484cebd97d4 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -394,7 +394,7 @@ static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 static int qlcnic_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			struct net_device *netdev,
 			const unsigned char *addr, u16 vid, u16 flags,
-			struct netlink_ext_ack *extack)
+			bool *notified, struct netlink_ext_ack *extack)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	int err = 0;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index edbd5afcec41..dfb462e63248 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1024,7 +1024,7 @@ static int macvlan_vlan_rx_kill_vid(struct net_device *dev,
 static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			   struct net_device *dev,
 			   const unsigned char *addr, u16 vid,
-			   u16 flags,
+			   u16 flags, bool *notified,
 			   struct netlink_ext_ack *extack)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 42b07bc2b107..22f17c5c7549 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -1241,7 +1241,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
 static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			 struct net_device *dev,
 			 const unsigned char *addr, u16 vid, u16 flags,
-			 struct netlink_ext_ack *extack)
+			 bool *notified, struct netlink_ext_ack *extack)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	/* struct net *net = dev_net(vxlan->dev); */
@@ -1277,6 +1277,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			       nhid, true, extack);
 	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
+	if (!err)
+		*notified = true;
+
 	return err;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0aae346d919e..6a7fd191e1ee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1248,8 +1248,10 @@ struct netdev_net_notifier {
  * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
  *		      struct net_device *dev,
  *		      const unsigned char *addr, u16 vid, u16 flags,
- *		      struct netlink_ext_ack *extack);
+ *		      bool *notified, struct netlink_ext_ack *extack);
  *	Adds an FDB entry to dev for addr.
+ *	Callee shall set *notified to true if it sent any appropriate
+ *	notification(s). Otherwise core will send a generic one.
  * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
  *		      struct net_device *dev,
  *		      const unsigned char *addr, u16 vid)
@@ -1525,6 +1527,7 @@ struct net_device_ops {
 					       const unsigned char *addr,
 					       u16 vid,
 					       u16 flags,
+					       bool *notified,
 					       struct netlink_ext_ack *extack);
 	int			(*ndo_fdb_del)(struct ndmsg *ndm,
 					       struct nlattr *tb[],
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 77f110035df1..5f29958f3ddd 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -1152,7 +1152,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
 			struct net_bridge_port *p, const unsigned char *addr,
 			u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[],
-			struct netlink_ext_ack *extack)
+			bool *notified, struct netlink_ext_ack *extack)
 {
 	int err = 0;
 
@@ -1183,6 +1183,8 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
 		spin_unlock_bh(&br->hash_lock);
 	}
 
+	if (!err)
+		*notified = true;
 	return err;
 }
 
@@ -1195,7 +1197,7 @@ static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = {
 int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	       struct net_device *dev,
 	       const unsigned char *addr, u16 vid, u16 nlh_flags,
-	       struct netlink_ext_ack *extack)
+	       bool *notified, struct netlink_ext_ack *extack)
 {
 	struct nlattr *nfea_tb[NFEA_MAX + 1], *attr;
 	struct net_bridge_vlan_group *vg;
@@ -1258,10 +1260,10 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 
 		/* VID was specified, so use it. */
 		err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb,
-				   extack);
+				   notified, extack);
 	} else {
 		err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb,
-				   extack);
+				   notified, extack);
 		if (err || !vg || !vg->num_vlans)
 			goto out;
 
@@ -1273,7 +1275,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			if (!br_vlan_should_use(v))
 				continue;
 			err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid,
-					   nfea_tb, extack);
+					   nfea_tb, notified, extack);
 			if (err)
 				goto out;
 		}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 041f6e571a20..ebfc59049ec1 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -858,7 +858,7 @@ int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
 		       struct netlink_ext_ack *extack);
 int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
 	       const unsigned char *addr, u16 vid, u16 nlh_flags,
-	       struct netlink_ext_ack *extack);
+	       bool *notified, struct netlink_ext_ack *extack);
 int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		struct net_device *dev, struct net_device *fdev, int *idx);
 int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 327fa4957929..f31b2436cde5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4578,9 +4578,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	    netif_is_bridge_port(dev)) {
 		struct net_device *br_dev = netdev_master_upper_dev_get(dev);
 		const struct net_device_ops *ops = br_dev->netdev_ops;
+		bool notified = false;
 
 		err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
-				       nlh->nlmsg_flags, extack);
+				       nlh->nlmsg_flags, &notified, extack);
 		if (err)
 			goto out;
 		else
@@ -4589,16 +4590,18 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	/* Embedded bridge, macvlan, and any other device support */
 	if ((ndm->ndm_flags & NTF_SELF)) {
+		bool notified = false;
+
 		if (dev->netdev_ops->ndo_fdb_add)
 			err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
 							   vid,
 							   nlh->nlmsg_flags,
-							   extack);
+							   &notified, extack);
 		else
 			err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
 					       nlh->nlmsg_flags);
 
-		if (!err) {
+		if (!err && !notified) {
 			rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
 					ndm->ndm_state);
 			ndm->ndm_flags &= ~NTF_SELF;
-- 
cgit v1.2.3


From 42575ad5aab932273475d1ec3e7881cb5a05420e Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 14 Nov 2024 15:09:54 +0100
Subject: ndo_fdb_del: Add a parameter to report whether notification was sent

In a similar fashion to ndo_fdb_add, which was covered in the previous
patch, add the bool *notified argument to ndo_fdb_del. Callees that send a
notification on their own set the flag to true.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/06b1acf4953ef0a5ed153ef1f32d7292044f2be6.1731589511.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/ice/ice_main.c        |  4 +++-
 drivers/net/ethernet/mscc/ocelot_net.c           |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  2 +-
 drivers/net/macvlan.c                            |  2 +-
 drivers/net/vxlan/vxlan_core.c                   |  5 ++++-
 include/linux/netdevice.h                        |  9 +++++++--
 net/bridge/br_fdb.c                              | 15 ++++++++-------
 net/bridge/br_private.h                          |  2 +-
 net/core/rtnetlink.c                             | 11 ++++++++---
 9 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index c875036f654b..b79848fe2a9e 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -6166,12 +6166,14 @@ ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
  * @dev: the net device pointer
  * @addr: the MAC address entry being added
  * @vid: VLAN ID
+ * @notified: whether notification was emitted
  * @extack: netlink extended ack
  */
 static int
 ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[],
 	    struct net_device *dev, const unsigned char *addr,
-	    __always_unused u16 vid, struct netlink_ext_ack *extack)
+	    __always_unused u16 vid, bool *notified,
+	    struct netlink_ext_ack *extack)
 {
 	int err;
 
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 4f15ba2c5525..558e03301aa8 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -744,7 +744,7 @@ static int ocelot_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 static int ocelot_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 			       struct net_device *dev,
 			       const unsigned char *addr, u16 vid,
-			       struct netlink_ext_ack *extack)
+			       bool *notified, struct netlink_ext_ack *extack)
 {
 	struct ocelot_port_private *priv = netdev_priv(dev);
 	struct ocelot_port *ocelot_port = &priv->port;
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 2484cebd97d4..eb69121df726 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -367,7 +367,7 @@ static int qlcnic_set_mac(struct net_device *netdev, void *p)
 
 static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 			struct net_device *netdev,
-			const unsigned char *addr, u16 vid,
+			const unsigned char *addr, u16 vid, bool *notified,
 			struct netlink_ext_ack *extack)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index dfb462e63248..fed4fe2a4748 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1049,7 +1049,7 @@ static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 
 static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 			   struct net_device *dev,
-			   const unsigned char *addr, u16 vid,
+			   const unsigned char *addr, u16 vid, bool *notified,
 			   struct netlink_ext_ack *extack)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 22f17c5c7549..9ea63059d52d 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -1319,7 +1319,7 @@ out:
 /* Delete entry (via netlink) */
 static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 			    struct net_device *dev,
-			    const unsigned char *addr, u16 vid,
+			    const unsigned char *addr, u16 vid, bool *notified,
 			    struct netlink_ext_ack *extack)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
@@ -1341,6 +1341,9 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 				 true);
 	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
+	if (!err)
+		*notified = true;
+
 	return err;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6a7fd191e1ee..ecc686409161 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1254,8 +1254,11 @@ struct netdev_net_notifier {
  *	notification(s). Otherwise core will send a generic one.
  * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
  *		      struct net_device *dev,
- *		      const unsigned char *addr, u16 vid)
+ *		      const unsigned char *addr, u16 vid
+ *		      bool *notified, struct netlink_ext_ack *extack);
  *	Deletes the FDB entry from dev corresponding to addr.
+ *	Callee shall set *notified to true if it sent any appropriate
+ *	notification(s). Otherwise core will send a generic one.
  * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev,
  *			   struct netlink_ext_ack *extack);
  * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
@@ -1533,7 +1536,9 @@ struct net_device_ops {
 					       struct nlattr *tb[],
 					       struct net_device *dev,
 					       const unsigned char *addr,
-					       u16 vid, struct netlink_ext_ack *extack);
+					       u16 vid,
+					       bool *notified,
+					       struct netlink_ext_ack *extack);
 	int			(*ndo_fdb_del_bulk)(struct nlmsghdr *nlh,
 						    struct net_device *dev,
 						    struct netlink_ext_ack *extack);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 5f29958f3ddd..82bac2426631 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -1287,7 +1287,7 @@ out:
 
 static int fdb_delete_by_addr_and_port(struct net_bridge *br,
 				       const struct net_bridge_port *p,
-				       const u8 *addr, u16 vlan)
+				       const u8 *addr, u16 vlan, bool *notified)
 {
 	struct net_bridge_fdb_entry *fdb;
 
@@ -1296,18 +1296,19 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br,
 		return -ENOENT;
 
 	fdb_delete(br, fdb, true);
+	*notified = true;
 
 	return 0;
 }
 
 static int __br_fdb_delete(struct net_bridge *br,
 			   const struct net_bridge_port *p,
-			   const unsigned char *addr, u16 vid)
+			   const unsigned char *addr, u16 vid, bool *notified)
 {
 	int err;
 
 	spin_lock_bh(&br->hash_lock);
-	err = fdb_delete_by_addr_and_port(br, p, addr, vid);
+	err = fdb_delete_by_addr_and_port(br, p, addr, vid, notified);
 	spin_unlock_bh(&br->hash_lock);
 
 	return err;
@@ -1316,7 +1317,7 @@ static int __br_fdb_delete(struct net_bridge *br,
 /* Remove neighbor entry with RTM_DELNEIGH */
 int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 		  struct net_device *dev,
-		  const unsigned char *addr, u16 vid,
+		  const unsigned char *addr, u16 vid, bool *notified,
 		  struct netlink_ext_ack *extack)
 {
 	struct net_bridge_vlan_group *vg;
@@ -1339,19 +1340,19 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	}
 
 	if (vid) {
-		err = __br_fdb_delete(br, p, addr, vid);
+		err = __br_fdb_delete(br, p, addr, vid, notified);
 	} else {
 		struct net_bridge_vlan *v;
 
 		err = -ENOENT;
-		err &= __br_fdb_delete(br, p, addr, 0);
+		err &= __br_fdb_delete(br, p, addr, 0, notified);
 		if (!vg || !vg->num_vlans)
 			return err;
 
 		list_for_each_entry(v, &vg->vlan_list, vlist) {
 			if (!br_vlan_should_use(v))
 				continue;
-			err &= __br_fdb_delete(br, p, addr, v->vid);
+			err &= __br_fdb_delete(br, p, addr, v->vid, notified);
 		}
 	}
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index ebfc59049ec1..9853cfbb9d14 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -853,7 +853,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 
 int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 		  struct net_device *dev, const unsigned char *addr, u16 vid,
-		  struct netlink_ext_ack *extack);
+		  bool *notified, struct netlink_ext_ack *extack);
 int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
 		       struct netlink_ext_ack *extack);
 int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f31b2436cde5..dd142f444659 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4701,11 +4701,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
 	    netif_is_bridge_port(dev)) {
 		struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+		bool notified = false;
 
 		ops = br_dev->netdev_ops;
 		if (!del_bulk) {
 			if (ops->ndo_fdb_del)
-				err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
+				err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+						       &notified, extack);
 		} else {
 			if (ops->ndo_fdb_del_bulk)
 				err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
@@ -4719,10 +4721,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	/* Embedded bridge, macvlan, and any other device support */
 	if (ndm->ndm_flags & NTF_SELF) {
+		bool notified = false;
+
 		ops = dev->netdev_ops;
 		if (!del_bulk) {
 			if (ops->ndo_fdb_del)
-				err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
+				err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+						       &notified, extack);
 			else
 				err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
 		} else {
@@ -4733,7 +4738,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 		}
 
 		if (!err) {
-			if (!del_bulk)
+			if (!del_bulk && !notified)
 				rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
 						ndm->ndm_state);
 			ndm->ndm_flags &= ~NTF_SELF;
-- 
cgit v1.2.3


From 3ef66af31feaf5ff5dd73e63b1327789822ed476 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Tue, 12 Nov 2024 09:29:20 +0800
Subject: virtio_ring: introduce add api for premapped

Two APIs are introduced to submit premapped per-buffers.

int virtqueue_add_inbuf_premapped(struct virtqueue *vq,
                                 struct scatterlist *sg, unsigned int num,
                                 void *data,
                                 void *ctx,
                                 gfp_t gfp);

int virtqueue_add_outbuf_premapped(struct virtqueue *vq,
                                  struct scatterlist *sg, unsigned int num,
                                  void *data,
                                  gfp_t gfp);

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://patch.msgid.link/20241112012928.102478-6-xuanzhuo@linux.alibaba.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/virtio/virtio_ring.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/virtio.h       | 11 ++++++++++
 2 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index fefa85a5e6b6..0842d27886e5 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2276,6 +2276,29 @@ int virtqueue_add_outbuf(struct virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
 
+/**
+ * virtqueue_add_outbuf_premapped - expose output buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg readable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Return:
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ */
+int virtqueue_add_outbuf_premapped(struct virtqueue *vq,
+				   struct scatterlist *sg, unsigned int num,
+				   void *data,
+				   gfp_t gfp)
+{
+	return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, true, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_outbuf_premapped);
+
 /**
  * virtqueue_add_inbuf - expose input buffers to other end
  * @vq: the struct virtqueue we're talking about.
@@ -2322,6 +2345,31 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
 
+/**
+ * virtqueue_add_inbuf_premapped - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg writable by other side
+ * @data: the token identifying the buffer.
+ * @ctx: extra context for the token
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Return:
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ */
+int virtqueue_add_inbuf_premapped(struct virtqueue *vq,
+				  struct scatterlist *sg, unsigned int num,
+				  void *data,
+				  void *ctx,
+				  gfp_t gfp)
+{
+	return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, true, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_premapped);
+
 /**
  * virtqueue_dma_dev - get the dma dev
  * @_vq: the struct virtqueue we're talking about.
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 306137a15d07..13b3f55abca3 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -56,6 +56,17 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
 			    void *ctx,
 			    gfp_t gfp);
 
+int virtqueue_add_inbuf_premapped(struct virtqueue *vq,
+				  struct scatterlist *sg, unsigned int num,
+				  void *data,
+				  void *ctx,
+				  gfp_t gfp);
+
+int virtqueue_add_outbuf_premapped(struct virtqueue *vq,
+				   struct scatterlist *sg, unsigned int num,
+				   void *data,
+				   gfp_t gfp);
+
 int virtqueue_add_sgs(struct virtqueue *vq,
 		      struct scatterlist *sgs[],
 		      unsigned int out_sgs,
-- 
cgit v1.2.3


From 880ebcbe06636c42cfb328039581e177c6cd6ba5 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Tue, 12 Nov 2024 09:29:22 +0800
Subject: virtio_ring: remove API virtqueue_set_dma_premapped

Now, this API is useless. remove it.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://patch.msgid.link/20241112012928.102478-8-xuanzhuo@linux.alibaba.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/virtio_net.c     | 13 ------------
 drivers/virtio/virtio_ring.c | 48 --------------------------------------------
 include/linux/virtio.h       |  2 --
 3 files changed, 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8aca4a3fc7e8..183ad5e6bef0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -6168,15 +6168,6 @@ err_ctrl:
 	return -ENOMEM;
 }
 
-static void virtnet_rq_set_premapped(struct virtnet_info *vi)
-{
-	int i;
-
-	for (i = 0; i < vi->max_queue_pairs; i++)
-		/* error should never happen */
-		BUG_ON(virtqueue_set_dma_premapped(vi->rq[i].vq));
-}
-
 static int init_vqs(struct virtnet_info *vi)
 {
 	int ret;
@@ -6190,10 +6181,6 @@ static int init_vqs(struct virtnet_info *vi)
 	if (ret)
 		goto err_free;
 
-	/* disable for big mode */
-	if (!vi->big_packets || vi->mergeable_rx_bufs)
-		virtnet_rq_set_premapped(vi);
-
 	cpus_read_lock();
 	virtnet_set_affinity(vi);
 	cpus_read_unlock();
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 0842d27886e5..8167be01b400 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -180,9 +180,6 @@ struct vring_virtqueue {
 	/* Host publishes avail event idx */
 	bool event;
 
-	/* Do DMA mapping by driver */
-	bool premapped;
-
 	/* Head of free buffer list. */
 	unsigned int free_head;
 	/* Number we've added since last sync. */
@@ -2098,7 +2095,6 @@ static struct virtqueue *vring_create_virtqueue_packed(
 	vq->packed_ring = true;
 	vq->dma_dev = dma_dev;
 	vq->use_dma_api = vring_use_dma_api(vdev);
-	vq->premapped = false;
 
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
 		!context;
@@ -2691,7 +2687,6 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
 #endif
 	vq->dma_dev = dma_dev;
 	vq->use_dma_api = vring_use_dma_api(vdev);
-	vq->premapped = false;
 
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
 		!context;
@@ -2818,49 +2813,6 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num,
 }
 EXPORT_SYMBOL_GPL(virtqueue_resize);
 
-/**
- * virtqueue_set_dma_premapped - set the vring premapped mode
- * @_vq: the struct virtqueue we're talking about.
- *
- * Enable the premapped mode of the vq.
- *
- * The vring in premapped mode does not do dma internally, so the driver must
- * do dma mapping in advance. The driver must pass the dma_address through
- * dma_address of scatterlist. When the driver got a used buffer from
- * the vring, it has to unmap the dma address.
- *
- * This function must be called immediately after creating the vq, or after vq
- * reset, and before adding any buffers to it.
- *
- * Caller must ensure we don't call this with other virtqueue operations
- * at the same time (except where noted).
- *
- * Returns zero or a negative error.
- * 0: success.
- * -EINVAL: too late to enable premapped mode, the vq already contains buffers.
- */
-int virtqueue_set_dma_premapped(struct virtqueue *_vq)
-{
-	struct vring_virtqueue *vq = to_vvq(_vq);
-	u32 num;
-
-	START_USE(vq);
-
-	num = vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num;
-
-	if (num != vq->vq.num_free) {
-		END_USE(vq);
-		return -EINVAL;
-	}
-
-	vq->premapped = true;
-
-	END_USE(vq);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtqueue_set_dma_premapped);
-
 /**
  * virtqueue_reset - detach and recycle all unused buffers
  * @_vq: the struct virtqueue we're talking about.
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 13b3f55abca3..338e0f5efb4b 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -93,8 +93,6 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
 
 unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
 
-int virtqueue_set_dma_premapped(struct virtqueue *_vq);
-
 bool virtqueue_poll(struct virtqueue *vq, unsigned);
 
 bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
-- 
cgit v1.2.3


From 665745f274870c921020f610e2c99a3b1613519b Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 18 Oct 2024 17:47:52 +0300
Subject: PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This mostly reverts the commit b4c7d2076b4e ("PCI/LINK: Remove bandwidth
notification"). An upcoming commit extends this driver building PCIe
bandwidth controller on top of it.

PCIe bandwidth notifications were first added in the commit e8303bb7a75c
("PCI/LINK: Report degraded links via link bandwidth notification") but
later had to be removed. The significant changes compared with the old
bandwidth notification driver include:

1) Don't print the notifications into kernel log, just keep the Link
   Speed cached in struct pci_bus updated. While somewhat unfortunate,
   the log spam was the source of complaints that eventually lead to
   the removal of the bandwidth notifications driver (see the links
   below for further information).

2) Besides the Link Bandwidth Management Interrupt, also enable Link
   Autonomous Bandwidth Interrupt to cover the other source of bandwidth
   changes.

3) Handle Link Speed updates robustly. Refresh the cached Link Speed
   when enabling Bandwidth Notification Interrupts, and solve the race
   between Link Speed read and LBMS/LABS update in
   pcie_bwnotif_irq_thread().

4) Use concurrency safe LNKCTL RMW operations.

5) The driver is now called PCIe bwctrl (bandwidth controller) instead
   of just bandwidth notifications because of increased scope and
   functionality within the driver.

6) Coexist with the Target Link Speed quirk in pcie_failed_link_retrain().
   Provide LBMS counting API for it.

7) Tweaks to variable/functions names for consistency and length reasons.

Bandwidth Notifications enable the cur_bus_speed in the struct pci_bus to
keep track PCIe Link Speed changes.

[bhelgaas: This is based on previous work by Alexandru Gagniuc
<mr.nuke.me@gmail.com>; see e8303bb7a75c ("PCI/LINK: Report degraded links
via link bandwidth notification")]

Link: https://lore.kernel.org/r/20241018144755.7875-7-ilpo.jarvinen@linux.intel.com
Link: https://lore.kernel.org/all/20190429185611.121751-1-helgaas@kernel.org/
Link: https://lore.kernel.org/linux-pci/20190501142942.26972-1-keith.busch@intel.com/
Link: https://lore.kernel.org/linux-pci/20200115221008.GA191037@google.com/
Suggested-by: Lukas Wunner <lukas@wunner.de> # Building bwctrl on top of bwnotif
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: squash fix to drop IRQF_ONESHOT and convert to hardirq handler:
https://lore.kernel.org/r/20241115165717.15233-1-ilpo.jarvinen@linux.intel.com]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Stefan Wahren <wahrenst@gmx.net>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 MAINTAINERS                       |   6 ++
 drivers/pci/hotplug/pciehp_ctrl.c |   5 +
 drivers/pci/pci.c                 |   2 +-
 drivers/pci/pci.h                 |  11 +++
 drivers/pci/pcie/Makefile         |   2 +-
 drivers/pci/pcie/bwctrl.c         | 186 ++++++++++++++++++++++++++++++++++++++
 drivers/pci/pcie/portdrv.c        |   9 +-
 drivers/pci/pcie/portdrv.h        |   6 +-
 drivers/pci/quirks.c              |   9 +-
 include/linux/pci.h               |   2 +
 10 files changed, 229 insertions(+), 9 deletions(-)
 create mode 100644 drivers/pci/pcie/bwctrl.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c27f3190737f..8c555b3325d6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17933,6 +17933,12 @@ F:	include/linux/of_pci.h
 F:	include/linux/pci*
 F:	include/uapi/linux/pci*
 
+PCIE BANDWIDTH CONTROLLER
+M:	Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
+L:	linux-pci@vger.kernel.org
+S:	Supported
+F:	drivers/pci/pcie/bwctrl.c
+
 PCIE DRIVER FOR AMAZON ANNAPURNA LABS
 M:	Jonathan Chocron <jonnyc@amazon.com>
 L:	linux-pci@vger.kernel.org
diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
index dcdbfcf404dd..d603a7aa7483 100644
--- a/drivers/pci/hotplug/pciehp_ctrl.c
+++ b/drivers/pci/hotplug/pciehp_ctrl.c
@@ -19,6 +19,8 @@
 #include <linux/types.h>
 #include <linux/pm_runtime.h>
 #include <linux/pci.h>
+
+#include "../pci.h"
 #include "pciehp.h"
 
 /* The following routines constitute the bulk of the
@@ -127,6 +129,9 @@ static void remove_board(struct controller *ctrl, bool safe_removal)
 
 	pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
 			      INDICATOR_NOOP);
+
+	/* Don't carry LBMS indications across */
+	pcie_reset_lbms_count(ctrl->pcie->port);
 }
 
 static int pciehp_enable_slot(struct controller *ctrl);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 3d67e8b50ba2..f85f380cdb9b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4740,7 +4740,7 @@ int pcie_retrain_link(struct pci_dev *pdev, bool use_lt)
 	 * to track link speed or width changes made by hardware itself
 	 * in attempt to correct unreliable link operation.
 	 */
-	pcie_capability_write_word(pdev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS);
+	pcie_reset_lbms_count(pdev);
 	return rc;
 }
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 8137ad2fead8..8ec189c8f3f5 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -698,6 +698,17 @@ static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
 static inline void pcie_ecrc_get_policy(char *str) { }
 #endif
 
+#ifdef CONFIG_PCIEPORTBUS
+void pcie_reset_lbms_count(struct pci_dev *port);
+int pcie_lbms_count(struct pci_dev *port, unsigned long *val);
+#else
+static inline void pcie_reset_lbms_count(struct pci_dev *port) {}
+static inline int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 struct pci_dev_reset_methods {
 	u16 vendor;
 	u16 device;
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 6461aa93fe76..53ccab62314d 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -4,7 +4,7 @@
 
 pcieportdrv-y			:= portdrv.o rcec.o
 
-obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
+obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o bwctrl.o
 
 obj-y				+= aspm.o
 obj-$(CONFIG_PCIEAER)		+= aer.o err.o
diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
new file mode 100644
index 000000000000..b3da24900271
--- /dev/null
+++ b/drivers/pci/pcie/bwctrl.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PCIe bandwidth controller
+ *
+ * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
+ *
+ * Copyright (C) 2019 Dell Inc
+ * Copyright (C) 2023-2024 Intel Corporation
+ *
+ * This service port driver hooks into the Bandwidth Notification interrupt
+ * watching for changes or links becoming degraded in operation. It updates
+ * the cached Current Link Speed that is exposed to user space through sysfs.
+ */
+
+#define dev_fmt(fmt) "bwctrl: " fmt
+
+#include <linux/atomic.h>
+#include <linux/cleanup.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "../pci.h"
+#include "portdrv.h"
+
+/**
+ * struct pcie_bwctrl_data - PCIe bandwidth controller
+ * @lbms_count:		Count for LBMS (since last reset)
+ */
+struct pcie_bwctrl_data {
+	atomic_t lbms_count;
+};
+
+/* Prevents port removal during LBMS count accessors */
+static DECLARE_RWSEM(pcie_bwctrl_lbms_rwsem);
+
+static void pcie_bwnotif_enable(struct pcie_device *srv)
+{
+	struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
+	struct pci_dev *port = srv->port;
+	u16 link_status;
+	int ret;
+
+	/* Count LBMS seen so far as one */
+	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+	if (ret == PCIBIOS_SUCCESSFUL && link_status & PCI_EXP_LNKSTA_LBMS)
+		atomic_inc(&data->lbms_count);
+
+	pcie_capability_set_word(port, PCI_EXP_LNKCTL,
+				 PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
+	pcie_capability_write_word(port, PCI_EXP_LNKSTA,
+				   PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
+
+	/*
+	 * Update after enabling notifications & clearing status bits ensures
+	 * link speed is up to date.
+	 */
+	pcie_update_link_speed(port->subordinate);
+}
+
+static void pcie_bwnotif_disable(struct pci_dev *port)
+{
+	pcie_capability_clear_word(port, PCI_EXP_LNKCTL,
+				   PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
+}
+
+static irqreturn_t pcie_bwnotif_irq(int irq, void *context)
+{
+	struct pcie_device *srv = context;
+	struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
+	struct pci_dev *port = srv->port;
+	u16 link_status, events;
+	int ret;
+
+	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+	if (ret != PCIBIOS_SUCCESSFUL)
+		return IRQ_NONE;
+
+	events = link_status & (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
+	if (!events)
+		return IRQ_NONE;
+
+	if (events & PCI_EXP_LNKSTA_LBMS)
+		atomic_inc(&data->lbms_count);
+
+	pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
+
+	/*
+	 * Interrupts will not be triggered from any further Link Speed
+	 * change until LBMS is cleared by the write. Therefore, re-read the
+	 * speed (inside pcie_update_link_speed()) after LBMS has been
+	 * cleared to avoid missing link speed changes.
+	 */
+	pcie_update_link_speed(port->subordinate);
+
+	return IRQ_HANDLED;
+}
+
+void pcie_reset_lbms_count(struct pci_dev *port)
+{
+	struct pcie_bwctrl_data *data;
+
+	guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
+	data = port->link_bwctrl;
+	if (data)
+		atomic_set(&data->lbms_count, 0);
+	else
+		pcie_capability_write_word(port, PCI_EXP_LNKSTA,
+					   PCI_EXP_LNKSTA_LBMS);
+}
+
+int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
+{
+	struct pcie_bwctrl_data *data;
+
+	guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
+	data = port->link_bwctrl;
+	if (!data)
+		return -ENOTTY;
+
+	*val = atomic_read(&data->lbms_count);
+
+	return 0;
+}
+
+static int pcie_bwnotif_probe(struct pcie_device *srv)
+{
+	struct pci_dev *port = srv->port;
+	int ret;
+
+	struct pcie_bwctrl_data *data = devm_kzalloc(&srv->device,
+						     sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = devm_request_irq(&srv->device, srv->irq, pcie_bwnotif_irq,
+			       IRQF_SHARED, "PCIe bwctrl", srv);
+	if (ret)
+		return ret;
+
+	scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
+		port->link_bwctrl = no_free_ptr(data);
+		pcie_bwnotif_enable(srv);
+	}
+
+	pci_dbg(port, "enabled with IRQ %d\n", srv->irq);
+
+	return 0;
+}
+
+static void pcie_bwnotif_remove(struct pcie_device *srv)
+{
+	pcie_bwnotif_disable(srv->port);
+	scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem)
+		srv->port->link_bwctrl = NULL;
+}
+
+static int pcie_bwnotif_suspend(struct pcie_device *srv)
+{
+	pcie_bwnotif_disable(srv->port);
+	return 0;
+}
+
+static int pcie_bwnotif_resume(struct pcie_device *srv)
+{
+	pcie_bwnotif_enable(srv);
+	return 0;
+}
+
+static struct pcie_port_service_driver pcie_bwctrl_driver = {
+	.name		= "pcie_bwctrl",
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_BWCTRL,
+	.probe		= pcie_bwnotif_probe,
+	.suspend	= pcie_bwnotif_suspend,
+	.resume		= pcie_bwnotif_resume,
+	.remove		= pcie_bwnotif_remove,
+};
+
+int __init pcie_bwctrl_init(void)
+{
+	return pcie_port_service_register(&pcie_bwctrl_driver);
+}
diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c
index 6af5e0425872..5e10306b6308 100644
--- a/drivers/pci/pcie/portdrv.c
+++ b/drivers/pci/pcie/portdrv.c
@@ -68,7 +68,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
 	 */
 
 	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
-		    PCIE_PORT_SERVICE_BWNOTIF)) {
+		    PCIE_PORT_SERVICE_BWCTRL)) {
 		pcie_capability_read_word(dev, PCI_EXP_FLAGS, &reg16);
 		*pme = FIELD_GET(PCI_EXP_FLAGS_IRQ, reg16);
 		nvec = *pme + 1;
@@ -150,11 +150,11 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
 
 	/* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */
 	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
-		    PCIE_PORT_SERVICE_BWNOTIF)) {
+		    PCIE_PORT_SERVICE_BWCTRL)) {
 		pcie_irq = pci_irq_vector(dev, pme);
 		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
 		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
-		irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
+		irqs[PCIE_PORT_SERVICE_BWCTRL_SHIFT] = pcie_irq;
 	}
 
 	if (mask & PCIE_PORT_SERVICE_AER)
@@ -271,7 +271,7 @@ static int get_port_device_capability(struct pci_dev *dev)
 
 		pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap);
 		if (linkcap & PCI_EXP_LNKCAP_LBNC)
-			services |= PCIE_PORT_SERVICE_BWNOTIF;
+			services |= PCIE_PORT_SERVICE_BWCTRL;
 	}
 
 	return services;
@@ -828,6 +828,7 @@ static void __init pcie_init_services(void)
 	pcie_aer_init();
 	pcie_pme_init();
 	pcie_dpc_init();
+	pcie_bwctrl_init();
 	pcie_hp_init();
 }
 
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 12c89ea0313b..bd29d1cc7b8b 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -20,8 +20,8 @@
 #define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
 #define PCIE_PORT_SERVICE_DPC_SHIFT	3	/* Downstream Port Containment */
 #define PCIE_PORT_SERVICE_DPC		(1 << PCIE_PORT_SERVICE_DPC_SHIFT)
-#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT	4	/* Bandwidth notification */
-#define PCIE_PORT_SERVICE_BWNOTIF	(1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
+#define PCIE_PORT_SERVICE_BWCTRL_SHIFT	4	/* Bandwidth Controller (notifications) */
+#define PCIE_PORT_SERVICE_BWCTRL	(1 << PCIE_PORT_SERVICE_BWCTRL_SHIFT)
 
 #define PCIE_PORT_DEVICE_MAXSERVICES   5
 
@@ -51,6 +51,8 @@ int pcie_dpc_init(void);
 static inline int pcie_dpc_init(void) { return 0; }
 #endif
 
+int pcie_bwctrl_init(void);
+
 /* Port Type */
 #define PCIE_ANY_PORT			(~0)
 
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index a560ea403b8e..e6d502dca939 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -35,7 +35,14 @@
 
 static bool pcie_lbms_seen(struct pci_dev *dev, u16 lnksta)
 {
-	return lnksta & PCI_EXP_LNKSTA_LBMS;
+	unsigned long count;
+	int ret;
+
+	ret = pcie_lbms_count(dev, &count);
+	if (ret < 0)
+		return lnksta & PCI_EXP_LNKSTA_LBMS;
+
+	return count > 0;
 }
 
 /*
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 99c6fa30d25b..579b28833429 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -313,6 +313,7 @@ struct pci_vpd {
 };
 
 struct irq_affinity;
+struct pcie_bwctrl_data;
 struct pcie_link_state;
 struct pci_sriov;
 struct pci_p2pdma;
@@ -502,6 +503,7 @@ struct pci_dev {
 	unsigned int	dpc_rp_extensions:1;
 	u8		dpc_rp_log_size;
 #endif
+	struct pcie_bwctrl_data		*link_bwctrl;
 #ifdef CONFIG_PCI_ATS
 	union {
 		struct pci_sriov	*sriov;		/* PF: SR-IOV info */
-- 
cgit v1.2.3


From de9a6c8d5dbfedb5eb3722c822da0490f6a59a45 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 18 Oct 2024 17:47:53 +0300
Subject: PCI/bwctrl: Add pcie_set_target_speed() to set PCIe Link Speed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, PCIe Link Speeds are adjusted by custom code rather than in a
common function provided in PCI core. The PCIe bandwidth controller
(bwctrl) introduces an in-kernel API, pcie_set_target_speed(), to set PCIe
Link Speed.

Convert Target Speed quirk to use the new API. The Target Speed quirk runs
very early when bwctrl is not yet probed for a Port and can also run later
when bwctrl is already setup for the Port, which requires the per port
mutex (set_speed_mutex) to be only taken if the bwctrl setup is already
complete.

The new API is also intended to be used in an upcoming commit that adds a
thermal cooling device to throttle PCIe bandwidth when thermal thresholds
are reached.

The PCIe bandwidth control procedure is as follows. The highest speed
supported by the Port and the PCIe device which is not higher than the
requested speed is selected and written into the Target Link Speed in the
Link Control 2 Register. Then bandwidth controller retrains the PCIe Link.

Bandwidth Notifications enable the cur_bus_speed in the struct pci_bus to
keep track PCIe Link Speed changes. While Bandwidth Notifications should
also be generated when bandwidth controller alters the PCIe Link Speed, a
few platforms do not deliver LMBS interrupt after Link Training as
expected. Thus, after changing the Link Speed, bandwidth controller makes
additional read for the Link Status Register to ensure cur_bus_speed is
consistent with the new PCIe Link Speed.

Link: https://lore.kernel.org/r/20241018144755.7875-8-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: squash devm_mutex_init() error checking from
https://lore.kernel.org/r/20241030163139.2111689-1-andriy.shevchenko@linux.intel.com,
drop export of pcie_set_target_speed()]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/pci/pci.h         |  20 ++++++
 drivers/pci/pcie/bwctrl.c | 180 ++++++++++++++++++++++++++++++++++++++++++++--
 drivers/pci/quirks.c      |  17 ++---
 include/linux/pci.h       |  10 +++
 4 files changed, 208 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 8ec189c8f3f5..7573f81f58c4 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -331,6 +331,17 @@ void pci_disable_bridge_window(struct pci_dev *dev);
 struct pci_bus *pci_bus_get(struct pci_bus *bus);
 void pci_bus_put(struct pci_bus *bus);
 
+#define PCIE_LNKCAP_SLS2SPEED(lnkcap)					\
+({									\
+	((lnkcap) == PCI_EXP_LNKCAP_SLS_64_0GB ? PCIE_SPEED_64_0GT :	\
+	 (lnkcap) == PCI_EXP_LNKCAP_SLS_32_0GB ? PCIE_SPEED_32_0GT :	\
+	 (lnkcap) == PCI_EXP_LNKCAP_SLS_16_0GB ? PCIE_SPEED_16_0GT :	\
+	 (lnkcap) == PCI_EXP_LNKCAP_SLS_8_0GB ? PCIE_SPEED_8_0GT :	\
+	 (lnkcap) == PCI_EXP_LNKCAP_SLS_5_0GB ? PCIE_SPEED_5_0GT :	\
+	 (lnkcap) == PCI_EXP_LNKCAP_SLS_2_5GB ? PCIE_SPEED_2_5GT :	\
+	 PCI_SPEED_UNKNOWN);						\
+})
+
 /* PCIe link information from Link Capabilities 2 */
 #define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \
 	((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \
@@ -341,6 +352,15 @@ void pci_bus_put(struct pci_bus *bus);
 	 (lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \
 	 PCI_SPEED_UNKNOWN)
 
+#define PCIE_LNKCTL2_TLS2SPEED(lnkctl2) \
+	((lnkctl2) == PCI_EXP_LNKCTL2_TLS_64_0GT ? PCIE_SPEED_64_0GT : \
+	 (lnkctl2) == PCI_EXP_LNKCTL2_TLS_32_0GT ? PCIE_SPEED_32_0GT : \
+	 (lnkctl2) == PCI_EXP_LNKCTL2_TLS_16_0GT ? PCIE_SPEED_16_0GT : \
+	 (lnkctl2) == PCI_EXP_LNKCTL2_TLS_8_0GT ? PCIE_SPEED_8_0GT : \
+	 (lnkctl2) == PCI_EXP_LNKCTL2_TLS_5_0GT ? PCIE_SPEED_5_0GT : \
+	 (lnkctl2) == PCI_EXP_LNKCTL2_TLS_2_5GT ? PCIE_SPEED_2_5GT : \
+	 PCI_SPEED_UNKNOWN)
+
 /* PCIe speed to Mb/s reduced by encoding overhead */
 #define PCIE_SPEED2MBS_ENC(speed) \
 	((speed) == PCIE_SPEED_64_0GT ? 64000*1/1 : \
diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
index b3da24900271..3cd3e2d066c9 100644
--- a/drivers/pci/pcie/bwctrl.c
+++ b/drivers/pci/pcie/bwctrl.c
@@ -7,6 +7,11 @@
  * Copyright (C) 2019 Dell Inc
  * Copyright (C) 2023-2024 Intel Corporation
  *
+ * The PCIe bandwidth controller provides a way to alter PCIe Link Speeds
+ * and notify the operating system when the Link Width or Speed changes. The
+ * notification capability is required for all Root Ports and Downstream
+ * Ports supporting Link Width wider than x1 and/or multiple Link Speeds.
+ *
  * This service port driver hooks into the Bandwidth Notification interrupt
  * watching for changes or links becoming degraded in operation. It updates
  * the cached Current Link Speed that is exposed to user space through sysfs.
@@ -15,9 +20,12 @@
 #define dev_fmt(fmt) "bwctrl: " fmt
 
 #include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bits.h>
 #include <linux/cleanup.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
@@ -28,14 +36,166 @@
 
 /**
  * struct pcie_bwctrl_data - PCIe bandwidth controller
+ * @set_speed_mutex:	Serializes link speed changes
  * @lbms_count:		Count for LBMS (since last reset)
  */
 struct pcie_bwctrl_data {
+	struct mutex set_speed_mutex;
 	atomic_t lbms_count;
 };
 
-/* Prevents port removal during LBMS count accessors */
+/*
+ * Prevent port removal during LBMS count accessors and Link Speed changes.
+ *
+ * These have to be differentiated because pcie_bwctrl_change_speed() calls
+ * pcie_retrain_link() which uses LBMS count reset accessor on success
+ * (using just one rwsem triggers "possible recursive locking detected"
+ * warning).
+ */
 static DECLARE_RWSEM(pcie_bwctrl_lbms_rwsem);
+static DECLARE_RWSEM(pcie_bwctrl_setspeed_rwsem);
+
+static bool pcie_valid_speed(enum pci_bus_speed speed)
+{
+	return (speed >= PCIE_SPEED_2_5GT) && (speed <= PCIE_SPEED_64_0GT);
+}
+
+static u16 pci_bus_speed2lnkctl2(enum pci_bus_speed speed)
+{
+	static const u8 speed_conv[] = {
+		[PCIE_SPEED_2_5GT] = PCI_EXP_LNKCTL2_TLS_2_5GT,
+		[PCIE_SPEED_5_0GT] = PCI_EXP_LNKCTL2_TLS_5_0GT,
+		[PCIE_SPEED_8_0GT] = PCI_EXP_LNKCTL2_TLS_8_0GT,
+		[PCIE_SPEED_16_0GT] = PCI_EXP_LNKCTL2_TLS_16_0GT,
+		[PCIE_SPEED_32_0GT] = PCI_EXP_LNKCTL2_TLS_32_0GT,
+		[PCIE_SPEED_64_0GT] = PCI_EXP_LNKCTL2_TLS_64_0GT,
+	};
+
+	if (WARN_ON_ONCE(!pcie_valid_speed(speed)))
+		return 0;
+
+	return speed_conv[speed];
+}
+
+static inline u16 pcie_supported_speeds2target_speed(u8 supported_speeds)
+{
+	return __fls(supported_speeds);
+}
+
+/**
+ * pcie_bwctrl_select_speed - Select Target Link Speed
+ * @port:	PCIe Port
+ * @speed_req:	Requested PCIe Link Speed
+ *
+ * Select Target Link Speed by take into account Supported Link Speeds of
+ * both the Root Port and the Endpoint.
+ *
+ * Return: Target Link Speed (1=2.5GT/s, 2=5GT/s, 3=8GT/s, etc.)
+ */
+static u16 pcie_bwctrl_select_speed(struct pci_dev *port, enum pci_bus_speed speed_req)
+{
+	struct pci_bus *bus = port->subordinate;
+	u8 desired_speeds, supported_speeds;
+	struct pci_dev *dev;
+
+	desired_speeds = GENMASK(pci_bus_speed2lnkctl2(speed_req),
+				 __fls(PCI_EXP_LNKCAP2_SLS_2_5GB));
+
+	supported_speeds = port->supported_speeds;
+	if (bus) {
+		down_read(&pci_bus_sem);
+		dev = list_first_entry_or_null(&bus->devices, struct pci_dev, bus_list);
+		if (dev)
+			supported_speeds &= dev->supported_speeds;
+		up_read(&pci_bus_sem);
+	}
+	if (!supported_speeds)
+		return PCI_EXP_LNKCAP2_SLS_2_5GB;
+
+	return pcie_supported_speeds2target_speed(supported_speeds & desired_speeds);
+}
+
+static int pcie_bwctrl_change_speed(struct pci_dev *port, u16 target_speed, bool use_lt)
+{
+	int ret;
+
+	ret = pcie_capability_clear_and_set_word(port, PCI_EXP_LNKCTL2,
+						 PCI_EXP_LNKCTL2_TLS, target_speed);
+	if (ret != PCIBIOS_SUCCESSFUL)
+		return pcibios_err_to_errno(ret);
+
+	ret = pcie_retrain_link(port, use_lt);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Ensure link speed updates also with platforms that have problems
+	 * with notifications.
+	 */
+	if (port->subordinate)
+		pcie_update_link_speed(port->subordinate);
+
+	return 0;
+}
+
+/**
+ * pcie_set_target_speed - Set downstream Link Speed for PCIe Port
+ * @port:	PCIe Port
+ * @speed_req:	Requested PCIe Link Speed
+ * @use_lt:	Wait for the LT or DLLLA bit to detect the end of link training
+ *
+ * Attempt to set PCIe Port Link Speed to @speed_req. @speed_req may be
+ * adjusted downwards to the best speed supported by both the Port and PCIe
+ * Device underneath it.
+ *
+ * Return:
+ * * 0		- on success
+ * * -EINVAL	- @speed_req is not a PCIe Link Speed
+ * * -ENODEV	- @port is not controllable
+ * * -ETIMEDOUT	- changing Link Speed took too long
+ * * -EAGAIN	- Link Speed was changed but @speed_req was not achieved
+ */
+int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
+			  bool use_lt)
+{
+	struct pci_bus *bus = port->subordinate;
+	u16 target_speed;
+	int ret;
+
+	if (WARN_ON_ONCE(!pcie_valid_speed(speed_req)))
+		return -EINVAL;
+
+	if (bus && bus->cur_bus_speed == speed_req)
+		return 0;
+
+	target_speed = pcie_bwctrl_select_speed(port, speed_req);
+
+	scoped_guard(rwsem_read, &pcie_bwctrl_setspeed_rwsem) {
+		struct pcie_bwctrl_data *data = port->link_bwctrl;
+
+		/*
+		 * port->link_bwctrl is NULL during initial scan when called
+		 * e.g. from the Target Speed quirk.
+		 */
+		if (data)
+			mutex_lock(&data->set_speed_mutex);
+
+		ret = pcie_bwctrl_change_speed(port, target_speed, use_lt);
+
+		if (data)
+			mutex_unlock(&data->set_speed_mutex);
+	}
+
+	/*
+	 * Despite setting higher speed into the Target Link Speed, empty
+	 * bus won't train to 5GT+ speeds.
+	 */
+	if (!ret && bus && bus->cur_bus_speed != speed_req &&
+	    !list_empty(&bus->devices))
+		ret = -EAGAIN;
+
+	return ret;
+}
 
 static void pcie_bwnotif_enable(struct pcie_device *srv)
 {
@@ -136,14 +296,20 @@ static int pcie_bwnotif_probe(struct pcie_device *srv)
 	if (!data)
 		return -ENOMEM;
 
+	ret = devm_mutex_init(&srv->device, &data->set_speed_mutex);
+	if (ret)
+		return ret;
+
 	ret = devm_request_irq(&srv->device, srv->irq, pcie_bwnotif_irq,
 			       IRQF_SHARED, "PCIe bwctrl", srv);
 	if (ret)
 		return ret;
 
-	scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
-		port->link_bwctrl = no_free_ptr(data);
-		pcie_bwnotif_enable(srv);
+	scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) {
+		scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
+			port->link_bwctrl = no_free_ptr(data);
+			pcie_bwnotif_enable(srv);
+		}
 	}
 
 	pci_dbg(port, "enabled with IRQ %d\n", srv->irq);
@@ -154,8 +320,10 @@ static int pcie_bwnotif_probe(struct pcie_device *srv)
 static void pcie_bwnotif_remove(struct pcie_device *srv)
 {
 	pcie_bwnotif_disable(srv->port);
-	scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem)
-		srv->port->link_bwctrl = NULL;
+
+	scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem)
+		scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem)
+			srv->port->link_bwctrl = NULL;
 }
 
 static int pcie_bwnotif_suspend(struct pcie_device *srv)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index e6d502dca939..dcf1c86a5488 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -113,16 +113,11 @@ int pcie_failed_link_retrain(struct pci_dev *dev)
 
 		pci_info(dev, "broken device, retraining non-functional downstream link at 2.5GT/s\n");
 
-		lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
-		lnkctl2 |= PCI_EXP_LNKCTL2_TLS_2_5GT;
-		pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
-
-		ret = pcie_retrain_link(dev, false);
+		ret = pcie_set_target_speed(dev, PCIE_SPEED_2_5GT, false);
 		if (ret) {
 			pci_info(dev, "retraining failed\n");
-			pcie_capability_write_word(dev, PCI_EXP_LNKCTL2,
-						   oldlnkctl2);
-			pcie_retrain_link(dev, true);
+			pcie_set_target_speed(dev, PCIE_LNKCTL2_TLS2SPEED(oldlnkctl2),
+					      true);
 			return ret;
 		}
 
@@ -136,11 +131,7 @@ int pcie_failed_link_retrain(struct pci_dev *dev)
 
 		pci_info(dev, "removing 2.5GT/s downstream link speed restriction\n");
 		pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
-		lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
-		lnkctl2 |= lnkcap & PCI_EXP_LNKCAP_SLS;
-		pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
-
-		ret = pcie_retrain_link(dev, false);
+		ret = pcie_set_target_speed(dev, PCIE_LNKCAP_SLS2SPEED(lnkcap), false);
 		if (ret) {
 			pci_info(dev, "retraining failed\n");
 			return ret;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 579b28833429..4f02d9f91399 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1797,9 +1797,19 @@ static inline int pci_irqd_intx_xlate(struct irq_domain *d,
 #ifdef CONFIG_PCIEPORTBUS
 extern bool pcie_ports_disabled;
 extern bool pcie_ports_native;
+
+int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
+			  bool use_lt);
 #else
 #define pcie_ports_disabled	true
 #define pcie_ports_native	false
+
+static inline int pcie_set_target_speed(struct pci_dev *port,
+					enum pci_bus_speed speed_req,
+					bool use_lt)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #define PCIE_LINK_STATE_L0S		(BIT(0) | BIT(1)) /* Upstr/dwnstr L0s */
-- 
cgit v1.2.3


From d278b098282d1327f6e1be82aacb18457a4d244d Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 18 Oct 2024 17:47:54 +0300
Subject: thermal: Add PCIe cooling driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a thermal cooling driver to provide path to access PCIe bandwidth
controller using the usual thermal interfaces.

A cooling device is instantiated for controllable PCIe Ports from the
bwctrl service driver.

If registering the cooling device fails, allow bwctrl's probe to succeed
regardless. As cdev in that case contains IS_ERR() pseudo "pointer", clean
that up inside the probe function so the remove side doesn't need to
suddenly make an odd looking IS_ERR() check.

The thermal side state 0 means no throttling, i.e., maximum supported PCIe
Link Speed.

Link: https://lore.kernel.org/r/20241018144755.7875-9-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: dropped data->cdev test per
https://lore.kernel.org/r/ZzRm1SJTwEMRsAr8@wunner.de]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org> # From the cooling device interface perspective
---
 MAINTAINERS                    |  2 ++
 drivers/pci/pcie/bwctrl.c      | 12 +++++++
 drivers/thermal/Kconfig        |  9 +++++
 drivers/thermal/Makefile       |  2 ++
 drivers/thermal/pcie_cooling.c | 80 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci-bwctrl.h     | 28 +++++++++++++++
 6 files changed, 133 insertions(+)
 create mode 100644 drivers/thermal/pcie_cooling.c
 create mode 100644 include/linux/pci-bwctrl.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 8c555b3325d6..393ed7ce5ea1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17938,6 +17938,8 @@ M:	Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
 L:	linux-pci@vger.kernel.org
 S:	Supported
 F:	drivers/pci/pcie/bwctrl.c
+F:	drivers/thermal/pcie_cooling.c
+F:	include/linux/pci-bwctrl.h
 
 PCIE DRIVER FOR AMAZON ANNAPURNA LABS
 M:	Jonathan Chocron <jonnyc@amazon.com>
diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
index 3cd3e2d066c9..b59cacc740fa 100644
--- a/drivers/pci/pcie/bwctrl.c
+++ b/drivers/pci/pcie/bwctrl.c
@@ -27,6 +27,7 @@
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
 #include <linux/pci.h>
+#include <linux/pci-bwctrl.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -38,10 +39,12 @@
  * struct pcie_bwctrl_data - PCIe bandwidth controller
  * @set_speed_mutex:	Serializes link speed changes
  * @lbms_count:		Count for LBMS (since last reset)
+ * @cdev:		Thermal cooling device associated with the port
  */
 struct pcie_bwctrl_data {
 	struct mutex set_speed_mutex;
 	atomic_t lbms_count;
+	struct thermal_cooling_device *cdev;
 };
 
 /*
@@ -314,11 +317,20 @@ static int pcie_bwnotif_probe(struct pcie_device *srv)
 
 	pci_dbg(port, "enabled with IRQ %d\n", srv->irq);
 
+	/* Don't fail on errors. Don't leave IS_ERR() "pointer" into ->cdev */
+	port->link_bwctrl->cdev = pcie_cooling_device_register(port);
+	if (IS_ERR(port->link_bwctrl->cdev))
+		port->link_bwctrl->cdev = NULL;
+
 	return 0;
 }
 
 static void pcie_bwnotif_remove(struct pcie_device *srv)
 {
+	struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
+
+	pcie_cooling_device_unregister(data->cdev);
+
 	pcie_bwnotif_disable(srv->port);
 
 	scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem)
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 61e7ae524b1f..d3f9686e26e7 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -220,6 +220,15 @@ config DEVFREQ_THERMAL
 
 	  If you want this support, you should say Y here.
 
+config PCIE_THERMAL
+	bool "PCIe cooling support"
+	depends on PCIEPORTBUS
+	help
+	  This implements PCIe cooling mechanism through bandwidth reduction
+	  for PCIe devices.
+
+	  If you want this support, you should say Y here.
+
 config THERMAL_EMULATION
 	bool "Thermal emulation mode support"
 	help
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 41c4d56beb40..210c16c91461 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -31,6 +31,8 @@ thermal_sys-$(CONFIG_CPU_IDLE_THERMAL)	+= cpuidle_cooling.o
 # devfreq cooling
 thermal_sys-$(CONFIG_DEVFREQ_THERMAL) += devfreq_cooling.o
 
+thermal_sys-$(CONFIG_PCIE_THERMAL) += pcie_cooling.o
+
 obj-$(CONFIG_K3_THERMAL)	+= k3_bandgap.o k3_j72xx_bandgap.o
 # platform thermal drivers
 obj-y				+= broadcom/
diff --git a/drivers/thermal/pcie_cooling.c b/drivers/thermal/pcie_cooling.c
new file mode 100644
index 000000000000..a876d64f1582
--- /dev/null
+++ b/drivers/thermal/pcie_cooling.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PCIe cooling device
+ *
+ * Copyright (C) 2023-2024 Intel Corporation
+ */
+
+#include <linux/build_bug.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci-bwctrl.h>
+#include <linux/slab.h>
+#include <linux/sprintf.h>
+#include <linux/thermal.h>
+
+#define COOLING_DEV_TYPE_PREFIX		"PCIe_Port_Link_Speed_"
+
+static int pcie_cooling_get_max_level(struct thermal_cooling_device *cdev, unsigned long *state)
+{
+	struct pci_dev *port = cdev->devdata;
+
+	/* cooling state 0 is same as the maximum PCIe speed */
+	*state = port->subordinate->max_bus_speed - PCIE_SPEED_2_5GT;
+
+	return 0;
+}
+
+static int pcie_cooling_get_cur_level(struct thermal_cooling_device *cdev, unsigned long *state)
+{
+	struct pci_dev *port = cdev->devdata;
+
+	/* cooling state 0 is same as the maximum PCIe speed */
+	*state = cdev->max_state - (port->subordinate->cur_bus_speed - PCIE_SPEED_2_5GT);
+
+	return 0;
+}
+
+static int pcie_cooling_set_cur_level(struct thermal_cooling_device *cdev, unsigned long state)
+{
+	struct pci_dev *port = cdev->devdata;
+	enum pci_bus_speed speed;
+
+	/* cooling state 0 is same as the maximum PCIe speed */
+	speed = (cdev->max_state - state) + PCIE_SPEED_2_5GT;
+
+	return pcie_set_target_speed(port, speed, true);
+}
+
+static struct thermal_cooling_device_ops pcie_cooling_ops = {
+	.get_max_state = pcie_cooling_get_max_level,
+	.get_cur_state = pcie_cooling_get_cur_level,
+	.set_cur_state = pcie_cooling_set_cur_level,
+};
+
+struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port)
+{
+	char *name __free(kfree) =
+		kasprintf(GFP_KERNEL, COOLING_DEV_TYPE_PREFIX "%s", pci_name(port));
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+
+	return thermal_cooling_device_register(name, port, &pcie_cooling_ops);
+}
+
+void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev)
+{
+	thermal_cooling_device_unregister(cdev);
+}
+
+/* For bus_speed <-> state arithmetic */
+static_assert(PCIE_SPEED_2_5GT + 1 == PCIE_SPEED_5_0GT);
+static_assert(PCIE_SPEED_5_0GT + 1 == PCIE_SPEED_8_0GT);
+static_assert(PCIE_SPEED_8_0GT + 1 == PCIE_SPEED_16_0GT);
+static_assert(PCIE_SPEED_16_0GT + 1 == PCIE_SPEED_32_0GT);
+static_assert(PCIE_SPEED_32_0GT + 1 == PCIE_SPEED_64_0GT);
+
+MODULE_AUTHOR("Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>");
+MODULE_DESCRIPTION("PCIe cooling driver");
diff --git a/include/linux/pci-bwctrl.h b/include/linux/pci-bwctrl.h
new file mode 100644
index 000000000000..cee07127455b
--- /dev/null
+++ b/include/linux/pci-bwctrl.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * PCIe bandwidth controller
+ *
+ * Copyright (C) 2023-2024 Intel Corporation
+ */
+
+#ifndef LINUX_PCI_BWCTRL_H
+#define LINUX_PCI_BWCTRL_H
+
+#include <linux/pci.h>
+
+struct thermal_cooling_device;
+
+#ifdef CONFIG_PCIE_THERMAL
+struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port);
+void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev);
+#else
+static inline struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port)
+{
+	return NULL;
+}
+static inline void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev)
+{
+}
+#endif
+
+#endif
-- 
cgit v1.2.3


From d7a516c6eeae29144649f1c3f586fa580ec9e040 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 15 Nov 2024 21:46:02 +0100
Subject: compiler.h: Fix undefined BUILD_BUG_ON_ZERO()

<linux/compiler.h> defines __must_be_array() and __must_be_cstr() and
both expand to BUILD_BUG_ON_ZERO(), but <linux/build_bug.h> defines
BUILD_BUG_ON_ZERO(). Including <linux/build_bug.h> in
<linux/compiler.h> would create a cyclic dependency as
<linux/build_bug.h> already includes <linux/compiler.h>.

Fix that by defining __BUILD_BUG_ON_ZERO_MSG() in <linux/compiler.h>
and using that for __must_be_array() and __must_be_cstr().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Link: https://lore.kernel.org/r/20241115204602.249590-1-philipp.reisner@linbit.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/compiler.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 4d4e23b6e3e7..469a64dd6495 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -239,11 +239,18 @@ static inline void *offset_to_ptr(const int *off)
 
 #endif /* __ASSEMBLY__ */
 
+#ifdef __CHECKER__
+#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0)
+#else /* __CHECKER__ */
+#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);}))
+#endif /* __CHECKER__ */
+
 /* &a[0] degrades to a pointer: a different type from an array */
-#define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
+#define __must_be_array(a)	__BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array")
 
 /* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */
-#define __must_be_cstr(p)	BUILD_BUG_ON_ZERO(__annotated(p, nonstring))
+#define __must_be_cstr(p) \
+	__BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)")
 
 /*
  * This returns a constant expression while determining if an argument is
-- 
cgit v1.2.3


From dab78a1745ab3c6001e1e4d50a9d09efef8e260d Mon Sep 17 00:00:00 2001
From: Philo Lu <lulie@linux.alibaba.com>
Date: Thu, 14 Nov 2024 18:52:05 +0800
Subject: net/udp: Add 4-tuple hash list basis

Add a new hash list, hash4, in udp table. It will be used to implement
4-tuple hash for connected udp sockets. This patch adds the hlist to
table, and implements helpers and the initialization. 4-tuple hash is
implemented in the following patch.

hash4 uses hlist_nulls to avoid moving wrongly onto another hlist due to
concurrent rehash, because rehash() can happen with lookup().

Co-developed-by: Cambda Zhu <cambda@linux.alibaba.com>
Signed-off-by: Cambda Zhu <cambda@linux.alibaba.com>
Co-developed-by: Fred Chen <fred.cc@alibaba-inc.com>
Signed-off-by: Fred Chen <fred.cc@alibaba-inc.com>
Co-developed-by: Yubing Qiu <yubing.qiuyubing@alibaba-inc.com>
Signed-off-by: Yubing Qiu <yubing.qiuyubing@alibaba-inc.com>
Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h | 11 +++++++
 include/net/udp.h   | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/udp.c      |  6 ++--
 3 files changed, 97 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 3eb3f2b9a2a0..0807e21cfec9 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -56,6 +56,12 @@ struct udp_sock {
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 
+#if !IS_ENABLED(CONFIG_BASE_SMALL)
+	/* For UDP 4-tuple hash */
+	__u16 udp_lrpa_hash;
+	struct hlist_nulls_node udp_lrpa_node;
+#endif
+
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -206,6 +212,11 @@ static inline void udp_allow_gso(struct sock *sk)
 #define udp_portaddr_for_each_entry_rcu(__sk, list) \
 	hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
 
+#if !IS_ENABLED(CONFIG_BASE_SMALL)
+#define udp_lrpa_for_each_entry_rcu(__up, node, list) \
+	hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node)
+#endif
+
 #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE)
 
 #endif	/* _LINUX_UDP_H */
diff --git a/include/net/udp.h b/include/net/udp.h
index 62a7207e65f2..edb669967130 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -50,14 +50,21 @@ struct udp_skb_cb {
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
 /**
- *	struct udp_hslot - UDP hash slot used by udp_table.hash
+ *	struct udp_hslot - UDP hash slot used by udp_table.hash/hash4
  *
  *	@head:	head of list of sockets
+ *	@nulls_head:	head of list of sockets, only used by hash4
  *	@count:	number of sockets in 'head' list
  *	@lock:	spinlock protecting changes to head/count
  */
 struct udp_hslot {
-	struct hlist_head	head;
+	union {
+		struct hlist_head	head;
+		/* hash4 uses hlist_nulls to avoid moving wrongly onto another
+		 * hlist, because rehash() can happen with lookup().
+		 */
+		struct hlist_nulls_head	nulls_head;
+	};
 	int			count;
 	spinlock_t		lock;
 } __aligned(2 * sizeof(long));
@@ -82,12 +89,17 @@ struct udp_hslot_main {
  *
  *	@hash:	hash table, sockets are hashed on (local port)
  *	@hash2:	hash table, sockets are hashed on (local port, local address)
+ *	@hash4:	hash table, connected sockets are hashed on
+ *		(local port, local address, remote port, remote address)
  *	@mask:	number of slots in hash tables, minus 1
  *	@log:	log2(number of slots in hash table)
  */
 struct udp_table {
 	struct udp_hslot	*hash;
 	struct udp_hslot_main	*hash2;
+#if !IS_ENABLED(CONFIG_BASE_SMALL)
+	struct udp_hslot	*hash4;
+#endif
 	unsigned int		mask;
 	unsigned int		log;
 };
@@ -114,13 +126,80 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 static inline void udp_table_hash4_init(struct udp_table *table)
 {
 }
+
+static inline struct udp_hslot *udp_hashslot4(struct udp_table *table,
+					      unsigned int hash)
+{
+	BUILD_BUG();
+	return NULL;
+}
+
+static inline bool udp_hashed4(const struct sock *sk)
+{
+	return false;
+}
+
+static inline unsigned int udp_hash4_slot_size(void)
+{
+	return 0;
+}
+
+static inline bool udp_has_hash4(const struct udp_hslot *hslot2)
+{
+	return false;
+}
+
+static inline void udp_hash4_inc(struct udp_hslot *hslot2)
+{
+}
+
+static inline void udp_hash4_dec(struct udp_hslot *hslot2)
+{
+}
 #else /* !CONFIG_BASE_SMALL */
 
 /* Must be called with table->hash2 initialized */
 static inline void udp_table_hash4_init(struct udp_table *table)
 {
-	for (int i = 0; i <= table->mask; i++)
+	table->hash4 = (void *)(table->hash2 + (table->mask + 1));
+	for (int i = 0; i <= table->mask; i++) {
 		table->hash2[i].hash4_cnt = 0;
+
+		INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i);
+		table->hash4[i].count = 0;
+		spin_lock_init(&table->hash4[i].lock);
+	}
+}
+
+static inline struct udp_hslot *udp_hashslot4(struct udp_table *table,
+					      unsigned int hash)
+{
+	return &table->hash4[hash & table->mask];
+}
+
+static inline bool udp_hashed4(const struct sock *sk)
+{
+	return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node);
+}
+
+static inline unsigned int udp_hash4_slot_size(void)
+{
+	return sizeof(struct udp_hslot);
+}
+
+static inline bool udp_has_hash4(const struct udp_hslot *hslot2)
+{
+	return UDP_HSLOT_MAIN(hslot2)->hash4_cnt;
+}
+
+static inline void udp_hash4_inc(struct udp_hslot *hslot2)
+{
+	UDP_HSLOT_MAIN(hslot2)->hash4_cnt++;
+}
+
+static inline void udp_hash4_dec(struct udp_hslot *hslot2)
+{
+	UDP_HSLOT_MAIN(hslot2)->hash4_cnt--;
 }
 #endif /* CONFIG_BASE_SMALL */
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2fdac5fae2a8..0bc0881d6569 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3427,7 +3427,8 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 {
 	unsigned int i, slot_size;
 
-	slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main);
+	slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
+		    udp_hash4_slot_size();
 	table->hash = alloc_large_system_hash(name,
 					      slot_size,
 					      uhash_entries,
@@ -3482,7 +3483,8 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent
 	if (!udptable)
 		goto out;
 
-	slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main);
+	slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
+		    udp_hash4_slot_size();
 	udptable->hash = vmalloc_huge(hash_entries * slot_size,
 				      GFP_KERNEL_ACCOUNT);
 	if (!udptable->hash)
-- 
cgit v1.2.3


From a06e4a93067cd8f55a74638d45146ddde76574f2 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@linux-m68k.org>
Date: Wed, 13 Nov 2024 09:32:15 +1100
Subject: rtc: m48t59: Use platform_data struct for year offset value

Instead of hard-coded values and ifdefs, store the year offset in the
platform_data struct.

Tested-by: Daniel Palmer <daniel@0x0f.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Finn Thain <fthain@linux-m68k.org>
Tested-by: Andreas Larsson <andreas@gaisler.com>
Acked-by: Andreas Larsson <andreas@gaisler.com>
Link: https://lore.kernel.org/r/665c3526184a8d0c4a6373297d8e7d9a12591d8b.1731450735.git.fthain@linux-m68k.org
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 arch/sparc/kernel/time_32.c |  1 +
 arch/sparc/kernel/time_64.c |  1 +
 drivers/rtc/rtc-m48t59.c    | 26 ++++----------------------
 include/linux/rtc/m48t59.h  |  3 +++
 4 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index 08bbdc458596..578fd0d49f30 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -255,6 +255,7 @@ static void mostek_write_byte(struct device *dev, u32 ofs, u8 val)
 static struct m48t59_plat_data m48t59_data = {
 	.read_byte = mostek_read_byte,
 	.write_byte = mostek_write_byte,
+	.yy_offset = 68,
 };
 
 /* resource is set at runtime */
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 60f1c8cc5363..b32f27f929d1 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -544,6 +544,7 @@ static void mostek_write_byte(struct device *dev, u32 ofs, u8 val)
 static struct m48t59_plat_data m48t59_data = {
 	.read_byte	= mostek_read_byte,
 	.write_byte	= mostek_write_byte,
+	.yy_offset	= 68,
 };
 
 static struct platform_device m48t59_rtc = {
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index 5d30ce8e13ca..4e608bc8bbd3 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -71,7 +71,7 @@ static int m48t59_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	/* Issue the READ command */
 	M48T59_SET_BITS(M48T59_CNTL_READ, M48T59_CNTL);
 
-	tm->tm_year	= bcd2bin(M48T59_READ(M48T59_YEAR));
+	tm->tm_year	= bcd2bin(M48T59_READ(M48T59_YEAR)) + pdata->yy_offset;
 	/* tm_mon is 0-11 */
 	tm->tm_mon	= bcd2bin(M48T59_READ(M48T59_MONTH)) - 1;
 	tm->tm_mday	= bcd2bin(M48T59_READ(M48T59_MDAY));
@@ -82,10 +82,6 @@ static int m48t59_rtc_read_time(struct device *dev, struct rtc_time *tm)
 		dev_dbg(dev, "Century bit is enabled\n");
 		tm->tm_year += 100;	/* one century */
 	}
-#ifdef CONFIG_SPARC
-	/* Sun SPARC machines count years since 1968 */
-	tm->tm_year += 68;
-#endif
 
 	tm->tm_wday	= bcd2bin(val & 0x07);
 	tm->tm_hour	= bcd2bin(M48T59_READ(M48T59_HOUR) & 0x3F);
@@ -106,12 +102,7 @@ static int m48t59_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	struct m48t59_private *m48t59 = dev_get_drvdata(dev);
 	unsigned long flags;
 	u8 val = 0;
-	int year = tm->tm_year;
-
-#ifdef CONFIG_SPARC
-	/* Sun SPARC machines count years since 1968 */
-	year -= 68;
-#endif
+	int year = tm->tm_year - pdata->yy_offset;
 
 	dev_dbg(dev, "RTC set time %04d-%02d-%02d %02d/%02d/%02d\n",
 		year + 1900, tm->tm_mon, tm->tm_mday,
@@ -162,11 +153,7 @@ static int m48t59_rtc_readalarm(struct device *dev, struct rtc_wkalrm *alrm)
 	/* Issue the READ command */
 	M48T59_SET_BITS(M48T59_CNTL_READ, M48T59_CNTL);
 
-	tm->tm_year = bcd2bin(M48T59_READ(M48T59_YEAR));
-#ifdef CONFIG_SPARC
-	/* Sun SPARC machines count years since 1968 */
-	tm->tm_year += 68;
-#endif
+	tm->tm_year = bcd2bin(M48T59_READ(M48T59_YEAR)) + pdata->yy_offset;
 	/* tm_mon is 0-11 */
 	tm->tm_mon = bcd2bin(M48T59_READ(M48T59_MONTH)) - 1;
 
@@ -197,12 +184,7 @@ static int m48t59_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 	struct rtc_time *tm = &alrm->time;
 	u8 mday, hour, min, sec;
 	unsigned long flags;
-	int year = tm->tm_year;
-
-#ifdef CONFIG_SPARC
-	/* Sun SPARC machines count years since 1968 */
-	year -= 68;
-#endif
+	int year = tm->tm_year - pdata->yy_offset;
 
 	/* If no irq, we don't support ALARM */
 	if (m48t59->irq == NO_IRQ)
diff --git a/include/linux/rtc/m48t59.h b/include/linux/rtc/m48t59.h
index 9465d5405fe2..373ba77071c6 100644
--- a/include/linux/rtc/m48t59.h
+++ b/include/linux/rtc/m48t59.h
@@ -56,6 +56,9 @@ struct m48t59_plat_data {
 	void __iomem *ioaddr;
 	/* offset to RTC registers, automatically set according to the type */
 	unsigned int offset;
+
+	/* YY digits (in RTC) are offset, i.e. year is 1900 + yy_offset + YY */
+	int yy_offset;
 };
 
 #endif /* _LINUX_RTC_M48T59_H_ */
-- 
cgit v1.2.3


From 03854920c39c62b88c0b540c92cf35746d059af2 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 6 Oct 2024 02:19:52 +0100
Subject: libceph: Remove unused ceph_pagelist functions

ceph_pagelist_truncate() and ceph_pagelist_set_cursor() have been unused
since commit
39be95e9c8c0 ("ceph: ceph_pagelist_append might sleep while atomic")

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/pagelist.h | 12 ------------
 net/ceph/pagelist.c           | 38 --------------------------------------
 2 files changed, 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h
index 5dead8486fd8..879bec0863aa 100644
--- a/include/linux/ceph/pagelist.h
+++ b/include/linux/ceph/pagelist.h
@@ -17,12 +17,6 @@ struct ceph_pagelist {
 	refcount_t refcnt;
 };
 
-struct ceph_pagelist_cursor {
-	struct ceph_pagelist *pl;   /* pagelist, for error checking */
-	struct list_head *page_lru; /* page in list */
-	size_t room;		    /* room remaining to reset to */
-};
-
 struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags);
 
 extern void ceph_pagelist_release(struct ceph_pagelist *pl);
@@ -33,12 +27,6 @@ extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
 
 extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
 
-extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
-				     struct ceph_pagelist_cursor *c);
-
-extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
-				  struct ceph_pagelist_cursor *c);
-
 static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
 {
 	__le64 ev = cpu_to_le64(v);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 74622b278d57..5a9c4be5f222 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -131,41 +131,3 @@ int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
 	return 0;
 }
 EXPORT_SYMBOL(ceph_pagelist_free_reserve);
-
-/* Create a truncation point. */
-void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
-			      struct ceph_pagelist_cursor *c)
-{
-	c->pl = pl;
-	c->page_lru = pl->head.prev;
-	c->room = pl->room;
-}
-EXPORT_SYMBOL(ceph_pagelist_set_cursor);
-
-/* Truncate a pagelist to the given point. Move extra pages to reserve.
- * This won't sleep.
- * Returns: 0 on success,
- *          -EINVAL if the pagelist doesn't match the trunc point pagelist
- */
-int ceph_pagelist_truncate(struct ceph_pagelist *pl,
-			   struct ceph_pagelist_cursor *c)
-{
-	struct page *page;
-
-	if (pl != c->pl)
-		return -EINVAL;
-	ceph_pagelist_unmap_tail(pl);
-	while (pl->head.prev != c->page_lru) {
-		page = list_entry(pl->head.prev, struct page, lru);
-		/* move from pagelist to reserve */
-		list_move_tail(&page->lru, &pl->free_list);
-		++pl->num_pages_free;
-	}
-	pl->room = c->room;
-	if (!list_empty(&pl->head)) {
-		page = list_entry(pl->head.prev, struct page, lru);
-		pl->mapped_tail = kmap(page);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(ceph_pagelist_truncate);
-- 
cgit v1.2.3


From ee1eb8ccaab8cc2ef4bda8e11a40409ee20f6405 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 6 Oct 2024 02:19:53 +0100
Subject: libceph: Remove unused pagevec functions

ceph_copy_user_to_page_vector() has been unused since 2013's commit
e8344e668915 ("ceph: Implement writev/pwritev for sync operation.")

ceph_copy_to_page_vector() has been unused since 2012's commit
913d2fdcf605 ("rbd: always pass ops array to rbd_req_sync_op()")

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h |  6 -----
 net/ceph/pagevec.c           | 52 --------------------------------------------
 2 files changed, 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 15fb566d3f46..733e7f93db66 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -317,12 +317,6 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages);
 extern void ceph_put_page_vector(struct page **pages, int num_pages,
 				 bool dirty);
 extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
-extern int ceph_copy_user_to_page_vector(struct page **pages,
-					 const void __user *data,
-					 loff_t off, size_t len);
-extern void ceph_copy_to_page_vector(struct page **pages,
-				    const void *data,
-				    loff_t off, size_t len);
 extern void ceph_copy_from_page_vector(struct page **pages,
 				    void *data,
 				    loff_t off, size_t len);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 64305e7056a1..4509757d8b3b 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -55,58 +55,6 @@ struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
 }
 EXPORT_SYMBOL(ceph_alloc_page_vector);
 
-/*
- * copy user data into a page vector
- */
-int ceph_copy_user_to_page_vector(struct page **pages,
-					 const void __user *data,
-					 loff_t off, size_t len)
-{
-	int i = 0;
-	int po = off & ~PAGE_MASK;
-	int left = len;
-	int l, bad;
-
-	while (left > 0) {
-		l = min_t(int, PAGE_SIZE-po, left);
-		bad = copy_from_user(page_address(pages[i]) + po, data, l);
-		if (bad == l)
-			return -EFAULT;
-		data += l - bad;
-		left -= l - bad;
-		po += l - bad;
-		if (po == PAGE_SIZE) {
-			po = 0;
-			i++;
-		}
-	}
-	return len;
-}
-EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
-
-void ceph_copy_to_page_vector(struct page **pages,
-				    const void *data,
-				    loff_t off, size_t len)
-{
-	int i = 0;
-	size_t po = off & ~PAGE_MASK;
-	size_t left = len;
-
-	while (left > 0) {
-		size_t l = min_t(size_t, PAGE_SIZE-po, left);
-
-		memcpy(page_address(pages[i]) + po, data, l);
-		data += l;
-		left -= l;
-		po += l;
-		if (po == PAGE_SIZE) {
-			po = 0;
-			i++;
-		}
-	}
-}
-EXPORT_SYMBOL(ceph_copy_to_page_vector);
-
 void ceph_copy_from_page_vector(struct page **pages,
 				    void *data,
 				    loff_t off, size_t len)
-- 
cgit v1.2.3


From 32844fd72b879d02f1f6b4394025349d31a09fd3 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 6 Oct 2024 02:19:54 +0100
Subject: libceph: Remove unused ceph_osdc_watch_check

ceph_osdc_watch_check() has been unused since it was added in commit
b07d3c4bd727 ("libceph: support for checking on status of watch")

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  2 --
 net/ceph/osd_client.c           | 34 ----------------------------------
 2 files changed, 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d7941478158c..d55b30057a45 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -626,8 +626,6 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
 		     u32 timeout,
 		     struct page ***preply_pages,
 		     size_t *preply_len);
-int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
-			  struct ceph_osd_linger_request *lreq);
 int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
 			    struct ceph_object_id *oid,
 			    struct ceph_object_locator *oloc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 9d078b37fe0b..9b1168eb77ab 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -4999,40 +4999,6 @@ out_put_lreq:
 }
 EXPORT_SYMBOL(ceph_osdc_notify);
 
-/*
- * Return the number of milliseconds since the watch was last
- * confirmed, or an error.  If there is an error, the watch is no
- * longer valid, and should be destroyed with ceph_osdc_unwatch().
- */
-int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
-			  struct ceph_osd_linger_request *lreq)
-{
-	unsigned long stamp, age;
-	int ret;
-
-	down_read(&osdc->lock);
-	mutex_lock(&lreq->lock);
-	stamp = lreq->watch_valid_thru;
-	if (!list_empty(&lreq->pending_lworks)) {
-		struct linger_work *lwork =
-		    list_first_entry(&lreq->pending_lworks,
-				     struct linger_work,
-				     pending_item);
-
-		if (time_before(lwork->queued_stamp, stamp))
-			stamp = lwork->queued_stamp;
-	}
-	age = jiffies - stamp;
-	dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
-	     lreq, lreq->linger_id, age, lreq->last_error);
-	/* we are truncating to msecs, so return a safe upper bound */
-	ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
-
-	mutex_unlock(&lreq->lock);
-	up_read(&osdc->lock);
-	return ret;
-}
-
 static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
 {
 	u8 struct_v;
-- 
cgit v1.2.3


From 979c6342f9c0a48696a6420f14f9dd409591657f Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 15 Nov 2024 13:41:21 -0800
Subject: nvme-pci: add support for sgl metadata

Supporting this mode allows creating and merging multi-segment metadata
requests that wouldn't be possible otherwise. It also allows directly
using user space requests that straddle physically discontiguous pages.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/nvme.h |   7 +++
 drivers/nvme/host/pci.c  | 144 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/nvme.h     |   1 +
 3 files changed, 137 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 900719c4c70c..5ef284a376cc 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1126,6 +1126,13 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
 	return ctrl->sgls & ((1 << 0) | (1 << 1));
 }
 
+static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->ops->flags & NVME_F_FABRICS)
+		return true;
+	return ctrl->sgls & NVME_CTRL_SGLS_MSDS;
+}
+
 #ifdef CONFIG_NVME_HOST_AUTH
 int __init nvme_init_auth(void);
 void __exit nvme_exit_auth(void);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5f2e3ad2cc52..c6c3ae3a7c43 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -43,6 +43,7 @@
  */
 #define NVME_MAX_KB_SZ	8192
 #define NVME_MAX_SEGS	128
+#define NVME_MAX_META_SEGS 15
 #define NVME_MAX_NR_ALLOCATIONS	5
 
 static int use_threaded_interrupts;
@@ -144,6 +145,7 @@ struct nvme_dev {
 	struct sg_table *hmb_sgt;
 
 	mempool_t *iod_mempool;
+	mempool_t *iod_meta_mempool;
 
 	/* shadow doorbell buffer support: */
 	__le32 *dbbuf_dbs;
@@ -239,6 +241,8 @@ struct nvme_iod {
 	dma_addr_t first_dma;
 	dma_addr_t meta_dma;
 	struct sg_table sgt;
+	struct sg_table meta_sgt;
+	union nvme_descriptor meta_list;
 	union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
 };
 
@@ -506,6 +510,14 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
 	spin_unlock(&nvmeq->sq_lock);
 }
 
+static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev,
+					      struct request *req)
+{
+	if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+		return false;
+	return req->nr_integrity_segments > 1;
+}
+
 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
 				     int nseg)
 {
@@ -518,6 +530,8 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
 		return false;
 	if (!nvmeq->qid)
 		return false;
+	if (nvme_pci_metadata_use_sgls(dev, req))
+		return true;
 	if (!sgl_threshold || avg_seg_size < sgl_threshold)
 		return false;
 	return true;
@@ -780,7 +794,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		struct bio_vec bv = req_bvec(req);
 
 		if (!is_pci_p2pdma_page(bv.bv_page)) {
-			if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
+			if (!nvme_pci_metadata_use_sgls(dev, req) &&
+			    (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
 			     bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
 				return nvme_setup_prp_simple(dev, req,
 							     &cmnd->rw, &bv);
@@ -824,11 +839,69 @@ out_free_sg:
 	return ret;
 }
 
-static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
-		struct nvme_command *cmnd)
+static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
+					     struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_rw_command *cmnd = &iod->cmd.rw;
+	struct nvme_sgl_desc *sg_list;
+	struct scatterlist *sgl, *sg;
+	unsigned int entries;
+	dma_addr_t sgl_dma;
+	int rc, i;
+
+	iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
+	if (!iod->meta_sgt.sgl)
+		return BLK_STS_RESOURCE;
+
+	sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
+	iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
+							   iod->meta_sgt.sgl);
+	if (!iod->meta_sgt.orig_nents)
+		goto out_free_sg;
+
+	rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
+			     DMA_ATTR_NO_WARN);
+	if (rc)
+		goto out_free_sg;
+
+	sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
+	if (!sg_list)
+		goto out_unmap_sg;
+
+	entries = iod->meta_sgt.nents;
+	iod->meta_list.sg_list = sg_list;
+	iod->meta_dma = sgl_dma;
+
+	cmnd->flags = NVME_CMD_SGL_METASEG;
+	cmnd->metadata = cpu_to_le64(sgl_dma);
+
+	sgl = iod->meta_sgt.sgl;
+	if (entries == 1) {
+		nvme_pci_sgl_set_data(sg_list, sgl);
+		return BLK_STS_OK;
+	}
+
+	sgl_dma += sizeof(*sg_list);
+	nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
+	for_each_sg(sgl, sg, entries, i)
+		nvme_pci_sgl_set_data(&sg_list[i + 1], sg);
+
+	return BLK_STS_OK;
+
+out_unmap_sg:
+	dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+out_free_sg:
+	mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
+	return BLK_STS_RESOURCE;
+}
+
+static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
+					     struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct bio_vec bv = rq_integrity_vec(req);
+	struct nvme_command *cmnd = &iod->cmd;
 
 	iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
 	if (dma_mapping_error(dev->dev, iod->meta_dma))
@@ -837,6 +910,13 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 	return BLK_STS_OK;
 }
 
+static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
+{
+	if (nvme_pci_metadata_use_sgls(dev, req))
+		return nvme_pci_setup_meta_sgls(dev, req);
+	return nvme_pci_setup_meta_mptr(dev, req);
+}
+
 static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -845,6 +925,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 	iod->aborted = false;
 	iod->nr_allocations = -1;
 	iod->sgt.nents = 0;
+	iod->meta_sgt.nents = 0;
 
 	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
@@ -857,7 +938,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 	}
 
 	if (blk_integrity_rq(req)) {
-		ret = nvme_map_metadata(dev, req, &iod->cmd);
+		ret = nvme_map_metadata(dev, req);
 		if (ret)
 			goto out_unmap_data;
 	}
@@ -955,17 +1036,31 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
 	*rqlist = requeue_list;
 }
 
+static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+						struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+	if (!iod->meta_sgt.nents) {
+		dma_unmap_page(dev->dev, iod->meta_dma,
+			       rq_integrity_vec(req).bv_len,
+			       rq_dma_dir(req));
+		return;
+	}
+
+	dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
+		      iod->meta_dma);
+	dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+	mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
+}
+
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 	struct nvme_dev *dev = nvmeq->dev;
 
-	if (blk_integrity_rq(req)) {
-	        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
-		dma_unmap_page(dev->dev, iod->meta_dma,
-			       rq_integrity_vec(req).bv_len, rq_dma_dir(req));
-	}
+	if (blk_integrity_rq(req))
+		nvme_unmap_metadata(dev, req);
 
 	if (blk_rq_nr_phys_segments(req))
 		nvme_unmap_data(dev, req);
@@ -2761,6 +2856,7 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
 
 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 {
+	size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
 	size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
 
 	dev->iod_mempool = mempool_create_node(1,
@@ -2769,7 +2865,18 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 			dev_to_node(dev->dev));
 	if (!dev->iod_mempool)
 		return -ENOMEM;
+
+	dev->iod_meta_mempool = mempool_create_node(1,
+			mempool_kmalloc, mempool_kfree,
+			(void *)meta_size, GFP_KERNEL,
+			dev_to_node(dev->dev));
+	if (!dev->iod_meta_mempool)
+		goto free;
+
 	return 0;
+free:
+	mempool_destroy(dev->iod_mempool);
+	return -ENOMEM;
 }
 
 static void nvme_free_tagset(struct nvme_dev *dev)
@@ -2834,6 +2941,11 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
+	if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+		dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+	else
+		dev->ctrl.max_integrity_segments = 1;
+
 	nvme_dbbuf_dma_alloc(dev);
 
 	result = nvme_setup_host_mem(dev);
@@ -3101,11 +3213,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
 	dev->ctrl.max_hw_sectors = min_t(u32,
 		NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
 	dev->ctrl.max_segments = NVME_MAX_SEGS;
-
-	/*
-	 * There is no support for SGLs for metadata (yet), so we are limited to
-	 * a single integrity segment for the separate metadata pointer.
-	 */
 	dev->ctrl.max_integrity_segments = 1;
 	return dev;
 
@@ -3168,6 +3275,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto out_disable;
 
+	if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+		dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+	else
+		dev->ctrl.max_integrity_segments = 1;
+
 	nvme_dbbuf_dma_alloc(dev);
 
 	result = nvme_setup_host_mem(dev);
@@ -3210,6 +3322,7 @@ out_disable:
 	nvme_free_queues(dev, 0);
 out_release_iod_mempool:
 	mempool_destroy(dev->iod_mempool);
+	mempool_destroy(dev->iod_meta_mempool);
 out_release_prp_pools:
 	nvme_release_prp_pools(dev);
 out_dev_unmap:
@@ -3275,6 +3388,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dbbuf_dma_free(dev);
 	nvme_free_queues(dev, 0);
 	mempool_destroy(dev->iod_mempool);
+	mempool_destroy(dev->iod_meta_mempool);
 	nvme_release_prp_pools(dev);
 	nvme_dev_unmap(dev);
 	nvme_uninit_ctrl(&dev->ctrl);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 0a6e22038ce3..5873ce859cc8 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -389,6 +389,7 @@ enum {
 	NVME_CTRL_CTRATT_PREDICTABLE_LAT	= 1 << 5,
 	NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY	= 1 << 7,
 	NVME_CTRL_CTRATT_UUID_LIST		= 1 << 9,
+	NVME_CTRL_SGLS_MSDS                     = 1 << 19,
 };
 
 struct nvme_lbaf {
-- 
cgit v1.2.3


From 6399a0db8cd61eedbfb4b7809a4f4699157a9bf8 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 13 Nov 2024 13:57:04 -0800
Subject: nvme: define the remaining used sgls constants

This provides a little more context when reading the code than hardcoded
magic numbers.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/nvme.h        | 3 ++-
 drivers/nvme/host/rdma.c        | 4 ++--
 drivers/nvme/target/admin-cmd.c | 7 ++++---
 include/linux/nvme.h            | 4 ++++
 4 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 5ef284a376cc..611b02c8a8b3 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1123,7 +1123,8 @@ static inline void nvme_start_request(struct request *rq)
 
 static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
 {
-	return ctrl->sgls & ((1 << 0) | (1 << 1));
+	return ctrl->sgls & (NVME_CTRL_SGLS_BYTE_ALIGNED |
+			     NVME_CTRL_SGLS_DWORD_ALIGNED);
 }
 
 static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 24a2759798d0..baf7d2490152 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1019,7 +1019,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
 		goto destroy_admin;
 	}
 
-	if (!(ctrl->ctrl.sgls & (1 << 2))) {
+	if (!(ctrl->ctrl.sgls & NVME_CTRL_SGLS_KSDBDS)) {
 		ret = -EOPNOTSUPP;
 		dev_err(ctrl->ctrl.device,
 			"Mandatory keyed sgls are not supported!\n");
@@ -1051,7 +1051,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
 		ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
 	}
 
-	if (ctrl->ctrl.sgls & (1 << 20))
+	if (ctrl->ctrl.sgls & NVME_CTRL_SGLS_SAOS)
 		ctrl->use_inline_data = true;
 
 	if (ctrl->ctrl.queue_count > 1) {
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 934b401fbc2f..4fa8496a4d96 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -601,11 +601,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 	id->awun = 0;
 	id->awupf = 0;
 
-	id->sgls = cpu_to_le32(1 << 0);	/* we always support SGLs */
+	/* we always support SGLs */
+	id->sgls = cpu_to_le32(NVME_CTRL_SGLS_BYTE_ALIGNED);
 	if (ctrl->ops->flags & NVMF_KEYED_SGLS)
-		id->sgls |= cpu_to_le32(1 << 2);
+		id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_KSDBDS);
 	if (req->port->inline_data_size)
-		id->sgls |= cpu_to_le32(1 << 20);
+		id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_SAOS);
 
 	strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn));
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5873ce859cc8..2baf9a80b470 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -389,7 +389,11 @@ enum {
 	NVME_CTRL_CTRATT_PREDICTABLE_LAT	= 1 << 5,
 	NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY	= 1 << 7,
 	NVME_CTRL_CTRATT_UUID_LIST		= 1 << 9,
+	NVME_CTRL_SGLS_BYTE_ALIGNED             = 1,
+	NVME_CTRL_SGLS_DWORD_ALIGNED            = 2,
+	NVME_CTRL_SGLS_KSDBDS			= 1 << 2,
 	NVME_CTRL_SGLS_MSDS                     = 1 << 19,
+	NVME_CTRL_SGLS_SAOS                     = 1 << 20,
 };
 
 struct nvme_lbaf {
-- 
cgit v1.2.3


From 28b6acd75e3cefbe746ec7402c7ff4fdb114f327 Mon Sep 17 00:00:00 2001
From: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Date: Thu, 14 Nov 2024 17:10:32 +0100
Subject: PCI: endpoint: Fix pci_epc_map map_size kerneldoc string
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because some endpoint controllers have requirements on the alignment of
the controller physical memory address that must be used to map a RC PCI
address region, the map PCI start address is not necessarily the desired
PCI base address to be mapped. This can result in map_pci_addr being
lower than pci_addr as documented. This results in map_size covering the
range map_pci_addr..pci_addr+pci_size.

The old text had the pci_addr twice instead of map_pci_addr..pci_addr,
so replace the erroneous kerneldoc string to reflect the actual range.

Link: https://lore.kernel.org/r/20241114161032.3046202-1-rick.wertenbroek@gmail.com
Signed-off-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
---
 include/linux/pci-epc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index de8cc3658220..e818e3fdcded 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -40,7 +40,7 @@ pci_epc_interface_string(enum pci_epc_interface_type type)
  * @map_pci_addr: RC PCI address used as the first address mapped (may be lower
  *                than @pci_addr)
  * @map_size: size of the controller memory needed for mapping the RC PCI address
- *            range @pci_addr..@pci_addr+@pci_size
+ *            range @map_pci_addr..@pci_addr+@pci_size
  * @phys_base: base physical address of the allocated EPC memory for mapping the
  *             RC PCI address range
  * @phys_addr: physical address at which @pci_addr is mapped
-- 
cgit v1.2.3


From 2f746e40e9baae878f8193e09d8f6d601dce42bb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2024 09:36:27 -0400
Subject: lockd: Remove unused typedef

Clean up: Looks like the last usage of this typedef was removed by
commit 026fec7e7c47 ("sunrpc: properly type pc_decode callbacks") in
2017.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/lockd/xdr.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index 80cca9426761..17d53165d9f2 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -73,8 +73,6 @@ struct nlm_args {
 	u32			fsm_mode;
 };
 
-typedef struct nlm_args nlm_args;
-
 /*
  * Generic lockd result
  */
-- 
cgit v1.2.3


From 8994a512e2598c0bf1b6e9ece1e8292976b0dd65 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2024 09:36:30 -0400
Subject: lockd: Remove unused parameter to nlmsvc_testlock()

The nlm_cookie parameter has been unused since commit 09802fd2a8ca
("lockd: rip out deferred lock handling from testlock codepath").

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc4proc.c         | 3 ++-
 fs/lockd/svclock.c          | 2 +-
 fs/lockd/svcproc.c          | 3 ++-
 include/linux/lockd/lockd.h | 6 +++---
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 1d0400d94b3d..2cb603013111 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -108,7 +108,8 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 
 	test_owner = argp->lock.fl.c.flc_owner;
 	/* Now check for conflicting locks */
-	resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
+	resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock,
+				       &resp->lock);
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 1f2149db10f2..33e1fd159934 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -609,7 +609,7 @@ out:
 __be32
 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 		struct nlm_host *host, struct nlm_lock *lock,
-		struct nlm_lock *conflock, struct nlm_cookie *cookie)
+		struct nlm_lock *conflock)
 {
 	int			error;
 	int			mode;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index d959a5dbe0ae..f53d5177f267 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -130,7 +130,8 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	test_owner = argp->lock.fl.c.flc_owner;
 
 	/* Now check for conflicting locks */
-	resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
+	resp->status = cast_status(nlmsvc_testlock(rqstp, file, host,
+						   &argp->lock, &resp->lock));
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 61c4b9c41904..c8f0f9458f2c 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -278,9 +278,9 @@ __be32		  nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
 			      struct nlm_host *, struct nlm_lock *, int,
 			      struct nlm_cookie *, int);
 __be32		  nlmsvc_unlock(struct net *net, struct nlm_file *, struct nlm_lock *);
-__be32		  nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
-			struct nlm_host *, struct nlm_lock *,
-			struct nlm_lock *, struct nlm_cookie *);
+__be32		  nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
+			struct nlm_host *host, struct nlm_lock *lock,
+			struct nlm_lock *conflock);
 __be32		  nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *);
 void		  nlmsvc_retry_blocked(struct svc_rqst *rqstp);
 void		  nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
-- 
cgit v1.2.3


From c840b8e1f039e90f97ca55525667eb961422f86c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 13 Nov 2024 22:59:38 -0500
Subject: nfs_common: must not hold RCU while calling nfsd_file_put_local

Move holding the RCU from nfs_to_nfsd_file_put_local to
nfs_to_nfsd_net_put.  It is the call to nfs_to->nfsd_serv_put that
requires the RCU anyway (the puts for nfsd_file and netns were
combined to avoid an extra indirect reference but that
micro-optimization isn't possible now).

This fixes xfstests generic/013 and it triggering:

"Voluntary context switch within RCU read-side critical section!"

[  143.545738] Call Trace:
[  143.546206]  <TASK>
[  143.546625]  ? show_regs+0x6d/0x80
[  143.547267]  ? __warn+0x91/0x140
[  143.547951]  ? rcu_note_context_switch+0x496/0x5d0
[  143.548856]  ? report_bug+0x193/0x1a0
[  143.549557]  ? handle_bug+0x63/0xa0
[  143.550214]  ? exc_invalid_op+0x1d/0x80
[  143.550938]  ? asm_exc_invalid_op+0x1f/0x30
[  143.551736]  ? rcu_note_context_switch+0x496/0x5d0
[  143.552634]  ? wakeup_preempt+0x62/0x70
[  143.553358]  __schedule+0xaa/0x1380
[  143.554025]  ? _raw_spin_unlock_irqrestore+0x12/0x40
[  143.554958]  ? try_to_wake_up+0x1fe/0x6b0
[  143.555715]  ? wake_up_process+0x19/0x20
[  143.556452]  schedule+0x2e/0x120
[  143.557066]  schedule_preempt_disabled+0x19/0x30
[  143.557933]  rwsem_down_read_slowpath+0x24d/0x4a0
[  143.558818]  ? xfs_efi_item_format+0x50/0xc0 [xfs]
[  143.559894]  down_read+0x4e/0xb0
[  143.560519]  xlog_cil_commit+0x1b2/0xbc0 [xfs]
[  143.561460]  ? _raw_spin_unlock+0x12/0x30
[  143.562212]  ? xfs_inode_item_precommit+0xc7/0x220 [xfs]
[  143.563309]  ? xfs_trans_run_precommits+0x69/0xd0 [xfs]
[  143.564394]  __xfs_trans_commit+0xb5/0x330 [xfs]
[  143.565367]  xfs_trans_roll+0x48/0xc0 [xfs]
[  143.566262]  xfs_defer_trans_roll+0x57/0x100 [xfs]
[  143.567278]  xfs_defer_finish_noroll+0x27a/0x490 [xfs]
[  143.568342]  xfs_defer_finish+0x1a/0x80 [xfs]
[  143.569267]  xfs_bunmapi_range+0x4d/0xb0 [xfs]
[  143.570208]  xfs_itruncate_extents_flags+0x13d/0x230 [xfs]
[  143.571353]  xfs_free_eofblocks+0x12e/0x190 [xfs]
[  143.572359]  xfs_file_release+0x12d/0x140 [xfs]
[  143.573324]  __fput+0xe8/0x2d0
[  143.573922]  __fput_sync+0x1d/0x30
[  143.574574]  nfsd_filp_close+0x33/0x60 [nfsd]
[  143.575430]  nfsd_file_free+0x96/0x150 [nfsd]
[  143.576274]  nfsd_file_put+0xf7/0x1a0 [nfsd]
[  143.577104]  nfsd_file_put_local+0x18/0x30 [nfsd]
[  143.578070]  nfs_close_local_fh+0x101/0x110 [nfs_localio]
[  143.579079]  __put_nfs_open_context+0xc9/0x180 [nfs]
[  143.580031]  nfs_file_clear_open_context+0x4a/0x60 [nfs]
[  143.581038]  nfs_file_release+0x3e/0x60 [nfs]
[  143.581879]  __fput+0xe8/0x2d0
[  143.582464]  __fput_sync+0x1d/0x30
[  143.583108]  __x64_sys_close+0x41/0x80
[  143.583823]  x64_sys_call+0x189a/0x20d0
[  143.584552]  do_syscall_64+0x64/0x170
[  143.585240]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  143.586185] RIP: 0033:0x7f3c5153efd7

Fixes: 65f2a5c36635 ("nfs_common: fix race in NFS calls to nfsd_file_put_local() and nfsd_serv_put()")
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs_common/nfslocalio.c |  8 +++-----
 fs/nfsd/filecache.c        | 14 +++++++-------
 fs/nfsd/filecache.h        |  2 +-
 include/linux/nfslocalio.h | 18 +++++++++++++++---
 4 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 09404d142d1a..a74ec08f6c96 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -155,11 +155,9 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
 	/* We have an implied reference to net thanks to nfsd_serv_try_get */
 	localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt,
 					     cred, nfs_fh, fmode);
-	if (IS_ERR(localio)) {
-		rcu_read_lock();
-		nfs_to->nfsd_serv_put(net);
-		rcu_read_unlock();
-	}
+	if (IS_ERR(localio))
+		nfs_to_nfsd_net_put(net);
+
 	return localio;
 }
 EXPORT_SYMBOL_GPL(nfs_open_local_fh);
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index c16671135d17..9a62b4da89bb 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -391,19 +391,19 @@ nfsd_file_put(struct nfsd_file *nf)
 }
 
 /**
- * nfsd_file_put_local - put the reference to nfsd_file and local nfsd_serv
- * @nf: nfsd_file of which to put the references
+ * nfsd_file_put_local - put nfsd_file reference and arm nfsd_serv_put in caller
+ * @nf: nfsd_file of which to put the reference
  *
- * First put the reference of the nfsd_file and then put the
- * reference to the associated nn->nfsd_serv.
+ * First save the associated net to return to caller, then put
+ * the reference of the nfsd_file.
  */
-void
-nfsd_file_put_local(struct nfsd_file *nf) __must_hold(rcu)
+struct net *
+nfsd_file_put_local(struct nfsd_file *nf)
 {
 	struct net *net = nf->nf_net;
 
 	nfsd_file_put(nf);
-	nfsd_serv_put(net);
+	return net;
 }
 
 /**
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index cadf3c2689c4..d5db6b34ba30 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -55,7 +55,7 @@ void nfsd_file_cache_shutdown(void);
 int nfsd_file_cache_start_net(struct net *net);
 void nfsd_file_cache_shutdown_net(struct net *net);
 void nfsd_file_put(struct nfsd_file *nf);
-void nfsd_file_put_local(struct nfsd_file *nf);
+struct net *nfsd_file_put_local(struct nfsd_file *nf);
 struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
 struct file *nfsd_file_file(struct nfsd_file *nf);
 void nfsd_file_close_inode_sync(struct inode *inode);
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 3982fea79919..9202f4b24343 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -55,7 +55,7 @@ struct nfsd_localio_operations {
 						const struct cred *,
 						const struct nfs_fh *,
 						const fmode_t);
-	void (*nfsd_file_put_local)(struct nfsd_file *);
+	struct net *(*nfsd_file_put_local)(struct nfsd_file *);
 	struct file *(*nfsd_file_file)(struct nfsd_file *);
 } ____cacheline_aligned;
 
@@ -66,7 +66,7 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *,
 		   struct rpc_clnt *, const struct cred *,
 		   const struct nfs_fh *, const fmode_t);
 
-static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio)
+static inline void nfs_to_nfsd_net_put(struct net *net)
 {
 	/*
 	 * Once reference to nfsd_serv is dropped, NFSD could be
@@ -74,10 +74,22 @@ static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio)
 	 * by always taking RCU.
 	 */
 	rcu_read_lock();
-	nfs_to->nfsd_file_put_local(localio);
+	nfs_to->nfsd_serv_put(net);
 	rcu_read_unlock();
 }
 
+static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio)
+{
+	/*
+	 * Must not hold RCU otherwise nfsd_file_put() can easily trigger:
+	 * "Voluntary context switch within RCU read-side critical section!"
+	 * by scheduling deep in underlying filesystem (e.g. XFS).
+	 */
+	struct net *net = nfs_to->nfsd_file_put_local(localio);
+
+	nfs_to_nfsd_net_put(net);
+}
+
 #else   /* CONFIG_NFS_LOCALIO */
 static inline void nfsd_localio_ops_init(void)
 {
-- 
cgit v1.2.3


From 1cfb5e57886aa69a992ff0ebd32e4651eb0fc995 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Fri, 15 Nov 2024 12:43:03 -0800
Subject: Revert "net: ethtool: Avoid thousands of
 -Wflex-array-member-not-at-end warnings"

This reverts commit 3bd9b9abdf1563a22041b7255baea6d449902f1a. We cannot
use the new tagged struct group because it throws C++ errors even under
"extern C".

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20241115204308.3821419-1-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c   |  6 +++---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c  |  4 ++--
 drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c |  2 +-
 drivers/net/ethernet/cisco/enic/enic_ethtool.c      |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c     |  4 ++--
 include/linux/ethtool.h                             |  2 +-
 net/ethtool/ioctl.c                                 |  2 +-
 net/ethtool/linkinfo.c                              |  8 ++++----
 net/ethtool/linkmodes.c                             | 18 +++++++-----------
 9 files changed, 22 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index cfd2c65b1c90..061a40b1974b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2780,7 +2780,7 @@ u32 bnxt_fw_to_ethtool_speed(u16 fw_link_speed)
 static void bnxt_get_default_speeds(struct ethtool_link_ksettings *lk_ksettings,
 				    struct bnxt_link_info *link_info)
 {
-	struct ethtool_link_settings_hdr *base = &lk_ksettings->base;
+	struct ethtool_link_settings *base = &lk_ksettings->base;
 
 	if (link_info->link_state == BNXT_LINK_STATE_UP) {
 		base->speed = bnxt_fw_to_ethtool_speed(link_info->link_speed);
@@ -2799,7 +2799,7 @@ static void bnxt_get_default_speeds(struct ethtool_link_ksettings *lk_ksettings,
 static int bnxt_get_link_ksettings(struct net_device *dev,
 				   struct ethtool_link_ksettings *lk_ksettings)
 {
-	struct ethtool_link_settings_hdr *base = &lk_ksettings->base;
+	struct ethtool_link_settings *base = &lk_ksettings->base;
 	enum ethtool_link_mode_bit_indices link_mode;
 	struct bnxt *bp = netdev_priv(dev);
 	struct bnxt_link_info *link_info;
@@ -3022,9 +3022,9 @@ u16 bnxt_get_fw_auto_link_speeds(const unsigned long *mode)
 static int bnxt_set_link_ksettings(struct net_device *dev,
 			   const struct ethtool_link_ksettings *lk_ksettings)
 {
-	const struct ethtool_link_settings_hdr *base = &lk_ksettings->base;
 	struct bnxt *bp = netdev_priv(dev);
 	struct bnxt_link_info *link_info = &bp->link_info;
+	const struct ethtool_link_settings *base = &lk_ksettings->base;
 	bool set_pause = false;
 	u32 speed, lanes = 0;
 	int rc = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index 45d28a65347e..7f3f5afa864f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -662,8 +662,8 @@ static unsigned int lmm_to_fw_caps(const unsigned long *link_mode_mask)
 static int get_link_ksettings(struct net_device *dev,
 			      struct ethtool_link_ksettings *link_ksettings)
 {
-	struct ethtool_link_settings_hdr *base = &link_ksettings->base;
 	struct port_info *pi = netdev_priv(dev);
+	struct ethtool_link_settings *base = &link_ksettings->base;
 
 	/* For the nonce, the Firmware doesn't send up Port State changes
 	 * when the Virtual Interface attached to the Port is down.  So
@@ -717,9 +717,9 @@ static int get_link_ksettings(struct net_device *dev,
 static int set_link_ksettings(struct net_device *dev,
 			    const struct ethtool_link_ksettings *link_ksettings)
 {
-	const struct ethtool_link_settings_hdr *base = &link_ksettings->base;
 	struct port_info *pi = netdev_priv(dev);
 	struct link_config *lc = &pi->link_cfg;
+	const struct ethtool_link_settings *base = &link_ksettings->base;
 	struct link_config old_lc;
 	unsigned int fw_caps;
 	int ret = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 61d08547e3f9..2fbe0f059a0b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -1436,8 +1436,8 @@ static void fw_caps_to_lmm(enum fw_port_type port_type,
 static int cxgb4vf_get_link_ksettings(struct net_device *dev,
 				  struct ethtool_link_ksettings *link_ksettings)
 {
-	struct ethtool_link_settings_hdr *base = &link_ksettings->base;
 	struct port_info *pi = netdev_priv(dev);
+	struct ethtool_link_settings *base = &link_ksettings->base;
 
 	/* For the nonce, the Firmware doesn't send up Port State changes
 	 * when the Virtual Interface attached to the Port is down.  So
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 95b071153fed..d607b4f0542c 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -129,8 +129,8 @@ static void enic_intr_coal_set_rx(struct enic *enic, u32 timer)
 static int enic_get_ksettings(struct net_device *netdev,
 			      struct ethtool_link_ksettings *ecmd)
 {
-	struct ethtool_link_settings_hdr *base = &ecmd->base;
 	struct enic *enic = netdev_priv(netdev);
+	struct ethtool_link_settings *base = &ecmd->base;
 
 	ethtool_link_ksettings_add_link_mode(ecmd, supported,
 					     10000baseT_Full);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index c553da16d4b1..e50e1df0a433 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -504,7 +504,7 @@ static int qede_get_link_ksettings(struct net_device *dev,
 				   struct ethtool_link_ksettings *cmd)
 {
 	typeof(cmd->link_modes) *link_modes = &cmd->link_modes;
-	struct ethtool_link_settings_hdr *base = &cmd->base;
+	struct ethtool_link_settings *base = &cmd->base;
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qed_link_output current_link;
 
@@ -537,7 +537,7 @@ static int qede_get_link_ksettings(struct net_device *dev,
 static int qede_set_link_ksettings(struct net_device *dev,
 				   const struct ethtool_link_ksettings *cmd)
 {
-	const struct ethtool_link_settings_hdr *base = &cmd->base;
+	const struct ethtool_link_settings *base = &cmd->base;
 	const struct ethtool_forced_speed_map *map;
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qed_link_output current_link;
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 299280c94d07..b8b935b52603 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -211,7 +211,7 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id);
  * fields, but they are allowed to overwrite them (will be ignored).
  */
 struct ethtool_link_ksettings {
-	struct ethtool_link_settings_hdr base;
+	struct ethtool_link_settings base;
 	struct {
 		__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
 		__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index d86399bcf223..61df8ce44379 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -425,7 +425,7 @@ convert_link_ksettings_to_legacy_settings(
 
 /* layout of the struct passed from/to userland */
 struct ethtool_link_usettings {
-	struct ethtool_link_settings_hdr base;
+	struct ethtool_link_settings base;
 	struct {
 		__u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
 		__u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c
index 2d5bc57160be..30b8ce275159 100644
--- a/net/ethtool/linkinfo.c
+++ b/net/ethtool/linkinfo.c
@@ -8,9 +8,9 @@ struct linkinfo_req_info {
 };
 
 struct linkinfo_reply_data {
-	struct ethnl_reply_data			base;
-	struct ethtool_link_ksettings		ksettings;
-	struct ethtool_link_settings_hdr	*lsettings;
+	struct ethnl_reply_data		base;
+	struct ethtool_link_ksettings	ksettings;
+	struct ethtool_link_settings	*lsettings;
 };
 
 #define LINKINFO_REPDATA(__reply_base) \
@@ -98,7 +98,7 @@ static int
 ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info)
 {
 	struct ethtool_link_ksettings ksettings = {};
-	struct ethtool_link_settings_hdr *lsettings;
+	struct ethtool_link_settings *lsettings;
 	struct net_device *dev = req_info->dev;
 	struct nlattr **tb = info->attrs;
 	bool mod = false;
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index 17e49cf89f03..259cd9ef1f2a 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -11,10 +11,10 @@ struct linkmodes_req_info {
 };
 
 struct linkmodes_reply_data {
-	struct ethnl_reply_data			base;
-	struct ethtool_link_ksettings		ksettings;
-	struct ethtool_link_settings_hdr	*lsettings;
-	bool					peer_empty;
+	struct ethnl_reply_data		base;
+	struct ethtool_link_ksettings	ksettings;
+	struct ethtool_link_settings	*lsettings;
+	bool				peer_empty;
 };
 
 #define LINKMODES_REPDATA(__reply_base) \
@@ -62,12 +62,10 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base,
 {
 	const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
 	const struct ethtool_link_ksettings *ksettings = &data->ksettings;
+	const struct ethtool_link_settings *lsettings = &ksettings->base;
 	bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
-	const struct ethtool_link_settings_hdr *lsettings;
 	int len, ret;
 
-	lsettings = &ksettings->base;
-
 	len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */
 		+ nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */
 		+ nla_total_size(sizeof(u32)) /* LINKMODES_LANES */
@@ -105,12 +103,10 @@ static int linkmodes_fill_reply(struct sk_buff *skb,
 {
 	const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
 	const struct ethtool_link_ksettings *ksettings = &data->ksettings;
+	const struct ethtool_link_settings *lsettings = &ksettings->base;
 	bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
-	const struct ethtool_link_settings_hdr *lsettings;
 	int ret;
 
-	lsettings = &ksettings->base;
-
 	if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg))
 		return -EMSGSIZE;
 
@@ -241,7 +237,7 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
 				  struct ethtool_link_ksettings *ksettings,
 				  bool *mod, const struct net_device *dev)
 {
-	struct ethtool_link_settings_hdr *lsettings = &ksettings->base;
+	struct ethtool_link_settings *lsettings = &ksettings->base;
 	bool req_speed, req_lanes, req_duplex;
 	const struct nlattr *master_slave_cfg, *lanes_cfg;
 	int ret;
-- 
cgit v1.2.3


From a57d5a72f8dec7db8a79d0016fb0a3bdecc82b56 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 18 Nov 2024 03:15:18 -0800
Subject: netpoll: Use rcu_access_pointer() in netpoll_poll_lock

The ndev->npinfo pointer in netpoll_poll_lock() is RCU-protected but is
being accessed directly for a NULL check. While no RCU read lock is held
in this context, we should still use proper RCU primitives for
consistency and correctness.

Replace the direct NULL check with rcu_access_pointer(), which is the
appropriate primitive when only checking for NULL without dereferencing
the pointer. This function provides the necessary ordering guarantees
without requiring RCU read-side protection.

Fixes: bea3348eef27 ("[NET]: Make NAPI polling independent of struct net_device objects.")
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/20241118-netpoll_rcu-v1-2-a1888dcb4a02@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netpoll.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index cd4e28db0cbd..959a4daacea1 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -72,7 +72,7 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi)
 {
 	struct net_device *dev = napi->dev;
 
-	if (dev && dev->npinfo) {
+	if (dev && rcu_access_pointer(dev->npinfo)) {
 		int owner = smp_processor_id();
 
 		while (cmpxchg(&napi->poll_owner, -1, owner) != -1)
-- 
cgit v1.2.3


From 50f42c489528566ea2d8a9561ee54748b0441d51 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Wed, 26 Jun 2024 09:49:10 -0400
Subject: ceph: correct ceph_mds_cap_item field name

The issue_seq is sent with bulk cap releases, not the current sequence
number. See also ceph.git commit 655cddb7c9f3 ("include/ceph_fs: correct
ceph_mds_cap_item field name").

Link: https://tracker.ceph.com/issues/66704
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 2 +-
 include/linux/ceph/ceph_fs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index aaa864c2e26d..df0587e61e79 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2354,7 +2354,7 @@ again:
 		item->ino = cpu_to_le64(cap->cap_ino);
 		item->cap_id = cpu_to_le64(cap->cap_id);
 		item->migrate_seq = cpu_to_le32(cap->mseq);
-		item->seq = cpu_to_le32(cap->issue_seq);
+		item->issue_seq = cpu_to_le32(cap->issue_seq);
 		msg->front.iov_len += sizeof(*item);
 
 		ceph_put_cap(mdsc, cap);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ee1d0e5f9789..4ff3ad5e9210 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -822,7 +822,7 @@ struct ceph_mds_cap_release {
 struct ceph_mds_cap_item {
 	__le64 ino;
 	__le64 cap_id;
-	__le32 migrate_seq, seq;
+	__le32 migrate_seq, issue_seq;
 } __attribute__ ((packed));
 
 #define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
-- 
cgit v1.2.3


From 8b41ac43c7bb902786eae09cc23bcc9964ef2386 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Wed, 26 Jun 2024 12:57:27 -0400
Subject: ceph: correct ceph_mds_cap_peer field name

The peer seq is used as the issue_seq. Use that name for consistency.
See also ceph.git commit 1da6ef237fc7 ("include/ceph_fs: correct
ceph_mds_cap_peer field name").

Link: https://tracker.ceph.com/issues/66704
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c               | 18 +++++++++---------
 include/linux/ceph/ceph_fs.h |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index da3146a0a287..40fd25b97506 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4072,17 +4072,17 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 t_cap_id;
 	unsigned mseq = le32_to_cpu(ex->migrate_seq);
-	unsigned t_seq, t_mseq;
+	unsigned t_issue_seq, t_mseq;
 	int target, issued;
 	int mds = session->s_mds;
 
 	if (ph) {
 		t_cap_id = le64_to_cpu(ph->cap_id);
-		t_seq = le32_to_cpu(ph->seq);
+		t_issue_seq = le32_to_cpu(ph->issue_seq);
 		t_mseq = le32_to_cpu(ph->mseq);
 		target = le32_to_cpu(ph->mds);
 	} else {
-		t_cap_id = t_seq = t_mseq = 0;
+		t_cap_id = t_issue_seq = t_mseq = 0;
 		target = -1;
 	}
 
@@ -4120,12 +4120,12 @@ retry:
 	if (tcap) {
 		/* already have caps from the target */
 		if (tcap->cap_id == t_cap_id &&
-		    ceph_seq_cmp(tcap->seq, t_seq) < 0) {
+		    ceph_seq_cmp(tcap->seq, t_issue_seq) < 0) {
 			doutc(cl, " updating import cap %p mds%d\n", tcap,
 			      target);
 			tcap->cap_id = t_cap_id;
-			tcap->seq = t_seq - 1;
-			tcap->issue_seq = t_seq - 1;
+			tcap->seq = t_issue_seq - 1;
+			tcap->issue_seq = t_issue_seq - 1;
 			tcap->issued |= issued;
 			tcap->implemented |= issued;
 			if (cap == ci->i_auth_cap) {
@@ -4140,7 +4140,7 @@ retry:
 		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
 		tcap = new_cap;
 		ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
-			     t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+			     t_issue_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
 
 		if (!list_empty(&ci->i_cap_flush_list) &&
 		    ci->i_auth_cap == tcap) {
@@ -4254,14 +4254,14 @@ retry:
 		doutc(cl, " remove export cap %p mds%d flags %d\n",
 		      ocap, peer, ph->flags);
 		if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
-		    (ocap->seq != le32_to_cpu(ph->seq) ||
+		    (ocap->seq != le32_to_cpu(ph->issue_seq) ||
 		     ocap->mseq != le32_to_cpu(ph->mseq))) {
 			pr_err_ratelimited_client(cl, "mismatched seq/mseq: "
 					"%p %llx.%llx mds%d seq %d mseq %d"
 					" importer mds%d has peer seq %d mseq %d\n",
 					inode, ceph_vinop(inode), peer,
 					ocap->seq, ocap->mseq, mds,
-					le32_to_cpu(ph->seq),
+					le32_to_cpu(ph->issue_seq),
 					le32_to_cpu(ph->mseq));
 		}
 		ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 4ff3ad5e9210..2d7d86f0290d 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -808,7 +808,7 @@ struct ceph_mds_caps {
 
 struct ceph_mds_cap_peer {
 	__le64 cap_id;
-	__le32 seq;
+	__le32 issue_seq;
 	__le32 mseq;
 	__le32 mds;
 	__u8   flags;
-- 
cgit v1.2.3


From 46fd48ab3ea3eb3bb215684bd66ea3d260b091a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2024 08:26:02 +0100
Subject: block: return unsigned int from bdev_io_min

The underlying limit is defined as an unsigned int, so return that from
bdev_io_min as well.

Fixes: ac481c20ef8f ("block: Topology ioctls")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20241119072602.1059488-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a1fd0ddce5cf..195db38fda16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1261,7 +1261,7 @@ static inline unsigned int queue_io_min(const struct request_queue *q)
 	return q->limits.io_min;
 }
 
-static inline int bdev_io_min(struct block_device *bdev)
+static inline unsigned int bdev_io_min(struct block_device *bdev)
 {
 	return queue_io_min(bdev_get_queue(bdev));
 }
-- 
cgit v1.2.3


From f06e108a3dc53c0f5234d18de0bd224753db5019 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Farr <kernel@jfarr.cc>
Date: Tue, 29 Oct 2024 15:00:36 +0100
Subject: Compiler Attributes: disable __counted_by for clang < 19.1.3

This patch disables __counted_by for clang versions < 19.1.3 because
of the two issues listed below. It does this by introducing
CONFIG_CC_HAS_COUNTED_BY.

1. clang < 19.1.2 has a bug that can lead to __bdos returning 0:
https://github.com/llvm/llvm-project/pull/110497

2. clang < 19.1.3 has a bug that can lead to __bdos being off by 4:
https://github.com/llvm/llvm-project/pull/112636

Fixes: c8248faf3ca2 ("Compiler Attributes: counted_by: Adjust name and identifier expansion")
Cc: stable@vger.kernel.org # 6.6.x: 16c31dd7fdf6: Compiler Attributes: counted_by: bump min gcc version
Cc: stable@vger.kernel.org # 6.6.x: 2993eb7a8d34: Compiler Attributes: counted_by: fixup clang URL
Cc: stable@vger.kernel.org # 6.6.x: 231dc3f0c936: lkdtm/bugs: Improve warning message for compilers without counted_by support
Cc: stable@vger.kernel.org # 6.6.x
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/all/20240913164630.GA4091534@thelio-3990X/
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202409260949.a1254989-oliver.sang@intel.com
Link: https://lore.kernel.org/all/Zw8iawAF5W2uzGuh@archlinux/T/#m204c09f63c076586a02d194b87dffc7e81b8de7b
Suggested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Jan Hendrik Farr <kernel@jfarr.cc>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Thorsten Blum <thorsten.blum@linux.dev>
Link: https://lore.kernel.org/r/20241029140036.577804-2-kernel@jfarr.cc
Signed-off-by: Kees Cook <kees@kernel.org>
---
 drivers/misc/lkdtm/bugs.c           |  2 +-
 include/linux/compiler_attributes.h | 13 -------------
 include/linux/compiler_types.h      | 19 +++++++++++++++++++
 init/Kconfig                        |  9 +++++++++
 lib/overflow_kunit.c                |  2 +-
 5 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index 62ba01525479..376047beea3d 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -445,7 +445,7 @@ static void lkdtm_FAM_BOUNDS(void)
 
 	pr_err("FAIL: survived access of invalid flexible array member index!\n");
 
-	if (!__has_attribute(__counted_by__))
+	if (!IS_ENABLED(CONFIG_CC_HAS_COUNTED_BY))
 		pr_warn("This is expected since this %s was built with a compiler that does not support __counted_by\n",
 			lkdtm_kernel_info);
 	else if (IS_ENABLED(CONFIG_UBSAN_BOUNDS))
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 32284cd26d52..c16d4199bf92 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -94,19 +94,6 @@
 # define __copy(symbol)
 #endif
 
-/*
- * Optional: only supported since gcc >= 15
- * Optional: only supported since clang >= 18
- *
- *   gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
- * clang: https://github.com/llvm/llvm-project/pull/76348
- */
-#if __has_attribute(__counted_by__)
-# define __counted_by(member)		__attribute__((__counted_by__(member)))
-#else
-# define __counted_by(member)
-#endif
-
 /*
  * Optional: not supported by gcc
  * Optional: only supported since clang >= 14.0
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 1a957ea2f4fe..639be0f30b45 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -323,6 +323,25 @@ struct ftrace_likely_data {
 #define __no_sanitize_or_inline __always_inline
 #endif
 
+/*
+ * Optional: only supported since gcc >= 15
+ * Optional: only supported since clang >= 18
+ *
+ *   gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
+ * clang: https://github.com/llvm/llvm-project/pull/76348
+ *
+ * __bdos on clang < 19.1.2 can erroneously return 0:
+ * https://github.com/llvm/llvm-project/pull/110497
+ *
+ * __bdos on clang < 19.1.3 can be off by 4:
+ * https://github.com/llvm/llvm-project/pull/112636
+ */
+#ifdef CONFIG_CC_HAS_COUNTED_BY
+# define __counted_by(member)		__attribute__((__counted_by__(member)))
+#else
+# define __counted_by(member)
+#endif
+
 /*
  * Apply __counted_by() when the Endianness matches to increase test coverage.
  */
diff --git a/init/Kconfig b/init/Kconfig
index 530a382ee0fe..92f106cf5572 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -116,6 +116,15 @@ config CC_HAS_ASM_INLINE
 config CC_HAS_NO_PROFILE_FN_ATTR
 	def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror)
 
+config CC_HAS_COUNTED_BY
+	# TODO: when gcc 15 is released remove the build test and add
+	# a gcc version check
+	def_bool $(success,echo 'struct flex { int count; int array[] __attribute__((__counted_by__(count))); };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror)
+	# clang needs to be at least 19.1.3 to avoid __bdos miscalculations
+	# https://github.com/llvm/llvm-project/pull/110497
+	# https://github.com/llvm/llvm-project/pull/112636
+	depends on !(CC_IS_CLANG && CLANG_VERSION < 190103)
+
 config PAHOLE_VERSION
 	int
 	default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE))
diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c
index 2abc78367dd1..5222c6393f11 100644
--- a/lib/overflow_kunit.c
+++ b/lib/overflow_kunit.c
@@ -1187,7 +1187,7 @@ static void DEFINE_FLEX_test(struct kunit *test)
 {
 	/* Using _RAW_ on a __counted_by struct will initialize "counter" to zero */
 	DEFINE_RAW_FLEX(struct foo, two_but_zero, array, 2);
-#if __has_attribute(__counted_by__)
+#ifdef CONFIG_CC_HAS_COUNTED_BY
 	int expected_raw_size = sizeof(struct foo);
 #else
 	int expected_raw_size = sizeof(struct foo) + 2 * sizeof(s16);
-- 
cgit v1.2.3


From d7f36dc446e894e0f57b5f05c5628f03c5f9e2d2 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Mon, 18 Nov 2024 10:50:15 +0000
Subject: block: Support atomic writes limits for stacked devices

Allow stacked devices to support atomic writes by aggregating the minimum
capability of all bottom devices.

Flag BLK_FEAT_ATOMIC_WRITES_STACKED is set for stacked devices which
have been enabled to support atomic writes.

Some things to note on the implementation:
- For simplicity, all bottom devices must have same atomic write boundary
  value (if any)
- The atomic write boundary must be a power-of-2 already, but this
  restriction could be relaxed. Furthermore, it is now required that the
  chunk sectors for a top device must be aligned with this boundary.
- If a bottom device atomic write unit min/max are not aligned with the
  top device chunk sectors, the top device atomic write unit min/max are
  reduced to a value which works for the chunk sectors.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20241118105018.1870052-3-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c   | 115 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |   4 ++
 2 files changed, 119 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 03c67620f98a..8f09e33f41f6 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -501,6 +501,119 @@ static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lb
 	return sectors;
 }
 
+/* Check if second and later bottom devices are compliant */
+static bool blk_stack_atomic_writes_tail(struct queue_limits *t,
+				struct queue_limits *b)
+{
+	/* We're not going to support different boundary sizes.. yet */
+	if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary)
+		return false;
+
+	/* Can't support this */
+	if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max)
+		return false;
+
+	/* Or this */
+	if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min)
+		return false;
+
+	t->atomic_write_hw_max = min(t->atomic_write_hw_max,
+				b->atomic_write_hw_max);
+	t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min,
+				b->atomic_write_hw_unit_min);
+	t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max,
+				b->atomic_write_hw_unit_max);
+	return true;
+}
+
+/* Check for valid boundary of first bottom device */
+static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t,
+				struct queue_limits *b)
+{
+	/*
+	 * Ensure atomic write boundary is aligned with chunk sectors. Stacked
+	 * devices store chunk sectors in t->io_min.
+	 */
+	if (b->atomic_write_hw_boundary > t->io_min &&
+	    b->atomic_write_hw_boundary % t->io_min)
+		return false;
+	if (t->io_min > b->atomic_write_hw_boundary &&
+	    t->io_min % b->atomic_write_hw_boundary)
+		return false;
+
+	t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
+	return true;
+}
+
+
+/* Check stacking of first bottom device */
+static bool blk_stack_atomic_writes_head(struct queue_limits *t,
+				struct queue_limits *b)
+{
+	if (b->atomic_write_hw_boundary &&
+	    !blk_stack_atomic_writes_boundary_head(t, b))
+		return false;
+
+	if (t->io_min <= SECTOR_SIZE) {
+		/* No chunk sectors, so use bottom device values directly */
+		t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+		t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
+		t->atomic_write_hw_max = b->atomic_write_hw_max;
+		return true;
+	}
+
+	/*
+	 * Find values for limits which work for chunk size.
+	 * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk
+	 * size (t->io_min), as chunk size is not restricted to a power-of-2.
+	 * So we need to find highest power-of-2 which works for the chunk
+	 * size.
+	 * As an example scenario, we could have b->unit_max = 16K and
+	 * t->io_min = 24K. For this case, reduce t->unit_max to a value
+	 * aligned with both limits, i.e. 8K in this example.
+	 */
+	t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+	while (t->io_min % t->atomic_write_hw_unit_max)
+		t->atomic_write_hw_unit_max /= 2;
+
+	t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min,
+					  t->atomic_write_hw_unit_max);
+	t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min);
+
+	return true;
+}
+
+static void blk_stack_atomic_writes_limits(struct queue_limits *t,
+				struct queue_limits *b)
+{
+	if (!(t->features & BLK_FEAT_ATOMIC_WRITES_STACKED))
+		goto unsupported;
+
+	if (!b->atomic_write_unit_min)
+		goto unsupported;
+
+	/*
+	 * If atomic_write_hw_max is set, we have already stacked 1x bottom
+	 * device, so check for compliance.
+	 */
+	if (t->atomic_write_hw_max) {
+		if (!blk_stack_atomic_writes_tail(t, b))
+			goto unsupported;
+		return;
+	}
+
+	if (!blk_stack_atomic_writes_head(t, b))
+		goto unsupported;
+	return;
+
+unsupported:
+	t->atomic_write_hw_max = 0;
+	t->atomic_write_hw_unit_max = 0;
+	t->atomic_write_hw_unit_min = 0;
+	t->atomic_write_hw_boundary = 0;
+	t->features &= ~BLK_FEAT_ATOMIC_WRITES_STACKED;
+}
+
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top device)
@@ -661,6 +774,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		t->zone_write_granularity = 0;
 		t->max_zone_append_sectors = 0;
 	}
+	blk_stack_atomic_writes_limits(t, b);
+
 	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 195db38fda16..31867de88213 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -333,6 +333,10 @@ typedef unsigned int __bitwise blk_features_t;
 #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
 	((__force blk_features_t)(1u << 15))
 
+/* stacked device can/does support atomic writes */
+#define BLK_FEAT_ATOMIC_WRITES_STACKED \
+	((__force blk_features_t)(1u << 16))
+
 /*
  * Flags automatically inherited when stacking limits.
  */
-- 
cgit v1.2.3


From 9cf9f2e70bea4e66a2c8b8c4743489beb21258a8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 15 Nov 2024 22:00:25 +0100
Subject: cpuidle: Change :enter_dead() driver callback return type to void

After a previous change, cpuidle_play_dead(), which is the only caller
of idle state :enter_dead() callbacks, ignores their return values, so
they may as well be void.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Link: https://patch.msgid.link/2285569.iZASKD2KPV@rjwysocki.net
---
 drivers/acpi/processor_idle.c | 7 ++-----
 include/linux/cpuidle.h       | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 831fa4a12159..ce728cf7e301 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -578,7 +578,7 @@ static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx)
  * @dev: the target CPU
  * @index: the index of suggested state
  */
-static int acpi_idle_play_dead(struct cpuidle_device *dev, int index)
+static void acpi_idle_play_dead(struct cpuidle_device *dev, int index)
 {
 	struct acpi_processor_cx *cx = per_cpu(acpi_cstate[index], dev->cpu);
 
@@ -591,11 +591,8 @@ static int acpi_idle_play_dead(struct cpuidle_device *dev, int index)
 		else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) {
 			io_idle(cx->address);
 		} else
-			return -ENODEV;
+			return;
 	}
-
-	/* Never reached */
-	return 0;
 }
 
 static __always_inline bool acpi_idle_fallback_to_c1(struct acpi_processor *pr)
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 3183aeb7f5b4..a9ee4fe55dcf 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -61,7 +61,7 @@ struct cpuidle_state {
 			struct cpuidle_driver *drv,
 			int index);
 
-	int (*enter_dead) (struct cpuidle_device *dev, int index);
+	void (*enter_dead) (struct cpuidle_device *dev, int index);
 
 	/*
 	 * CPUs execute ->enter_s2idle with the local tick or entire timekeeping
-- 
cgit v1.2.3


From 5a9d1b83e5334915c651604648c20a9fc64d47a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2024 17:09:18 +0100
Subject: block: return unsigned int from bdev_io_opt

The underlying limit is defined as an unsigned int, so return that from
bdev_io_opt as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241119160932.1327864-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 31867de88213..97c89c0c1398 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1275,7 +1275,7 @@ static inline unsigned int queue_io_opt(const struct request_queue *q)
 	return q->limits.io_opt;
 }
 
-static inline int bdev_io_opt(struct block_device *bdev)
+static inline unsigned int bdev_io_opt(struct block_device *bdev)
 {
 	return queue_io_opt(bdev_get_queue(bdev));
 }
-- 
cgit v1.2.3


From ed5db174cf39374215934f21b04639a7a1513023 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2024 17:09:19 +0100
Subject: block: return unsigned int from queue_dma_alignment

The underlying limit is defined as an unsigned int, so return that from
queue_dma_alignment as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241119160932.1327864-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 97c89c0c1398..d8de261c2b99 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1421,7 +1421,7 @@ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
 	return is_seq;
 }
 
-static inline int queue_dma_alignment(const struct request_queue *q)
+static inline unsigned int queue_dma_alignment(const struct request_queue *q)
 {
 	return q->limits.dma_alignment;
 }
-- 
cgit v1.2.3


From e769489a54401d0c89555f7ad8672038b5c2b767 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2024 17:09:20 +0100
Subject: block: return unsigned int from blk_lim_dma_alignment_and_pad

The underlying limits are defined as unsigned int, so return that from
blk_lim_dma_alignment_and_pad as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241119160932.1327864-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d8de261c2b99..d0d8190429c8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1462,7 +1462,8 @@ static inline bool bdev_iter_is_aligned(struct block_device *bdev,
 				   bdev_logical_block_size(bdev) - 1);
 }
 
-static inline int blk_lim_dma_alignment_and_pad(struct queue_limits *lim)
+static inline unsigned int
+blk_lim_dma_alignment_and_pad(struct queue_limits *lim)
 {
 	return lim->dma_alignment | lim->dma_pad_mask;
 }
-- 
cgit v1.2.3


From da77d9b23700708d0d22a4407d32a8755a3596e8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2024 17:09:21 +0100
Subject: block: return bool from blk_rq_aligned

blk_rq_aligned returns a boolean condition, don't mascquerade it as int.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241119160932.1327864-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d0d8190429c8..5270117d54ac 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1468,7 +1468,7 @@ blk_lim_dma_alignment_and_pad(struct queue_limits *lim)
 	return lim->dma_alignment | lim->dma_pad_mask;
 }
 
-static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
+static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr,
 				 unsigned int len)
 {
 	unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits);
-- 
cgit v1.2.3


From e888810bc4f471f85989a0991aff28d2ac9f783b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2024 17:09:22 +0100
Subject: block: remove a duplicate definition for bdev_read_only

bdev_read_only is already defined as an inline function in blkdev.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241119160932.1327864-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5270117d54ac..8b4e4692e7fb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1586,7 +1586,6 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
 	return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
 }
 
-int bdev_read_only(struct block_device *bdev);
 int set_blocksize(struct file *file, int size);
 
 int lookup_bdev(const char *pathname, dev_t *dev);
-- 
cgit v1.2.3


From 766a71ef65bb217ed8bf1c068ac14c7d3c15d487 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2024 17:09:23 +0100
Subject: block: return bool from get_disk_ro and bdev_read_only

get_disk_ro and bdev_read_only return boolean conditions,
don't masquerade them as int.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241119160932.1327864-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8b4e4692e7fb..08a727b40816 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -779,13 +779,13 @@ static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag)
 	atomic_andnot(flag, &bdev->__bd_flags);
 }
 
-static inline int get_disk_ro(struct gendisk *disk)
+static inline bool get_disk_ro(struct gendisk *disk)
 {
 	return bdev_test_flag(disk->part0, BD_READ_ONLY) ||
 		test_bit(GD_READ_ONLY, &disk->state);
 }
 
-static inline int bdev_read_only(struct block_device *bdev)
+static inline bool bdev_read_only(struct block_device *bdev)
 {
 	return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk);
 }
-- 
cgit v1.2.3


From f46b9cdb22f7a167c36b6bcddaef7e8aee2598fa Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Wed, 20 Nov 2024 14:14:52 -0800
Subject: io_uring: limit local tw done

Instead of eagerly running all available local tw, limit the amount of
local tw done to the max of IO_LOCAL_TW_DEFAULT_MAX (20) or wait_nr. The
value of 20 is chosen as a reasonable heuristic to allow enough work
batching but also keep latency down.

Add a retry_llist that maintains a list of local tw that couldn't be
done in time. No synchronisation is needed since it is only modified
within the task context.

Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://lore.kernel.org/r/20241120221452.3762588-3-dw@davidwei.uk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/io_uring.c            | 43 +++++++++++++++++++++++++++++++-----------
 io_uring/io_uring.h            |  2 +-
 3 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index aa5f5ea98076..3e934feb3187 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -335,6 +335,7 @@ struct io_ring_ctx {
 	 */
 	struct {
 		struct llist_head	work_llist;
+		struct llist_head	retry_llist;
 		unsigned long		check_cq;
 		atomic_t		cq_wait_nr;
 		atomic_t		cq_timeouts;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 55e3618b726d..bfa93888f862 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -122,6 +122,7 @@
 
 #define IO_COMPL_BATCH			32
 #define IO_REQ_ALLOC_BATCH		8
+#define IO_LOCAL_TW_DEFAULT_MAX		20
 
 struct io_defer_entry {
 	struct list_head	list;
@@ -1256,6 +1257,8 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 	struct llist_node *node = llist_del_all(&ctx->work_llist);
 
 	__io_fallback_tw(node, false);
+	node = llist_del_all(&ctx->retry_llist);
+	__io_fallback_tw(node, false);
 }
 
 static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
@@ -1270,37 +1273,55 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
 	return false;
 }
 
+static int __io_run_local_work_loop(struct llist_node **node,
+				    struct io_tw_state *ts,
+				    int events)
+{
+	while (*node) {
+		struct llist_node *next = (*node)->next;
+		struct io_kiocb *req = container_of(*node, struct io_kiocb,
+						    io_task_work.node);
+		INDIRECT_CALL_2(req->io_task_work.func,
+				io_poll_task_func, io_req_rw_complete,
+				req, ts);
+		*node = next;
+		if (--events <= 0)
+			break;
+	}
+
+	return events;
+}
+
 static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
 			       int min_events)
 {
 	struct llist_node *node;
 	unsigned int loops = 0;
-	int ret = 0;
+	int ret, limit;
 
 	if (WARN_ON_ONCE(ctx->submitter_task != current))
 		return -EEXIST;
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+	limit = max(IO_LOCAL_TW_DEFAULT_MAX, min_events);
 again:
+	ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, limit);
+	if (ctx->retry_llist.first)
+		goto retry_done;
+
 	/*
 	 * llists are in reverse order, flip it back the right way before
 	 * running the pending items.
 	 */
 	node = llist_reverse_order(llist_del_all(&ctx->work_llist));
-	while (node) {
-		struct llist_node *next = node->next;
-		struct io_kiocb *req = container_of(node, struct io_kiocb,
-						    io_task_work.node);
-		INDIRECT_CALL_2(req->io_task_work.func,
-				io_poll_task_func, io_req_rw_complete,
-				req, ts);
-		ret++;
-		node = next;
-	}
+	ret = __io_run_local_work_loop(&node, ts, ret);
+	ctx->retry_llist.first = node;
 	loops++;
 
+	ret = limit - ret;
 	if (io_run_local_work_continue(ctx, ret, min_events))
 		goto again;
+retry_done:
 	io_submit_flush_completions(ctx);
 	if (io_run_local_work_continue(ctx, ret, min_events))
 		goto again;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 69eb3b23a5a0..12abee607e4a 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -349,7 +349,7 @@ static inline int io_run_task_work(void)
 
 static inline bool io_local_work_pending(struct io_ring_ctx *ctx)
 {
-	return !llist_empty(&ctx->work_llist);
+	return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist);
 }
 
 static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
-- 
cgit v1.2.3


From b88cbaaa6fa109c2eb455d8fe2a318de0d197ea2 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 15 Nov 2024 15:44:27 -0600
Subject: PCI/pwrctrl: Rename pwrctl files to pwrctrl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To slightly reduce confusion between "pwrctl" (the power controller and
power sequencing framework) and "bwctrl" (the bandwidth controller),
rename "pwrctl" to "pwrctrl" so they use the same "ctrl" suffix.

Rename drivers/pci/pwrctl/ to drivers/pci/pwrctrl/, including the related
MAINTAINERS, include file (include/linux/pci-pwrctl.h), Makefile, and
Kconfig changes.

This is the minimal rename of files only.  A subsequent commit will rename
functions and data structures.

Link: https://lore.kernel.org/r/20241115214428.2061153-2-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
---
 MAINTAINERS                              |   4 +-
 drivers/pci/Kconfig                      |   2 +-
 drivers/pci/Makefile                     |   2 +-
 drivers/pci/pwrctl/Kconfig               |  12 ---
 drivers/pci/pwrctl/Makefile              |   6 --
 drivers/pci/pwrctl/core.c                | 147 -------------------------------
 drivers/pci/pwrctl/pci-pwrctl-pwrseq.c   |  94 --------------------
 drivers/pci/pwrctrl/Kconfig              |  12 +++
 drivers/pci/pwrctrl/Makefile             |   6 ++
 drivers/pci/pwrctrl/core.c               | 147 +++++++++++++++++++++++++++++++
 drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c |  94 ++++++++++++++++++++
 include/linux/pci-pwrctl.h               |  54 ------------
 include/linux/pci-pwrctrl.h              |  54 ++++++++++++
 13 files changed, 317 insertions(+), 317 deletions(-)
 delete mode 100644 drivers/pci/pwrctl/Kconfig
 delete mode 100644 drivers/pci/pwrctl/Makefile
 delete mode 100644 drivers/pci/pwrctl/core.c
 delete mode 100644 drivers/pci/pwrctl/pci-pwrctl-pwrseq.c
 create mode 100644 drivers/pci/pwrctrl/Kconfig
 create mode 100644 drivers/pci/pwrctrl/Makefile
 create mode 100644 drivers/pci/pwrctrl/core.c
 create mode 100644 drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c
 delete mode 100644 include/linux/pci-pwrctl.h
 create mode 100644 include/linux/pci-pwrctrl.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c27f3190737f..1a6096135424 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17910,8 +17910,8 @@ M:	Bartosz Golaszewski <brgl@bgdev.pl>
 L:	linux-pci@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
-F:	drivers/pci/pwrctl/*
-F:	include/linux/pci-pwrctl.h
+F:	drivers/pci/pwrctrl/*
+F:	include/linux/pci-pwrctrl.h
 
 PCI SUBSYSTEM
 M:	Bjorn Helgaas <bhelgaas@google.com>
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 0d94e4a967d8..e1c025698a28 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -305,6 +305,6 @@ source "drivers/pci/hotplug/Kconfig"
 source "drivers/pci/controller/Kconfig"
 source "drivers/pci/endpoint/Kconfig"
 source "drivers/pci/switch/Kconfig"
-source "drivers/pci/pwrctl/Kconfig"
+source "drivers/pci/pwrctrl/Kconfig"
 
 endif
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 374c5c06d92f..39a07890abd1 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_PCI)		+= access.o bus.o probe.o host-bridge.o \
 
 obj-$(CONFIG_PCI)		+= msi/
 obj-$(CONFIG_PCI)		+= pcie/
-obj-$(CONFIG_PCI)		+= pwrctl/
+obj-$(CONFIG_PCI)		+= pwrctrl/
 
 ifdef CONFIG_PCI
 obj-$(CONFIG_PROC_FS)		+= proc.o
diff --git a/drivers/pci/pwrctl/Kconfig b/drivers/pci/pwrctl/Kconfig
deleted file mode 100644
index 54589bb2403b..000000000000
--- a/drivers/pci/pwrctl/Kconfig
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config HAVE_PWRCTL
-	bool
-
-config PCI_PWRCTL
-	tristate
-
-config PCI_PWRCTL_PWRSEQ
-	tristate
-	select POWER_SEQUENCING
-	select PCI_PWRCTL
diff --git a/drivers/pci/pwrctl/Makefile b/drivers/pci/pwrctl/Makefile
deleted file mode 100644
index d308aae4800c..000000000000
--- a/drivers/pci/pwrctl/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_PCI_PWRCTL)		+= pci-pwrctl-core.o
-pci-pwrctl-core-y			:= core.o
-
-obj-$(CONFIG_PCI_PWRCTL_PWRSEQ)		+= pci-pwrctl-pwrseq.o
diff --git a/drivers/pci/pwrctl/core.c b/drivers/pci/pwrctl/core.c
deleted file mode 100644
index bb5a23712434..000000000000
--- a/drivers/pci/pwrctl/core.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2024 Linaro Ltd.
- */
-
-#include <linux/device.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/pci-pwrctl.h>
-#include <linux/property.h>
-#include <linux/slab.h>
-
-static int pci_pwrctl_notify(struct notifier_block *nb, unsigned long action,
-			     void *data)
-{
-	struct pci_pwrctl *pwrctl = container_of(nb, struct pci_pwrctl, nb);
-	struct device *dev = data;
-
-	if (dev_fwnode(dev) != dev_fwnode(pwrctl->dev))
-		return NOTIFY_DONE;
-
-	switch (action) {
-	case BUS_NOTIFY_ADD_DEVICE:
-		/*
-		 * We will have two struct device objects bound to two different
-		 * drivers on different buses but consuming the same DT node. We
-		 * must not bind the pins twice in this case but only once for
-		 * the first device to be added.
-		 *
-		 * If we got here then the PCI device is the second after the
-		 * power control platform device. Mark its OF node as reused.
-		 */
-		dev->of_node_reused = true;
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static void rescan_work_func(struct work_struct *work)
-{
-	struct pci_pwrctl *pwrctl = container_of(work, struct pci_pwrctl, work);
-
-	pci_lock_rescan_remove();
-	pci_rescan_bus(to_pci_dev(pwrctl->dev->parent)->bus);
-	pci_unlock_rescan_remove();
-}
-
-/**
- * pci_pwrctl_init() - Initialize the PCI power control context struct
- *
- * @pwrctl: PCI power control data
- * @dev: Parent device
- */
-void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev)
-{
-	pwrctl->dev = dev;
-	INIT_WORK(&pwrctl->work, rescan_work_func);
-}
-EXPORT_SYMBOL_GPL(pci_pwrctl_init);
-
-/**
- * pci_pwrctl_device_set_ready() - Notify the pwrctl subsystem that the PCI
- * device is powered-up and ready to be detected.
- *
- * @pwrctl: PCI power control data.
- *
- * Returns:
- * 0 on success, negative error number on error.
- *
- * Note:
- * This function returning 0 doesn't mean the device was detected. It means,
- * that the bus rescan was successfully started. The device will get bound to
- * its PCI driver asynchronously.
- */
-int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl)
-{
-	int ret;
-
-	if (!pwrctl->dev)
-		return -ENODEV;
-
-	pwrctl->nb.notifier_call = pci_pwrctl_notify;
-	ret = bus_register_notifier(&pci_bus_type, &pwrctl->nb);
-	if (ret)
-		return ret;
-
-	schedule_work(&pwrctl->work);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pci_pwrctl_device_set_ready);
-
-/**
- * pci_pwrctl_device_unset_ready() - Notify the pwrctl subsystem that the PCI
- * device is about to be powered-down.
- *
- * @pwrctl: PCI power control data.
- */
-void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl)
-{
-	/*
-	 * We don't have to delete the link here. Typically, this function
-	 * is only called when the power control device is being detached. If
-	 * it is being detached then the child PCI device must have already
-	 * been unbound too or the device core wouldn't let us unbind.
-	 */
-	bus_unregister_notifier(&pci_bus_type, &pwrctl->nb);
-}
-EXPORT_SYMBOL_GPL(pci_pwrctl_device_unset_ready);
-
-static void devm_pci_pwrctl_device_unset_ready(void *data)
-{
-	struct pci_pwrctl *pwrctl = data;
-
-	pci_pwrctl_device_unset_ready(pwrctl);
-}
-
-/**
- * devm_pci_pwrctl_device_set_ready - Managed variant of
- * pci_pwrctl_device_set_ready().
- *
- * @dev: Device managing this pwrctl provider.
- * @pwrctl: PCI power control data.
- *
- * Returns:
- * 0 on success, negative error number on error.
- */
-int devm_pci_pwrctl_device_set_ready(struct device *dev,
-				     struct pci_pwrctl *pwrctl)
-{
-	int ret;
-
-	ret = pci_pwrctl_device_set_ready(pwrctl);
-	if (ret)
-		return ret;
-
-	return devm_add_action_or_reset(dev,
-					devm_pci_pwrctl_device_unset_ready,
-					pwrctl);
-}
-EXPORT_SYMBOL_GPL(devm_pci_pwrctl_device_set_ready);
-
-MODULE_AUTHOR("Bartosz Golaszewski <bartosz.golaszewski@linaro.org>");
-MODULE_DESCRIPTION("PCI Device Power Control core driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/pci/pwrctl/pci-pwrctl-pwrseq.c b/drivers/pci/pwrctl/pci-pwrctl-pwrseq.c
deleted file mode 100644
index a23a4312574b..000000000000
--- a/drivers/pci/pwrctl/pci-pwrctl-pwrseq.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2024 Linaro Ltd.
- */
-
-#include <linux/device.h>
-#include <linux/mod_devicetable.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/pci-pwrctl.h>
-#include <linux/platform_device.h>
-#include <linux/pwrseq/consumer.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-
-struct pci_pwrctl_pwrseq_data {
-	struct pci_pwrctl ctx;
-	struct pwrseq_desc *pwrseq;
-};
-
-static void devm_pci_pwrctl_pwrseq_power_off(void *data)
-{
-	struct pwrseq_desc *pwrseq = data;
-
-	pwrseq_power_off(pwrseq);
-}
-
-static int pci_pwrctl_pwrseq_probe(struct platform_device *pdev)
-{
-	struct pci_pwrctl_pwrseq_data *data;
-	struct device *dev = &pdev->dev;
-	int ret;
-
-	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	data->pwrseq = devm_pwrseq_get(dev, of_device_get_match_data(dev));
-	if (IS_ERR(data->pwrseq))
-		return dev_err_probe(dev, PTR_ERR(data->pwrseq),
-				     "Failed to get the power sequencer\n");
-
-	ret = pwrseq_power_on(data->pwrseq);
-	if (ret)
-		return dev_err_probe(dev, ret,
-				     "Failed to power-on the device\n");
-
-	ret = devm_add_action_or_reset(dev, devm_pci_pwrctl_pwrseq_power_off,
-				       data->pwrseq);
-	if (ret)
-		return ret;
-
-	pci_pwrctl_init(&data->ctx, dev);
-
-	ret = devm_pci_pwrctl_device_set_ready(dev, &data->ctx);
-	if (ret)
-		return dev_err_probe(dev, ret,
-				     "Failed to register the pwrctl wrapper\n");
-
-	return 0;
-}
-
-static const struct of_device_id pci_pwrctl_pwrseq_of_match[] = {
-	{
-		/* ATH11K in QCA6390 package. */
-		.compatible = "pci17cb,1101",
-		.data = "wlan",
-	},
-	{
-		/* ATH11K in WCN6855 package. */
-		.compatible = "pci17cb,1103",
-		.data = "wlan",
-	},
-	{
-		/* ATH12K in WCN7850 package. */
-		.compatible = "pci17cb,1107",
-		.data = "wlan",
-	},
-	{ }
-};
-MODULE_DEVICE_TABLE(of, pci_pwrctl_pwrseq_of_match);
-
-static struct platform_driver pci_pwrctl_pwrseq_driver = {
-	.driver = {
-		.name = "pci-pwrctl-pwrseq",
-		.of_match_table = pci_pwrctl_pwrseq_of_match,
-	},
-	.probe = pci_pwrctl_pwrseq_probe,
-};
-module_platform_driver(pci_pwrctl_pwrseq_driver);
-
-MODULE_AUTHOR("Bartosz Golaszewski <bartosz.golaszewski@linaro.org>");
-MODULE_DESCRIPTION("Generic PCI Power Control module for power sequenced devices");
-MODULE_LICENSE("GPL");
diff --git a/drivers/pci/pwrctrl/Kconfig b/drivers/pci/pwrctrl/Kconfig
new file mode 100644
index 000000000000..54589bb2403b
--- /dev/null
+++ b/drivers/pci/pwrctrl/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config HAVE_PWRCTL
+	bool
+
+config PCI_PWRCTL
+	tristate
+
+config PCI_PWRCTL_PWRSEQ
+	tristate
+	select POWER_SEQUENCING
+	select PCI_PWRCTL
diff --git a/drivers/pci/pwrctrl/Makefile b/drivers/pci/pwrctrl/Makefile
new file mode 100644
index 000000000000..75c7ce531c7e
--- /dev/null
+++ b/drivers/pci/pwrctrl/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_PCI_PWRCTL)		+= pci-pwrctrl-core.o
+pci-pwrctrl-core-y			:= core.o
+
+obj-$(CONFIG_PCI_PWRCTL_PWRSEQ)		+= pci-pwrctrl-pwrseq.o
diff --git a/drivers/pci/pwrctrl/core.c b/drivers/pci/pwrctrl/core.c
new file mode 100644
index 000000000000..88f06e9271ba
--- /dev/null
+++ b/drivers/pci/pwrctrl/core.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Linaro Ltd.
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci-pwrctrl.h>
+#include <linux/property.h>
+#include <linux/slab.h>
+
+static int pci_pwrctl_notify(struct notifier_block *nb, unsigned long action,
+			     void *data)
+{
+	struct pci_pwrctl *pwrctl = container_of(nb, struct pci_pwrctl, nb);
+	struct device *dev = data;
+
+	if (dev_fwnode(dev) != dev_fwnode(pwrctl->dev))
+		return NOTIFY_DONE;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		/*
+		 * We will have two struct device objects bound to two different
+		 * drivers on different buses but consuming the same DT node. We
+		 * must not bind the pins twice in this case but only once for
+		 * the first device to be added.
+		 *
+		 * If we got here then the PCI device is the second after the
+		 * power control platform device. Mark its OF node as reused.
+		 */
+		dev->of_node_reused = true;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static void rescan_work_func(struct work_struct *work)
+{
+	struct pci_pwrctl *pwrctl = container_of(work, struct pci_pwrctl, work);
+
+	pci_lock_rescan_remove();
+	pci_rescan_bus(to_pci_dev(pwrctl->dev->parent)->bus);
+	pci_unlock_rescan_remove();
+}
+
+/**
+ * pci_pwrctl_init() - Initialize the PCI power control context struct
+ *
+ * @pwrctl: PCI power control data
+ * @dev: Parent device
+ */
+void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev)
+{
+	pwrctl->dev = dev;
+	INIT_WORK(&pwrctl->work, rescan_work_func);
+}
+EXPORT_SYMBOL_GPL(pci_pwrctl_init);
+
+/**
+ * pci_pwrctl_device_set_ready() - Notify the pwrctl subsystem that the PCI
+ * device is powered-up and ready to be detected.
+ *
+ * @pwrctl: PCI power control data.
+ *
+ * Returns:
+ * 0 on success, negative error number on error.
+ *
+ * Note:
+ * This function returning 0 doesn't mean the device was detected. It means,
+ * that the bus rescan was successfully started. The device will get bound to
+ * its PCI driver asynchronously.
+ */
+int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl)
+{
+	int ret;
+
+	if (!pwrctl->dev)
+		return -ENODEV;
+
+	pwrctl->nb.notifier_call = pci_pwrctl_notify;
+	ret = bus_register_notifier(&pci_bus_type, &pwrctl->nb);
+	if (ret)
+		return ret;
+
+	schedule_work(&pwrctl->work);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_pwrctl_device_set_ready);
+
+/**
+ * pci_pwrctl_device_unset_ready() - Notify the pwrctl subsystem that the PCI
+ * device is about to be powered-down.
+ *
+ * @pwrctl: PCI power control data.
+ */
+void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl)
+{
+	/*
+	 * We don't have to delete the link here. Typically, this function
+	 * is only called when the power control device is being detached. If
+	 * it is being detached then the child PCI device must have already
+	 * been unbound too or the device core wouldn't let us unbind.
+	 */
+	bus_unregister_notifier(&pci_bus_type, &pwrctl->nb);
+}
+EXPORT_SYMBOL_GPL(pci_pwrctl_device_unset_ready);
+
+static void devm_pci_pwrctl_device_unset_ready(void *data)
+{
+	struct pci_pwrctl *pwrctl = data;
+
+	pci_pwrctl_device_unset_ready(pwrctl);
+}
+
+/**
+ * devm_pci_pwrctl_device_set_ready - Managed variant of
+ * pci_pwrctl_device_set_ready().
+ *
+ * @dev: Device managing this pwrctl provider.
+ * @pwrctl: PCI power control data.
+ *
+ * Returns:
+ * 0 on success, negative error number on error.
+ */
+int devm_pci_pwrctl_device_set_ready(struct device *dev,
+				     struct pci_pwrctl *pwrctl)
+{
+	int ret;
+
+	ret = pci_pwrctl_device_set_ready(pwrctl);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(dev,
+					devm_pci_pwrctl_device_unset_ready,
+					pwrctl);
+}
+EXPORT_SYMBOL_GPL(devm_pci_pwrctl_device_set_ready);
+
+MODULE_AUTHOR("Bartosz Golaszewski <bartosz.golaszewski@linaro.org>");
+MODULE_DESCRIPTION("PCI Device Power Control core driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c b/drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c
new file mode 100644
index 000000000000..cc19ad61dd6e
--- /dev/null
+++ b/drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Linaro Ltd.
+ */
+
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/pci-pwrctrl.h>
+#include <linux/platform_device.h>
+#include <linux/pwrseq/consumer.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+struct pci_pwrctl_pwrseq_data {
+	struct pci_pwrctl ctx;
+	struct pwrseq_desc *pwrseq;
+};
+
+static void devm_pci_pwrctl_pwrseq_power_off(void *data)
+{
+	struct pwrseq_desc *pwrseq = data;
+
+	pwrseq_power_off(pwrseq);
+}
+
+static int pci_pwrctl_pwrseq_probe(struct platform_device *pdev)
+{
+	struct pci_pwrctl_pwrseq_data *data;
+	struct device *dev = &pdev->dev;
+	int ret;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->pwrseq = devm_pwrseq_get(dev, of_device_get_match_data(dev));
+	if (IS_ERR(data->pwrseq))
+		return dev_err_probe(dev, PTR_ERR(data->pwrseq),
+				     "Failed to get the power sequencer\n");
+
+	ret = pwrseq_power_on(data->pwrseq);
+	if (ret)
+		return dev_err_probe(dev, ret,
+				     "Failed to power-on the device\n");
+
+	ret = devm_add_action_or_reset(dev, devm_pci_pwrctl_pwrseq_power_off,
+				       data->pwrseq);
+	if (ret)
+		return ret;
+
+	pci_pwrctl_init(&data->ctx, dev);
+
+	ret = devm_pci_pwrctl_device_set_ready(dev, &data->ctx);
+	if (ret)
+		return dev_err_probe(dev, ret,
+				     "Failed to register the pwrctl wrapper\n");
+
+	return 0;
+}
+
+static const struct of_device_id pci_pwrctl_pwrseq_of_match[] = {
+	{
+		/* ATH11K in QCA6390 package. */
+		.compatible = "pci17cb,1101",
+		.data = "wlan",
+	},
+	{
+		/* ATH11K in WCN6855 package. */
+		.compatible = "pci17cb,1103",
+		.data = "wlan",
+	},
+	{
+		/* ATH12K in WCN7850 package. */
+		.compatible = "pci17cb,1107",
+		.data = "wlan",
+	},
+	{ }
+};
+MODULE_DEVICE_TABLE(of, pci_pwrctl_pwrseq_of_match);
+
+static struct platform_driver pci_pwrctl_pwrseq_driver = {
+	.driver = {
+		.name = "pci-pwrctl-pwrseq",
+		.of_match_table = pci_pwrctl_pwrseq_of_match,
+	},
+	.probe = pci_pwrctl_pwrseq_probe,
+};
+module_platform_driver(pci_pwrctl_pwrseq_driver);
+
+MODULE_AUTHOR("Bartosz Golaszewski <bartosz.golaszewski@linaro.org>");
+MODULE_DESCRIPTION("Generic PCI Power Control module for power sequenced devices");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/pci-pwrctl.h b/include/linux/pci-pwrctl.h
deleted file mode 100644
index 0d23dddf59ec..000000000000
--- a/include/linux/pci-pwrctl.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2024 Linaro Ltd.
- */
-
-#ifndef __PCI_PWRCTL_H__
-#define __PCI_PWRCTL_H__
-
-#include <linux/notifier.h>
-#include <linux/workqueue.h>
-
-struct device;
-struct device_link;
-
-/*
- * This is a simple framework for solving the issue of PCI devices that require
- * certain resources (regulators, GPIOs, clocks) to be enabled before the
- * device can actually be detected on the PCI bus.
- *
- * The idea is to reuse the platform bus to populate OF nodes describing the
- * PCI device and its resources, let these platform devices probe and enable
- * relevant resources and then trigger a rescan of the PCI bus allowing for the
- * same device (with a second associated struct device) to be registered with
- * the PCI subsystem.
- *
- * To preserve a correct hierarchy for PCI power management and device reset,
- * we create a device link between the power control platform device (parent)
- * and the supplied PCI device (child).
- */
-
-/**
- * struct pci_pwrctl - PCI device power control context.
- * @dev: Address of the power controlling device.
- *
- * An object of this type must be allocated by the PCI power control device and
- * passed to the pwrctl subsystem to trigger a bus rescan and setup a device
- * link with the device once it's up.
- */
-struct pci_pwrctl {
-	struct device *dev;
-
-	/* Private: don't use. */
-	struct notifier_block nb;
-	struct device_link *link;
-	struct work_struct work;
-};
-
-void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev);
-int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl);
-void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl);
-int devm_pci_pwrctl_device_set_ready(struct device *dev,
-				     struct pci_pwrctl *pwrctl);
-
-#endif /* __PCI_PWRCTL_H__ */
diff --git a/include/linux/pci-pwrctrl.h b/include/linux/pci-pwrctrl.h
new file mode 100644
index 000000000000..0d23dddf59ec
--- /dev/null
+++ b/include/linux/pci-pwrctrl.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2024 Linaro Ltd.
+ */
+
+#ifndef __PCI_PWRCTL_H__
+#define __PCI_PWRCTL_H__
+
+#include <linux/notifier.h>
+#include <linux/workqueue.h>
+
+struct device;
+struct device_link;
+
+/*
+ * This is a simple framework for solving the issue of PCI devices that require
+ * certain resources (regulators, GPIOs, clocks) to be enabled before the
+ * device can actually be detected on the PCI bus.
+ *
+ * The idea is to reuse the platform bus to populate OF nodes describing the
+ * PCI device and its resources, let these platform devices probe and enable
+ * relevant resources and then trigger a rescan of the PCI bus allowing for the
+ * same device (with a second associated struct device) to be registered with
+ * the PCI subsystem.
+ *
+ * To preserve a correct hierarchy for PCI power management and device reset,
+ * we create a device link between the power control platform device (parent)
+ * and the supplied PCI device (child).
+ */
+
+/**
+ * struct pci_pwrctl - PCI device power control context.
+ * @dev: Address of the power controlling device.
+ *
+ * An object of this type must be allocated by the PCI power control device and
+ * passed to the pwrctl subsystem to trigger a bus rescan and setup a device
+ * link with the device once it's up.
+ */
+struct pci_pwrctl {
+	struct device *dev;
+
+	/* Private: don't use. */
+	struct notifier_block nb;
+	struct device_link *link;
+	struct work_struct work;
+};
+
+void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev);
+int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl);
+void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl);
+int devm_pci_pwrctl_device_set_ready(struct device *dev,
+				     struct pci_pwrctl *pwrctl);
+
+#endif /* __PCI_PWRCTL_H__ */
-- 
cgit v1.2.3


From 3f925cd6287401bbc9d568f56d796a69c8bd292a Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 15 Nov 2024 15:44:28 -0600
Subject: PCI/pwrctrl: Rename pwrctrl functions and structures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename pwrctrl functions and structures from "pwrctl" to "pwrctrl" to match
the similar file renames.

Link: https://lore.kernel.org/r/20241115214428.2061153-3-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
---
 drivers/pci/bus.c                        | 25 ++++++-----
 drivers/pci/pwrctrl/core.c               | 77 ++++++++++++++++----------------
 drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c | 32 ++++++-------
 drivers/pci/remove.c                     |  4 +-
 include/linux/pci-pwrctrl.h              | 22 ++++-----
 5 files changed, 81 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 90d775601789..0d27809746df 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -322,10 +322,10 @@ void __weak pcibios_resource_survey_bus(struct pci_bus *bus) { }
 void __weak pcibios_bus_add_device(struct pci_dev *pdev) { }
 
 /*
- * Create pwrctl devices (if required) for the PCI devices to handle the power
+ * Create pwrctrl devices (if required) for the PCI devices to handle the power
  * state.
  */
-static void pci_pwrctl_create_devices(struct pci_dev *dev)
+static void pci_pwrctrl_create_devices(struct pci_dev *dev)
 {
 	struct device_node *np = dev_of_node(&dev->dev);
 	struct device *parent = &dev->dev;
@@ -337,23 +337,24 @@ static void pci_pwrctl_create_devices(struct pci_dev *dev)
 	 */
 	if (np && pci_is_bridge(dev)) {
 		/*
-		 * Now look for the child PCI device nodes and create pwrctl
-		 * devices for them. The pwrctl device drivers will manage the
+		 * Now look for the child PCI device nodes and create pwrctrl
+		 * devices for them. The pwrctrl device drivers will manage the
 		 * power state of the devices.
 		 */
 		for_each_available_child_of_node_scoped(np, child) {
 			/*
-			 * First check whether the pwrctl device really needs to
-			 * be created or not. This is decided based on at least
-			 * one of the power supplies being defined in the
-			 * devicetree node of the device.
+			 * First check whether the pwrctrl device really
+			 * needs to be created or not. This is decided
+			 * based on at least one of the power supplies
+			 * being defined in the devicetree node of the
+			 * device.
 			 */
 			if (!of_pci_supply_present(child)) {
 				pci_dbg(dev, "skipping OF node: %s\n", child->name);
 				return;
 			}
 
-			/* Now create the pwrctl device */
+			/* Now create the pwrctrl device */
 			pdev = of_platform_device_create(child, NULL, parent);
 			if (!pdev)
 				pci_err(dev, "failed to create OF node: %s\n", child->name);
@@ -385,12 +386,12 @@ void pci_bus_add_device(struct pci_dev *dev)
 	pci_proc_attach_device(dev);
 	pci_bridge_d3_update(dev);
 
-	pci_pwrctl_create_devices(dev);
+	pci_pwrctrl_create_devices(dev);
 
 	/*
-	 * If the PCI device is associated with a pwrctl device with a
+	 * If the PCI device is associated with a pwrctrl device with a
 	 * power supply, create a device link between the PCI device and
-	 * pwrctl device.  This ensures that pwrctl drivers are probed
+	 * pwrctrl device.  This ensures that pwrctrl drivers are probed
 	 * before PCI client drivers.
 	 */
 	pdev = of_find_device_by_node(dn);
diff --git a/drivers/pci/pwrctrl/core.c b/drivers/pci/pwrctrl/core.c
index 88f06e9271ba..2fb174db91e5 100644
--- a/drivers/pci/pwrctrl/core.c
+++ b/drivers/pci/pwrctrl/core.c
@@ -11,13 +11,13 @@
 #include <linux/property.h>
 #include <linux/slab.h>
 
-static int pci_pwrctl_notify(struct notifier_block *nb, unsigned long action,
-			     void *data)
+static int pci_pwrctrl_notify(struct notifier_block *nb, unsigned long action,
+			      void *data)
 {
-	struct pci_pwrctl *pwrctl = container_of(nb, struct pci_pwrctl, nb);
+	struct pci_pwrctrl *pwrctrl = container_of(nb, struct pci_pwrctrl, nb);
 	struct device *dev = data;
 
-	if (dev_fwnode(dev) != dev_fwnode(pwrctl->dev))
+	if (dev_fwnode(dev) != dev_fwnode(pwrctrl->dev))
 		return NOTIFY_DONE;
 
 	switch (action) {
@@ -40,31 +40,32 @@ static int pci_pwrctl_notify(struct notifier_block *nb, unsigned long action,
 
 static void rescan_work_func(struct work_struct *work)
 {
-	struct pci_pwrctl *pwrctl = container_of(work, struct pci_pwrctl, work);
+	struct pci_pwrctrl *pwrctrl = container_of(work,
+						   struct pci_pwrctrl, work);
 
 	pci_lock_rescan_remove();
-	pci_rescan_bus(to_pci_dev(pwrctl->dev->parent)->bus);
+	pci_rescan_bus(to_pci_dev(pwrctrl->dev->parent)->bus);
 	pci_unlock_rescan_remove();
 }
 
 /**
- * pci_pwrctl_init() - Initialize the PCI power control context struct
+ * pci_pwrctrl_init() - Initialize the PCI power control context struct
  *
- * @pwrctl: PCI power control data
+ * @pwrctrl: PCI power control data
  * @dev: Parent device
  */
-void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev)
+void pci_pwrctrl_init(struct pci_pwrctrl *pwrctrl, struct device *dev)
 {
-	pwrctl->dev = dev;
-	INIT_WORK(&pwrctl->work, rescan_work_func);
+	pwrctrl->dev = dev;
+	INIT_WORK(&pwrctrl->work, rescan_work_func);
 }
-EXPORT_SYMBOL_GPL(pci_pwrctl_init);
+EXPORT_SYMBOL_GPL(pci_pwrctrl_init);
 
 /**
- * pci_pwrctl_device_set_ready() - Notify the pwrctl subsystem that the PCI
+ * pci_pwrctrl_device_set_ready() - Notify the pwrctrl subsystem that the PCI
  * device is powered-up and ready to be detected.
  *
- * @pwrctl: PCI power control data.
+ * @pwrctrl: PCI power control data.
  *
  * Returns:
  * 0 on success, negative error number on error.
@@ -74,31 +75,31 @@ EXPORT_SYMBOL_GPL(pci_pwrctl_init);
  * that the bus rescan was successfully started. The device will get bound to
  * its PCI driver asynchronously.
  */
-int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl)
+int pci_pwrctrl_device_set_ready(struct pci_pwrctrl *pwrctrl)
 {
 	int ret;
 
-	if (!pwrctl->dev)
+	if (!pwrctrl->dev)
 		return -ENODEV;
 
-	pwrctl->nb.notifier_call = pci_pwrctl_notify;
-	ret = bus_register_notifier(&pci_bus_type, &pwrctl->nb);
+	pwrctrl->nb.notifier_call = pci_pwrctrl_notify;
+	ret = bus_register_notifier(&pci_bus_type, &pwrctrl->nb);
 	if (ret)
 		return ret;
 
-	schedule_work(&pwrctl->work);
+	schedule_work(&pwrctrl->work);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pci_pwrctl_device_set_ready);
+EXPORT_SYMBOL_GPL(pci_pwrctrl_device_set_ready);
 
 /**
- * pci_pwrctl_device_unset_ready() - Notify the pwrctl subsystem that the PCI
+ * pci_pwrctrl_device_unset_ready() - Notify the pwrctrl subsystem that the PCI
  * device is about to be powered-down.
  *
- * @pwrctl: PCI power control data.
+ * @pwrctrl: PCI power control data.
  */
-void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl)
+void pci_pwrctrl_device_unset_ready(struct pci_pwrctrl *pwrctrl)
 {
 	/*
 	 * We don't have to delete the link here. Typically, this function
@@ -106,41 +107,41 @@ void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl)
 	 * it is being detached then the child PCI device must have already
 	 * been unbound too or the device core wouldn't let us unbind.
 	 */
-	bus_unregister_notifier(&pci_bus_type, &pwrctl->nb);
+	bus_unregister_notifier(&pci_bus_type, &pwrctrl->nb);
 }
-EXPORT_SYMBOL_GPL(pci_pwrctl_device_unset_ready);
+EXPORT_SYMBOL_GPL(pci_pwrctrl_device_unset_ready);
 
-static void devm_pci_pwrctl_device_unset_ready(void *data)
+static void devm_pci_pwrctrl_device_unset_ready(void *data)
 {
-	struct pci_pwrctl *pwrctl = data;
+	struct pci_pwrctrl *pwrctrl = data;
 
-	pci_pwrctl_device_unset_ready(pwrctl);
+	pci_pwrctrl_device_unset_ready(pwrctrl);
 }
 
 /**
- * devm_pci_pwrctl_device_set_ready - Managed variant of
- * pci_pwrctl_device_set_ready().
+ * devm_pci_pwrctrl_device_set_ready - Managed variant of
+ * pci_pwrctrl_device_set_ready().
  *
- * @dev: Device managing this pwrctl provider.
- * @pwrctl: PCI power control data.
+ * @dev: Device managing this pwrctrl provider.
+ * @pwrctrl: PCI power control data.
  *
  * Returns:
  * 0 on success, negative error number on error.
  */
-int devm_pci_pwrctl_device_set_ready(struct device *dev,
-				     struct pci_pwrctl *pwrctl)
+int devm_pci_pwrctrl_device_set_ready(struct device *dev,
+				      struct pci_pwrctrl *pwrctrl)
 {
 	int ret;
 
-	ret = pci_pwrctl_device_set_ready(pwrctl);
+	ret = pci_pwrctrl_device_set_ready(pwrctrl);
 	if (ret)
 		return ret;
 
 	return devm_add_action_or_reset(dev,
-					devm_pci_pwrctl_device_unset_ready,
-					pwrctl);
+					devm_pci_pwrctrl_device_unset_ready,
+					pwrctrl);
 }
-EXPORT_SYMBOL_GPL(devm_pci_pwrctl_device_set_ready);
+EXPORT_SYMBOL_GPL(devm_pci_pwrctrl_device_set_ready);
 
 MODULE_AUTHOR("Bartosz Golaszewski <bartosz.golaszewski@linaro.org>");
 MODULE_DESCRIPTION("PCI Device Power Control core driver");
diff --git a/drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c b/drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c
index cc19ad61dd6e..e9f89866b7c2 100644
--- a/drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c
+++ b/drivers/pci/pwrctrl/pci-pwrctrl-pwrseq.c
@@ -13,21 +13,21 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
-struct pci_pwrctl_pwrseq_data {
-	struct pci_pwrctl ctx;
+struct pci_pwrctrl_pwrseq_data {
+	struct pci_pwrctrl ctx;
 	struct pwrseq_desc *pwrseq;
 };
 
-static void devm_pci_pwrctl_pwrseq_power_off(void *data)
+static void devm_pci_pwrctrl_pwrseq_power_off(void *data)
 {
 	struct pwrseq_desc *pwrseq = data;
 
 	pwrseq_power_off(pwrseq);
 }
 
-static int pci_pwrctl_pwrseq_probe(struct platform_device *pdev)
+static int pci_pwrctrl_pwrseq_probe(struct platform_device *pdev)
 {
-	struct pci_pwrctl_pwrseq_data *data;
+	struct pci_pwrctrl_pwrseq_data *data;
 	struct device *dev = &pdev->dev;
 	int ret;
 
@@ -45,22 +45,22 @@ static int pci_pwrctl_pwrseq_probe(struct platform_device *pdev)
 		return dev_err_probe(dev, ret,
 				     "Failed to power-on the device\n");
 
-	ret = devm_add_action_or_reset(dev, devm_pci_pwrctl_pwrseq_power_off,
+	ret = devm_add_action_or_reset(dev, devm_pci_pwrctrl_pwrseq_power_off,
 				       data->pwrseq);
 	if (ret)
 		return ret;
 
-	pci_pwrctl_init(&data->ctx, dev);
+	pci_pwrctrl_init(&data->ctx, dev);
 
-	ret = devm_pci_pwrctl_device_set_ready(dev, &data->ctx);
+	ret = devm_pci_pwrctrl_device_set_ready(dev, &data->ctx);
 	if (ret)
 		return dev_err_probe(dev, ret,
-				     "Failed to register the pwrctl wrapper\n");
+				     "Failed to register the pwrctrl wrapper\n");
 
 	return 0;
 }
 
-static const struct of_device_id pci_pwrctl_pwrseq_of_match[] = {
+static const struct of_device_id pci_pwrctrl_pwrseq_of_match[] = {
 	{
 		/* ATH11K in QCA6390 package. */
 		.compatible = "pci17cb,1101",
@@ -78,16 +78,16 @@ static const struct of_device_id pci_pwrctl_pwrseq_of_match[] = {
 	},
 	{ }
 };
-MODULE_DEVICE_TABLE(of, pci_pwrctl_pwrseq_of_match);
+MODULE_DEVICE_TABLE(of, pci_pwrctrl_pwrseq_of_match);
 
-static struct platform_driver pci_pwrctl_pwrseq_driver = {
+static struct platform_driver pci_pwrctrl_pwrseq_driver = {
 	.driver = {
-		.name = "pci-pwrctl-pwrseq",
-		.of_match_table = pci_pwrctl_pwrseq_of_match,
+		.name = "pci-pwrctrl-pwrseq",
+		.of_match_table = pci_pwrctrl_pwrseq_of_match,
 	},
-	.probe = pci_pwrctl_pwrseq_probe,
+	.probe = pci_pwrctrl_pwrseq_probe,
 };
-module_platform_driver(pci_pwrctl_pwrseq_driver);
+module_platform_driver(pci_pwrctrl_pwrseq_driver);
 
 MODULE_AUTHOR("Bartosz Golaszewski <bartosz.golaszewski@linaro.org>");
 MODULE_DESCRIPTION("Generic PCI Power Control module for power sequenced devices");
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 3dd9b3024956..27ef371a8443 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -17,7 +17,7 @@ static void pci_free_resources(struct pci_dev *dev)
 	}
 }
 
-static void pci_pwrctl_unregister(struct device *dev)
+static void pci_pwrctrl_unregister(struct device *dev)
 {
 	struct platform_device *pdev;
 
@@ -34,7 +34,7 @@ static void pci_stop_dev(struct pci_dev *dev)
 	pci_pme_active(dev, false);
 
 	if (pci_dev_is_added(dev)) {
-		pci_pwrctl_unregister(&dev->dev);
+		pci_pwrctrl_unregister(&dev->dev);
 		device_release_driver(&dev->dev);
 		pci_proc_detach_device(dev);
 		pci_remove_sysfs_dev_files(dev);
diff --git a/include/linux/pci-pwrctrl.h b/include/linux/pci-pwrctrl.h
index 0d23dddf59ec..7d439b0675e9 100644
--- a/include/linux/pci-pwrctrl.h
+++ b/include/linux/pci-pwrctrl.h
@@ -3,8 +3,8 @@
  * Copyright (C) 2024 Linaro Ltd.
  */
 
-#ifndef __PCI_PWRCTL_H__
-#define __PCI_PWRCTL_H__
+#ifndef __PCI_PWRCTRL_H__
+#define __PCI_PWRCTRL_H__
 
 #include <linux/notifier.h>
 #include <linux/workqueue.h>
@@ -29,14 +29,14 @@ struct device_link;
  */
 
 /**
- * struct pci_pwrctl - PCI device power control context.
+ * struct pci_pwrctrl - PCI device power control context.
  * @dev: Address of the power controlling device.
  *
  * An object of this type must be allocated by the PCI power control device and
- * passed to the pwrctl subsystem to trigger a bus rescan and setup a device
+ * passed to the pwrctrl subsystem to trigger a bus rescan and setup a device
  * link with the device once it's up.
  */
-struct pci_pwrctl {
+struct pci_pwrctrl {
 	struct device *dev;
 
 	/* Private: don't use. */
@@ -45,10 +45,10 @@ struct pci_pwrctl {
 	struct work_struct work;
 };
 
-void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev);
-int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl);
-void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl);
-int devm_pci_pwrctl_device_set_ready(struct device *dev,
-				     struct pci_pwrctl *pwrctl);
+void pci_pwrctrl_init(struct pci_pwrctrl *pwrctrl, struct device *dev);
+int pci_pwrctrl_device_set_ready(struct pci_pwrctrl *pwrctrl);
+void pci_pwrctrl_device_unset_ready(struct pci_pwrctrl *pwrctrl);
+int devm_pci_pwrctrl_device_set_ready(struct device *dev,
+				     struct pci_pwrctrl *pwrctrl);
 
-#endif /* __PCI_PWRCTL_H__ */
+#endif /* __PCI_PWRCTRL_H__ */
-- 
cgit v1.2.3


From 3273d8ad947dea925a65a78ca29e5351c960c801 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Fri, 8 Nov 2024 09:25:54 +0800
Subject: f2fs: fix to do cast in F2FS_{BLK_TO_BYTES, BTYES_TO_BLK} to avoid
 overflow

It missed to cast variable to unsigned long long type before
bit shift, which will cause overflow, fix it.

Fixes: f7ef9b83b583 ("f2fs: introduce macros to convert bytes and blocks in f2fs")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c         | 2 +-
 include/linux/f2fs_fs.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 7bcd6471af9a..0097a5770fa3 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3344,7 +3344,7 @@ loff_t max_file_blocks(struct inode *inode)
 	 * fit within U32_MAX + 1 data units.
 	 */
 
-	result = min(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096));
+	result = umin(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096));
 
 	return result;
 }
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index b0b821edfd97..3b2ad444c002 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -24,10 +24,10 @@
 #define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
 #define COMPRESS_ADDR		((block_t)-2)	/* used as compressed data flag */
 
-#define F2FS_BYTES_TO_BLK(bytes)	((bytes) >> F2FS_BLKSIZE_BITS)
-#define F2FS_BLK_TO_BYTES(blk)		((blk) << F2FS_BLKSIZE_BITS)
+#define F2FS_BYTES_TO_BLK(bytes)	((unsigned long long)(bytes) >> F2FS_BLKSIZE_BITS)
+#define F2FS_BLK_TO_BYTES(blk)		((unsigned long long)(blk) << F2FS_BLKSIZE_BITS)
 #define F2FS_BLK_END_BYTES(blk)		(F2FS_BLK_TO_BYTES(blk + 1) - 1)
-#define F2FS_BLK_ALIGN(x)			(F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1))
+#define F2FS_BLK_ALIGN(x)		(F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1))
 
 /* 0, 1(node nid), 2(meta nid) are reserved node id */
 #define F2FS_RESERVED_NODE_NUM		3
-- 
cgit v1.2.3


From 77569f785c8624fa4189795fb52e635a973672e5 Mon Sep 17 00:00:00 2001
From: Zhiguo Niu <zhiguo.niu@unisoc.com>
Date: Fri, 8 Nov 2024 09:25:56 +0800
Subject: f2fs: fix to adjust appropriate length for fiemap

If user give a file size as "length" parameter for fiemap
operations, but if this size is non-block size aligned,
it will show 2 segments fiemap results even this whole file
is contiguous on disk, such as the following results:

 ./f2fs_io fiemap 0 19034 ylog/analyzer.py
Fiemap: offset = 0 len = 19034
        logical addr.    physical addr.   length           flags
0       0000000000000000 0000000020baa000 0000000000004000 00001000
1       0000000000004000 0000000020bae000 0000000000001000 00001001

after this patch:
./f2fs_io fiemap 0 19034 ylog/analyzer.py
Fiemap: offset = 0 len = 19034
    logical addr.    physical addr.   length           flags
0    0000000000000000 00000000315f3000 0000000000005000 00001001

Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          | 6 +++---
 include/linux/f2fs_fs.h | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0e8390cbdb5b..69f1cb0490ee 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1938,12 +1938,12 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			goto out;
 	}
 
-	if (F2FS_BYTES_TO_BLK(len) == 0)
-		len = F2FS_BLKSIZE;
-
 	start_blk = F2FS_BYTES_TO_BLK(start);
 	last_blk = F2FS_BYTES_TO_BLK(start + len - 1);
 
+	if (len & F2FS_BLKSIZE_MASK)
+		len = round_up(len, F2FS_BLKSIZE);
+
 next:
 	memset(&map, 0, sizeof(map));
 	map.m_lblk = start_blk;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 3b2ad444c002..c24f8bc01045 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -24,6 +24,7 @@
 #define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
 #define COMPRESS_ADDR		((block_t)-2)	/* used as compressed data flag */
 
+#define F2FS_BLKSIZE_MASK		(F2FS_BLKSIZE - 1)
 #define F2FS_BYTES_TO_BLK(bytes)	((unsigned long long)(bytes) >> F2FS_BLKSIZE_BITS)
 #define F2FS_BLK_TO_BYTES(blk)		((unsigned long long)(blk) << F2FS_BLKSIZE_BITS)
 #define F2FS_BLK_END_BYTES(blk)		(F2FS_BLK_TO_BYTES(blk + 1) - 1)
-- 
cgit v1.2.3


From 7d2f9f870f26798699585f96fa7a2a2cab38dc02 Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Wed, 20 Nov 2024 15:10:07 +0800
Subject: nvme: introduce change ptpl and iekey definition

This is for the next tuning pr code more readble patch, make
linux/nvme.h's changes separately.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 2baf9a80b470..13377dde4527 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -2171,4 +2171,13 @@ enum nvme_pr_release_action {
 	NVME_PR_RELEASE_ACT_CLEAR		= 1,
 };
 
+enum nvme_pr_change_ptpl {
+	NVME_PR_CPTPL_NO_CHANGE			= 0,
+	NVME_PR_CPTPL_RESV			= 1 << 30,
+	NVME_PR_CPTPL_CLEARED			= 2 << 30,
+	NVME_PR_CPTPL_PERSIST			= 3 << 30,
+};
+
+#define NVME_PR_IGNORE_KEY (1 << 3)
+
 #endif /* _LINUX_NVME_H */
-- 
cgit v1.2.3


From 64214c2b95364d26cdff045d8bbefd37380edbe1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 14 Nov 2024 15:55:30 -0400
Subject: iommu: Add ops->domain_alloc_nested()

It turns out all the drivers that are using this immediately call into
another function, so just make that function directly into the op. This
makes paging=NULL for domain_alloc_user and we can remove the argument in
the next patch.

The function mirrors the similar op in the viommu that allocates a nested
domain on top of the viommu's nesting parent. This version supports cases
where a viommu is not being used.

Link: https://patch.msgid.link/r/1-v1-c252ebdeb57b+329-iommu_paging_flags_jgg@nvidia.com
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/intel/iommu.c          |  9 +++------
 drivers/iommu/intel/iommu.h          |  6 ++++--
 drivers/iommu/intel/nested.c         | 11 +++++++++--
 drivers/iommu/iommufd/hw_pagetable.c |  8 ++++----
 drivers/iommu/iommufd/selftest.c     |  7 ++++---
 include/linux/iommu.h                |  9 +++++----
 6 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 527f6f89d8a1..6f11a075114f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3340,12 +3340,8 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
 	struct iommu_domain *domain;
 	bool first_stage;
 
-	/* Must be NESTING domain */
-	if (parent) {
-		if (!nested_supported(iommu) || flags)
-			return ERR_PTR(-EOPNOTSUPP);
-		return intel_nested_domain_alloc(parent, user_data);
-	}
+	if (parent)
+		return ERR_PTR(-EOPNOTSUPP);
 
 	if (flags &
 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING
@@ -4475,6 +4471,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
 	.domain_alloc_sva	= intel_svm_domain_alloc,
 	.domain_alloc_paging	= intel_iommu_domain_alloc_paging,
+	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
 	.probe_device		= intel_iommu_probe_device,
 	.release_device		= intel_iommu_release_device,
 	.get_resv_regions	= intel_iommu_get_resv_regions,
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 2cca094c259d..6ea7bbe26b19 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1265,8 +1265,10 @@ int __domain_setup_first_level(struct intel_iommu *iommu,
 int dmar_ir_support(void);
 
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
-struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
-					       const struct iommu_user_data *user_data);
+struct iommu_domain *
+intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
+				u32 flags,
+				const struct iommu_user_data *user_data);
 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
 
 enum cache_tag_type {
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index 42c4533a6ea2..aba92c00b427 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -186,14 +186,21 @@ static const struct iommu_domain_ops intel_nested_domain_ops = {
 	.cache_invalidate_user	= intel_nested_cache_invalidate_user,
 };
 
-struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
-					       const struct iommu_user_data *user_data)
+struct iommu_domain *
+intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
+				u32 flags,
+				const struct iommu_user_data *user_data)
 {
+	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct dmar_domain *s2_domain = to_dmar_domain(parent);
+	struct intel_iommu *iommu = info->iommu;
 	struct iommu_hwpt_vtd_s1 vtd;
 	struct dmar_domain *domain;
 	int ret;
 
+	if (!nested_supported(iommu) || flags)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	/* Must be nested domain */
 	if (user_data->type != IOMMU_HWPT_DATA_VTD_S1)
 		return ERR_PTR(-EOPNOTSUPP);
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 9236e8ca9aa8..ec3c64a8c796 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -227,7 +227,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 	int rc;
 
 	if ((flags & ~IOMMU_HWPT_FAULT_ID_VALID) ||
-	    !user_data->len || !ops->domain_alloc_user)
+	    !user_data->len || !ops->domain_alloc_nested)
 		return ERR_PTR(-EOPNOTSUPP);
 	if (parent->auto_domain || !parent->nest_parent ||
 	    parent->common.domain->owner != ops)
@@ -242,9 +242,9 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 	refcount_inc(&parent->common.obj.users);
 	hwpt_nested->parent = parent;
 
-	hwpt->domain = ops->domain_alloc_user(idev->dev,
-					      flags & ~IOMMU_HWPT_FAULT_ID_VALID,
-					      parent->common.domain, user_data);
+	hwpt->domain = ops->domain_alloc_nested(
+		idev->dev, parent->common.domain,
+		flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data);
 	if (IS_ERR(hwpt->domain)) {
 		rc = PTR_ERR(hwpt->domain);
 		hwpt->domain = NULL;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 2f9de177dffc..c58083c3660a 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -356,8 +356,8 @@ __mock_domain_alloc_nested(const struct iommu_user_data *user_data)
 }
 
 static struct iommu_domain *
-mock_domain_alloc_nested(struct iommu_domain *parent, u32 flags,
-			 const struct iommu_user_data *user_data)
+mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
+			 u32 flags, const struct iommu_user_data *user_data)
 {
 	struct mock_iommu_domain_nested *mock_nested;
 	struct mock_iommu_domain *mock_parent;
@@ -391,7 +391,7 @@ mock_domain_alloc_user(struct device *dev, u32 flags,
 	struct iommu_domain *domain;
 
 	if (parent)
-		return mock_domain_alloc_nested(parent, flags, user_data);
+		return ERR_PTR(-EOPNOTSUPP);
 
 	if (user_data)
 		return ERR_PTR(-EOPNOTSUPP);
@@ -719,6 +719,7 @@ static const struct iommu_ops mock_ops = {
 	.hw_info = mock_domain_hw_info,
 	.domain_alloc_paging = mock_domain_alloc_paging,
 	.domain_alloc_user = mock_domain_alloc_user,
+	.domain_alloc_nested = mock_domain_alloc_nested,
 	.capable = mock_domain_capable,
 	.device_group = generic_device_group,
 	.probe_device = mock_probe_device,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d6aaaec3caf4..0472cc124519 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -559,15 +559,13 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
  *                the caller iommu_domain_alloc() returns.
  * @domain_alloc_user: Allocate an iommu domain corresponding to the input
  *                     parameters as defined in include/uapi/linux/iommufd.h.
- *                     Upon success, if the @user_data is valid and the @parent
- *                     points to a kernel-managed domain, the new domain must be
- *                     IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be
- *                     NULL while the @user_data can be optionally provided, the
+ *                     The @user_data can be optionally provided, the
  *                     new domain must support __IOMMU_DOMAIN_PAGING.
  *                     Upon failure, ERR_PTR must be returned.
  * @domain_alloc_paging: Allocate an iommu_domain that can be used for
  *                       UNMANAGED, DMA, and DMA_FQ domain types.
  * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing.
+ * @domain_alloc_nested: Allocate an iommu_domain for nested translation.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -622,6 +620,9 @@ struct iommu_ops {
 	struct iommu_domain *(*domain_alloc_paging)(struct device *dev);
 	struct iommu_domain *(*domain_alloc_sva)(struct device *dev,
 						 struct mm_struct *mm);
+	struct iommu_domain *(*domain_alloc_nested)(
+		struct device *dev, struct iommu_domain *parent, u32 flags,
+		const struct iommu_user_data *user_data);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
-- 
cgit v1.2.3


From d53764723ecd639a0cc0c5ad24146847fc09f78d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 14 Nov 2024 15:55:31 -0400
Subject: iommu: Rename ops->domain_alloc_user() to domain_alloc_paging_flags()

Now that the main domain allocating path is calling this function it
doesn't make sense to leave it named _user. Change the name to
alloc_paging_flags() to mirror the new iommu_paging_domain_alloc_flags()
function.

A driver should implement only one of ops->domain_alloc_paging() or
ops->domain_alloc_paging_flags(). The former is a simpler interface with
less boiler plate that the majority of drivers use. The latter is for
drivers with a greater feature set (PASID, multiple page table support,
advanced iommufd support, nesting, etc). Additional patches will be needed
to achieve this.

Link: https://patch.msgid.link/r/2-v1-c252ebdeb57b+329-iommu_paging_flags_jgg@nvidia.com
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/amd/iommu.c                   |  9 ++++-----
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  9 ++++-----
 drivers/iommu/intel/iommu.c                 | 10 +++-------
 drivers/iommu/iommu.c                       |  4 ++--
 drivers/iommu/iommufd/hw_pagetable.c        |  8 ++++----
 drivers/iommu/iommufd/selftest.c            | 10 +++-------
 include/linux/iommu.h                       | 20 ++++++++++++--------
 7 files changed, 32 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 5ce8e6504ba7..3f691e1fd22c 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2407,9 +2407,8 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
 }
 
 static struct iommu_domain *
-amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
-			    struct iommu_domain *parent,
-			    const struct iommu_user_data *user_data)
+amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
+				    const struct iommu_user_data *user_data)
 
 {
 	unsigned int type = IOMMU_DOMAIN_UNMANAGED;
@@ -2420,7 +2419,7 @@ amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
 	if (dev)
 		iommu = get_amd_iommu_from_dev(dev);
 
-	if ((flags & ~supported_flags) || parent || user_data)
+	if ((flags & ~supported_flags) || user_data)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	/* Allocate domain with v2 page table if IOMMU supports PASID. */
@@ -2884,7 +2883,7 @@ const struct iommu_ops amd_iommu_ops = {
 	.release_domain = &release_domain,
 	.identity_domain = &identity_domain.domain,
 	.domain_alloc = amd_iommu_domain_alloc,
-	.domain_alloc_user = amd_iommu_domain_alloc_user,
+	.domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags,
 	.domain_alloc_sva = amd_iommu_domain_alloc_sva,
 	.probe_device = amd_iommu_probe_device,
 	.release_device = amd_iommu_release_device,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 04630dbfedd9..e4ebd9e12ad4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3132,9 +3132,8 @@ static struct iommu_domain arm_smmu_blocked_domain = {
 };
 
 static struct iommu_domain *
-arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
-			   struct iommu_domain *parent,
-			   const struct iommu_user_data *user_data)
+arm_smmu_domain_alloc_paging_flags(struct device *dev, u32 flags,
+				   const struct iommu_user_data *user_data)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
@@ -3145,7 +3144,7 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 
 	if (flags & ~PAGING_FLAGS)
 		return ERR_PTR(-EOPNOTSUPP);
-	if (parent || user_data)
+	if (user_data)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	if (flags & IOMMU_HWPT_ALLOC_PASID)
@@ -3546,7 +3545,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.hw_info		= arm_smmu_hw_info,
 	.domain_alloc_paging    = arm_smmu_domain_alloc_paging,
 	.domain_alloc_sva       = arm_smmu_sva_domain_alloc,
-	.domain_alloc_user	= arm_smmu_domain_alloc_user,
+	.domain_alloc_paging_flags = arm_smmu_domain_alloc_paging_flags,
 	.probe_device		= arm_smmu_probe_device,
 	.release_device		= arm_smmu_release_device,
 	.device_group		= arm_smmu_device_group,
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6f11a075114f..7d0acb74d5a5 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3328,9 +3328,8 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st
 }
 
 static struct iommu_domain *
-intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
-			      struct iommu_domain *parent,
-			      const struct iommu_user_data *user_data)
+intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
+				      const struct iommu_user_data *user_data)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
@@ -3340,9 +3339,6 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
 	struct iommu_domain *domain;
 	bool first_stage;
 
-	if (parent)
-		return ERR_PTR(-EOPNOTSUPP);
-
 	if (flags &
 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING
 	       | IOMMU_HWPT_FAULT_ID_VALID)))
@@ -4468,7 +4464,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.identity_domain	= &identity_domain,
 	.capable		= intel_iommu_capable,
 	.hw_info		= intel_iommu_hw_info,
-	.domain_alloc_user	= intel_iommu_domain_alloc_user,
+	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
 	.domain_alloc_sva	= intel_svm_domain_alloc,
 	.domain_alloc_paging	= intel_iommu_domain_alloc_paging,
 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7618e9c65d3f..9bc0c74cca3c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1987,8 +1987,8 @@ __iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type,
 
 	if (ops->domain_alloc_paging && !flags)
 		domain = ops->domain_alloc_paging(dev);
-	else if (ops->domain_alloc_user)
-		domain = ops->domain_alloc_user(dev, flags, NULL, NULL);
+	else if (ops->domain_alloc_paging_flags)
+		domain = ops->domain_alloc_paging_flags(dev, flags, NULL);
 	else if (ops->domain_alloc && !flags)
 		domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED);
 	else
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index ec3c64a8c796..ce03c3804651 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -119,7 +119,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 
 	lockdep_assert_held(&ioas->mutex);
 
-	if ((flags || user_data) && !ops->domain_alloc_user)
+	if ((flags || user_data) && !ops->domain_alloc_paging_flags)
 		return ERR_PTR(-EOPNOTSUPP);
 	if (flags & ~valid_flags)
 		return ERR_PTR(-EOPNOTSUPP);
@@ -139,9 +139,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 	hwpt_paging->ioas = ioas;
 	hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
 
-	if (ops->domain_alloc_user) {
-		hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL,
-						      user_data);
+	if (ops->domain_alloc_paging_flags) {
+		hwpt->domain = ops->domain_alloc_paging_flags(idev->dev, flags,
+							      user_data);
 		if (IS_ERR(hwpt->domain)) {
 			rc = PTR_ERR(hwpt->domain);
 			hwpt->domain = NULL;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index c58083c3660a..a0de6d6d4e68 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -379,9 +379,8 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
 }
 
 static struct iommu_domain *
-mock_domain_alloc_user(struct device *dev, u32 flags,
-		       struct iommu_domain *parent,
-		       const struct iommu_user_data *user_data)
+mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
+			       const struct iommu_user_data *user_data)
 {
 	bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
 	const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
@@ -390,9 +389,6 @@ mock_domain_alloc_user(struct device *dev, u32 flags,
 			    MOCK_FLAGS_DEVICE_NO_DIRTY;
 	struct iommu_domain *domain;
 
-	if (parent)
-		return ERR_PTR(-EOPNOTSUPP);
-
 	if (user_data)
 		return ERR_PTR(-EOPNOTSUPP);
 	if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
@@ -718,7 +714,7 @@ static const struct iommu_ops mock_ops = {
 	.pgsize_bitmap = MOCK_IO_PAGE_SIZE,
 	.hw_info = mock_domain_hw_info,
 	.domain_alloc_paging = mock_domain_alloc_paging,
-	.domain_alloc_user = mock_domain_alloc_user,
+	.domain_alloc_paging_flags = mock_domain_alloc_paging_flags,
 	.domain_alloc_nested = mock_domain_alloc_nested,
 	.capable = mock_domain_capable,
 	.device_group = generic_device_group,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0472cc124519..1e3308e89996 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -557,13 +557,17 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
  * @domain_alloc: allocate and return an iommu domain if success. Otherwise
  *                NULL is returned. The domain is not fully initialized until
  *                the caller iommu_domain_alloc() returns.
- * @domain_alloc_user: Allocate an iommu domain corresponding to the input
- *                     parameters as defined in include/uapi/linux/iommufd.h.
- *                     The @user_data can be optionally provided, the
- *                     new domain must support __IOMMU_DOMAIN_PAGING.
- *                     Upon failure, ERR_PTR must be returned.
+ * @domain_alloc_paging_flags: Allocate an iommu domain corresponding to the
+ *                     input parameters as defined in
+ *                     include/uapi/linux/iommufd.h. The @user_data can be
+ *                     optionally provided, the new domain must support
+ *                     __IOMMU_DOMAIN_PAGING. Upon failure, ERR_PTR must be
+ *                     returned.
  * @domain_alloc_paging: Allocate an iommu_domain that can be used for
- *                       UNMANAGED, DMA, and DMA_FQ domain types.
+ *                       UNMANAGED, DMA, and DMA_FQ domain types. This is the
+ *                       same as invoking domain_alloc_paging_flags() with
+ *                       @flags=0, @user_data=NULL. A driver should implement
+ *                       only one of the two ops.
  * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing.
  * @domain_alloc_nested: Allocate an iommu_domain for nested translation.
  * @probe_device: Add device to iommu driver handling
@@ -614,8 +618,8 @@ struct iommu_ops {
 
 	/* Domain allocation and freeing by the iommu driver */
 	struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
-	struct iommu_domain *(*domain_alloc_user)(
-		struct device *dev, u32 flags, struct iommu_domain *parent,
+	struct iommu_domain *(*domain_alloc_paging_flags)(
+		struct device *dev, u32 flags,
 		const struct iommu_user_data *user_data);
 	struct iommu_domain *(*domain_alloc_paging)(struct device *dev);
 	struct iommu_domain *(*domain_alloc_sva)(struct device *dev,
-- 
cgit v1.2.3


From 9d8a2b033db179bef9b6b5bad492f611a0fe89b7 Mon Sep 17 00:00:00 2001
From: Miao Wang <shankerwangmiao@gmail.com>
Date: Thu, 21 Nov 2024 22:25:21 +0800
Subject: ACPI: introduce acpi_arch_init()

To avoid arch-specific code in general ACPI initialization flow,
introduce a weak symbol acpi_arch_init().

Currently, arm64 and riscv can utillize this to insert their
arch-specific flow.

In the future, other architectures can also have a chance to define
their own arch-specific ACPI initialization process if necessary.

Reviewed-by: Sunil V L <sunilvl@ventanamicro.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Miao Wang <shankerwangmiao@gmail.com>
Link: https://patch.msgid.link/20241121-intro-acpi-arch-init-v4-1-b1fb517e7d8b@gmail.com
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/arm64/init.c |  2 +-
 drivers/acpi/bus.c        |  5 +++--
 drivers/acpi/riscv/init.c |  2 +-
 include/linux/acpi.h      | 12 +-----------
 4 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/init.c b/drivers/acpi/arm64/init.c
index d0c8aed90fd1..7a47d8095a7d 100644
--- a/drivers/acpi/arm64/init.c
+++ b/drivers/acpi/arm64/init.c
@@ -2,7 +2,7 @@
 #include <linux/acpi.h>
 #include "init.h"
 
-void __init acpi_arm_init(void)
+void __init acpi_arch_init(void)
 {
 	if (IS_ENABLED(CONFIG_ACPI_AGDI))
 		acpi_agdi_init();
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 16917dc3ad60..058910af82bc 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -1434,6 +1434,8 @@ static int __init acpi_bus_init(void)
 struct kobject *acpi_kobj;
 EXPORT_SYMBOL_GPL(acpi_kobj);
 
+void __weak __init acpi_arch_init(void) { }
+
 static int __init acpi_init(void)
 {
 	int result;
@@ -1461,8 +1463,7 @@ static int __init acpi_init(void)
 	acpi_viot_early_init();
 	acpi_hest_init();
 	acpi_ghes_init();
-	acpi_arm_init();
-	acpi_riscv_init();
+	acpi_arch_init();
 	acpi_scan_init();
 	acpi_ec_init();
 	acpi_debugfs_init();
diff --git a/drivers/acpi/riscv/init.c b/drivers/acpi/riscv/init.c
index 5ef97905a727..673e4d5dd752 100644
--- a/drivers/acpi/riscv/init.c
+++ b/drivers/acpi/riscv/init.c
@@ -7,7 +7,7 @@
 #include <linux/acpi.h>
 #include "init.h"
 
-void __init acpi_riscv_init(void)
+void __init acpi_arch_init(void)
 {
 	riscv_acpi_init_gsi_mapping();
 }
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7dd24acd9ffe..05f39fbfa485 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1530,17 +1530,7 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
 }
 #endif
 
-#ifdef CONFIG_ARM64
-void acpi_arm_init(void);
-#else
-static inline void acpi_arm_init(void) { }
-#endif
-
-#ifdef CONFIG_RISCV
-void acpi_riscv_init(void);
-#else
-static inline void acpi_riscv_init(void) { }
-#endif
+void acpi_arch_init(void);
 
 #ifdef CONFIG_ACPI_PCC
 void acpi_init_pcc(void);
-- 
cgit v1.2.3


From 0172afefbfbdd8987787c926b40b68400bd1c3d1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Nov 2024 21:28:49 +0100
Subject: tracing: Record task flag NEED_RESCHED_LAZY.

The scheduler added NEED_RESCHED_LAZY scheduling. Record this state as
part of trace flags and expose it in the need_resched field.

Record and expose NEED_RESCHED_LAZY.

[bigeasy: Commit description, documentation bits.]

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20241122202849.7DfYpJR0@linutronix.de
Reviewed-by: Ankur Arora <ankur.a.arora@oracle.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst |  4 ++++
 include/linux/trace_events.h   |  1 +
 kernel/trace/trace.c           |  2 ++
 kernel/trace/trace_output.c    | 14 +++++++++++++-
 4 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 74d5bd801b1a..272464bb7c60 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -1033,9 +1033,13 @@ explains which is which.
   irqs-off: 'd' interrupts are disabled. '.' otherwise.
 
   need-resched:
+	- 'B' all, TIF_NEED_RESCHED, PREEMPT_NEED_RESCHED and TIF_RESCHED_LAZY is set,
 	- 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set,
 	- 'n' only TIF_NEED_RESCHED is set,
 	- 'p' only PREEMPT_NEED_RESCHED is set,
+	- 'L' both PREEMPT_NEED_RESCHED and TIF_RESCHED_LAZY is set,
+	- 'b' both TIF_NEED_RESCHED and TIF_RESCHED_LAZY is set,
+	- 'l' only TIF_RESCHED_LAZY is set
 	- '.' otherwise.
 
   hardirq/softirq:
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 016b29a56c87..2a5df5b62cfc 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -184,6 +184,7 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
 
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED_LAZY	= 0x02,
 	TRACE_FLAG_NEED_RESCHED		= 0x04,
 	TRACE_FLAG_HARDIRQ		= 0x08,
 	TRACE_FLAG_SOFTIRQ		= 0x10,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ef047ed9705..be62f0ea1814 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2552,6 +2552,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
 		trace_flags |= TRACE_FLAG_NEED_RESCHED;
 	if (test_preempt_need_resched())
 		trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
+	if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+		trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
 	return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
 		(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e08aee34ef63..da748b7cbc4d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -462,17 +462,29 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 		bh_off ? 'b' :
 		'.';
 
-	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
+	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY |
 				TRACE_FLAG_PREEMPT_RESCHED)) {
+	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
+		need_resched = 'B';
+		break;
 	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
 		need_resched = 'N';
 		break;
+	case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
+		need_resched = 'L';
+		break;
+	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY:
+		need_resched = 'b';
+		break;
 	case TRACE_FLAG_NEED_RESCHED:
 		need_resched = 'n';
 		break;
 	case TRACE_FLAG_PREEMPT_RESCHED:
 		need_resched = 'p';
 		break;
+	case TRACE_FLAG_NEED_RESCHED_LAZY:
+		need_resched = 'l';
+		break;
 	default:
 		need_resched = '.';
 		break;
-- 
cgit v1.2.3


From 306d40aa53b671ea72c3bf272f85ccb6c1b0bc65 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 23 Nov 2024 10:30:27 -0500
Subject: tracing: Move it_func[0] comment to the relevant context

When introducing __DO_TRACE_CALL(), the iteration over it_func moved
from __DO_TRACE() to __tracepoint_iter_##_name(), but the comment
relevant for this iterator was left in its original location.

Move the comment to the relevant context.

Fixes: d25e37d89dd2 ("tracepoint: Optimize using static_call()")
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
Link: https://lore.kernel.org/20241123153031.2884933-2-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 425123e921ac..d390e8cabf02 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -210,9 +210,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
 /*
- * it_func[0] is never NULL because there is at least one element in the array
- * when the array itself is non NULL.
- *
  * With @syscall=0, the tracepoint callback array dereference is
  * protected by disabling preemption.
  * With @syscall=1, the tracepoint callback array dereference is
@@ -316,6 +313,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * We have no guarantee that gcc and the linker won't up-align the tracepoint
  * structures, so we create an array of pointers that will be used for iteration
  * on the tracepoints.
+ *
+ * it_func[0] is never NULL because there is at least one element in the array
+ * when the array itself is non NULL.
  */
 #define __DEFINE_TRACE_EXT(_name, _ext, proto, args)			\
 	static const char __tpstrtab_##_name[]				\
-- 
cgit v1.2.3


From 89c7e17f303e2d56d50a162bb65d786d3a43963e Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 23 Nov 2024 10:30:28 -0500
Subject: tracing: Remove __idx variable from __DO_TRACE

Since the removal of SRCU-protected tracepoints, the __idx variable in
__DO_TRACE is unused. Remove this variable.

Fixes: 48bcda684823 ("tracing: Remove definition of trace_*_rcuidle()")
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
Link: https://lore.kernel.org/20241123153031.2884933-3-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d390e8cabf02..867f3c1ac7dc 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -218,8 +218,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  */
 #define __DO_TRACE(name, args, cond, syscall)				\
 	do {								\
-		int __maybe_unused __idx = 0;				\
-									\
 		if (!(cond))						\
 			return;						\
 									\
-- 
cgit v1.2.3


From 7c565a4d4e437034755a06b7d61361add3b98882 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 23 Nov 2024 10:30:29 -0500
Subject: rcupdate_trace: Define rcu_tasks_trace lock guard

Define a rcu_tasks_trace lock guard for use by the syscall enter/exit
tracepoints.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
Cc: linux-trace-kernel@vger.kernel.org
Link: https://lore.kernel.org/20241123153031.2884933-4-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rcupdate_trace.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index eda493200663..e6c44eb428ab 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -10,6 +10,7 @@
 
 #include <linux/sched.h>
 #include <linux/rcupdate.h>
+#include <linux/cleanup.h>
 
 extern struct lockdep_map rcu_trace_lock_map;
 
@@ -98,4 +99,8 @@ static inline void rcu_read_lock_trace(void) { BUG(); }
 static inline void rcu_read_unlock_trace(void) { BUG(); }
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 
+DEFINE_LOCK_GUARD_0(rcu_tasks_trace,
+	rcu_read_lock_trace(),
+	rcu_read_unlock_trace())
+
 #endif /* __LINUX_RCUPDATE_TRACE_H */
-- 
cgit v1.2.3


From 98bf0fbb65226809a8df4d1948c5b6cf0c91590f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 23 Nov 2024 10:30:30 -0500
Subject: tracing: Remove conditional locking from __DO_TRACE()

Remove conditional locking by moving the __DO_TRACE() code into
trace_##name().

When the faultable syscall tracepoints were implemented, __DO_TRACE()
had a rcuidle argument which selected between SRCU and preempt disable.
Therefore, the RCU tasks trace protection for faultable syscall
tracepoints was introduced using the same pattern.

At that point, it did not appear obvious that this feedback from Linus [1]
applied here as well, because the __DO_TRACE() modification was
extending a pre-existing pattern.

Shortly before pulling the faultable syscall tracepoints modifications,
Steven removed the rcuidle argument and SRCU protection scheme entirely
from tracepoint.h:

commit 48bcda684823 ("tracing: Remove definition of trace_*_rcuidle()")

This required a rebase of the faultable syscall tracepoints series,
which missed a perfect opportunity to integrate the prior recommendation
from Linus.

In response to the pull request, Linus pointed out [2] that he was not
pleased by the implementation, expecting this to be fixed in a follow up
patch series.

Move __DO_TRACE() code into trace_##name() within each of
__DECLARE_TRACE() and __DECLARE_TRACE_SYSCALL(). Use a scoped guard
to guard the preempt disable notrace and RCU tasks trace critical
sections.

Link: https://lore.kernel.org/all/CAHk-=wggDLDeTKbhb5hh--x=-DQd69v41137M72m6NOTmbD-cw@mail.gmail.com/ [1]
Link: https://lore.kernel.org/lkml/CAHk-=witPrLcu22dZ93VCyRQonS7+-dFYhQbna=KBa-TAhayMw@mail.gmail.com/ [2]

Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults")
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
Cc: linux-trace-kernel@vger.kernel.org
Link: https://lore.kernel.org/20241123153031.2884933-5-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 45 ++++++++++++---------------------------------
 1 file changed, 12 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 867f3c1ac7dc..832f49b56b1f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -209,31 +209,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #define __DO_TRACE_CALL(name, args)	__traceiter_##name(NULL, args)
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
-/*
- * With @syscall=0, the tracepoint callback array dereference is
- * protected by disabling preemption.
- * With @syscall=1, the tracepoint callback array dereference is
- * protected by Tasks Trace RCU, which allows probes to handle page
- * faults.
- */
-#define __DO_TRACE(name, args, cond, syscall)				\
-	do {								\
-		if (!(cond))						\
-			return;						\
-									\
-		if (syscall)						\
-			rcu_read_lock_trace();				\
-		else							\
-			preempt_disable_notrace();			\
-									\
-		__DO_TRACE_CALL(name, TP_ARGS(args));			\
-									\
-		if (syscall)						\
-			rcu_read_unlock_trace();			\
-		else							\
-			preempt_enable_notrace();			\
-	} while (0)
-
 /*
  * Make sure the alignment of the structure in the __tracepoints section will
  * not add unwanted padding between the beginning of the section and the
@@ -282,10 +257,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
 	static inline void trace_##name(proto)				\
 	{								\
-		if (static_branch_unlikely(&__tracepoint_##name.key))	\
-			__DO_TRACE(name,				\
-				TP_ARGS(args),				\
-				TP_CONDITION(cond), 0);			\
+		if (static_branch_unlikely(&__tracepoint_##name.key)) { \
+			if (cond) {					\
+				scoped_guard(preempt_notrace)		\
+					__DO_TRACE_CALL(name, TP_ARGS(args)); \
+			}						\
+		}							\
 		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
 			WARN_ONCE(!rcu_is_watching(),			\
 				  "RCU not watching for tracepoint");	\
@@ -297,10 +274,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	static inline void trace_##name(proto)				\
 	{								\
 		might_fault();						\
-		if (static_branch_unlikely(&__tracepoint_##name.key))	\
-			__DO_TRACE(name,				\
-				TP_ARGS(args),				\
-				TP_CONDITION(cond), 1);			\
+		if (static_branch_unlikely(&__tracepoint_##name.key)) {	\
+			if (cond) {					\
+				scoped_guard(rcu_tasks_trace)		\
+					__DO_TRACE_CALL(name, TP_ARGS(args)); \
+			}						\
+		}							\
 		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
 			WARN_ONCE(!rcu_is_watching(),			\
 				  "RCU not watching for tracepoint");	\
-- 
cgit v1.2.3


From ef0d4186083127d2f99ed04e051fd94ba061d253 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 23 Nov 2024 10:30:31 -0500
Subject: tracing: Remove cond argument from __DECLARE_TRACE_SYSCALL

Syscall tracepoints do not require a "cond" argument, because they are
meant to be used only for sys_enter and sys_exit instrumentation, which
don't require condition evaluation.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
Cc: linux-trace-kernel@vger.kernel.org
Link: https://lore.kernel.org/20241123153031.2884933-6-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 832f49b56b1f..b2633a72e871 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -220,7 +220,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * site if it is not watching, as it will need to be active when the
  * tracepoint is enabled.
  */
-#define __DECLARE_TRACE_COMMON(name, proto, args, cond, data_proto)	\
+#define __DECLARE_TRACE_COMMON(name, proto, args, data_proto)		\
 	extern int __traceiter_##name(data_proto);			\
 	DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name);	\
 	extern struct tracepoint __tracepoint_##name;			\
@@ -254,7 +254,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	}
 
 #define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
-	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \
 	static inline void trace_##name(proto)				\
 	{								\
 		if (static_branch_unlikely(&__tracepoint_##name.key)) { \
@@ -269,18 +269,16 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		}							\
 	}
 
-#define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto)	\
-	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+#define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto)		\
+	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \
 	static inline void trace_##name(proto)				\
 	{								\
 		might_fault();						\
 		if (static_branch_unlikely(&__tracepoint_##name.key)) {	\
-			if (cond) {					\
-				scoped_guard(rcu_tasks_trace)		\
-					__DO_TRACE_CALL(name, TP_ARGS(args)); \
-			}						\
+			scoped_guard(rcu_tasks_trace)			\
+				__DO_TRACE_CALL(name, TP_ARGS(args));	\
 		}							\
-		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
+		if (IS_ENABLED(CONFIG_LOCKDEP)) {			\
 			WARN_ONCE(!rcu_is_watching(),			\
 				  "RCU not watching for tracepoint");	\
 		}							\
@@ -363,7 +361,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 
 
 #else /* !TRACEPOINTS_ENABLED */
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
+#define __DECLARE_TRACE_COMMON(name, proto, args, data_proto)		\
 	static inline void trace_##name(proto)				\
 	{ }								\
 	static inline int						\
@@ -387,7 +385,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		return false;						\
 	}
 
-#define __DECLARE_TRACE_SYSCALL	__DECLARE_TRACE
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
+	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto))
+
+#define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto)		\
+	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto))
 
 #define DEFINE_TRACE_FN(name, reg, unreg, proto, args)
 #define DEFINE_TRACE_SYSCALL(name, reg, unreg, proto, args)
@@ -453,7 +455,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 
 #define DECLARE_TRACE_SYSCALL(name, proto, args)			\
 	__DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args),	\
-				cpu_online(raw_smp_processor_id()),	\
 				PARAMS(void *__data, proto))
 
 #define TRACE_EVENT_FLAGS(event, flag)
-- 
cgit v1.2.3


From 2bd9b57d04df417f31ef54448477c212fcdd14fc Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 25 Nov 2024 09:25:14 -0500
Subject: tracing: Use guard() rather than scoped_guard()

Using scoped_guard() in the implementation of trace_##name() adds an
unnecessary level of indentation.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
Link: https://lore.kernel.org/20241125142514.2897143-1-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index b2633a72e871..e398f6e43f61 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -259,8 +259,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	{								\
 		if (static_branch_unlikely(&__tracepoint_##name.key)) { \
 			if (cond) {					\
-				scoped_guard(preempt_notrace)		\
-					__DO_TRACE_CALL(name, TP_ARGS(args)); \
+				guard(preempt_notrace)();		\
+				__DO_TRACE_CALL(name, TP_ARGS(args));	\
 			}						\
 		}							\
 		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
@@ -275,8 +275,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	{								\
 		might_fault();						\
 		if (static_branch_unlikely(&__tracepoint_##name.key)) {	\
-			scoped_guard(rcu_tasks_trace)			\
-				__DO_TRACE_CALL(name, TP_ARGS(args));	\
+			guard(rcu_tasks_trace)();			\
+			__DO_TRACE_CALL(name, TP_ARGS(args));		\
 		}							\
 		if (IS_ENABLED(CONFIG_LOCKDEP)) {			\
 			WARN_ONCE(!rcu_is_watching(),			\
-- 
cgit v1.2.3


From 222974c6ec9d901f7ad13bbe6e505ec1f1d822d4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 25 Nov 2024 18:45:33 -0800
Subject: iommu: remove stale declaration left over by a merge conflict

The merge commit ae3325f752ef ("Merge branches 'arm/smmu', 'mediatek',
's390', 'ti/omap', 'riscv' and 'core' into next") left a stale
declaration of 'iommu_present()' even though the 'core' branch that was
merged had removed the function (and the declaration).

Remove it for real.

Reported-by: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/iommu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d6aaaec3caf4..301e97d745c1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -841,7 +841,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 	};
 }
 
-extern bool iommu_present(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
 struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags);
-- 
cgit v1.2.3


From bb43a59944f45e89aa158740b8a16ba8f0b0fa2b Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 7 Nov 2024 01:14:40 +0900
Subject: Rename .data.unlikely to .data..unlikely

Commit 7ccaba5314ca ("consolidate WARN_...ONCE() static variables")
was intended to collect all .data.unlikely sections into one chunk.
However, this has not worked when CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
or CONFIG_LTO_CLANG is enabled, because .data.unlikely matches the
.data.[0-9a-zA-Z_]* pattern in the DATA_MAIN macro.

Commit cb87481ee89d ("kbuild: linker script do not match C names unless
LD_DEAD_CODE_DATA_ELIMINATION is configured") was introduced to suppress
the issue for the default CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n case,
providing a minimal fix for stable backporting. We were aware this did
not address the issue for CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y. The
plan was to apply correct fixes and then revert cb87481ee89d. [1]

Seven years have passed since then, yet the #ifdef workaround remains in
place.

Using a ".." separator in the section name fixes the issue for
CONFIG_LD_DEAD_CODE_DATA_ELIMINATION and CONFIG_LTO_CLANG.

[1]: https://lore.kernel.org/linux-kbuild/CAK7LNASck6BfdLnESxXUeECYL26yUDm0cwRZuM4gmaWUkxjL5g@mail.gmail.com/

Fixes: cb87481ee89d ("kbuild: linker script do not match C names unless LD_DEAD_CODE_DATA_ELIMINATION is configured")
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 include/linux/rcupdate.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c995474e4c64..3c9dc1fd094d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -357,7 +357,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
 	*(.data..decrypted)						\
 	*(.ref.data)							\
 	*(.data..shared_aligned) /* percpu related */			\
-	*(.data.unlikely)						\
+	*(.data..unlikely)						\
 	__start_once = .;						\
 	*(.data.once)							\
 	__end_once = .;							\
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 58d84c59f3dd..48e5c03df1dd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -401,7 +401,7 @@ static inline int debug_lockdep_rcu_enabled(void)
  */
 #define RCU_LOCKDEP_WARN(c, s)						\
 	do {								\
-		static bool __section(".data.unlikely") __warned;	\
+		static bool __section(".data..unlikely") __warned;	\
 		if (debug_lockdep_rcu_enabled() && (c) &&		\
 		    debug_lockdep_rcu_enabled() && !__warned) {		\
 			__warned = true;				\
-- 
cgit v1.2.3


From dbefa1f31a91670c9e7dac9b559625336206466f Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 7 Nov 2024 01:14:41 +0900
Subject: Rename .data.once to .data..once to fix resetting WARN*_ONCE

Commit b1fca27d384e ("kernel debug: support resetting WARN*_ONCE")
added support for clearing the state of once warnings. However,
it is not functional when CONFIG_LD_DEAD_CODE_DATA_ELIMINATION or
CONFIG_LTO_CLANG is enabled, because .data.once matches the
.data.[0-9a-zA-Z_]* pattern in the DATA_MAIN macro.

Commit cb87481ee89d ("kbuild: linker script do not match C names unless
LD_DEAD_CODE_DATA_ELIMINATION is configured") was introduced to suppress
the issue for the default CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n case,
providing a minimal fix for stable backporting. We were aware this did
not address the issue for CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y. The
plan was to apply correct fixes and then revert cb87481ee89d. [1]

Seven years have passed since then, yet the #ifdef workaround remains in
place. Meanwhile, commit b1fca27d384e introduced the .data.once section,
and commit dc5723b02e52 ("kbuild: add support for Clang LTO") extended
the #ifdef.

Using a ".." separator in the section name fixes the issue for
CONFIG_LD_DEAD_CODE_DATA_ELIMINATION and CONFIG_LTO_CLANG.

[1]: https://lore.kernel.org/linux-kbuild/CAK7LNASck6BfdLnESxXUeECYL26yUDm0cwRZuM4gmaWUkxjL5g@mail.gmail.com/

Fixes: b1fca27d384e ("kernel debug: support resetting WARN*_ONCE")
Fixes: dc5723b02e52 ("kbuild: add support for Clang LTO")
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 include/linux/mmdebug.h           | 6 +++---
 include/linux/once.h              | 4 ++--
 include/linux/once_lite.h         | 2 +-
 include/net/net_debug.h           | 2 +-
 mm/internal.h                     | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 3c9dc1fd094d..54504013c749 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -359,7 +359,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
 	*(.data..shared_aligned) /* percpu related */			\
 	*(.data..unlikely)						\
 	__start_once = .;						\
-	*(.data.once)							\
+	*(.data..once)							\
 	__end_once = .;							\
 	STRUCT_ALIGN();							\
 	*(__tracepoints)						\
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 39a7714605a7..d7cb1e5ecbda 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -46,7 +46,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi);
 		}							\
 	} while (0)
 #define VM_WARN_ON_ONCE_PAGE(cond, page)	({			\
-	static bool __section(".data.once") __warned;			\
+	static bool __section(".data..once") __warned;			\
 	int __ret_warn_once = !!(cond);					\
 									\
 	if (unlikely(__ret_warn_once && !__warned)) {			\
@@ -66,7 +66,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi);
 	unlikely(__ret_warn);						\
 })
 #define VM_WARN_ON_ONCE_FOLIO(cond, folio)	({			\
-	static bool __section(".data.once") __warned;			\
+	static bool __section(".data..once") __warned;			\
 	int __ret_warn_once = !!(cond);					\
 									\
 	if (unlikely(__ret_warn_once && !__warned)) {			\
@@ -77,7 +77,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi);
 	unlikely(__ret_warn_once);					\
 })
 #define VM_WARN_ON_ONCE_MM(cond, mm)		({			\
-	static bool __section(".data.once") __warned;			\
+	static bool __section(".data..once") __warned;			\
 	int __ret_warn_once = !!(cond);					\
 									\
 	if (unlikely(__ret_warn_once && !__warned)) {			\
diff --git a/include/linux/once.h b/include/linux/once.h
index bc714d414448..30346fcdc799 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -46,7 +46,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key,
 #define DO_ONCE(func, ...)						     \
 	({								     \
 		bool ___ret = false;					     \
-		static bool __section(".data.once") ___done = false;	     \
+		static bool __section(".data..once") ___done = false;	     \
 		static DEFINE_STATIC_KEY_TRUE(___once_key);		     \
 		if (static_branch_unlikely(&___once_key)) {		     \
 			unsigned long ___flags;				     \
@@ -64,7 +64,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key,
 #define DO_ONCE_SLEEPABLE(func, ...)						\
 	({									\
 		bool ___ret = false;						\
-		static bool __section(".data.once") ___done = false;		\
+		static bool __section(".data..once") ___done = false;		\
 		static DEFINE_STATIC_KEY_TRUE(___once_key);			\
 		if (static_branch_unlikely(&___once_key)) {			\
 			___ret = __do_once_sleepable_start(&___done);		\
diff --git a/include/linux/once_lite.h b/include/linux/once_lite.h
index b7bce4983638..27de7bc32a06 100644
--- a/include/linux/once_lite.h
+++ b/include/linux/once_lite.h
@@ -12,7 +12,7 @@
 
 #define __ONCE_LITE_IF(condition)					\
 	({								\
-		static bool __section(".data.once") __already_done;	\
+		static bool __section(".data..once") __already_done;	\
 		bool __ret_cond = !!(condition);			\
 		bool __ret_once = false;				\
 									\
diff --git a/include/net/net_debug.h b/include/net/net_debug.h
index 1e74684cbbdb..4a79204c8d30 100644
--- a/include/net/net_debug.h
+++ b/include/net/net_debug.h
@@ -27,7 +27,7 @@ void netdev_info(const struct net_device *dev, const char *format, ...);
 
 #define netdev_level_once(level, dev, fmt, ...)			\
 do {								\
-	static bool __section(".data.once") __print_once;	\
+	static bool __section(".data..once") __print_once;	\
 								\
 	if (!__print_once) {					\
 		__print_once = true;				\
diff --git a/mm/internal.h b/mm/internal.h
index 93083bbeeefa..a23f7b11b760 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -48,7 +48,7 @@ struct folio_batch;
  * when we specify __GFP_NOWARN.
  */
 #define WARN_ON_ONCE_GFP(cond, gfp)	({				\
-	static bool __section(".data.once") __warned;			\
+	static bool __section(".data..once") __warned;			\
 	int __ret_warn_once = !!(cond);					\
 									\
 	if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
-- 
cgit v1.2.3


From 81de291d86b704de1809cfb06672902d003cf3a3 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 6 Nov 2024 17:33:28 +0800
Subject: of: dynamic: Add of_changeset_update_prop_string

Add a helper function to add string property updates to an OF changeset.
This is similar to of_changeset_add_prop_string(), but instead of adding
the property (and failing if it exists), it will update the property.

This shall be used later in the DT hardware prober.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 drivers/of/dynamic.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of.h   |  4 ++++
 2 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index d45a8df61380..0aba760f7577 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -1072,3 +1072,47 @@ int of_changeset_add_prop_bool(struct of_changeset *ocs, struct device_node *np,
 	return of_changeset_add_prop_helper(ocs, np, &prop);
 }
 EXPORT_SYMBOL_GPL(of_changeset_add_prop_bool);
+
+static int of_changeset_update_prop_helper(struct of_changeset *ocs,
+					   struct device_node *np,
+					   const struct property *pp)
+{
+	struct property *new_pp;
+	int ret;
+
+	new_pp = __of_prop_dup(pp, GFP_KERNEL);
+	if (!new_pp)
+		return -ENOMEM;
+
+	ret = of_changeset_update_property(ocs, np, new_pp);
+	if (ret)
+		__of_prop_free(new_pp);
+
+	return ret;
+}
+
+/**
+ * of_changeset_update_prop_string - Add a string property update to a changeset
+ *
+ * @ocs:	changeset pointer
+ * @np:		device node pointer
+ * @prop_name:	name of the property to be updated
+ * @str:	pointer to null terminated string
+ *
+ * Create a string property to be updated and add it to a changeset.
+ *
+ * Return: 0 on success, a negative error value in case of an error.
+ */
+int of_changeset_update_prop_string(struct of_changeset *ocs,
+				    struct device_node *np,
+				    const char *prop_name, const char *str)
+{
+	struct property prop = {
+		.name = (char *)prop_name,
+		.length = strlen(str) + 1,
+		.value = (void *)str,
+	};
+
+	return of_changeset_update_prop_helper(ocs, np, &prop);
+}
+EXPORT_SYMBOL_GPL(of_changeset_update_prop_string);
diff --git a/include/linux/of.h b/include/linux/of.h
index 086a60f3b8a6..d0307e3b093d 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1651,6 +1651,10 @@ static inline int of_changeset_add_prop_u32(struct of_changeset *ocs,
 	return of_changeset_add_prop_u32_array(ocs, np, prop_name, &val, 1);
 }
 
+int of_changeset_update_prop_string(struct of_changeset *ocs,
+				    struct device_node *np,
+				    const char *prop_name, const char *str);
+
 int of_changeset_add_prop_bool(struct of_changeset *ocs, struct device_node *np,
 			       const char *prop_name);
 
-- 
cgit v1.2.3


From 1fcc67e3a354865775355eafec1fb061a755c971 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 6 Nov 2024 17:33:29 +0800
Subject: of: base: Add for_each_child_of_node_with_prefix()

There are cases where drivers would go through child device nodes and
operate on only the ones whose node name starts with a given prefix.

Provide a helper for these users. This will mainly be used in a
subsequent patch that implements a hardware component prober for I2C
busses.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 drivers/of/base.c  | 35 +++++++++++++++++++++++++++++++++++
 include/linux/of.h |  9 +++++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index a8b0c42bdc8e..4cba021c89d3 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -644,6 +644,41 @@ struct device_node *of_get_next_child(const struct device_node *node,
 }
 EXPORT_SYMBOL(of_get_next_child);
 
+/**
+ * of_get_next_child_with_prefix - Find the next child node with prefix
+ * @node:	parent node
+ * @prev:	previous child of the parent node, or NULL to get first
+ *
+ * This function is like of_get_next_child(), except that it automatically
+ * skips any nodes whose name doesn't have the given prefix.
+ *
+ * Return: A node pointer with refcount incremented, use
+ * of_node_put() on it when done.
+ */
+struct device_node *of_get_next_child_with_prefix(const struct device_node *node,
+						  struct device_node *prev,
+						  const char *prefix)
+{
+	struct device_node *next;
+	unsigned long flags;
+
+	if (!node)
+		return NULL;
+
+	raw_spin_lock_irqsave(&devtree_lock, flags);
+	next = prev ? prev->sibling : node->child;
+	for (; next; next = next->sibling) {
+		if (!of_node_name_prefix(next, prefix))
+			continue;
+		if (of_node_get(next))
+			break;
+	}
+	of_node_put(prev);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
+	return next;
+}
+EXPORT_SYMBOL(of_get_next_child_with_prefix);
+
 static struct device_node *of_get_next_status_child(const struct device_node *node,
 						    struct device_node *prev,
 						    bool (*checker)(const struct device_node *))
diff --git a/include/linux/of.h b/include/linux/of.h
index d0307e3b093d..f921786cb8ac 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -289,6 +289,9 @@ extern struct device_node *of_get_parent(const struct device_node *node);
 extern struct device_node *of_get_next_parent(struct device_node *node);
 extern struct device_node *of_get_next_child(const struct device_node *node,
 					     struct device_node *prev);
+extern struct device_node *of_get_next_child_with_prefix(const struct device_node *node,
+							 struct device_node *prev,
+							 const char *prefix);
 extern struct device_node *of_get_next_available_child(
 	const struct device_node *node, struct device_node *prev);
 extern struct device_node *of_get_next_reserved_child(
@@ -1468,6 +1471,12 @@ static inline int of_property_read_s32(const struct device_node *np,
 	     child != NULL;						\
 	     child = of_get_next_child(parent, child))
 
+#define for_each_child_of_node_with_prefix(parent, child, prefix)	\
+	for (struct device_node *child __free(device_node) =		\
+	     of_get_next_child_with_prefix(parent, NULL, prefix);	\
+	     child != NULL;						\
+	     child = of_get_next_child_with_prefix(parent, child, prefix))
+
 #define for_each_available_child_of_node(parent, child) \
 	for (child = of_get_next_available_child(parent, NULL); child != NULL; \
 	     child = of_get_next_available_child(parent, child))
-- 
cgit v1.2.3


From 157ce8f381efe264933e9366db828d845bade3a1 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 6 Nov 2024 17:33:30 +0800
Subject: i2c: Introduce OF component probe function

Some devices are designed and manufactured with some components having
multiple drop-in replacement options. These components are often
connected to the mainboard via ribbon cables, having the same signals
and pin assignments across all options. These may include the display
panel and touchscreen on laptops and tablets, and the trackpad on
laptops. Sometimes which component option is used in a particular device
can be detected by some firmware provided identifier, other times that
information is not available, and the kernel has to try to probe each
device.

This change attempts to make the "probe each device" case cleaner. The
current approach is to have all options added and enabled in the device
tree. The kernel would then bind each device and run each driver's probe
function. This works, but has been broken before due to the introduction
of asynchronous probing, causing multiple instances requesting "shared"
resources, such as pinmuxes, GPIO pins, interrupt lines, at the same
time, with only one instance succeeding. Work arounds for these include
moving the pinmux to the parent I2C controller, using GPIO hogs or
pinmux settings to keep the GPIO pins in some fixed configuration, and
requesting the interrupt line very late. Such configurations can be seen
on the MT8183 Krane Chromebook tablets, and the Qualcomm sc8280xp-based
Lenovo Thinkpad 13S.

Instead of this delicate dance between drivers and device tree quirks,
this change introduces a simple I2C component probe function. For a
given class of devices on the same I2C bus, it will go through all of
them, doing a simple I2C read transfer and see which one of them responds.
It will then enable the device that responds.

This requires some minor modifications in the existing device tree. The
status for all the device nodes for the component options must be set
to "fail-needs-probe". This makes it clear that some mechanism is
needed to enable one of them, and also prevents the prober and device
drivers running at the same time.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 MAINTAINERS                      |   8 ++
 drivers/i2c/Makefile             |   1 +
 drivers/i2c/i2c-core-of-prober.c | 183 +++++++++++++++++++++++++++++++++++++++
 include/linux/i2c-of-prober.h    |  75 ++++++++++++++++
 4 files changed, 267 insertions(+)
 create mode 100644 drivers/i2c/i2c-core-of-prober.c
 create mode 100644 include/linux/i2c-of-prober.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 85cdc618a51c..0f837aca327c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10702,6 +10702,14 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml
 F:	drivers/i2c/busses/i2c-mv64xxx.c
 
+I2C OF COMPONENT PROBER
+M:	Chen-Yu Tsai <wenst@chromium.org>
+L:	linux-i2c@vger.kernel.org
+L:	devicetree@vger.kernel.org
+S:	Maintained
+F:	drivers/i2c/i2c-core-of-prober.c
+F:	include/linux-i2c-of-prober.h
+
 I2C OVER PARALLEL PORT
 M:	Jean Delvare <jdelvare@suse.com>
 L:	linux-i2c@vger.kernel.org
diff --git a/drivers/i2c/Makefile b/drivers/i2c/Makefile
index af95d6e7f004..d27de18de46f 100644
--- a/drivers/i2c/Makefile
+++ b/drivers/i2c/Makefile
@@ -9,6 +9,7 @@ i2c-core-y			:= i2c-core-base.o i2c-core-smbus.o
 i2c-core-$(CONFIG_ACPI)		+= i2c-core-acpi.o
 i2c-core-$(CONFIG_I2C_SLAVE)	+= i2c-core-slave.o
 i2c-core-$(CONFIG_OF)		+= i2c-core-of.o
+i2c-core-$(CONFIG_OF_DYNAMIC)	+= i2c-core-of-prober.o
 
 obj-$(CONFIG_I2C_SMBUS)		+= i2c-smbus.o
 obj-$(CONFIG_I2C_CHARDEV)	+= i2c-dev.o
diff --git a/drivers/i2c/i2c-core-of-prober.c b/drivers/i2c/i2c-core-of-prober.c
new file mode 100644
index 000000000000..21f7a4769293
--- /dev/null
+++ b/drivers/i2c/i2c-core-of-prober.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Linux I2C core OF component prober code
+ *
+ * Copyright (C) 2024 Google LLC
+ */
+
+#include <linux/cleanup.h>
+#include <linux/device.h>
+#include <linux/dev_printk.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/i2c-of-prober.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+
+/*
+ * Some devices, such as Google Hana Chromebooks, are produced by multiple
+ * vendors each using their preferred components. Such components are all
+ * in the device tree. Instead of having all of them enabled and having each
+ * driver separately try and probe its device while fighting over shared
+ * resources, they can be marked as "fail-needs-probe" and have a prober
+ * figure out which one is actually used beforehand.
+ *
+ * This prober assumes such drop-in parts are on the same I2C bus, have
+ * non-conflicting addresses, and can be directly probed by seeing which
+ * address responds.
+ *
+ * TODO:
+ * - Support handling common regulators.
+ * - Support handling common GPIOs.
+ * - Support I2C muxes
+ */
+
+static struct device_node *i2c_of_probe_get_i2c_node(struct device *dev, const char *type)
+{
+	struct device_node *node __free(device_node) = of_find_node_by_name(NULL, type);
+	if (!node) {
+		dev_err(dev, "Could not find %s device node\n", type);
+		return NULL;
+	}
+
+	struct device_node *i2c_node __free(device_node) = of_get_parent(node);
+	if (!of_node_name_eq(i2c_node, "i2c")) {
+		dev_err(dev, "%s device isn't on I2C bus\n", type);
+		return NULL;
+	}
+
+	if (!of_device_is_available(i2c_node)) {
+		dev_err(dev, "I2C controller not available\n");
+		return NULL;
+	}
+
+	return no_free_ptr(i2c_node);
+}
+
+static int i2c_of_probe_enable_node(struct device *dev, struct device_node *node)
+{
+	int ret;
+
+	dev_dbg(dev, "Enabling %pOF\n", node);
+
+	struct of_changeset *ocs __free(kfree) = kzalloc(sizeof(*ocs), GFP_KERNEL);
+	if (!ocs)
+		return -ENOMEM;
+
+	of_changeset_init(ocs);
+	ret = of_changeset_update_prop_string(ocs, node, "status", "okay");
+	if (ret)
+		return ret;
+
+	ret = of_changeset_apply(ocs);
+	if (ret) {
+		/* ocs needs to be explicitly cleaned up before being freed. */
+		of_changeset_destroy(ocs);
+	} else {
+		/*
+		 * ocs is intentionally kept around as it needs to
+		 * exist as long as the change is applied.
+		 */
+		void *ptr __always_unused = no_free_ptr(ocs);
+	}
+
+	return ret;
+}
+
+static const struct i2c_of_probe_ops i2c_of_probe_dummy_ops;
+
+/**
+ * i2c_of_probe_component() - probe for devices of "type" on the same i2c bus
+ * @dev: Pointer to the &struct device of the caller, only used for dev_printk() messages.
+ * @cfg: Pointer to the &struct i2c_of_probe_cfg containing callbacks and other options
+ *       for the prober.
+ * @ctx: Context data for callbacks.
+ *
+ * Probe for possible I2C components of the same "type" (&i2c_of_probe_cfg->type)
+ * on the same I2C bus that have their status marked as "fail-needs-probe".
+ *
+ * Assumes that across the entire device tree the only instances of nodes
+ * with "type" prefixed node names (not including the address portion) are
+ * the ones that need handling for second source components. In other words,
+ * if "type" is "touchscreen", then all device nodes named "touchscreen*"
+ * are the ones that need probing. There cannot be another "touchscreen*"
+ * node that is already enabled.
+ *
+ * Assumes that for each "type" of component, only one actually exists. In
+ * other words, only one matching and existing device will be enabled.
+ *
+ * Context: Process context only. Does non-atomic I2C transfers.
+ *          Should only be used from a driver probe function, as the function
+ *          can return -EPROBE_DEFER if the I2C adapter or other resources
+ *          are unavailable.
+ * Return: 0 on success or no-op, error code otherwise.
+ *         A no-op can happen when it seems like the device tree already
+ *         has components of the type to be probed already enabled. This
+ *         can happen when the device tree had not been updated to mark
+ *         the status of the to-be-probed components as "fail-needs-probe".
+ *         Or this function was already run with the same parameters and
+ *         succeeded in enabling a component. The latter could happen if
+ *         the user had multiple types of components to probe, and one of
+ *         them down the list caused a deferred probe. This is expected
+ *         behavior.
+ */
+int i2c_of_probe_component(struct device *dev, const struct i2c_of_probe_cfg *cfg, void *ctx)
+{
+	const struct i2c_of_probe_ops *ops;
+	const char *type;
+	struct i2c_adapter *i2c;
+	int ret;
+
+	ops = cfg->ops ?: &i2c_of_probe_dummy_ops;
+	type = cfg->type;
+
+	struct device_node *i2c_node __free(device_node) = i2c_of_probe_get_i2c_node(dev, type);
+	if (!i2c_node)
+		return -ENODEV;
+
+	/*
+	 * If any devices of the given "type" are already enabled then this function is a no-op.
+	 * Either the device tree hasn't been modified to work with this probe function, or the
+	 * function had already run before and enabled some component.
+	 */
+	for_each_child_of_node_with_prefix(i2c_node, node, type)
+		if (of_device_is_available(node))
+			return 0;
+
+	i2c = of_get_i2c_adapter_by_node(i2c_node);
+	if (!i2c)
+		return dev_err_probe(dev, -EPROBE_DEFER, "Couldn't get I2C adapter\n");
+
+	/* Grab and enable resources */
+	ret = 0;
+	if (ops->enable)
+		ret = ops->enable(dev, i2c_node, ctx);
+	if (ret)
+		goto out_put_i2c_adapter;
+
+	for_each_child_of_node_with_prefix(i2c_node, node, type) {
+		union i2c_smbus_data data;
+		u32 addr;
+
+		if (of_property_read_u32(node, "reg", &addr))
+			continue;
+		if (i2c_smbus_xfer(i2c, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_BYTE, &data) < 0)
+			continue;
+
+		/* Found a device that is responding */
+		if (ops->cleanup_early)
+			ops->cleanup_early(dev, ctx);
+		ret = i2c_of_probe_enable_node(dev, node);
+		break;
+	}
+
+	if (ops->cleanup)
+		ops->cleanup(dev, ctx);
+out_put_i2c_adapter:
+	i2c_put_adapter(i2c);
+
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(i2c_of_probe_component, I2C_OF_PROBER);
diff --git a/include/linux/i2c-of-prober.h b/include/linux/i2c-of-prober.h
new file mode 100644
index 000000000000..e7e052ac9e48
--- /dev/null
+++ b/include/linux/i2c-of-prober.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Definitions for the Linux I2C OF component prober
+ *
+ * Copyright (C) 2024 Google LLC
+ */
+
+#ifndef _LINUX_I2C_OF_PROBER_H
+#define _LINUX_I2C_OF_PROBER_H
+
+#include <linux/kconfig.h>
+
+struct device;
+struct device_node;
+
+/**
+ * struct i2c_of_probe_ops - I2C OF component prober callbacks
+ *
+ * A set of callbacks to be used by i2c_of_probe_component().
+ *
+ * All callbacks are optional. Callbacks are called only once per run, and are
+ * used in the order they are defined in this structure.
+ *
+ * All callbacks that have return values shall return %0 on success,
+ * or a negative error number on failure.
+ *
+ * The @dev parameter passed to the callbacks is the same as @dev passed to
+ * i2c_of_probe_component(). It should only be used for dev_printk() calls
+ * and nothing else, especially not managed device resource (devres) APIs.
+ */
+struct i2c_of_probe_ops {
+	/**
+	 * @enable: Retrieve and enable resources so that the components respond to probes.
+	 *
+	 * It is OK for this callback to return -EPROBE_DEFER since the intended use includes
+	 * retrieving resources and enables them. Resources should be reverted to their initial
+	 * state and released before returning if this fails.
+	 */
+	int (*enable)(struct device *dev, struct device_node *bus_node, void *data);
+
+	/**
+	 * @cleanup_early: Release exclusive resources prior to calling probe() on a
+	 *		   detected component.
+	 *
+	 * Only called if a matching component is actually found. If none are found,
+	 * resources that would have been released in this callback should be released in
+	 * @free_resourcs_late instead.
+	 */
+	void (*cleanup_early)(struct device *dev, void *data);
+
+	/**
+	 * @cleanup: Opposite of @enable to balance refcounts and free resources after probing.
+	 *
+	 * Should check if resources were already freed by @cleanup_early.
+	 */
+	void (*cleanup)(struct device *dev, void *data);
+};
+
+/**
+ * struct i2c_of_probe_cfg - I2C OF component prober configuration
+ * @ops: Callbacks for the prober to use.
+ * @type: A string to match the device node name prefix to probe for.
+ */
+struct i2c_of_probe_cfg {
+	const struct i2c_of_probe_ops *ops;
+	const char *type;
+};
+
+#if IS_ENABLED(CONFIG_OF_DYNAMIC)
+
+int i2c_of_probe_component(struct device *dev, const struct i2c_of_probe_cfg *cfg, void *ctx);
+
+#endif /* IS_ENABLED(CONFIG_OF_DYNAMIC) */
+
+#endif /* _LINUX_I2C_OF_PROBER_H */
-- 
cgit v1.2.3


From 897261149d255d03fc90bec6782e3835cacbfdde Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 6 Nov 2024 17:33:31 +0800
Subject: i2c: of-prober: Add simple helpers for regulator support

Add helpers to do regulator management for the I2C OF component prober.
Components that the prober intends to probe likely require their
regulator supplies be enabled, and GPIOs be toggled to enable them or
bring them out of reset before they will respond to probe attempts.
GPIOs will be handled in the next patch.

The assumption is that the same class of components to be probed are
always connected in the same fashion with the same regulator supply
and GPIO. The names may vary due to binding differences, but the
physical layout does not change.

This set of helpers supports at most one regulator supply. The user
must specify the node from which the supply is retrieved. The supply
name and the amount of time to wait after the supply is enabled are
also given by the user.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 drivers/i2c/i2c-core-of-prober.c | 138 ++++++++++++++++++++++++++++++++++++++-
 include/linux/i2c-of-prober.h    |  44 +++++++++++++
 2 files changed, 181 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-of-prober.c b/drivers/i2c/i2c-core-of-prober.c
index 21f7a4769293..76df75c51cae 100644
--- a/drivers/i2c/i2c-core-of-prober.c
+++ b/drivers/i2c/i2c-core-of-prober.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/cleanup.h>
+#include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/dev_printk.h>
 #include <linux/err.h>
@@ -13,6 +14,7 @@
 #include <linux/i2c-of-prober.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/regulator/consumer.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
 
@@ -29,7 +31,6 @@
  * address responds.
  *
  * TODO:
- * - Support handling common regulators.
  * - Support handling common GPIOs.
  * - Support I2C muxes
  */
@@ -181,3 +182,138 @@ out_put_i2c_adapter:
 	return ret;
 }
 EXPORT_SYMBOL_NS_GPL(i2c_of_probe_component, I2C_OF_PROBER);
+
+static int i2c_of_probe_simple_get_supply(struct device *dev, struct device_node *node,
+					  struct i2c_of_probe_simple_ctx *ctx)
+{
+	const char *supply_name;
+	struct regulator *supply;
+
+	/*
+	 * It's entirely possible for the component's device node to not have the
+	 * regulator supplies. While it does not make sense from a hardware perspective,
+	 * the supplies could be always on or otherwise not modeled in the device tree,
+	 * but the device would still work.
+	 */
+	supply_name = ctx->opts->supply_name;
+	if (!supply_name)
+		return 0;
+
+	supply = of_regulator_get_optional(dev, node, supply_name);
+	if (IS_ERR(supply)) {
+		return dev_err_probe(dev, PTR_ERR(supply),
+				     "Failed to get regulator supply \"%s\" from %pOF\n",
+				     supply_name, node);
+	}
+
+	ctx->supply = supply;
+
+	return 0;
+}
+
+static void i2c_of_probe_simple_put_supply(struct i2c_of_probe_simple_ctx *ctx)
+{
+	regulator_put(ctx->supply);
+	ctx->supply = NULL;
+}
+
+static int i2c_of_probe_simple_enable_regulator(struct device *dev, struct i2c_of_probe_simple_ctx *ctx)
+{
+	int ret;
+
+	if (!ctx->supply)
+		return 0;
+
+	dev_dbg(dev, "Enabling regulator supply \"%s\"\n", ctx->opts->supply_name);
+
+	ret = regulator_enable(ctx->supply);
+	if (ret)
+		return ret;
+
+	if (ctx->opts->post_power_on_delay_ms)
+		msleep(ctx->opts->post_power_on_delay_ms);
+
+	return 0;
+}
+
+static void i2c_of_probe_simple_disable_regulator(struct device *dev, struct i2c_of_probe_simple_ctx *ctx)
+{
+	if (!ctx->supply)
+		return;
+
+	dev_dbg(dev, "Disabling regulator supply \"%s\"\n", ctx->opts->supply_name);
+
+	regulator_disable(ctx->supply);
+}
+
+/**
+ * i2c_of_probe_simple_enable - Simple helper for I2C OF prober to get and enable resources
+ * @dev: Pointer to the &struct device of the caller, only used for dev_printk() messages
+ * @bus_node: Pointer to the &struct device_node of the I2C adapter.
+ * @data: Pointer to &struct i2c_of_probe_simple_ctx helper context.
+ *
+ * If &i2c_of_probe_simple_opts->supply_name is given, request the named regulator supply.
+ * If a regulator supply was found, enable that regulator.
+ *
+ * Return: %0 on success or no-op, or a negative error number on failure.
+ */
+int i2c_of_probe_simple_enable(struct device *dev, struct device_node *bus_node, void *data)
+{
+	struct i2c_of_probe_simple_ctx *ctx = data;
+	struct device_node *node;
+	const char *compat;
+	int ret;
+
+	dev_dbg(dev, "Requesting resources for components under I2C bus %pOF\n", bus_node);
+
+	if (!ctx || !ctx->opts)
+		return -EINVAL;
+
+	compat = ctx->opts->res_node_compatible;
+	if (!compat)
+		return -EINVAL;
+
+	node = of_get_compatible_child(bus_node, compat);
+	if (!node)
+		return dev_err_probe(dev, -ENODEV, "No device compatible with \"%s\" found\n",
+				     compat);
+
+	ret = i2c_of_probe_simple_get_supply(dev, node, ctx);
+	if (ret)
+		goto out_put_node;
+
+	ret = i2c_of_probe_simple_enable_regulator(dev, ctx);
+	if (ret)
+		goto out_put_supply;
+
+	return 0;
+
+out_put_supply:
+	i2c_of_probe_simple_put_supply(ctx);
+out_put_node:
+	of_node_put(node);
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(i2c_of_probe_simple_enable, I2C_OF_PROBER);
+
+/**
+ * i2c_of_probe_simple_cleanup - Clean up and release resources for I2C OF prober simple helpers
+ * @dev: Pointer to the &struct device of the caller, only used for dev_printk() messages
+ * @data: Pointer to &struct i2c_of_probe_simple_ctx helper context.
+ *
+ * * If a regulator supply was found, disable that regulator and release it.
+ */
+void i2c_of_probe_simple_cleanup(struct device *dev, void *data)
+{
+	struct i2c_of_probe_simple_ctx *ctx = data;
+
+	i2c_of_probe_simple_disable_regulator(dev, ctx);
+	i2c_of_probe_simple_put_supply(ctx);
+}
+EXPORT_SYMBOL_NS_GPL(i2c_of_probe_simple_cleanup, I2C_OF_PROBER);
+
+struct i2c_of_probe_ops i2c_of_probe_simple_ops = {
+	.enable = i2c_of_probe_simple_enable,
+	.cleanup = i2c_of_probe_simple_cleanup,
+};
+EXPORT_SYMBOL_NS_GPL(i2c_of_probe_simple_ops, I2C_OF_PROBER);
diff --git a/include/linux/i2c-of-prober.h b/include/linux/i2c-of-prober.h
index e7e052ac9e48..df95aa6ad90e 100644
--- a/include/linux/i2c-of-prober.h
+++ b/include/linux/i2c-of-prober.h
@@ -70,6 +70,50 @@ struct i2c_of_probe_cfg {
 
 int i2c_of_probe_component(struct device *dev, const struct i2c_of_probe_cfg *cfg, void *ctx);
 
+/**
+ * DOC: I2C OF component prober simple helpers
+ *
+ * Components such as trackpads are commonly connected to a devices baseboard
+ * with a 6-pin ribbon cable. That gives at most one voltage supply and one
+ * GPIO (commonly a "enable" or "reset" line) besides the I2C bus, interrupt
+ * pin, and common ground. Touchscreens, while integrated into the display
+ * panel's connection, typically have the same set of connections.
+ *
+ * A simple set of helpers are provided here for use with the I2C OF component
+ * prober. This implementation targets such components, allowing for at most
+ * one regulator supply.
+ *
+ * The following helpers are provided:
+ * * i2c_of_probe_simple_enable()
+ * * i2c_of_probe_simple_cleanup()
+ */
+
+/**
+ * struct i2c_of_probe_simple_opts - Options for simple I2C component prober callbacks
+ * @res_node_compatible: Compatible string of device node to retrieve resources from.
+ * @supply_name: Name of regulator supply.
+ * @post_power_on_delay_ms: Delay after regulators are powered on. Passed to msleep().
+ */
+struct i2c_of_probe_simple_opts {
+	const char *res_node_compatible;
+	const char *supply_name;
+	unsigned int post_power_on_delay_ms;
+};
+
+struct regulator;
+
+struct i2c_of_probe_simple_ctx {
+	/* public: provided by user before helpers are used. */
+	const struct i2c_of_probe_simple_opts *opts;
+	/* private: internal fields for helpers. */
+	struct regulator *supply;
+};
+
+int i2c_of_probe_simple_enable(struct device *dev, struct device_node *bus_node, void *data);
+void i2c_of_probe_simple_cleanup(struct device *dev, void *data);
+
+extern struct i2c_of_probe_ops i2c_of_probe_simple_ops;
+
 #endif /* IS_ENABLED(CONFIG_OF_DYNAMIC) */
 
 #endif /* _LINUX_I2C_OF_PROBER_H */
-- 
cgit v1.2.3


From 39b415f84654892003cebb7c026b7daa3380610b Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 6 Nov 2024 17:33:32 +0800
Subject: i2c: of-prober: Add GPIO support to simple helpers

Add GPIO support to the simple helpers for the I2C OF component prober.
Components that the prober intends to probe likely require their
regulator supplies be enabled, and GPIOs be toggled to enable them or
bring them out of reset before they will respond to probe attempts.
Regulator supplies were handled in the previous patch.

The assumption is that the same class of components to be probed are
always connected in the same fashion with the same regulator supply
and GPIO. The names may vary due to binding differences, but the
physical layout does not change.

This supports at most one GPIO pin. The user must specify the GPIO name,
the polarity, and the amount of time to wait after the GPIO is toggled.
Devices with more than one GPIO pin likely require specific power
sequencing beyond what generic code can easily support.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Andrey Skvortsov <andrej.skvortzov@gmail.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 drivers/i2c/i2c-core-of-prober.c | 100 ++++++++++++++++++++++++++++++++++++++-
 include/linux/i2c-of-prober.h    |  21 ++++++++
 2 files changed, 119 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-of-prober.c b/drivers/i2c/i2c-core-of-prober.c
index 76df75c51cae..b9ca785f8b17 100644
--- a/drivers/i2c/i2c-core-of-prober.c
+++ b/drivers/i2c/i2c-core-of-prober.c
@@ -10,6 +10,7 @@
 #include <linux/device.h>
 #include <linux/dev_printk.h>
 #include <linux/err.h>
+#include <linux/gpio/consumer.h>
 #include <linux/i2c.h>
 #include <linux/i2c-of-prober.h>
 #include <linux/module.h>
@@ -31,7 +32,6 @@
  * address responds.
  *
  * TODO:
- * - Support handling common GPIOs.
  * - Support I2C muxes
  */
 
@@ -246,6 +246,62 @@ static void i2c_of_probe_simple_disable_regulator(struct device *dev, struct i2c
 	regulator_disable(ctx->supply);
 }
 
+static int i2c_of_probe_simple_get_gpiod(struct device *dev, struct device_node *node,
+					 struct i2c_of_probe_simple_ctx *ctx)
+{
+	struct fwnode_handle *fwnode = of_fwnode_handle(node);
+	struct gpio_desc *gpiod;
+	const char *con_id;
+
+	/* NULL signals no GPIO needed */
+	if (!ctx->opts->gpio_name)
+		return 0;
+
+	/* An empty string signals an unnamed GPIO */
+	if (!ctx->opts->gpio_name[0])
+		con_id = NULL;
+	else
+		con_id = ctx->opts->gpio_name;
+
+	gpiod = fwnode_gpiod_get_index(fwnode, con_id, 0, GPIOD_ASIS, "i2c-of-prober");
+	if (IS_ERR(gpiod))
+		return PTR_ERR(gpiod);
+
+	ctx->gpiod = gpiod;
+
+	return 0;
+}
+
+static void i2c_of_probe_simple_put_gpiod(struct i2c_of_probe_simple_ctx *ctx)
+{
+	gpiod_put(ctx->gpiod);
+	ctx->gpiod = NULL;
+}
+
+static int i2c_of_probe_simple_set_gpio(struct device *dev, struct i2c_of_probe_simple_ctx *ctx)
+{
+	int ret;
+
+	if (!ctx->gpiod)
+		return 0;
+
+	dev_dbg(dev, "Configuring GPIO\n");
+
+	ret = gpiod_direction_output(ctx->gpiod, ctx->opts->gpio_assert_to_enable);
+	if (ret)
+		return ret;
+
+	if (ctx->opts->post_gpio_config_delay_ms)
+		msleep(ctx->opts->post_gpio_config_delay_ms);
+
+	return 0;
+}
+
+static void i2c_of_probe_simple_disable_gpio(struct device *dev, struct i2c_of_probe_simple_ctx *ctx)
+{
+	gpiod_set_value(ctx->gpiod, !ctx->opts->gpio_assert_to_enable);
+}
+
 /**
  * i2c_of_probe_simple_enable - Simple helper for I2C OF prober to get and enable resources
  * @dev: Pointer to the &struct device of the caller, only used for dev_printk() messages
@@ -253,7 +309,11 @@ static void i2c_of_probe_simple_disable_regulator(struct device *dev, struct i2c
  * @data: Pointer to &struct i2c_of_probe_simple_ctx helper context.
  *
  * If &i2c_of_probe_simple_opts->supply_name is given, request the named regulator supply.
+ * If &i2c_of_probe_simple_opts->gpio_name is given, request the named GPIO. Or if it is
+ * the empty string, request the unnamed GPIO.
  * If a regulator supply was found, enable that regulator.
+ * If a GPIO line was found, configure the GPIO line to output and set value
+ * according to given options.
  *
  * Return: %0 on success or no-op, or a negative error number on failure.
  */
@@ -282,12 +342,24 @@ int i2c_of_probe_simple_enable(struct device *dev, struct device_node *bus_node,
 	if (ret)
 		goto out_put_node;
 
-	ret = i2c_of_probe_simple_enable_regulator(dev, ctx);
+	ret = i2c_of_probe_simple_get_gpiod(dev, node, ctx);
 	if (ret)
 		goto out_put_supply;
 
+	ret = i2c_of_probe_simple_enable_regulator(dev, ctx);
+	if (ret)
+		goto out_put_gpiod;
+
+	ret = i2c_of_probe_simple_set_gpio(dev, ctx);
+	if (ret)
+		goto out_disable_regulator;
+
 	return 0;
 
+out_disable_regulator:
+	i2c_of_probe_simple_disable_regulator(dev, ctx);
+out_put_gpiod:
+	i2c_of_probe_simple_put_gpiod(ctx);
 out_put_supply:
 	i2c_of_probe_simple_put_supply(ctx);
 out_put_node:
@@ -296,17 +368,40 @@ out_put_node:
 }
 EXPORT_SYMBOL_NS_GPL(i2c_of_probe_simple_enable, I2C_OF_PROBER);
 
+/**
+ * i2c_of_probe_simple_cleanup_early - \
+ *	Simple helper for I2C OF prober to release GPIOs before component is enabled
+ * @dev: Pointer to the &struct device of the caller; unused.
+ * @data: Pointer to &struct i2c_of_probe_simple_ctx helper context.
+ *
+ * GPIO descriptors are exclusive and have to be released before the
+ * actual driver probes so that the latter can acquire them.
+ */
+void i2c_of_probe_simple_cleanup_early(struct device *dev, void *data)
+{
+	struct i2c_of_probe_simple_ctx *ctx = data;
+
+	i2c_of_probe_simple_put_gpiod(ctx);
+}
+EXPORT_SYMBOL_NS_GPL(i2c_of_probe_simple_cleanup_early, I2C_OF_PROBER);
+
 /**
  * i2c_of_probe_simple_cleanup - Clean up and release resources for I2C OF prober simple helpers
  * @dev: Pointer to the &struct device of the caller, only used for dev_printk() messages
  * @data: Pointer to &struct i2c_of_probe_simple_ctx helper context.
  *
+ * * If a GPIO line was found and not yet released, set its value to the opposite of that
+ *   set in i2c_of_probe_simple_enable() and release it.
  * * If a regulator supply was found, disable that regulator and release it.
  */
 void i2c_of_probe_simple_cleanup(struct device *dev, void *data)
 {
 	struct i2c_of_probe_simple_ctx *ctx = data;
 
+	/* GPIO operations here are no-ops if i2c_of_probe_simple_cleanup_early was called. */
+	i2c_of_probe_simple_disable_gpio(dev, ctx);
+	i2c_of_probe_simple_put_gpiod(ctx);
+
 	i2c_of_probe_simple_disable_regulator(dev, ctx);
 	i2c_of_probe_simple_put_supply(ctx);
 }
@@ -314,6 +409,7 @@ EXPORT_SYMBOL_NS_GPL(i2c_of_probe_simple_cleanup, I2C_OF_PROBER);
 
 struct i2c_of_probe_ops i2c_of_probe_simple_ops = {
 	.enable = i2c_of_probe_simple_enable,
+	.cleanup_early = i2c_of_probe_simple_cleanup_early,
 	.cleanup = i2c_of_probe_simple_cleanup,
 };
 EXPORT_SYMBOL_NS_GPL(i2c_of_probe_simple_ops, I2C_OF_PROBER);
diff --git a/include/linux/i2c-of-prober.h b/include/linux/i2c-of-prober.h
index df95aa6ad90e..bb6d47f50ee5 100644
--- a/include/linux/i2c-of-prober.h
+++ b/include/linux/i2c-of-prober.h
@@ -9,6 +9,7 @@
 #define _LINUX_I2C_OF_PROBER_H
 
 #include <linux/kconfig.h>
+#include <linux/types.h>
 
 struct device;
 struct device_node;
@@ -85,6 +86,7 @@ int i2c_of_probe_component(struct device *dev, const struct i2c_of_probe_cfg *cf
  *
  * The following helpers are provided:
  * * i2c_of_probe_simple_enable()
+ * * i2c_of_probe_simple_cleanup_early()
  * * i2c_of_probe_simple_cleanup()
  */
 
@@ -92,14 +94,31 @@ int i2c_of_probe_component(struct device *dev, const struct i2c_of_probe_cfg *cf
  * struct i2c_of_probe_simple_opts - Options for simple I2C component prober callbacks
  * @res_node_compatible: Compatible string of device node to retrieve resources from.
  * @supply_name: Name of regulator supply.
+ * @gpio_name: Name of GPIO. NULL if no GPIO line is used. Empty string ("") if GPIO
+ *	       line is unnamed.
  * @post_power_on_delay_ms: Delay after regulators are powered on. Passed to msleep().
+ * @post_gpio_config_delay_ms: Delay after GPIO is configured. Passed to msleep().
+ * @gpio_assert_to_enable: %true if GPIO should be asserted, i.e. set to logical high,
+ *			   to enable the component.
+ *
+ * This describes power sequences common for the class of components supported by the
+ * simple component prober:
+ * * @gpio_name is configured to the non-active setting according to @gpio_assert_to_enable.
+ * * @supply_name regulator supply is enabled.
+ * * Wait for @post_power_on_delay_ms to pass.
+ * * @gpio_name is configured to the active setting according to @gpio_assert_to_enable.
+ * * Wait for @post_gpio_config_delay_ms to pass.
  */
 struct i2c_of_probe_simple_opts {
 	const char *res_node_compatible;
 	const char *supply_name;
+	const char *gpio_name;
 	unsigned int post_power_on_delay_ms;
+	unsigned int post_gpio_config_delay_ms;
+	bool gpio_assert_to_enable;
 };
 
+struct gpio_desc;
 struct regulator;
 
 struct i2c_of_probe_simple_ctx {
@@ -107,9 +126,11 @@ struct i2c_of_probe_simple_ctx {
 	const struct i2c_of_probe_simple_opts *opts;
 	/* private: internal fields for helpers. */
 	struct regulator *supply;
+	struct gpio_desc *gpiod;
 };
 
 int i2c_of_probe_simple_enable(struct device *dev, struct device_node *bus_node, void *data);
+void i2c_of_probe_simple_cleanup_early(struct device *dev, void *data);
 void i2c_of_probe_simple_cleanup(struct device *dev, void *data);
 
 extern struct i2c_of_probe_ops i2c_of_probe_simple_ops;
-- 
cgit v1.2.3


From 054a9cd395a79f4a5595f2cd24542e9bca8723c8 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Wed, 20 Nov 2024 08:56:52 +0900
Subject: modpost: rename alias symbol for MODULE_DEVICE_TABLE()

This commit renames the alias symbol, __mod_<type>__<name>_device_table
to __mod_device_table__<type>__<name>.

This change simplifies the code slightly, as there is no longer a need
to check both the prefix and suffix.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/module.h   |  2 +-
 scripts/mod/file2alias.c | 17 +++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 88ecc5e9f523..3dd79a3d0cbf 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -247,7 +247,7 @@ extern void cleanup_module(void);
 #ifdef MODULE
 /* Creates an alias so file2alias.c can find device table. */
 #define MODULE_DEVICE_TABLE(type, name)					\
-extern typeof(name) __mod_##type##__##name##_device_table		\
+extern typeof(name) __mod_device_table__##type##__##name		\
   __attribute__ ((unused, alias(__stringify(name))))
 #else  /* !MODULE */
 #define MODULE_DEVICE_TABLE(type, name)
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 038b2ab79e11..fc4bbeea1268 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -123,7 +123,7 @@ typedef struct {
 #include "../../include/linux/mod_devicetable.h"
 
 struct devtable {
-	const char *device_id; /* name of table, __mod_<name>__*_device_table. */
+	const char *device_id;
 	unsigned long id_size;
 	void (*do_entry)(struct module *mod, void *symval);
 };
@@ -190,7 +190,7 @@ static void device_id_check(const char *modname, const char *device_id,
 	int i;
 
 	if (size % id_size || size < id_size) {
-		fatal("%s: sizeof(struct %s_device_id)=%lu is not a modulo of the size of section __mod_%s__<identifier>_device_table=%lu.\n"
+		fatal("%s: sizeof(struct %s_device_id)=%lu is not a modulo of the size of section __mod_device_table__%s__<identifier>=%lu.\n"
 		      "Fix definition of struct %s_device_id in mod_devicetable.h\n",
 		      modname, device_id, id_size, device_id, size, device_id);
 	}
@@ -1505,6 +1505,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 	char *zeros = NULL;
 	const char *type, *name;
 	size_t typelen;
+	static const char *prefix = "__mod_device_table__";
 
 	/* We're looking for a section relative symbol */
 	if (!sym->st_shndx || get_secindex(info, sym) >= info->num_sections)
@@ -1514,15 +1515,11 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 	if (ELF_ST_TYPE(sym->st_info) != STT_OBJECT)
 		return;
 
-	/* All our symbols are of form __mod_<type>__<name>_device_table. */
-	if (!strstarts(symname, "__mod_"))
-		return;
-	type = symname + strlen("__mod_");
-	typelen = strlen(type);
-	if (typelen < strlen("_device_table"))
-		return;
-	if (strcmp(type + typelen - strlen("_device_table"), "_device_table"))
+	/* All our symbols are of form __mod_device_table__<type>__<name>. */
+	if (!strstarts(symname, prefix))
 		return;
+	type = symname + strlen(prefix);
+
 	name = strstr(type, "__");
 	if (!name)
 		return;
-- 
cgit v1.2.3


From 49b2b973325add467270e906d4d794aa6679b38b Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Tue, 19 Nov 2024 14:31:43 +0100
Subject: net: Comment copy_from_sockptr() explaining its behaviour

copy_from_sockptr() has a history of misuse. Add a comment explaining that
the function follows API of copy_from_user(), i.e. returns 0 for success,
or number of bytes not copied on error.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/sockptr.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index 195debe2b1db..3e6c8e9d67ae 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -53,6 +53,8 @@ static inline int copy_from_sockptr_offset(void *dst, sockptr_t src,
 /* Deprecated.
  * This is unsafe, unless caller checked user provided optlen.
  * Prefer copy_safe_from_sockptr() instead.
+ *
+ * Returns 0 for success, or number of bytes not copied on error.
  */
 static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size)
 {
-- 
cgit v1.2.3


From e2668c34b7e1a2288ea0a97ccf3cd12e2870ca18 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Sat, 23 Nov 2024 14:50:12 +0000
Subject: net: phy: fix phy_ethtool_set_eee() incorrectly enabling LPI

When phy_ethtool_set_eee_noneg() detects a change in the LPI
parameters, it attempts to update phylib state and trigger the link
to cycle so the MAC sees the updated parameters.

However, in doing so, it sets phydev->enable_tx_lpi depending on
whether the EEE configuration allows the MAC to generate LPI without
taking into account the result of negotiation.

This can be demonstrated with a 1000base-T FD interface by:

 # ethtool --set-eee eno0 advertise 8   # cause EEE to be not negotiated
 # ethtool --set-eee eno0 tx-lpi off
 # ethtool --set-eee eno0 tx-lpi on

This results in being true, despite EEE not having been negotiated and:
 # ethtool --show-eee eno0
	EEE status: enabled - inactive
	Tx LPI: 250 (us)
	Supported EEE link modes:  100baseT/Full
	                           1000baseT/Full
	Advertised EEE link modes:  100baseT/Full
	                                         1000baseT/Full

Fix this by keeping track of whether EEE was negotiated via a new
eee_active member in struct phy_device, and include this state in
the decision whether phydev->enable_tx_lpi should be set.

Fixes: 3e43b903da04 ("net: phy: Immediately call adjust_link if only tx_lpi_enabled changes")
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1tErSe-005RhB-2R@rmk-PC.armlinux.org.uk
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/phy-c45.c |  2 +-
 drivers/net/phy/phy.c     | 30 ++++++++++++++++++------------
 include/linux/phy.h       |  2 ++
 3 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 96d0b3a5a9d3..944ae98ad110 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -1530,7 +1530,7 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev,
 		return ret;
 
 	data->eee_enabled = is_enabled;
-	data->eee_active = ret;
+	data->eee_active = phydev->eee_active;
 	linkmode_copy(data->supported, phydev->supported_eee);
 
 	return 0;
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a660a80f34b7..0d20b534122b 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -990,14 +990,14 @@ static int phy_check_link_status(struct phy_device *phydev)
 		phydev->state = PHY_RUNNING;
 		err = genphy_c45_eee_is_active(phydev,
 					       NULL, NULL, NULL);
-		if (err <= 0)
-			phydev->enable_tx_lpi = false;
-		else
-			phydev->enable_tx_lpi = phydev->eee_cfg.tx_lpi_enabled;
+		phydev->eee_active = err > 0;
+		phydev->enable_tx_lpi = phydev->eee_cfg.tx_lpi_enabled &&
+					phydev->eee_active;
 
 		phy_link_up(phydev);
 	} else if (!phydev->link && phydev->state != PHY_NOLINK) {
 		phydev->state = PHY_NOLINK;
+		phydev->eee_active = false;
 		phydev->enable_tx_lpi = false;
 		phy_link_down(phydev);
 	}
@@ -1685,15 +1685,21 @@ EXPORT_SYMBOL(phy_ethtool_get_eee);
 static void phy_ethtool_set_eee_noneg(struct phy_device *phydev,
 				      const struct eee_config *old_cfg)
 {
-	if (phydev->eee_cfg.tx_lpi_enabled != old_cfg->tx_lpi_enabled ||
+	bool enable_tx_lpi;
+
+	if (!phydev->link)
+		return;
+
+	enable_tx_lpi = phydev->eee_cfg.tx_lpi_enabled && phydev->eee_active;
+
+	if (phydev->enable_tx_lpi != enable_tx_lpi ||
 	    phydev->eee_cfg.tx_lpi_timer != old_cfg->tx_lpi_timer) {
-		phydev->enable_tx_lpi = eeecfg_mac_can_tx_lpi(&phydev->eee_cfg);
-		if (phydev->link) {
-			phydev->link = false;
-			phy_link_down(phydev);
-			phydev->link = true;
-			phy_link_up(phydev);
-		}
+		phydev->enable_tx_lpi = false;
+		phydev->link = false;
+		phy_link_down(phydev);
+		phydev->enable_tx_lpi = enable_tx_lpi;
+		phydev->link = true;
+		phy_link_up(phydev);
 	}
 }
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 77c6d6451638..563c46205685 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -602,6 +602,7 @@ struct macsec_ops;
  * @supported_eee: supported PHY EEE linkmodes
  * @advertising_eee: Currently advertised EEE linkmodes
  * @enable_tx_lpi: When True, MAC should transmit LPI to PHY
+ * @eee_active: phylib private state, indicating that EEE has been negotiated
  * @eee_cfg: User configuration of EEE
  * @lp_advertising: Current link partner advertised linkmodes
  * @host_interfaces: PHY interface modes supported by host
@@ -723,6 +724,7 @@ struct phy_device {
 	/* Energy efficient ethernet modes which should be prohibited */
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(eee_broken_modes);
 	bool enable_tx_lpi;
+	bool eee_active;
 	struct eee_config eee_cfg;
 
 	/* Host supported PHY interface types. Should be ignored if empty. */
-- 
cgit v1.2.3


From f69e63756f7822fcdad8a34f9967e8b243e883ee Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Wed, 2 Oct 2024 18:31:47 +0100
Subject: printf: Remove unused 'bprintf'

bprintf() is unused. Remove it. It was added in the commit 4370aa4aa753
("vsprintf: add binary printf") but as far as I can see was never used,
unlike the other two functions in that patch.

Link: https://lore.kernel.org/20241002173147.210107-1-linux@treblig.org
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Acked-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/string.h |  1 -
 lib/vsprintf.c         | 23 -----------------------
 2 files changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 0dd27afcfaf7..493ac4862c77 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -335,7 +335,6 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s);
 #ifdef CONFIG_BINARY_PRINTF
 int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
 int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
-int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
 #endif
 
 extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 6ac02bbb7df1..9d3dac38a3f4 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -3428,29 +3428,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(bstr_printf);
 
-/**
- * bprintf - Parse a format string and place args' binary value in a buffer
- * @bin_buf: The buffer to place args' binary value
- * @size: The size of the buffer(by words(32bits), not characters)
- * @fmt: The format string to use
- * @...: Arguments for the format string
- *
- * The function returns the number of words(u32) written
- * into @bin_buf.
- */
-int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...)
-{
-	va_list args;
-	int ret;
-
-	va_start(args, fmt);
-	ret = vbin_printf(bin_buf, size, fmt, args);
-	va_end(args);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(bprintf);
-
 #endif /* CONFIG_BINARY_PRINTF */
 
 /**
-- 
cgit v1.2.3