From b1a37ed00d7908a991c1d0f18a8cba3c2aa99bdc Mon Sep 17 00:00:00 2001
From: Lee Jones <lee@kernel.org>
Date: Mon, 23 Jan 2023 12:39:11 +0000
Subject: HID: core: Provide new max_buffer_size attribute to over-ride the
 default

Presently, when a report is processed, its proposed size, provided by
the user of the API (as Report Size * Report Count) is compared against
the subsystem default HID_MAX_BUFFER_SIZE (16k).  However, some
low-level HID drivers allocate a reduced amount of memory to their
buffers (e.g. UHID only allocates UHID_DATA_MAX (4k) buffers), rending
this check inadequate in some cases.

In these circumstances, if the received report ends up being smaller
than the proposed report size, the remainder of the buffer is zeroed.
That is, the space between sizeof(csize) (size of the current report)
and the rsize (size proposed i.e. Report Size * Report Count), which can
be handled up to HID_MAX_BUFFER_SIZE (16k).  Meaning that memset()
shoots straight past the end of the buffer boundary and starts zeroing
out in-use values, often resulting in calamity.

This patch introduces a new variable into 'struct hid_ll_driver' where
individual low-level drivers can over-ride the default maximum value of
HID_MAX_BUFFER_SIZE (16k) with something more sympathetic to the
interface.

Signed-off-by: Lee Jones <lee@kernel.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index eaf8ab177303..1ea8c7a3570b 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -834,6 +834,7 @@ struct hid_driver {
  * @output_report: send output report to device
  * @idle: send idle request to device
  * @may_wakeup: return if device may act as a wakeup source during system-suspend
+ * @max_buffer_size: over-ride maximum data buffer size (default: HID_MAX_BUFFER_SIZE)
  */
 struct hid_ll_driver {
 	int (*start)(struct hid_device *hdev);
@@ -859,6 +860,8 @@ struct hid_ll_driver {
 
 	int (*idle)(struct hid_device *hdev, int report, int idle, int reqtype);
 	bool (*may_wakeup)(struct hid_device *hdev);
+
+	unsigned int max_buffer_size;
 };
 
 extern bool hid_is_usb(const struct hid_device *hdev);
-- 
cgit v1.2.3


From 80c16b2b121fbc3380dbffa9bab7559acbaaa2ed Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 6 Mar 2023 17:22:04 +0200
Subject: cpumask: Fix typo nr_cpumask_size --> nr_cpumask_bits

The never used nr_cpumask_size is just a typo, hence use existing
redefinition that's called nr_cpumask_bits.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 8fbe76607965..ce8eb7ef2107 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -66,7 +66,7 @@ static inline void set_nr_cpu_ids(unsigned int nr)
  *
  * Finally, some operations just want the exact limit, either because
  * they set bits or just don't have any faster fixed-sized versions. We
- * call this just 'nr_cpumask_size'.
+ * call this just 'nr_cpumask_bits'.
  *
  * Note that these optional constants are always guaranteed to be at
  * least as big as 'nr_cpu_ids' itself is, and all our cpumask
-- 
cgit v1.2.3


From c28cd1f3433c7e339315d1ddacaeacf0fdfbe252 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Wed, 1 Mar 2023 17:46:38 -0800
Subject: clk: Mark a fwnode as initialized when using CLK_OF_DECLARE() macro

We already mark fwnodes as initialized when they are registered as clock
providers. We do this so that fw_devlink can tell when a clock driver
doesn't use the driver core framework to probe/initialize its device.
This ensures fw_devlink doesn't block the consumers of such a clock
provider indefinitely.

However, some users of CLK_OF_DECLARE() macros don't use the same node
that matches the macro as the node for the clock provider, but they
initialize the entire node. To cover these cases, also mark the nodes
that match the macros as initialized when the init callback function is
called.

An example of this is "stericsson,u8500-clks" that's handled using
CLK_OF_DECLARE() and looks something like this:

clocks {
	compatible = "stericsson,u8500-clks";

	prcmu_clk: prcmu-clock {
		#clock-cells = <1>;
	};

	prcc_pclk: prcc-periph-clock {
		#clock-cells = <2>;
	};

	prcc_kclk: prcc-kernel-clock {
		#clock-cells = <2>;
	};

	prcc_reset: prcc-reset-controller {
		#reset-cells = <2>;
	};
	...
};

This patch makes sure that "clocks" is marked as initialized so that
fw_devlink knows that all nodes under it have been initialized. If the
driver creates struct devices for some of the subnodes, fw_devlink is
smart enough to know to wait for those devices to probe, so no special
handling is required for those cases.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reported-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/lkml/CACRpkdamxDX6EBVjKX5=D3rkHp17f5pwGdBVhzFU90-0MHY6dQ@mail.gmail.com/
Fixes: 4a032827daa8 ("of: property: Simplify of_link_to_phandle()")
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20230302014639.297514-1-saravanak@google.com
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Tested-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 842e72a5348f..c9f5276006a0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1363,7 +1363,13 @@ struct clk_hw_onecell_data {
 	struct clk_hw *hws[];
 };
 
-#define CLK_OF_DECLARE(name, compat, fn) OF_DECLARE_1(clk, name, compat, fn)
+#define CLK_OF_DECLARE(name, compat, fn) \
+	static void __init name##_of_clk_init_declare(struct device_node *np) \
+	{								\
+		fn(np);							\
+		fwnode_dev_initialized(of_fwnode_handle(np), true);	\
+	}								\
+	OF_DECLARE_1(clk, name, compat, name##_of_clk_init_declare)
 
 /*
  * Use this macro when you have a driver that requires two initialization
-- 
cgit v1.2.3


From 849ad04cf562ac63b0371a825eed473d84de9c6d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Mar 2023 01:50:53 -0500
Subject: new helper: put_and_unmap_page()

kunmap_local() + put_page(), as done by e.g. ext2 directory handling.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/highmem.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index b06254e76d99..8fc10089e19e 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -481,4 +481,10 @@ static inline void folio_zero_range(struct folio *folio,
 	zero_user_segments(&folio->page, start, start + length, 0, 0);
 }
 
+static inline void put_and_unmap_page(struct page *page, void *addr)
+{
+	kunmap_local(addr);
+	put_page(page);
+}
+
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3


From 63355b9884b3d1677de6bd1517cd2b8a9bf53978 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 7 Mar 2023 12:16:18 -0800
Subject: cpumask: be more careful with 'cpumask_setall()'

Commit 596ff4a09b89 ("cpumask: re-introduce constant-sized cpumask
optimizations") changed cpumask_setall() to use "bitmap_set()" instead
of "bitmap_fill()", because bitmap_fill() would explicitly set all the
bits of a constant sized small bitmap, and that's exactly what we don't
want: we want to only set bits up to 'nr_cpu_ids', which is what
"bitmap_set()" does.

However, Yury correctly points out that while "bitmap_set()" does indeed
only set bits up to the required bitmap size, it doesn't _clear_ bits
above that size, so the upper bits would still not have well-defined
values.

Now, none of this should really matter, since any bits set past
'nr_cpu_ids' should always be ignored in the first place.  Yes, the bit
scanning functions might return them as a result, but since users should
always consider the ">= nr_cpu_ids" condition to mean "no more bits",
that shouldn't have any actual effect (see previous commit 8ca09d5fa354
"cpumask: fix incorrect cpumask scanning result checks").

But let's just do it right, the way the code was _intended_ to work.  We
have had enough lazy code that works but bites us in the *rse later
(again, see previous commit) that there's no reason to not just do this
properly.

It turns out that "bitmap_fill()" gets this all right for the complex
case, and really only fails for the inlined optimized case that just
fills the whole word.  And while we could just fix bitmap_fill() to use
the proper last word mask, there's two issues with that:

 - the cpumask case wants to do the _optimization_ based on "NR_CPUS is
   a small constant", but then wants to do the actual bit _fill_ based
   on "nr_cpu_ids" that isn't necessarily that same constant

 - we have lots of non-cpumask users of bitmap_fill(), and while they
   hopefully don't care, and probably would want the proper semantics
   anyway ("only set bits up to the limit"), I do not want the cpumask
   changes to impact other parts

So this ends up just doing the single-word optimization by hand in the
cpumask code.  If our cpumask is fundamentally limited to a single word,
just do the proper "fill in that word" exactly.  And if it's the more
complex multi-word case, then the generic bitmap_fill() will DTRT.

This is all an example of how our bitmap function optimizations really
are somewhat broken.  They conflate the "this is size of the bitmap"
optimizations with the actual bit(s) we want to set.

In many cases we really want to have the two be separate things:
sometimes we base our optimizations on the size of the whole bitmap ("I
know this whole bitmap fits in a single word, so I'll just use
single-word accesses"), and sometimes we base them on the bit we are
looking at ("this is just acting on bits that are in the first word, so
I'll use single-word accesses").

Notice how the end result of the two optimizations are the same, but the
way we get to them are quite different.

And all our cpumask optimization games are really about that fundamental
distinction, and we'd often really want to pass in both the "this is the
bit I'm working on" (which _can_ be a small constant but might be
variable), and "I know it's in this range even if it's variable" (based
on CONFIG_NR_CPUS).

So this cpumask_setall() implementation just makes that explicit.  It
checks the "I statically know the size is small" using the known static
size of the cpumask (which is what that 'small_cpumask_bits' is all
about), but then sets the actual bits using the exact number of cpus we
have (ie 'nr_cpumask_bits')

Of course, in a perfect world, the compiler would have done all the
range analysis (possibly with help from us just telling it that
"this value is always in this range"), and would do all of this for us.
But that is not the world we live in.

While we dream of that perfect world, this does that manual logic to
make it all work out.  And this was a very long explanation for a small
code change that shouldn't even matter.

Reported-by: Yury Norov <yury.norov@gmail.com>
Link: https://lore.kernel.org/lkml/ZAV9nGG9e1%2FrV+L%2F@yury-laptop/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ce8eb7ef2107..63d637d18e79 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -518,14 +518,14 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *
 /**
  * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
  * @dstp: the cpumask pointer
- *
- * Note: since we set bits, we should use the tighter 'bitmap_set()' with
- * the eact number of bits, not 'bitmap_fill()' that will fill past the
- * end.
  */
 static inline void cpumask_setall(struct cpumask *dstp)
 {
-	bitmap_set(cpumask_bits(dstp), 0, nr_cpumask_bits);
+	if (small_const_nbits(small_cpumask_bits)) {
+		cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits);
+		return;
+	}
+	bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
 }
 
 /**
-- 
cgit v1.2.3


From eb59eca0d8ac15f8c1b7f1cd35999455a90292c0 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Mon, 6 Mar 2023 08:56:31 +0100
Subject: interconnect: fix provider registration API

The current interconnect provider interface is inherently racy as
providers are expected to be added before being fully initialised.

Specifically, nodes are currently not added and the provider data is not
initialised until after registering the provider which can cause racing
DT lookups to fail.

Add a new provider API which will be used to fix up the interconnect
drivers.

The old API is reimplemented using the new interface and will be removed
once all drivers have been fixed.

Fixes: 11f1ceca7031 ("interconnect: Add generic on-chip interconnect API")
Fixes: 87e3031b6fbd ("interconnect: Allow endpoints translation via DT")
Cc: stable@vger.kernel.org      # 5.1
Reviewed-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Tested-by: Luca Ceresoli <luca.ceresoli@bootlin.com> # i.MX8MP MSC SM2-MB-EP1 Board
Link: https://lore.kernel.org/r/20230306075651.2449-4-johan+linaro@kernel.org
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 drivers/interconnect/core.c           | 52 +++++++++++++++++++++++++----------
 include/linux/interconnect-provider.h | 12 ++++++++
 2 files changed, 50 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c
index cabb6f5df83e..7a24c1444ace 100644
--- a/drivers/interconnect/core.c
+++ b/drivers/interconnect/core.c
@@ -1033,44 +1033,68 @@ int icc_nodes_remove(struct icc_provider *provider)
 EXPORT_SYMBOL_GPL(icc_nodes_remove);
 
 /**
- * icc_provider_add() - add a new interconnect provider
- * @provider: the interconnect provider that will be added into topology
+ * icc_provider_init() - initialize a new interconnect provider
+ * @provider: the interconnect provider to initialize
+ *
+ * Must be called before adding nodes to the provider.
+ */
+void icc_provider_init(struct icc_provider *provider)
+{
+	WARN_ON(!provider->set);
+
+	INIT_LIST_HEAD(&provider->nodes);
+}
+EXPORT_SYMBOL_GPL(icc_provider_init);
+
+/**
+ * icc_provider_register() - register a new interconnect provider
+ * @provider: the interconnect provider to register
  *
  * Return: 0 on success, or an error code otherwise
  */
-int icc_provider_add(struct icc_provider *provider)
+int icc_provider_register(struct icc_provider *provider)
 {
-	if (WARN_ON(!provider->set))
-		return -EINVAL;
 	if (WARN_ON(!provider->xlate && !provider->xlate_extended))
 		return -EINVAL;
 
 	mutex_lock(&icc_lock);
-
-	INIT_LIST_HEAD(&provider->nodes);
 	list_add_tail(&provider->provider_list, &icc_providers);
-
 	mutex_unlock(&icc_lock);
 
-	dev_dbg(provider->dev, "interconnect provider added to topology\n");
+	dev_dbg(provider->dev, "interconnect provider registered\n");
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(icc_provider_add);
+EXPORT_SYMBOL_GPL(icc_provider_register);
 
 /**
- * icc_provider_del() - delete previously added interconnect provider
- * @provider: the interconnect provider that will be removed from topology
+ * icc_provider_deregister() - deregister an interconnect provider
+ * @provider: the interconnect provider to deregister
  */
-void icc_provider_del(struct icc_provider *provider)
+void icc_provider_deregister(struct icc_provider *provider)
 {
 	mutex_lock(&icc_lock);
 	WARN_ON(provider->users);
-	WARN_ON(!list_empty(&provider->nodes));
 
 	list_del(&provider->provider_list);
 	mutex_unlock(&icc_lock);
 }
+EXPORT_SYMBOL_GPL(icc_provider_deregister);
+
+int icc_provider_add(struct icc_provider *provider)
+{
+	icc_provider_init(provider);
+
+	return icc_provider_register(provider);
+}
+EXPORT_SYMBOL_GPL(icc_provider_add);
+
+void icc_provider_del(struct icc_provider *provider)
+{
+	WARN_ON(!list_empty(&provider->nodes));
+
+	icc_provider_deregister(provider);
+}
 EXPORT_SYMBOL_GPL(icc_provider_del);
 
 static const struct of_device_id __maybe_unused ignore_list[] = {
diff --git a/include/linux/interconnect-provider.h b/include/linux/interconnect-provider.h
index cd5c5a27557f..d12cd18aab3f 100644
--- a/include/linux/interconnect-provider.h
+++ b/include/linux/interconnect-provider.h
@@ -122,6 +122,9 @@ int icc_link_destroy(struct icc_node *src, struct icc_node *dst);
 void icc_node_add(struct icc_node *node, struct icc_provider *provider);
 void icc_node_del(struct icc_node *node);
 int icc_nodes_remove(struct icc_provider *provider);
+void icc_provider_init(struct icc_provider *provider);
+int icc_provider_register(struct icc_provider *provider);
+void icc_provider_deregister(struct icc_provider *provider);
 int icc_provider_add(struct icc_provider *provider);
 void icc_provider_del(struct icc_provider *provider);
 struct icc_node_data *of_icc_get_from_provider(struct of_phandle_args *spec);
@@ -167,6 +170,15 @@ static inline int icc_nodes_remove(struct icc_provider *provider)
 	return -ENOTSUPP;
 }
 
+static inline void icc_provider_init(struct icc_provider *provider) { }
+
+static inline int icc_provider_register(struct icc_provider *provider)
+{
+	return -ENOTSUPP;
+}
+
+static inline void icc_provider_deregister(struct icc_provider *provider) { }
+
 static inline int icc_provider_add(struct icc_provider *provider)
 {
 	return -ENOTSUPP;
-- 
cgit v1.2.3


From 03c835f498b540087244a6757e87dfe7ef10999b Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 26 Feb 2023 23:26:52 +0100
Subject: i2c: Switch .probe() to not take an id parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit b8a1a4cd5a98 ("i2c: Provide a temporary .probe_new() call-back
type") introduced a new probe callback to convert i2c init routines to
not take an i2c_device_id parameter. Now that all in-tree drivers are
converted to the temporary .probe_new() callback, .probe() can be
modified to match the desired prototype.

Now that .probe() and .probe_new() have the same semantic, they can be
defined as members of an anonymous union to save some memory and
simplify the core code a bit.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-core-base.c | 11 ++---------
 include/linux/i2c.h         | 18 +++++++++++-------
 2 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index cb5fa971d67e..63253e2b2c1f 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -561,15 +561,8 @@ static int i2c_device_probe(struct device *dev)
 		goto err_detach_pm_domain;
 	}
 
-	/*
-	 * When there are no more users of probe(),
-	 * rename probe_new to probe.
-	 */
-	if (driver->probe_new)
-		status = driver->probe_new(client);
-	else if (driver->probe)
-		status = driver->probe(client,
-				       i2c_match_id(driver->id_table, client));
+	if (driver->probe)
+		status = driver->probe(client);
 	else
 		status = -EINVAL;
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 500404d85141..5ba89663ea86 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -236,8 +236,8 @@ enum i2c_driver_flags {
 /**
  * struct i2c_driver - represent an I2C device driver
  * @class: What kind of i2c device we instantiate (for detect)
- * @probe: Callback for device binding - soon to be deprecated
- * @probe_new: New callback for device binding
+ * @probe: Callback for device binding
+ * @probe_new: Transitional callback for device binding - do not use
  * @remove: Callback for device unbinding
  * @shutdown: Callback for device shutdown
  * @alert: Alert callback, for example for the SMBus alert protocol
@@ -272,14 +272,18 @@ enum i2c_driver_flags {
 struct i2c_driver {
 	unsigned int class;
 
+	union {
 	/* Standard driver model interfaces */
-	int (*probe)(struct i2c_client *client, const struct i2c_device_id *id);
+		int (*probe)(struct i2c_client *client);
+		/*
+		 * Legacy callback that was part of a conversion of .probe().
+		 * Today it has the same semantic as .probe(). Don't use for new
+		 * code.
+		 */
+		int (*probe_new)(struct i2c_client *client);
+	};
 	void (*remove)(struct i2c_client *client);
 
-	/* New driver model interface to aid the seamless removal of the
-	 * current probe()'s, more commonly unused than used second parameter.
-	 */
-	int (*probe_new)(struct i2c_client *client);
 
 	/* driver model interfaces that don't relate to enumeration  */
 	void (*shutdown)(struct i2c_client *client);
-- 
cgit v1.2.3


From 5cf9d015be160e2d90d29ae74ef1364390e8fce8 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Wed, 8 Mar 2023 13:47:11 -0700
Subject: clk: Avoid invalid function names in CLK_OF_DECLARE()

After commit c28cd1f3433c ("clk: Mark a fwnode as initialized when using
CLK_OF_DECLARE() macro"), drivers/clk/mvebu/kirkwood.c fails to build:

 drivers/clk/mvebu/kirkwood.c:358:1: error: expected identifier or '('
 CLK_OF_DECLARE(98dx1135_clk, "marvell,mv98dx1135-core-clock",
 ^
 include/linux/clk-provider.h:1367:21: note: expanded from macro 'CLK_OF_DECLARE'
         static void __init name##_of_clk_init_declare(struct device_node *np) \
                            ^
 <scratch space>:124:1: note: expanded from here
 98dx1135_clk_of_clk_init_declare
 ^
 drivers/clk/mvebu/kirkwood.c:358:1: error: invalid digit 'd' in decimal constant
 include/linux/clk-provider.h:1372:34: note: expanded from macro 'CLK_OF_DECLARE'
         OF_DECLARE_1(clk, name, compat, name##_of_clk_init_declare)
                                         ^
 <scratch space>:125:3: note: expanded from here
 98dx1135_clk_of_clk_init_declare
   ^
 drivers/clk/mvebu/kirkwood.c:358:1: error: invalid digit 'd' in decimal constant
 include/linux/clk-provider.h:1372:34: note: expanded from macro 'CLK_OF_DECLARE'
         OF_DECLARE_1(clk, name, compat, name##_of_clk_init_declare)
                                         ^
 <scratch space>:125:3: note: expanded from here
 98dx1135_clk_of_clk_init_declare
   ^
 drivers/clk/mvebu/kirkwood.c:358:1: error: invalid digit 'd' in decimal constant
 include/linux/clk-provider.h:1372:34: note: expanded from macro 'CLK_OF_DECLARE'
         OF_DECLARE_1(clk, name, compat, name##_of_clk_init_declare)
                                         ^
 <scratch space>:125:3: note: expanded from here
 98dx1135_clk_of_clk_init_declare
   ^

C function names must start with either an alphabetic letter or an
underscore. To avoid generating invalid function names from clock names,
add two underscores to the beginning of the identifier.

Fixes: c28cd1f3433c ("clk: Mark a fwnode as initialized when using CLK_OF_DECLARE() macro")
Suggested-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20230308-clk_of_declare-fix-v1-1-317b741e2532@kernel.org
Reviewed-by: Saravana Kannan <saravanak@google.com>
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index c9f5276006a0..6f3175f0678a 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1364,12 +1364,12 @@ struct clk_hw_onecell_data {
 };
 
 #define CLK_OF_DECLARE(name, compat, fn) \
-	static void __init name##_of_clk_init_declare(struct device_node *np) \
+	static void __init __##name##_of_clk_init_declare(struct device_node *np) \
 	{								\
 		fn(np);							\
 		fwnode_dev_initialized(of_fwnode_handle(np), true);	\
 	}								\
-	OF_DECLARE_1(clk, name, compat, name##_of_clk_init_declare)
+	OF_DECLARE_1(clk, name, compat, __##name##_of_clk_init_declare)
 
 /*
  * Use this macro when you have a driver that requires two initialization
-- 
cgit v1.2.3


From fe9ae05cfbe587dda724fcf537c00bc2f287da62 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 8 Mar 2023 11:50:12 +0100
Subject: fbdev: Fix incorrect page mapping clearance at
 fb_deferred_io_release()

The recent fix for the deferred I/O by the commit
  3efc61d95259 ("fbdev: Fix invalid page access after closing deferred I/O devices")
caused a regression when the same fb device is opened/closed while
it's being used.  It resulted in a frozen screen even if something
is redrawn there after the close.  The breakage is because the patch
was made under a wrong assumption of a single open; in the current
code, fb_deferred_io_release() cleans up the page mapping of the
pageref list and it calls cancel_delayed_work_sync() unconditionally,
where both are no correct behavior for multiple opens.

This patch adds a refcount for the opens of the device, and applies
the cleanup only when all files get closed.

As both fb_deferred_io_open() and _close() are called always in the
fb_info lock (mutex), it's safe to use the normal int for the
refcounting.

Also, a useless BUG_ON() is dropped.

Fixes: 3efc61d95259 ("fbdev: Fix invalid page access after closing deferred I/O devices")
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Reviewed-by: Patrik Jakobsson <patrik.r.jakobsson@gmail.com>
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/20230308105012.1845-1-tiwai@suse.de
---
 drivers/video/fbdev/core/fb_defio.c | 17 +++++++++++++----
 include/linux/fb.h                  |  1 +
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c
index 583cbcf09446..a3cf1f764f29 100644
--- a/drivers/video/fbdev/core/fb_defio.c
+++ b/drivers/video/fbdev/core/fb_defio.c
@@ -309,17 +309,18 @@ void fb_deferred_io_open(struct fb_info *info,
 			 struct inode *inode,
 			 struct file *file)
 {
+	struct fb_deferred_io *fbdefio = info->fbdefio;
+
 	file->f_mapping->a_ops = &fb_deferred_io_aops;
+	fbdefio->open_count++;
 }
 EXPORT_SYMBOL_GPL(fb_deferred_io_open);
 
-void fb_deferred_io_release(struct fb_info *info)
+static void fb_deferred_io_lastclose(struct fb_info *info)
 {
-	struct fb_deferred_io *fbdefio = info->fbdefio;
 	struct page *page;
 	int i;
 
-	BUG_ON(!fbdefio);
 	cancel_delayed_work_sync(&info->deferred_work);
 
 	/* clear out the mapping that we setup */
@@ -328,13 +329,21 @@ void fb_deferred_io_release(struct fb_info *info)
 		page->mapping = NULL;
 	}
 }
+
+void fb_deferred_io_release(struct fb_info *info)
+{
+	struct fb_deferred_io *fbdefio = info->fbdefio;
+
+	if (!--fbdefio->open_count)
+		fb_deferred_io_lastclose(info);
+}
 EXPORT_SYMBOL_GPL(fb_deferred_io_release);
 
 void fb_deferred_io_cleanup(struct fb_info *info)
 {
 	struct fb_deferred_io *fbdefio = info->fbdefio;
 
-	fb_deferred_io_release(info);
+	fb_deferred_io_lastclose(info);
 
 	kvfree(info->pagerefs);
 	mutex_destroy(&fbdefio->lock);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 73eb1f85ea8e..05e40fcc7696 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -212,6 +212,7 @@ struct fb_deferred_io {
 	/* delay between mkwrite and deferred handler */
 	unsigned long delay;
 	bool sort_pagereflist; /* sort pagelist by offset */
+	int open_count; /* number of opened files; protected by fb_info lock */
 	struct mutex lock; /* mutex that protects the pageref list */
 	struct list_head pagereflist; /* list of pagerefs for touched pages */
 	/* callback */
-- 
cgit v1.2.3


From 62913ae96de747091c4dacd06d158e7729c1a76d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 7 Mar 2023 23:15:49 -0500
Subject: ext4, jbd2: add an optimized bmap for the journal inode

The generic bmap() function exported by the VFS takes locks and does
checks that are not necessary for the journal inode.  So allow the
file system to set a journal-optimized bmap function in
journal->j_bmap.

Reported-by: syzbot+9543479984ae9e576000@syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?id=e4aaa78795e490421c79f76ec3679006c8ff4cf0
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c      | 23 +++++++++++++++++++++++
 fs/jbd2/journal.c    |  9 ++++++---
 include/linux/jbd2.h |  8 ++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2192b4111442..46b7345d2b6a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5742,6 +5742,28 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
 	return journal_inode;
 }
 
+static int ext4_journal_bmap(journal_t *journal, sector_t *block)
+{
+	struct ext4_map_blocks map;
+	int ret;
+
+	if (journal->j_inode == NULL)
+		return 0;
+
+	map.m_lblk = *block;
+	map.m_len = 1;
+	ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0);
+	if (ret <= 0) {
+		ext4_msg(journal->j_inode->i_sb, KERN_CRIT,
+			 "journal bmap failed: block %llu ret %d\n",
+			 *block, ret);
+		jbd2_journal_abort(journal, ret ? ret : -EIO);
+		return ret;
+	}
+	*block = map.m_pblk;
+	return 0;
+}
+
 static journal_t *ext4_get_journal(struct super_block *sb,
 				   unsigned int journal_inum)
 {
@@ -5762,6 +5784,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 		return NULL;
 	}
 	journal->j_private = sb;
+	journal->j_bmap = ext4_journal_bmap;
 	ext4_init_journal_params(sb, journal);
 	return journal;
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2696f43e7239..c84f588fdcd0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -970,10 +970,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 {
 	int err = 0;
 	unsigned long long ret;
-	sector_t block = 0;
+	sector_t block = blocknr;
 
-	if (journal->j_inode) {
-		block = blocknr;
+	if (journal->j_bmap) {
+		err = journal->j_bmap(journal, &block);
+		if (err == 0)
+			*retp = block;
+	} else if (journal->j_inode) {
 		ret = bmap(journal->j_inode, &block);
 
 		if (ret || !block) {
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 2170e0cc279d..6ffa34c51a11 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1308,6 +1308,14 @@ struct journal_s
 				    struct buffer_head *bh,
 				    enum passtype pass, int off,
 				    tid_t expected_commit_id);
+
+	/**
+	 * @j_bmap:
+	 *
+	 * Bmap function that should be used instead of the generic
+	 * VFS bmap function.
+	 */
+	int (*j_bmap)(struct journal_s *journal, sector_t *block);
 };
 
 #define jbd2_might_wait_for_commit(j) \
-- 
cgit v1.2.3


From e7304080e0e50d979ce9eaf694ad8283e2e539ea Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 12 Mar 2023 08:52:03 -0700
Subject: cpumask: relax sanity checking constraints

The cpumask_check() was unnecessarily tight, and causes problems for the
users of cpumask_next().

We have a number of users that take the previous return value of one of
the bit scanning functions and subtract one to keep it in "range".  But
since the scanning functions end up returning up to 'small_cpumask_bits'
instead of the tighter 'nr_cpumask_bits', the range really needs to be
using that widened form.

[ This "previous-1" behavior is also the reason we have all those
  comments about /* -1 is a legal arg here. */ and separate checks for
  that being ok.  So we could have just made "small_cpumask_bits-1"
  be a similar special "don't check this" value.

  Tetsuo Handa even suggested a patch that only does that for
  cpumask_next(), since that seems to be the only actual case that
  triggers, but that all makes it even _more_ magical and special. So
  just relax the check ]

One example of this kind of pattern being the 'c_start()' function in
arch/x86/kernel/cpu/proc.c, but also duplicated in various forms on
other architectures.

Reported-by: syzbot+96cae094d90877641f32@syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?extid=96cae094d90877641f32
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Link: https://lore.kernel.org/lkml/c1f4cc16-feea-b83c-82cf-1a1f007b7eb9@I-love.SAKURA.ne.jp/
Fixes: 596ff4a09b89 ("cpumask: re-introduce constant-sized cpumask optimizations")
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 63d637d18e79..d4901ca8883c 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -147,7 +147,7 @@ static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bit
 /* verify cpu argument to cpumask_* operators */
 static __always_inline unsigned int cpumask_check(unsigned int cpu)
 {
-	cpu_max_bits_warn(cpu, nr_cpumask_bits);
+	cpu_max_bits_warn(cpu, small_cpumask_bits);
 	return cpu;
 }
 
-- 
cgit v1.2.3


From ab909509850b27fd39b8ba99e44cda39dbc3858c Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Mon, 6 Mar 2023 16:10:11 +0100
Subject: PCI: s390: Fix use-after-free of PCI resources with per-function
 hotplug

On s390 PCI functions may be hotplugged individually even when they
belong to a multi-function device. In particular on an SR-IOV device VFs
may be removed and later re-added.

In commit a50297cf8235 ("s390/pci: separate zbus creation from
scanning") it was missed however that struct pci_bus and struct
zpci_bus's resource list retained a reference to the PCI functions MMIO
resources even though those resources are released and freed on
hot-unplug. These stale resources may subsequently be claimed when the
PCI function re-appears resulting in use-after-free.

One idea of fixing this use-after-free in s390 specific code that was
investigated was to simply keep resources around from the moment a PCI
function first appeared until the whole virtual PCI bus created for
a multi-function device disappears. The problem with this however is
that due to the requirement of artificial MMIO addreesses (address
cookies) extra logic is then needed to keep the address cookies
compatible on re-plug. At the same time the MMIO resources semantically
belong to the PCI function so tying their lifecycle to the function
seems more logical.

Instead a simpler approach is to remove the resources of an individually
hot-unplugged PCI function from the PCI bus's resource list while
keeping the resources of other PCI functions on the PCI bus untouched.

This is done by introducing pci_bus_remove_resource() to remove an
individual resource. Similarly the resource also needs to be removed
from the struct zpci_bus's resource list. It turns out however, that
there is really no need to add the MMIO resources to the struct
zpci_bus's resource list at all and instead we can simply use the
zpci_bar_struct's resource pointer directly.

Fixes: a50297cf8235 ("s390/pci: separate zbus creation from scanning")
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20230306151014.60913-2-schnelle@linux.ibm.com
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/pci/pci.c     | 16 ++++++++++------
 arch/s390/pci/pci_bus.c | 12 +++++-------
 arch/s390/pci/pci_bus.h |  3 +--
 drivers/pci/bus.c       | 21 +++++++++++++++++++++
 include/linux/pci.h     |  1 +
 5 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index ef38b1514c77..e16afacc8fd1 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -544,8 +544,7 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
 	return r;
 }
 
-int zpci_setup_bus_resources(struct zpci_dev *zdev,
-			     struct list_head *resources)
+int zpci_setup_bus_resources(struct zpci_dev *zdev)
 {
 	unsigned long addr, size, flags;
 	struct resource *res;
@@ -581,7 +580,6 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
 			return -ENOMEM;
 		}
 		zdev->bars[i].res = res;
-		pci_add_resource(resources, res);
 	}
 	zdev->has_resources = 1;
 
@@ -590,17 +588,23 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
 
 static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
 {
+	struct resource *res;
 	int i;
 
+	pci_lock_rescan_remove();
 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
-		if (!zdev->bars[i].size || !zdev->bars[i].res)
+		res = zdev->bars[i].res;
+		if (!res)
 			continue;
 
+		release_resource(res);
+		pci_bus_remove_resource(zdev->zbus->bus, res);
 		zpci_free_iomap(zdev, zdev->bars[i].map_idx);
-		release_resource(zdev->bars[i].res);
-		kfree(zdev->bars[i].res);
+		zdev->bars[i].res = NULL;
+		kfree(res);
 	}
 	zdev->has_resources = 0;
+	pci_unlock_rescan_remove();
 }
 
 int pcibios_device_add(struct pci_dev *pdev)
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index 6a8da1b742ae..a99926af2b69 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -41,9 +41,7 @@ static int zpci_nb_devices;
  */
 static int zpci_bus_prepare_device(struct zpci_dev *zdev)
 {
-	struct resource_entry *window, *n;
-	struct resource *res;
-	int rc;
+	int rc, i;
 
 	if (!zdev_enabled(zdev)) {
 		rc = zpci_enable_device(zdev);
@@ -57,10 +55,10 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev)
 	}
 
 	if (!zdev->has_resources) {
-		zpci_setup_bus_resources(zdev, &zdev->zbus->resources);
-		resource_list_for_each_entry_safe(window, n, &zdev->zbus->resources) {
-			res = window->res;
-			pci_bus_add_resource(zdev->zbus->bus, res, 0);
+		zpci_setup_bus_resources(zdev);
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+			if (zdev->bars[i].res)
+				pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0);
 		}
 	}
 
diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
index e96c9860e064..af9f0ac79a1b 100644
--- a/arch/s390/pci/pci_bus.h
+++ b/arch/s390/pci/pci_bus.h
@@ -30,8 +30,7 @@ static inline void zpci_zdev_get(struct zpci_dev *zdev)
 
 int zpci_alloc_domain(int domain);
 void zpci_free_domain(int domain);
-int zpci_setup_bus_resources(struct zpci_dev *zdev,
-			     struct list_head *resources);
+int zpci_setup_bus_resources(struct zpci_dev *zdev);
 
 static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus,
 					     unsigned int devfn)
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 83ae838ceb5f..549c4bd5caec 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -76,6 +76,27 @@ struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n)
 }
 EXPORT_SYMBOL_GPL(pci_bus_resource_n);
 
+void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res)
+{
+	struct pci_bus_resource *bus_res, *tmp;
+	int i;
+
+	for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
+		if (bus->resource[i] == res) {
+			bus->resource[i] = NULL;
+			return;
+		}
+	}
+
+	list_for_each_entry_safe(bus_res, tmp, &bus->resources, list) {
+		if (bus_res->res == res) {
+			list_del(&bus_res->list);
+			kfree(bus_res);
+			return;
+		}
+	}
+}
+
 void pci_bus_remove_resources(struct pci_bus *bus)
 {
 	int i;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index fafd8020c6d7..b50e5c79f7e3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1438,6 +1438,7 @@ void pci_bus_add_resource(struct pci_bus *bus, struct resource *res,
 			  unsigned int flags);
 struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n);
 void pci_bus_remove_resources(struct pci_bus *bus);
+void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res);
 int devm_request_pci_bus_resources(struct device *dev,
 				   struct list_head *resources);
 
-- 
cgit v1.2.3


From 34e0a279a993debaff03158fc2fbf6a00c093643 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 13 Mar 2023 10:30:02 +0100
Subject: block: do not reverse request order when flushing plug list

Commit 26fed4ac4eab ("block: flush plug based on hardware and software
queue order") changed flushing of plug list to submit requests one
device at a time. However while doing that it also started using
list_add_tail() instead of list_add() used previously thus effectively
submitting requests in reverse order. Also when forming a rq_list with
remaining requests (in case two or more devices are used), we
effectively reverse the ordering of the plug list for each device we
process. Submitting requests in reverse order has negative impact on
performance for rotational disks (when BFQ is not in use). We observe
10-25% regression in random 4k write throughput, as well as ~20%
regression in MariaDB OLTP benchmark on rotational storage on btrfs
filesystem.

Fix the problem by preserving ordering of the plug list when inserting
requests into the queuelist as well as by appending to requeue_list
instead of prepending to it.

Fixes: 26fed4ac4eab ("block: flush plug based on hardware and software queue order")
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230313093002.11756-1-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 5 +++--
 include/linux/blk-mq.h | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d0cb2ef18fe2..cf1a39adf9a5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2725,6 +2725,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 	struct blk_mq_hw_ctx *this_hctx = NULL;
 	struct blk_mq_ctx *this_ctx = NULL;
 	struct request *requeue_list = NULL;
+	struct request **requeue_lastp = &requeue_list;
 	unsigned int depth = 0;
 	LIST_HEAD(list);
 
@@ -2735,10 +2736,10 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 			this_hctx = rq->mq_hctx;
 			this_ctx = rq->mq_ctx;
 		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
-			rq_list_add(&requeue_list, rq);
+			rq_list_add_tail(&requeue_lastp, rq);
 			continue;
 		}
-		list_add_tail(&rq->queuelist, &list);
+		list_add(&rq->queuelist, &list);
 		depth++;
 	} while (!rq_list_empty(plug->mq_list));
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index dd5ce1137f04..de0b0c3e7395 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -228,6 +228,12 @@ static inline unsigned short req_get_ioprio(struct request *req)
 	*(listptr) = rq;				\
 } while (0)
 
+#define rq_list_add_tail(lastpptr, rq)	do {		\
+	(rq)->rq_next = NULL;				\
+	**(lastpptr) = rq;				\
+	*(lastpptr) = &rq->rq_next;			\
+} while (0)
+
 #define rq_list_pop(listptr)				\
 ({							\
 	struct request *__req = NULL;			\
-- 
cgit v1.2.3


From c2679254b9c9980d9045f0f722cf093a2b1f7590 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 10 Mar 2023 17:28:56 -0500
Subject: tracing: Make tracepoint lockdep check actually test something

A while ago where the trace events had the following:

   rcu_read_lock_sched_notrace();
   rcu_dereference_sched(...);
   rcu_read_unlock_sched_notrace();

If the tracepoint is enabled, it could trigger RCU issues if called in
the wrong place. And this warning was only triggered if lockdep was
enabled. If the tracepoint was never enabled with lockdep, the bug would
not be caught. To handle this, the above sequence was done when lockdep
was enabled regardless if the tracepoint was enabled or not (although the
always enabled code really didn't do anything, it would still trigger a
warning).

But a lot has changed since that lockdep code was added. One is, that
sequence no longer triggers any warning. Another is, the tracepoint when
enabled doesn't even do that sequence anymore.

The main check we care about today is whether RCU is "watching" or not.
So if lockdep is enabled, always check if rcu_is_watching() which will
trigger a warning if it is not (tracepoints require RCU to be watching).

Note, that old sequence did add a bit of overhead when lockdep was enabled,
and with the latest kernel updates, would cause the system to slow down
enough to trigger kernel "stalled" warnings.

Link: http://lore.kernel.org/lkml/20140806181801.GA4605@redhat.com
Link: http://lore.kernel.org/lkml/20140807175204.C257CAC5@viggo.jf.intel.com
Link: https://lore.kernel.org/lkml/20230307184645.521db5c9@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20230310172856.77406446@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Fixes: e6753f23d961 ("tracepoint: Make rcuidle tracepoint callers use SRCU")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index fa1004fcf810..2083f2d2f05b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -231,12 +231,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
  *
- * When lockdep is enabled, we make sure to always do the RCU portions of
- * the tracepoint code, regardless of whether tracing is on. However,
- * don't check if the condition is false, due to interaction with idle
- * instrumentation. This lets us find RCU issues triggered with tracepoints
- * even when this tracepoint is off. This code has no purpose other than
- * poking RCU a bit.
+ * When lockdep is enabled, we make sure to always test if RCU is
+ * "watching" regardless if the tracepoint is enabled or not. Tracepoints
+ * require RCU to be active, and it should always warn at the tracepoint
+ * site if it is not watching, as it will need to be active when the
+ * tracepoint is enabled.
  */
 #define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
 	extern int __traceiter_##name(data_proto);			\
@@ -249,9 +248,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 				TP_ARGS(args),				\
 				TP_CONDITION(cond), 0);			\
 		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
-			rcu_read_lock_sched_notrace();			\
-			rcu_dereference_sched(__tracepoint_##name.funcs);\
-			rcu_read_unlock_sched_notrace();		\
+			WARN_ON_ONCE(!rcu_is_watching());		\
 		}							\
 	}								\
 	__DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args),		\
-- 
cgit v1.2.3


From 4b397c06cb987935b1b097336532aa6b4210e091 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 10 Mar 2023 19:11:09 +0000
Subject: net: tunnels: annotate lockless accesses to dev->needed_headroom

IP tunnels can apparently update dev->needed_headroom
in their xmit path.

This patch takes care of three tunnels xmit, and also the
core LL_RESERVED_SPACE() and LL_RESERVED_SPACE_EXTRA()
helpers.

More changes might be needed for completeness.

BUG: KCSAN: data-race in ip_tunnel_xmit / ip_tunnel_xmit

read to 0xffff88815b9da0ec of 2 bytes by task 888 on cpu 1:
ip_tunnel_xmit+0x1270/0x1730 net/ipv4/ip_tunnel.c:803
__gre_xmit net/ipv4/ip_gre.c:469 [inline]
ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661
__netdev_start_xmit include/linux/netdevice.h:4881 [inline]
netdev_start_xmit include/linux/netdevice.h:4895 [inline]
xmit_one net/core/dev.c:3580 [inline]
dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596
__dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246
dev_queue_xmit include/linux/netdevice.h:3051 [inline]
neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623
neigh_output include/net/neighbour.h:546 [inline]
ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228
ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316
NF_HOOK_COND include/linux/netfilter.h:291 [inline]
ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430
dst_output include/net/dst.h:444 [inline]
ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126
iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813
__gre_xmit net/ipv4/ip_gre.c:469 [inline]
ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661
__netdev_start_xmit include/linux/netdevice.h:4881 [inline]
netdev_start_xmit include/linux/netdevice.h:4895 [inline]
xmit_one net/core/dev.c:3580 [inline]
dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596
__dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246
dev_queue_xmit include/linux/netdevice.h:3051 [inline]
neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623
neigh_output include/net/neighbour.h:546 [inline]
ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228
ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316
NF_HOOK_COND include/linux/netfilter.h:291 [inline]
ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430
dst_output include/net/dst.h:444 [inline]
ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126
iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813
__gre_xmit net/ipv4/ip_gre.c:469 [inline]
ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661
__netdev_start_xmit include/linux/netdevice.h:4881 [inline]
netdev_start_xmit include/linux/netdevice.h:4895 [inline]
xmit_one net/core/dev.c:3580 [inline]
dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596
__dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246
dev_queue_xmit include/linux/netdevice.h:3051 [inline]
neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623
neigh_output include/net/neighbour.h:546 [inline]
ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228
ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316
NF_HOOK_COND include/linux/netfilter.h:291 [inline]
ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430
dst_output include/net/dst.h:444 [inline]
ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126
iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813
__gre_xmit net/ipv4/ip_gre.c:469 [inline]
ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661
__netdev_start_xmit include/linux/netdevice.h:4881 [inline]
netdev_start_xmit include/linux/netdevice.h:4895 [inline]
xmit_one net/core/dev.c:3580 [inline]
dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596
__dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246
dev_queue_xmit include/linux/netdevice.h:3051 [inline]
neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623
neigh_output include/net/neighbour.h:546 [inline]
ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228
ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316
NF_HOOK_COND include/linux/netfilter.h:291 [inline]
ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430
dst_output include/net/dst.h:444 [inline]
ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126
iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813
__gre_xmit net/ipv4/ip_gre.c:469 [inline]
ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661
__netdev_start_xmit include/linux/netdevice.h:4881 [inline]
netdev_start_xmit include/linux/netdevice.h:4895 [inline]
xmit_one net/core/dev.c:3580 [inline]
dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596
__dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246
dev_queue_xmit include/linux/netdevice.h:3051 [inline]
neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623
neigh_output include/net/neighbour.h:546 [inline]
ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228
ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316
NF_HOOK_COND include/linux/netfilter.h:291 [inline]
ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430
dst_output include/net/dst.h:444 [inline]
ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126
iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813
__gre_xmit net/ipv4/ip_gre.c:469 [inline]
ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661
__netdev_start_xmit include/linux/netdevice.h:4881 [inline]
netdev_start_xmit include/linux/netdevice.h:4895 [inline]
xmit_one net/core/dev.c:3580 [inline]
dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596
__dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246
dev_queue_xmit include/linux/netdevice.h:3051 [inline]
neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623
neigh_output include/net/neighbour.h:546 [inline]
ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228
ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316
NF_HOOK_COND include/linux/netfilter.h:291 [inline]
ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430
dst_output include/net/dst.h:444 [inline]
ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126
iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813
__gre_xmit net/ipv4/ip_gre.c:469 [inline]
ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661
__netdev_start_xmit include/linux/netdevice.h:4881 [inline]
netdev_start_xmit include/linux/netdevice.h:4895 [inline]
xmit_one net/core/dev.c:3580 [inline]
dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596
__dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246

write to 0xffff88815b9da0ec of 2 bytes by task 2379 on cpu 0:
ip_tunnel_xmit+0x1294/0x1730 net/ipv4/ip_tunnel.c:804
__gre_xmit net/ipv4/ip_gre.c:469 [inline]
ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661
__netdev_start_xmit include/linux/netdevice.h:4881 [inline]
netdev_start_xmit include/linux/netdevice.h:4895 [inline]
xmit_one net/core/dev.c:3580 [inline]
dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596
__dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246
dev_queue_xmit include/linux/netdevice.h:3051 [inline]
neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623
neigh_output include/net/neighbour.h:546 [inline]
ip6_finish_output2+0x9bc/0xc50 net/ipv6/ip6_output.c:134
__ip6_finish_output net/ipv6/ip6_output.c:195 [inline]
ip6_finish_output+0x39a/0x4e0 net/ipv6/ip6_output.c:206
NF_HOOK_COND include/linux/netfilter.h:291 [inline]
ip6_output+0xeb/0x220 net/ipv6/ip6_output.c:227
dst_output include/net/dst.h:444 [inline]
NF_HOOK include/linux/netfilter.h:302 [inline]
mld_sendpack+0x438/0x6a0 net/ipv6/mcast.c:1820
mld_send_cr net/ipv6/mcast.c:2121 [inline]
mld_ifc_work+0x519/0x7b0 net/ipv6/mcast.c:2653
process_one_work+0x3e6/0x750 kernel/workqueue.c:2390
worker_thread+0x5f2/0xa10 kernel/workqueue.c:2537
kthread+0x1ac/0x1e0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308

value changed: 0x0dd4 -> 0x0e14

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 2379 Comm: kworker/0:0 Not tainted 6.3.0-rc1-syzkaller-00002-g8ca09d5fa354-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/02/2023
Workqueue: mld mld_ifc_work

Fixes: 8eb30be0352d ("ipv6: Create ip6_tnl_xmit")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20230310191109.2384387-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  6 ++++--
 net/ipv4/ip_tunnel.c      | 12 ++++++------
 net/ipv6/ip6_tunnel.c     |  4 ++--
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6a14b7b11766..470085b121d3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -297,9 +297,11 @@ struct hh_cache {
  * relationship HH alignment <= LL alignment.
  */
 #define LL_RESERVED_SPACE(dev) \
-	((((dev)->hard_header_len+(dev)->needed_headroom)&~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
+	((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \
+	  & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
 #define LL_RESERVED_SPACE_EXTRA(dev,extra) \
-	((((dev)->hard_header_len+(dev)->needed_headroom+(extra))&~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
+	((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \
+	  & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
 
 struct header_ops {
 	int	(*create) (struct sk_buff *skb, struct net_device *dev,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index de90b09dfe78..2541083d49ad 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -614,10 +614,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 
 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
-	if (headroom > dev->needed_headroom)
-		dev->needed_headroom = headroom;
+	if (headroom > READ_ONCE(dev->needed_headroom))
+		WRITE_ONCE(dev->needed_headroom, headroom);
 
-	if (skb_cow_head(skb, dev->needed_headroom)) {
+	if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
 		ip_rt_put(rt);
 		goto tx_dropped;
 	}
@@ -800,10 +800,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
-	if (max_headroom > dev->needed_headroom)
-		dev->needed_headroom = max_headroom;
+	if (max_headroom > READ_ONCE(dev->needed_headroom))
+		WRITE_ONCE(dev->needed_headroom, max_headroom);
 
-	if (skb_cow_head(skb, dev->needed_headroom)) {
+	if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
 		ip_rt_put(rt);
 		DEV_STATS_INC(dev, tx_dropped);
 		kfree_skb(skb);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 47b6607a1370..5e80e517f071 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1240,8 +1240,8 @@ route_lookup:
 	 */
 	max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr)
 			+ dst->header_len + t->hlen;
-	if (max_headroom > dev->needed_headroom)
-		dev->needed_headroom = max_headroom;
+	if (max_headroom > READ_ONCE(dev->needed_headroom))
+		WRITE_ONCE(dev->needed_headroom, max_headroom);
 
 	err = ip6_tnl_encap(skb, t, &proto, fl6);
 	if (err)
-- 
cgit v1.2.3


From 8e19b87cfce2de2125f11363d7dea3d08f16ccae Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Thu, 9 Mar 2023 23:31:18 +0900
Subject: nvme-trace: show more opcode names

We have more commands to show in the trace. Sync up.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4fad4aa245fb..779507ac750b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -812,6 +812,7 @@ enum nvme_opcode {
 		nvme_opcode_name(nvme_cmd_compare),		\
 		nvme_opcode_name(nvme_cmd_write_zeroes),	\
 		nvme_opcode_name(nvme_cmd_dsm),			\
+		nvme_opcode_name(nvme_cmd_verify),		\
 		nvme_opcode_name(nvme_cmd_resv_register),	\
 		nvme_opcode_name(nvme_cmd_resv_report),		\
 		nvme_opcode_name(nvme_cmd_resv_acquire),	\
@@ -1144,10 +1145,14 @@ enum nvme_admin_opcode {
 		nvme_admin_opcode_name(nvme_admin_ns_mgmt),		\
 		nvme_admin_opcode_name(nvme_admin_activate_fw),		\
 		nvme_admin_opcode_name(nvme_admin_download_fw),		\
+		nvme_admin_opcode_name(nvme_admin_dev_self_test),	\
 		nvme_admin_opcode_name(nvme_admin_ns_attach),		\
 		nvme_admin_opcode_name(nvme_admin_keep_alive),		\
 		nvme_admin_opcode_name(nvme_admin_directive_send),	\
 		nvme_admin_opcode_name(nvme_admin_directive_recv),	\
+		nvme_admin_opcode_name(nvme_admin_virtual_mgmt),	\
+		nvme_admin_opcode_name(nvme_admin_nvme_mi_send),	\
+		nvme_admin_opcode_name(nvme_admin_nvme_mi_recv),	\
 		nvme_admin_opcode_name(nvme_admin_dbbuf),		\
 		nvme_admin_opcode_name(nvme_admin_format_nvm),		\
 		nvme_admin_opcode_name(nvme_admin_security_send),	\
-- 
cgit v1.2.3


From 5f27571382ca42daa3e3d40d1b252bf18c2b61d2 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 23 Feb 2023 17:12:26 +0800
Subject: block: count 'ios' and 'sectors' when io is done for bio-based device

While using iostat for raid, I observed very strange 'await'
occasionally, and turns out it's due to that 'ios' and 'sectors' is
counted in bdev_start_io_acct(), while 'nsecs' is counted in
bdev_end_io_acct(). I'm not sure why they are ccounted like that
but I think this behaviour is obviously wrong because user will get
wrong disk stats.

Fix the problem by counting 'ios' and 'sectors' when io is done, like
what rq-based device does.

Fixes: 394ffa503bc4 ("blk: introduce generic io stat accounting help function")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230223091226.1135678-1-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c              | 16 ++++++----------
 drivers/md/dm.c               |  6 +++---
 drivers/nvme/host/multipath.c |  8 ++++----
 include/linux/blkdev.h        |  5 ++---
 4 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9e5e0277a4d9..42926e6cb83c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -959,16 +959,11 @@ again:
 	}
 }
 
-unsigned long bdev_start_io_acct(struct block_device *bdev,
-				 unsigned int sectors, enum req_op op,
+unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
 				 unsigned long start_time)
 {
-	const int sgrp = op_stat_group(op);
-
 	part_stat_lock();
 	update_io_ticks(bdev, start_time, false);
-	part_stat_inc(bdev, ios[sgrp]);
-	part_stat_add(bdev, sectors[sgrp], sectors);
 	part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
 	part_stat_unlock();
 
@@ -984,13 +979,12 @@ EXPORT_SYMBOL(bdev_start_io_acct);
  */
 unsigned long bio_start_io_acct(struct bio *bio)
 {
-	return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
-				  bio_op(bio), jiffies);
+	return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
 }
 EXPORT_SYMBOL_GPL(bio_start_io_acct);
 
 void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
-		      unsigned long start_time)
+		      unsigned int sectors, unsigned long start_time)
 {
 	const int sgrp = op_stat_group(op);
 	unsigned long now = READ_ONCE(jiffies);
@@ -998,6 +992,8 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
 
 	part_stat_lock();
 	update_io_ticks(bdev, now, true);
+	part_stat_inc(bdev, ios[sgrp]);
+	part_stat_add(bdev, sectors[sgrp], sectors);
 	part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
 	part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
 	part_stat_unlock();
@@ -1007,7 +1003,7 @@ EXPORT_SYMBOL(bdev_end_io_acct);
 void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
 			      struct block_device *orig_bdev)
 {
-	bdev_end_io_acct(orig_bdev, bio_op(bio), start_time);
+	bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
 }
 EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index eace45a18d45..f5cc330bb549 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -512,10 +512,10 @@ static void dm_io_acct(struct dm_io *io, bool end)
 		sectors = io->sectors;
 
 	if (!end)
-		bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio),
-				   start_time);
+		bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time);
 	else
-		bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
+		bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors,
+				 start_time);
 
 	if (static_branch_unlikely(&stats_enabled) &&
 	    unlikely(dm_stats_used(&md->stats))) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index fc39d01e7b63..9171452e2f6d 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -123,9 +123,8 @@ void nvme_mpath_start_request(struct request *rq)
 		return;
 
 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
-	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0,
-					blk_rq_bytes(rq) >> SECTOR_SHIFT,
-					req_op(rq), jiffies);
+	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
+						      jiffies);
 }
 EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
 
@@ -136,7 +135,8 @@ void nvme_mpath_end_request(struct request *rq)
 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
 		return;
 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
-		nvme_req(rq)->start_time);
+			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
+			 nvme_req(rq)->start_time);
 }
 
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d1aee08f8c18..941304f17492 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1446,11 +1446,10 @@ static inline void blk_wake_io_task(struct task_struct *waiter)
 		wake_up_process(waiter);
 }
 
-unsigned long bdev_start_io_acct(struct block_device *bdev,
-				 unsigned int sectors, enum req_op op,
+unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
 				 unsigned long start_time);
 void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
-		unsigned long start_time);
+		      unsigned int sectors, unsigned long start_time);
 
 unsigned long bio_start_io_acct(struct bio *bio);
 void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
-- 
cgit v1.2.3