From 25af7406df5915f04d5f1c8f081dabb0ead1cdcc Mon Sep 17 00:00:00 2001
From: Isaac Manjarres <isaacmanjarres@google.com>
Date: Thu, 18 Aug 2022 18:28:51 +0100
Subject: ARM: 9229/1: amba: Fix use-after-free in amba_read_periphid()

After commit f2d3b9a46e0e ("ARM: 9220/1: amba: Remove deferred device
addition"), it became possible for amba_read_periphid() to be invoked
concurrently from two threads for a particular AMBA device.

Consider the case where a thread (T0) is registering an AMBA driver, and
searching for all of the devices it can match with on the AMBA bus.
Suppose that another thread (T1) is executing the deferred probe work,
and is searching through all of the AMBA drivers on the bus for a driver
that matches a particular AMBA device. Assume that both threads begin
operating on the same AMBA device and the device's peripheral ID is
still unknown.

In this scenario, the amba_match() function will be invoked for the
same AMBA device by both threads, which means amba_read_periphid()
can also be invoked by both threads, and both threads will be able
to manipulate the AMBA device's pclk pointer without any synchronization.
It's possible that one thread will initialize the pclk pointer, then the
other thread will re-initialize it, overwriting the previous value, and
both will race to free the same pclk, resulting in a use-after-free for
whichever thread frees the pclk last.

Add a lock per AMBA device to synchronize the handling with detecting the
peripheral ID to avoid the use-after-free scenario.

The following KFENCE bug report helped detect this problem:
==================================================================
BUG: KFENCE: use-after-free read in clk_disable+0x14/0x34

Use-after-free read at 0x(ptrval) (in kfence-#19):
 clk_disable+0x14/0x34
 amba_read_periphid+0xdc/0x134
 amba_match+0x3c/0x84
 __driver_attach+0x20/0x158
 bus_for_each_dev+0x74/0xc0
 bus_add_driver+0x154/0x1e8
 driver_register+0x88/0x11c
 do_one_initcall+0x8c/0x2fc
 kernel_init_freeable+0x190/0x220
 kernel_init+0x10/0x108
 ret_from_fork+0x14/0x3c
 0x0

kfence-#19: 0x(ptrval)-0x(ptrval), size=36, cache=kmalloc-64

allocated by task 8 on cpu 0 at 11.629931s:
 clk_hw_create_clk+0x38/0x134
 amba_get_enable_pclk+0x10/0x68
 amba_read_periphid+0x28/0x134
 amba_match+0x3c/0x84
 __device_attach_driver+0x2c/0xc4
 bus_for_each_drv+0x80/0xd0
 __device_attach+0xb0/0x1f0
 bus_probe_device+0x88/0x90
 deferred_probe_work_func+0x8c/0xc0
 process_one_work+0x23c/0x690
 worker_thread+0x34/0x488
 kthread+0xd4/0xfc
 ret_from_fork+0x14/0x3c
 0x0

freed by task 8 on cpu 0 at 11.630095s:
 amba_read_periphid+0xec/0x134
 amba_match+0x3c/0x84
 __device_attach_driver+0x2c/0xc4
 bus_for_each_drv+0x80/0xd0
 __device_attach+0xb0/0x1f0
 bus_probe_device+0x88/0x90
 deferred_probe_work_func+0x8c/0xc0
 process_one_work+0x23c/0x690
 worker_thread+0x34/0x488
 kthread+0xd4/0xfc
 ret_from_fork+0x14/0x3c
 0x0

Cc: Saravana Kannan <saravanak@google.com>
Cc: patches@armlinux.org.uk
Fixes: f2d3b9a46e0e ("ARM: 9220/1: amba: Remove deferred device addition")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 include/linux/amba/bus.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index e94cdf235f1d..5001e14c5c06 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -67,6 +67,7 @@ struct amba_device {
 	struct clk		*pclk;
 	struct device_dma_parameters dma_parms;
 	unsigned int		periphid;
+	struct mutex		periphid_lock;
 	unsigned int		cid;
 	struct amba_cs_uci_id	uci;
 	unsigned int		irq[AMBA_NR_IRQS];
-- 
cgit v1.2.3


From a02875c4cbd6f3d2f33d70cc158a19ef02d4b84f Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea@microchip.com>
Date: Fri, 26 Aug 2022 11:39:20 +0300
Subject: ARM: at91: pm: fix self-refresh for sama7g5

It has been discovered that on some parts, from time to time, self-refresh
procedure doesn't work as expected. Debugging and investigating it proved
that disabling AC DLL introduce glitches in RAM controllers which
leads to unexpected behavior. This is confirmed as a hardware bug. DLL
bypass disables 3 DLLs: 2 DX DLLs and AC DLL. Thus, keep only DX DLLs
disabled. This introduce 6mA extra current consumption on VDDCORE when
switching to any ULP mode or standby mode but the self-refresh procedure
still works.

Fixes: f0bbf17958e8 ("ARM: at91: pm: add self-refresh support for sama7g5")
Suggested-by: Frederic Schumacher <frederic.schumacher@microchip.com>
Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Tested-by: Cristian Birsan <cristian.birsan@microchip.com>
Link: https://lore.kernel.org/r/20220826083927.3107272-3-claudiu.beznea@microchip.com
---
 arch/arm/mach-at91/pm_suspend.S | 24 +++++++++++++++++-------
 include/soc/at91/sama7-ddr.h    |  4 ++++
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-at91/pm_suspend.S b/arch/arm/mach-at91/pm_suspend.S
index abe4ced33eda..ffed4d949042 100644
--- a/arch/arm/mach-at91/pm_suspend.S
+++ b/arch/arm/mach-at91/pm_suspend.S
@@ -172,9 +172,15 @@ sr_ena_2:
 	/* Put DDR PHY's DLL in bypass mode for non-backup modes. */
 	cmp	r7, #AT91_PM_BACKUP
 	beq	sr_ena_3
-	ldr	tmp1, [r3, #DDR3PHY_PIR]
-	orr	tmp1, tmp1, #DDR3PHY_PIR_DLLBYP
-	str	tmp1, [r3, #DDR3PHY_PIR]
+
+	/* Disable DX DLLs. */
+	ldr	tmp1, [r3, #DDR3PHY_DX0DLLCR]
+	orr	tmp1, tmp1, #DDR3PHY_DXDLLCR_DLLDIS
+	str	tmp1, [r3, #DDR3PHY_DX0DLLCR]
+
+	ldr	tmp1, [r3, #DDR3PHY_DX1DLLCR]
+	orr	tmp1, tmp1, #DDR3PHY_DXDLLCR_DLLDIS
+	str	tmp1, [r3, #DDR3PHY_DX1DLLCR]
 
 sr_ena_3:
 	/* Power down DDR PHY data receivers. */
@@ -221,10 +227,14 @@ sr_ena_3:
 	bic	tmp1, tmp1, #DDR3PHY_DSGCR_ODTPDD_ODT0
 	str	tmp1, [r3, #DDR3PHY_DSGCR]
 
-	/* Take DDR PHY's DLL out of bypass mode. */
-	ldr	tmp1, [r3, #DDR3PHY_PIR]
-	bic	tmp1, tmp1, #DDR3PHY_PIR_DLLBYP
-	str	tmp1, [r3, #DDR3PHY_PIR]
+	/* Enable DX DLLs. */
+	ldr	tmp1, [r3, #DDR3PHY_DX0DLLCR]
+	bic	tmp1, tmp1, #DDR3PHY_DXDLLCR_DLLDIS
+	str	tmp1, [r3, #DDR3PHY_DX0DLLCR]
+
+	ldr	tmp1, [r3, #DDR3PHY_DX1DLLCR]
+	bic	tmp1, tmp1, #DDR3PHY_DXDLLCR_DLLDIS
+	str	tmp1, [r3, #DDR3PHY_DX1DLLCR]
 
 	/* Enable quasi-dynamic programming. */
 	mov	tmp1, #0
diff --git a/include/soc/at91/sama7-ddr.h b/include/soc/at91/sama7-ddr.h
index 9e17247474fa..2706bc48c076 100644
--- a/include/soc/at91/sama7-ddr.h
+++ b/include/soc/at91/sama7-ddr.h
@@ -39,6 +39,10 @@
 
 #define DDR3PHY_ZQ0SR0				(0x188)		/* ZQ status register 0 */
 
+#define	DDR3PHY_DX0DLLCR			(0x1CC)		/* DDR3PHY DATX8 DLL Control Register */
+#define	DDR3PHY_DX1DLLCR			(0x20C)		/* DDR3PHY DATX8 DLL Control Register */
+#define		DDR3PHY_DXDLLCR_DLLDIS		(1 << 31)	/* DLL Disable */
+
 /* UDDRC */
 #define UDDRC_STAT				(0x04)		/* UDDRC Operating Mode Status Register */
 #define		UDDRC_STAT_SELFREF_TYPE_DIS	(0x0 << 4)	/* SDRAM is not in Self-refresh */
-- 
cgit v1.2.3


From 7a94b83a7dc551607b6c4400df29151e6a951f07 Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea@microchip.com>
Date: Fri, 26 Aug 2022 11:39:21 +0300
Subject: ARM: at91: pm: fix DDR recalibration when resuming from backup and
 self-refresh

On SAMA7G5, when resuming from backup and self-refresh, the bootloader
performs DDR PHY recalibration by restoring the value of ZQ0SR0 (stored
in RAM by Linux before going to backup and self-refresh). It has been
discovered that the current procedure doesn't work for all possible values
that might go to ZQ0SR0 due to hardware bug. The workaround to this is to
avoid storing some values in ZQ0SR0. Thus Linux will read the ZQ0SR0
register and cache its value in RAM after processing it (using
modified_gray_code array). The bootloader will restore the processed value.

Fixes: d2d4716d8384 ("ARM: at91: pm: save ddr phy calibration data to securam")
Suggested-by: Frederic Schumacher <frederic.schumacher@microchip.com>
Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Link: https://lore.kernel.org/r/20220826083927.3107272-4-claudiu.beznea@microchip.com
---
 arch/arm/mach-at91/pm.c      | 36 ++++++++++++++++++++++++++++++++----
 include/soc/at91/sama7-ddr.h |  4 ++++
 2 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index df6d673e83d5..f4501dea98b0 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -541,9 +541,41 @@ extern u32 at91_pm_suspend_in_sram_sz;
 
 static int at91_suspend_finish(unsigned long val)
 {
+	unsigned char modified_gray_code[] = {
+		0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x04, 0x05, 0x0c, 0x0d,
+		0x0e, 0x0f, 0x0a, 0x0b, 0x08, 0x09, 0x18, 0x19, 0x1a, 0x1b,
+		0x1e, 0x1f, 0x1c, 0x1d, 0x14, 0x15, 0x16, 0x17, 0x12, 0x13,
+		0x10, 0x11,
+	};
+	unsigned int tmp, index;
 	int i;
 
 	if (soc_pm.data.mode == AT91_PM_BACKUP && soc_pm.data.ramc_phy) {
+		/*
+		 * Bootloader will perform DDR recalibration and will try to
+		 * restore the ZQ0SR0 with the value saved here. But the
+		 * calibration is buggy and restoring some values from ZQ0SR0
+		 * is forbidden and risky thus we need to provide processed
+		 * values for these (modified gray code values).
+		 */
+		tmp = readl(soc_pm.data.ramc_phy + DDR3PHY_ZQ0SR0);
+
+		/* Store pull-down output impedance select. */
+		index = (tmp >> DDR3PHY_ZQ0SR0_PDO_OFF) & 0x1f;
+		soc_pm.bu->ddr_phy_calibration[0] = modified_gray_code[index];
+
+		/* Store pull-up output impedance select. */
+		index = (tmp >> DDR3PHY_ZQ0SR0_PUO_OFF) & 0x1f;
+		soc_pm.bu->ddr_phy_calibration[0] |= modified_gray_code[index];
+
+		/* Store pull-down on-die termination impedance select. */
+		index = (tmp >> DDR3PHY_ZQ0SR0_PDODT_OFF) & 0x1f;
+		soc_pm.bu->ddr_phy_calibration[0] |= modified_gray_code[index];
+
+		/* Store pull-up on-die termination impedance select. */
+		index = (tmp >> DDR3PHY_ZQ0SRO_PUODT_OFF) & 0x1f;
+		soc_pm.bu->ddr_phy_calibration[0] |= modified_gray_code[index];
+
 		/*
 		 * The 1st 8 words of memory might get corrupted in the process
 		 * of DDR PHY recalibration; it is saved here in securam and it
@@ -1066,10 +1098,6 @@ static int __init at91_pm_backup_init(void)
 		of_scan_flat_dt(at91_pm_backup_scan_memcs, &located);
 		if (!located)
 			goto securam_fail;
-
-		/* DDR3PHY_ZQ0SR0 */
-		soc_pm.bu->ddr_phy_calibration[0] = readl(soc_pm.data.ramc_phy +
-							  0x188);
 	}
 
 	return 0;
diff --git a/include/soc/at91/sama7-ddr.h b/include/soc/at91/sama7-ddr.h
index 2706bc48c076..6ce3bd22f6c6 100644
--- a/include/soc/at91/sama7-ddr.h
+++ b/include/soc/at91/sama7-ddr.h
@@ -38,6 +38,10 @@
 #define		DDR3PHY_DSGCR_ODTPDD_ODT0	(1 << 20)	/* ODT[0] Power Down Driver */
 
 #define DDR3PHY_ZQ0SR0				(0x188)		/* ZQ status register 0 */
+#define DDR3PHY_ZQ0SR0_PDO_OFF			(0)		/* Pull-down output impedance select offset */
+#define DDR3PHY_ZQ0SR0_PUO_OFF			(5)		/* Pull-up output impedance select offset */
+#define DDR3PHY_ZQ0SR0_PDODT_OFF		(10)		/* Pull-down on-die termination impedance select offset */
+#define DDR3PHY_ZQ0SRO_PUODT_OFF		(15)		/* Pull-up on-die termination impedance select offset */
 
 #define	DDR3PHY_DX0DLLCR			(0x1CC)		/* DDR3PHY DATX8 DLL Control Register */
 #define	DDR3PHY_DX1DLLCR			(0x20C)		/* DDR3PHY DATX8 DLL Control Register */
-- 
cgit v1.2.3


From b118509076b39cc5e616c0680312b5caaca535fe Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 26 Aug 2022 08:49:16 +0200
Subject: netfilter: remove nf_conntrack_helper sysctl and modparam toggles

__nf_ct_try_assign_helper() remains in place but it now requires a
template to configure the helper.

A toggle to disable automatic helper assignment was added by:

  a9006892643a ("netfilter: nf_ct_helper: allow to disable automatic helper assignment")

in 2012 to address the issues described in "Secure use of iptables and
connection tracking helpers". Automatic conntrack helper assignment was
disabled by:

  3bb398d925ec ("netfilter: nf_ct_helper: disable automatic helper assignment")

back in 2016.

This patch removes the sysctl and modparam toggles, users now have to
rely on explicit conntrack helper configuration via ruleset.

Update tools/testing/selftests/netfilter/nft_conntrack_helper.sh to
check that auto-assignment does not happen anymore.

Acked-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h               |  2 -
 include/net/netns/conntrack.h                      |  1 -
 net/netfilter/nf_conntrack_core.c                  |  7 +-
 net/netfilter/nf_conntrack_helper.c                | 80 +++-------------------
 net/netfilter/nf_conntrack_netlink.c               |  5 --
 net/netfilter/nf_conntrack_standalone.c            | 10 ---
 net/netfilter/nft_ct.c                             |  3 -
 .../selftests/netfilter/nft_conntrack_helper.sh    | 36 +++++++---
 8 files changed, 37 insertions(+), 107 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index a32be8aa7ed2..6a2019aaa464 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -53,8 +53,6 @@ struct nf_conntrack_net {
 	/* only used when new connection is allocated: */
 	atomic_t count;
 	unsigned int expect_count;
-	u8 sysctl_auto_assign_helper;
-	bool auto_assign_helper_warned;
 
 	/* only used from work queues, configuration plane, and so on: */
 	unsigned int users4;
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index c396a3862e80..e1290c159184 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -101,7 +101,6 @@ struct netns_ct {
 	u8			sysctl_log_invalid; /* Log invalid packets */
 	u8			sysctl_events;
 	u8			sysctl_acct;
-	u8			sysctl_auto_assign_helper;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 71c2f4f95d36..1357a2729a4b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1782,7 +1782,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		}
 		spin_unlock_bh(&nf_conntrack_expect_lock);
 	}
-	if (!exp)
+	if (!exp && tmpl)
 		__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
 
 	/* Other CPU might have obtained a pointer to this object before it was
@@ -2068,10 +2068,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
 	if (ct->master || (help && !hlist_empty(&help->expectations)))
 		return;
-
-	rcu_read_lock();
-	__nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
-	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
 
@@ -2797,7 +2793,6 @@ int nf_conntrack_init_net(struct net *net)
 	nf_conntrack_acct_pernet_init(net);
 	nf_conntrack_tstamp_pernet_init(net);
 	nf_conntrack_ecache_pernet_init(net);
-	nf_conntrack_helper_pernet_init(net);
 	nf_conntrack_proto_pernet_init(net);
 
 	return 0;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index e96b32221444..ff737a76052e 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -35,11 +35,6 @@ unsigned int nf_ct_helper_hsize __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
 static unsigned int nf_ct_helper_count __read_mostly;
 
-static bool nf_ct_auto_assign_helper __read_mostly = false;
-module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
-MODULE_PARM_DESC(nf_conntrack_helper,
-		 "Enable automatic conntrack helper assignment (default 0)");
-
 static DEFINE_MUTEX(nf_ct_nat_helpers_mutex);
 static struct list_head nf_ct_nat_helpers __read_mostly;
 
@@ -51,24 +46,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple)
 		(__force __u16)tuple->src.u.all) % nf_ct_helper_hsize;
 }
 
-static struct nf_conntrack_helper *
-__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
-{
-	struct nf_conntrack_helper *helper;
-	struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
-	unsigned int h;
-
-	if (!nf_ct_helper_count)
-		return NULL;
-
-	h = helper_hash(tuple);
-	hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) {
-		if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
-			return helper;
-	}
-	return NULL;
-}
-
 struct nf_conntrack_helper *
 __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
 {
@@ -209,33 +186,11 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
 }
 EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
 
-static struct nf_conntrack_helper *
-nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
-{
-	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-
-	if (!cnet->sysctl_auto_assign_helper) {
-		if (cnet->auto_assign_helper_warned)
-			return NULL;
-		if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple))
-			return NULL;
-		pr_info("nf_conntrack: default automatic helper assignment "
-			"has been turned off for security reasons and CT-based "
-			"firewall rule not found. Use the iptables CT target "
-			"to attach helpers instead.\n");
-		cnet->auto_assign_helper_warned = true;
-		return NULL;
-	}
-
-	return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-}
-
 int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 			      gfp_t flags)
 {
 	struct nf_conntrack_helper *helper = NULL;
 	struct nf_conn_help *help;
-	struct net *net = nf_ct_net(ct);
 
 	/* We already got a helper explicitly attached. The function
 	 * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
@@ -246,23 +201,21 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 	if (test_bit(IPS_HELPER_BIT, &ct->status))
 		return 0;
 
-	if (tmpl != NULL) {
-		help = nfct_help(tmpl);
-		if (help != NULL) {
-			helper = rcu_dereference(help->helper);
-			set_bit(IPS_HELPER_BIT, &ct->status);
-		}
+	if (WARN_ON_ONCE(!tmpl))
+		return 0;
+
+	help = nfct_help(tmpl);
+	if (help != NULL) {
+		helper = rcu_dereference(help->helper);
+		set_bit(IPS_HELPER_BIT, &ct->status);
 	}
 
 	help = nfct_help(ct);
 
 	if (helper == NULL) {
-		helper = nf_ct_lookup_helper(ct, net);
-		if (helper == NULL) {
-			if (help)
-				RCU_INIT_POINTER(help->helper, NULL);
-			return 0;
-		}
+		if (help)
+			RCU_INIT_POINTER(help->helper, NULL);
+		return 0;
 	}
 
 	if (help == NULL) {
@@ -545,19 +498,6 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat)
 }
 EXPORT_SYMBOL_GPL(nf_nat_helper_unregister);
 
-void nf_ct_set_auto_assign_helper_warned(struct net *net)
-{
-	nf_ct_pernet(net)->auto_assign_helper_warned = true;
-}
-EXPORT_SYMBOL_GPL(nf_ct_set_auto_assign_helper_warned);
-
-void nf_conntrack_helper_pernet_init(struct net *net)
-{
-	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-
-	cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
-}
-
 int nf_conntrack_helper_init(void)
 {
 	nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 04169b54f2a2..7562b215b932 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2298,11 +2298,6 @@ ctnetlink_create_conntrack(struct net *net,
 			ct->status |= IPS_HELPER;
 			RCU_INIT_POINTER(help->helper, helper);
 		}
-	} else {
-		/* try an implicit helper assignation */
-		err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
-		if (err < 0)
-			goto err2;
 	}
 
 	err = ctnetlink_setup_nat(ct, cda);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 05895878610c..4ffe84c5a82c 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -561,7 +561,6 @@ enum nf_ct_sysctl_index {
 	NF_SYSCTL_CT_LOG_INVALID,
 	NF_SYSCTL_CT_EXPECT_MAX,
 	NF_SYSCTL_CT_ACCT,
-	NF_SYSCTL_CT_HELPER,
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	NF_SYSCTL_CT_EVENTS,
 #endif
@@ -680,14 +679,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.extra1 	= SYSCTL_ZERO,
 		.extra2 	= SYSCTL_ONE,
 	},
-	[NF_SYSCTL_CT_HELPER] = {
-		.procname	= "nf_conntrack_helper",
-		.maxlen		= sizeof(u8),
-		.mode		= 0644,
-		.proc_handler	= proc_dou8vec_minmax,
-		.extra1 	= SYSCTL_ZERO,
-		.extra2 	= SYSCTL_ONE,
-	},
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	[NF_SYSCTL_CT_EVENTS] = {
 		.procname	= "nf_conntrack_events",
@@ -1100,7 +1091,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
 	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
 	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
-	table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events;
 #endif
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index b04995c3e17f..a3f01f209a53 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -1089,9 +1089,6 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		goto err_put_helper;
 
-	/* Avoid the bogus warning, helper will be assigned after CT init */
-	nf_ct_set_auto_assign_helper_warned(ctx->net);
-
 	return 0;
 
 err_put_helper:
diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh
index bf6b9626c7dd..faa7778d7bd1 100755
--- a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh
+++ b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh
@@ -102,26 +102,42 @@ check_for_helper()
 
 	ip netns exec ${netns} conntrack -L -f $family -p tcp --dport $port 2> /dev/null |grep -q 'helper=ftp'
 	if [ $? -ne 0 ] ; then
-		echo "FAIL: ${netns} did not show attached helper $message" 1>&2
-		ret=1
+		if [ $autoassign -eq 0 ] ;then
+			echo "FAIL: ${netns} did not show attached helper $message" 1>&2
+			ret=1
+		else
+			echo "PASS: ${netns} did not show attached helper $message" 1>&2
+		fi
+	else
+		if [ $autoassign -eq 0 ] ;then
+			echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2
+		else
+			echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2
+			ret=1
+		fi
 	fi
 
-	echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2
 	return 0
 }
 
 test_helper()
 {
 	local port=$1
-	local msg=$2
+	local autoassign=$2
+
+	if [ $autoassign -eq 0 ] ;then
+		msg="set via ruleset"
+	else
+		msg="auto-assign"
+	fi
 
 	sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null &
 
 	sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null &
 	sleep 1
 
-	check_for_helper "$ns1" "ip $msg" $port
-	check_for_helper "$ns2" "ip $msg" $port
+	check_for_helper "$ns1" "ip $msg" $port $autoassign
+	check_for_helper "$ns2" "ip $msg" $port $autoassign
 
 	wait
 
@@ -173,9 +189,9 @@ if [ $? -ne 0 ];then
 	fi
 fi
 
-test_helper 2121 "set via ruleset"
-ip netns exec ${ns1} sysctl -q 'net.netfilter.nf_conntrack_helper=1'
-ip netns exec ${ns2} sysctl -q 'net.netfilter.nf_conntrack_helper=1'
-test_helper 21 "auto-assign"
+test_helper 2121 0
+ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1'
+ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1'
+test_helper 21 1
 
 exit $ret
-- 
cgit v1.2.3


From d94b2d00f7bfa0c555e7b8724b1faa037d57d150 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Sun, 21 Aug 2022 15:05:01 -0700
Subject: scsi: core: Revert "Make sure that hosts outlive targets"

Revert the patch series "Call blk_mq_free_tag_set() earlier" because it
introduces a deadlock if the scsi_remove_host() caller holds a reference on
a device, target or host.

Link: https://lore.kernel.org/r/20220821220502.13685-4-bvanassche@acm.org
Fixes: 16728aaba62e ("scsi: core: Make sure that hosts outlive targets")
Reported-by: syzbot+bafeb834708b1bb750bc@syzkaller.appspotmail.com
Tested-by: Kenneth R. Crudup <kenny@panix.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/hosts.c     | 8 --------
 drivers/scsi/scsi_scan.c | 7 -------
 include/scsi/scsi_host.h | 3 ---
 3 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 20c1f5420ba6..26bf3b153595 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -190,13 +190,6 @@ void scsi_remove_host(struct Scsi_Host *shost)
 	transport_unregister_device(&shost->shost_gendev);
 	device_unregister(&shost->shost_dev);
 	device_del(&shost->shost_gendev);
-
-	/*
-	 * After scsi_remove_host() has returned the scsi LLD module can be
-	 * unloaded and/or the host resources can be released. Hence wait until
-	 * the dependent SCSI targets and devices are gone before returning.
-	 */
-	wait_event(shost->targets_wq, atomic_read(&shost->target_count) == 0);
 }
 EXPORT_SYMBOL(scsi_remove_host);
 
@@ -406,7 +399,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	INIT_LIST_HEAD(&shost->starved_list);
 	init_waitqueue_head(&shost->host_wait);
 	mutex_init(&shost->scan_mutex);
-	init_waitqueue_head(&shost->targets_wq);
 
 	index = ida_alloc(&host_index_ida, GFP_KERNEL);
 	if (index < 0) {
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index ac6059702d13..4c1efd6a3b0c 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -406,14 +406,9 @@ static void scsi_target_destroy(struct scsi_target *starget)
 static void scsi_target_dev_release(struct device *dev)
 {
 	struct device *parent = dev->parent;
-	struct Scsi_Host *shost = dev_to_shost(parent);
 	struct scsi_target *starget = to_scsi_target(dev);
 
 	kfree(starget);
-
-	if (atomic_dec_return(&shost->target_count) == 0)
-		wake_up(&shost->targets_wq);
-
 	put_device(parent);
 }
 
@@ -528,8 +523,6 @@ static struct scsi_target *scsi_alloc_target(struct device *parent,
 	starget->max_target_blocked = SCSI_DEFAULT_TARGET_BLOCKED;
 	init_waitqueue_head(&starget->sdev_wq);
 
-	atomic_inc(&shost->target_count);
-
  retry:
 	spin_lock_irqsave(shost->host_lock, flags);
 
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index aa7b7496c93a..b6e41ee3d566 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -690,9 +690,6 @@ struct Scsi_Host {
 	/* ldm bits */
 	struct device		shost_gendev, shost_dev;
 
-	atomic_t		target_count;
-	wait_queue_head_t	targets_wq;
-
 	/*
 	 * Points to the transport data (if any) which is allocated
 	 * separately
-- 
cgit v1.2.3


From f782201ebc2b5f6c7a22e586e8eb81fe1d9e4801 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Sun, 21 Aug 2022 15:05:02 -0700
Subject: scsi: core: Revert "Make sure that targets outlive devices"

Revert the patch series "Call blk_mq_free_tag_set() earlier" because it
introduces a deadlock if the scsi_remove_host() caller holds a reference on
a device, target or host.

Link: https://lore.kernel.org/r/20220821220502.13685-5-bvanassche@acm.org
Fixes: fe442604199e ("scsi: core: Make sure that targets outlive devices")
Reported-by: syzbot+bafeb834708b1bb750bc@syzkaller.appspotmail.com
Tested-by: Kenneth R. Crudup <kenny@panix.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_scan.c   |  2 --
 drivers/scsi/scsi_sysfs.c  | 20 +++-----------------
 include/scsi/scsi_device.h |  2 --
 3 files changed, 3 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 4c1efd6a3b0c..91ac901a6682 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -521,8 +521,6 @@ static struct scsi_target *scsi_alloc_target(struct device *parent,
 	starget->state = STARGET_CREATED;
 	starget->scsi_level = SCSI_2;
 	starget->max_target_blocked = SCSI_DEFAULT_TARGET_BLOCKED;
-	init_waitqueue_head(&starget->sdev_wq);
-
  retry:
 	spin_lock_irqsave(shost->host_lock, flags);
 
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 282b32781e8c..aa70d9282161 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -443,9 +443,7 @@ static void scsi_device_cls_release(struct device *class_dev)
 
 static void scsi_device_dev_release_usercontext(struct work_struct *work)
 {
-	struct scsi_device *sdev = container_of(work, struct scsi_device,
-						ew.work);
-	struct scsi_target *starget = sdev->sdev_target;
+	struct scsi_device *sdev;
 	struct device *parent;
 	struct list_head *this, *tmp;
 	struct scsi_vpd *vpd_pg80 = NULL, *vpd_pg83 = NULL;
@@ -454,6 +452,8 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
 	unsigned long flags;
 	struct module *mod;
 
+	sdev = container_of(work, struct scsi_device, ew.work);
+
 	mod = sdev->host->hostt->module;
 
 	scsi_dh_release_device(sdev);
@@ -516,9 +516,6 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
 	kfree(sdev->inquiry);
 	kfree(sdev);
 
-	if (starget && atomic_dec_return(&starget->sdev_count) == 0)
-		wake_up(&starget->sdev_wq);
-
 	if (parent)
 		put_device(parent);
 	module_put(mod);
@@ -1538,14 +1535,6 @@ static void __scsi_remove_target(struct scsi_target *starget)
 		goto restart;
 	}
 	spin_unlock_irqrestore(shost->host_lock, flags);
-
-	/*
-	 * After scsi_remove_target() returns its caller can remove resources
-	 * associated with @starget, e.g. an rport or session. Wait until all
-	 * devices associated with @starget have been removed to prevent that
-	 * a SCSI error handling callback function triggers a use-after-free.
-	 */
-	wait_event(starget->sdev_wq, atomic_read(&starget->sdev_count) == 0);
 }
 
 /**
@@ -1656,9 +1645,6 @@ void scsi_sysfs_device_initialize(struct scsi_device *sdev)
 	list_add_tail(&sdev->same_target_siblings, &starget->devices);
 	list_add_tail(&sdev->siblings, &shost->__devices);
 	spin_unlock_irqrestore(shost->host_lock, flags);
-
-	atomic_inc(&starget->sdev_count);
-
 	/*
 	 * device can now only be removed via __scsi_remove_device() so hold
 	 * the target.  Target will be held in CREATED state until something
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 3113471ca375..2493bd65351a 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -309,8 +309,6 @@ struct scsi_target {
 	struct list_head	devices;
 	struct device		dev;
 	struct kref		reap_ref; /* last put renders target invisible */
-	atomic_t		sdev_count;
-	wait_queue_head_t	sdev_wq;
 	unsigned int		channel;
 	unsigned int		id; /* target id ... replace
 				     * scsi_device.id eventually */
-- 
cgit v1.2.3


From 8fe4ce5836e932f5766317cb651c1ff2a4cd0506 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 25 Aug 2022 17:26:34 -0700
Subject: scsi: core: Fix a use-after-free

There are two .exit_cmd_priv implementations. Both implementations use
resources associated with the SCSI host. Make sure that these resources are
still available when .exit_cmd_priv is called by waiting inside
scsi_remove_host() until the tag set has been freed.

This commit fixes the following use-after-free:

==================================================================
BUG: KASAN: use-after-free in srp_exit_cmd_priv+0x27/0xd0 [ib_srp]
Read of size 8 at addr ffff888100337000 by task multipathd/16727
Call Trace:
 <TASK>
 dump_stack_lvl+0x34/0x44
 print_report.cold+0x5e/0x5db
 kasan_report+0xab/0x120
 srp_exit_cmd_priv+0x27/0xd0 [ib_srp]
 scsi_mq_exit_request+0x4d/0x70
 blk_mq_free_rqs+0x143/0x410
 __blk_mq_free_map_and_rqs+0x6e/0x100
 blk_mq_free_tag_set+0x2b/0x160
 scsi_host_dev_release+0xf3/0x1a0
 device_release+0x54/0xe0
 kobject_put+0xa5/0x120
 device_release+0x54/0xe0
 kobject_put+0xa5/0x120
 scsi_device_dev_release_usercontext+0x4c1/0x4e0
 execute_in_process_context+0x23/0x90
 device_release+0x54/0xe0
 kobject_put+0xa5/0x120
 scsi_disk_release+0x3f/0x50
 device_release+0x54/0xe0
 kobject_put+0xa5/0x120
 disk_release+0x17f/0x1b0
 device_release+0x54/0xe0
 kobject_put+0xa5/0x120
 dm_put_table_device+0xa3/0x160 [dm_mod]
 dm_put_device+0xd0/0x140 [dm_mod]
 free_priority_group+0xd8/0x110 [dm_multipath]
 free_multipath+0x94/0xe0 [dm_multipath]
 dm_table_destroy+0xa2/0x1e0 [dm_mod]
 __dm_destroy+0x196/0x350 [dm_mod]
 dev_remove+0x10c/0x160 [dm_mod]
 ctl_ioctl+0x2c2/0x590 [dm_mod]
 dm_ctl_ioctl+0x5/0x10 [dm_mod]
 __x64_sys_ioctl+0xb4/0xf0
 dm_ctl_ioctl+0x5/0x10 [dm_mod]
 __x64_sys_ioctl+0xb4/0xf0
 do_syscall_64+0x3b/0x90
 entry_SYSCALL_64_after_hwframe+0x46/0xb0

Link: https://lore.kernel.org/r/20220826002635.919423-1-bvanassche@acm.org
Fixes: 65ca846a5314 ("scsi: core: Introduce {init,exit}_cmd_priv()")
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: John Garry <john.garry@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Reported-by: Li Zhijian <lizhijian@fujitsu.com>
Tested-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/hosts.c      | 16 +++++++++++++---
 drivers/scsi/scsi_lib.c   |  6 +++++-
 drivers/scsi/scsi_priv.h  |  2 +-
 drivers/scsi/scsi_scan.c  |  1 +
 drivers/scsi/scsi_sysfs.c |  1 +
 include/scsi/scsi_host.h  |  2 ++
 6 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 26bf3b153595..9857dba09c95 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -182,6 +182,15 @@ void scsi_remove_host(struct Scsi_Host *shost)
 	mutex_unlock(&shost->scan_mutex);
 	scsi_proc_host_rm(shost);
 
+	/*
+	 * New SCSI devices cannot be attached anymore because of the SCSI host
+	 * state so drop the tag set refcnt. Wait until the tag set refcnt drops
+	 * to zero because .exit_cmd_priv implementations may need the host
+	 * pointer.
+	 */
+	kref_put(&shost->tagset_refcnt, scsi_mq_free_tags);
+	wait_for_completion(&shost->tagset_freed);
+
 	spin_lock_irqsave(shost->host_lock, flags);
 	if (scsi_host_set_state(shost, SHOST_DEL))
 		BUG_ON(scsi_host_set_state(shost, SHOST_DEL_RECOVERY));
@@ -245,6 +254,9 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 	if (error)
 		goto fail;
 
+	kref_init(&shost->tagset_refcnt);
+	init_completion(&shost->tagset_freed);
+
 	/*
 	 * Increase usage count temporarily here so that calling
 	 * scsi_autopm_put_host() will trigger runtime idle if there is
@@ -317,6 +329,7 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 	pm_runtime_disable(&shost->shost_gendev);
 	pm_runtime_set_suspended(&shost->shost_gendev);
 	pm_runtime_put_noidle(&shost->shost_gendev);
+	kref_put(&shost->tagset_refcnt, scsi_mq_free_tags);
  fail:
 	return error;
 }
@@ -350,9 +363,6 @@ static void scsi_host_dev_release(struct device *dev)
 		kfree(dev_name(&shost->shost_dev));
 	}
 
-	if (shost->tag_set.tags)
-		scsi_mq_destroy_tags(shost);
-
 	kfree(shost->shost_data);
 
 	ida_free(&host_index_ida, shost->host_no);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ef08029a0079..96e7e3eaca29 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1983,9 +1983,13 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
 	return blk_mq_alloc_tag_set(tag_set);
 }
 
-void scsi_mq_destroy_tags(struct Scsi_Host *shost)
+void scsi_mq_free_tags(struct kref *kref)
 {
+	struct Scsi_Host *shost = container_of(kref, typeof(*shost),
+					       tagset_refcnt);
+
 	blk_mq_free_tag_set(&shost->tag_set);
+	complete(&shost->tagset_freed);
 }
 
 /**
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 429663bd78ec..f385b3f04d6e 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -94,7 +94,7 @@ extern void scsi_run_host_queues(struct Scsi_Host *shost);
 extern void scsi_requeue_run_queue(struct work_struct *work);
 extern void scsi_start_queue(struct scsi_device *sdev);
 extern int scsi_mq_setup_tags(struct Scsi_Host *shost);
-extern void scsi_mq_destroy_tags(struct Scsi_Host *shost);
+extern void scsi_mq_free_tags(struct kref *kref);
 extern void scsi_exit_queue(void);
 extern void scsi_evt_thread(struct work_struct *work);
 
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 91ac901a6682..5d27f5196de6 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -340,6 +340,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 		kfree(sdev);
 		goto out;
 	}
+	kref_get(&sdev->host->tagset_refcnt);
 	sdev->request_queue = q;
 	q->queuedata = sdev;
 	__scsi_init_queue(sdev->host, q);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index aa70d9282161..5d61f58399dc 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1476,6 +1476,7 @@ void __scsi_remove_device(struct scsi_device *sdev)
 	mutex_unlock(&sdev->state_mutex);
 
 	blk_mq_destroy_queue(sdev->request_queue);
+	kref_put(&sdev->host->tagset_refcnt, scsi_mq_free_tags);
 	cancel_work_sync(&sdev->requeue_work);
 
 	if (sdev->host->hostt->slave_destroy)
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index b6e41ee3d566..9b0a028bf053 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -557,6 +557,8 @@ struct Scsi_Host {
 	struct scsi_host_template *hostt;
 	struct scsi_transport_template *transportt;
 
+	struct kref		tagset_refcnt;
+	struct completion	tagset_freed;
 	/* Area to keep a shared tag map */
 	struct blk_mq_tag_set	tag_set;
 
-- 
cgit v1.2.3


From ac56a0b48da86fd1b4389632fb7c4c8a5d86eefa Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 26 Aug 2022 15:39:28 +0100
Subject: rxrpc: Fix ICMP/ICMP6 error handling

Because rxrpc pretends to be a tunnel on top of a UDP/UDP6 socket, allowing
it to siphon off UDP packets early in the handling of received UDP packets
thereby avoiding the packet going through the UDP receive queue, it doesn't
get ICMP packets through the UDP ->sk_error_report() callback.  In fact, it
doesn't appear that there's any usable option for getting hold of ICMP
packets.

Fix this by adding a new UDP encap hook to distribute error messages for
UDP tunnels.  If the hook is set, then the tunnel driver will be able to
see ICMP packets.  The hook provides the offset into the packet of the UDP
header of the original packet that caused the notification.

An alternative would be to call the ->error_handler() hook - but that
requires that the skbuff be cloned (as ip_icmp_error() or ipv6_cmp_error()
do, though isn't really necessary or desirable in rxrpc's case is we want
to parse them there and then, not queue them).

Changes
=======
ver #3)
 - Fixed an uninitialised variable.

ver #2)
 - Fixed some missing CONFIG_AF_RXRPC_IPV6 conditionals.

Fixes: 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/udp.h        |   1 +
 include/net/udp_tunnel.h   |   4 +
 net/ipv4/udp.c             |   2 +
 net/ipv4/udp_tunnel_core.c |   1 +
 net/ipv6/udp.c             |   5 +-
 net/rxrpc/ar-internal.h    |   1 +
 net/rxrpc/local_object.c   |   1 +
 net/rxrpc/peer_event.c     | 293 +++++++++++++++++++++++++++++++++++++++------
 8 files changed, 270 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 254a2654400f..e96da4157d04 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -70,6 +70,7 @@ struct udp_sock {
 	 * For encapsulation sockets.
 	 */
 	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+	void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset);
 	int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
 	void (*encap_destroy)(struct sock *sk);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index afc7ce713657..72394f441dad 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -67,6 +67,9 @@ static inline int udp_sock_create(struct net *net,
 typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
 typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk,
 					     struct sk_buff *skb);
+typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk,
+					   struct sk_buff *skb,
+					   unsigned int udp_offset);
 typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
 typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
 						    struct list_head *head,
@@ -80,6 +83,7 @@ struct udp_tunnel_sock_cfg {
 	__u8  encap_type;
 	udp_tunnel_encap_rcv_t encap_rcv;
 	udp_tunnel_encap_err_lookup_t encap_err_lookup;
+	udp_tunnel_encap_err_rcv_t encap_err_rcv;
 	udp_tunnel_encap_destroy_t encap_destroy;
 	udp_tunnel_gro_receive_t gro_receive;
 	udp_tunnel_gro_complete_t gro_complete;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 34eda973bbf1..cd72158e953a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -783,6 +783,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	 */
 	if (tunnel) {
 		/* ...not for tunnels though: we don't have a sending socket */
+		if (udp_sk(sk)->encap_err_rcv)
+			udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2);
 		goto out;
 	}
 	if (!inet->recverr) {
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 8efaf8c3fe2a..8242c8947340 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -72,6 +72,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 
 	udp_sk(sk)->encap_type = cfg->encap_type;
 	udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+	udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv;
 	udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
 	udp_sk(sk)->encap_destroy = cfg->encap_destroy;
 	udp_sk(sk)->gro_receive = cfg->gro_receive;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 16c176e7c69a..3366d6a77ff2 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -616,8 +616,11 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	}
 
 	/* Tunnels don't have an application socket: don't pass errors back */
-	if (tunnel)
+	if (tunnel) {
+		if (udp_sk(sk)->encap_err_rcv)
+			udp_sk(sk)->encap_err_rcv(sk, skb, offset);
 		goto out;
+	}
 
 	if (!np->recverr) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 571436064cd6..62c70709d798 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -982,6 +982,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *);
 /*
  * peer_event.c
  */
+void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset);
 void rxrpc_error_report(struct sock *);
 void rxrpc_peer_keepalive_worker(struct work_struct *);
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 96ecb7356c0f..79bb02eb67b2 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -137,6 +137,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 
 	tuncfg.encap_type = UDP_ENCAP_RXRPC;
 	tuncfg.encap_rcv = rxrpc_input_packet;
+	tuncfg.encap_err_rcv = rxrpc_encap_err_rcv;
 	tuncfg.sk_user_data = local;
 	setup_udp_tunnel_sock(net, local->socket, &tuncfg);
 
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index be032850ae8c..32561e9567fe 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -16,22 +16,105 @@
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <net/ip.h>
+#include <net/icmp.h>
 #include "ar-internal.h"
 
+static void rxrpc_adjust_mtu(struct rxrpc_peer *, unsigned int);
 static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *);
 static void rxrpc_distribute_error(struct rxrpc_peer *, int,
 				   enum rxrpc_call_completion);
 
 /*
- * Find the peer associated with an ICMP packet.
+ * Find the peer associated with an ICMPv4 packet.
  */
 static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
-						     const struct sk_buff *skb,
+						     struct sk_buff *skb,
+						     unsigned int udp_offset,
+						     unsigned int *info,
 						     struct sockaddr_rxrpc *srx)
 {
-	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+	struct iphdr *ip, *ip0 = ip_hdr(skb);
+	struct icmphdr *icmp = icmp_hdr(skb);
+	struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset);
 
-	_enter("");
+	_enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code);
+
+	switch (icmp->type) {
+	case ICMP_DEST_UNREACH:
+		*info = ntohs(icmp->un.frag.mtu);
+		fallthrough;
+	case ICMP_TIME_EXCEEDED:
+	case ICMP_PARAMETERPROB:
+		ip = (struct iphdr *)((void *)icmp + 8);
+		break;
+	default:
+		return NULL;
+	}
+
+	memset(srx, 0, sizeof(*srx));
+	srx->transport_type = local->srx.transport_type;
+	srx->transport_len = local->srx.transport_len;
+	srx->transport.family = local->srx.transport.family;
+
+	/* Can we see an ICMP4 packet on an ICMP6 listening socket?  and vice
+	 * versa?
+	 */
+	switch (srx->transport.family) {
+	case AF_INET:
+		srx->transport_len = sizeof(srx->transport.sin);
+		srx->transport.family = AF_INET;
+		srx->transport.sin.sin_port = udp->dest;
+		memcpy(&srx->transport.sin.sin_addr, &ip->daddr,
+		       sizeof(struct in_addr));
+		break;
+
+#ifdef CONFIG_AF_RXRPC_IPV6
+	case AF_INET6:
+		srx->transport_len = sizeof(srx->transport.sin);
+		srx->transport.family = AF_INET;
+		srx->transport.sin.sin_port = udp->dest;
+		memcpy(&srx->transport.sin.sin_addr, &ip->daddr,
+		       sizeof(struct in_addr));
+		break;
+#endif
+
+	default:
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
+	_net("ICMP {%pISp}", &srx->transport);
+	return rxrpc_lookup_peer_rcu(local, srx);
+}
+
+#ifdef CONFIG_AF_RXRPC_IPV6
+/*
+ * Find the peer associated with an ICMPv6 packet.
+ */
+static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local,
+						      struct sk_buff *skb,
+						      unsigned int udp_offset,
+						      unsigned int *info,
+						      struct sockaddr_rxrpc *srx)
+{
+	struct icmp6hdr *icmp = icmp6_hdr(skb);
+	struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb);
+	struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset);
+
+	_enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code);
+
+	switch (icmp->icmp6_type) {
+	case ICMPV6_DEST_UNREACH:
+		*info = ntohl(icmp->icmp6_mtu);
+		fallthrough;
+	case ICMPV6_PKT_TOOBIG:
+	case ICMPV6_TIME_EXCEED:
+	case ICMPV6_PARAMPROB:
+		ip = (struct ipv6hdr *)((void *)icmp + 8);
+		break;
+	default:
+		return NULL;
+	}
 
 	memset(srx, 0, sizeof(*srx));
 	srx->transport_type = local->srx.transport_type;
@@ -41,6 +124,165 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 	/* Can we see an ICMP4 packet on an ICMP6 listening socket?  and vice
 	 * versa?
 	 */
+	switch (srx->transport.family) {
+	case AF_INET:
+		_net("Rx ICMP6 on v4 sock");
+		srx->transport_len = sizeof(srx->transport.sin);
+		srx->transport.family = AF_INET;
+		srx->transport.sin.sin_port = udp->dest;
+		memcpy(&srx->transport.sin.sin_addr,
+		       &ip->daddr.s6_addr32[3], sizeof(struct in_addr));
+		break;
+	case AF_INET6:
+		_net("Rx ICMP6");
+		srx->transport.sin.sin_port = udp->dest;
+		memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr,
+		       sizeof(struct in6_addr));
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
+	_net("ICMP {%pISp}", &srx->transport);
+	return rxrpc_lookup_peer_rcu(local, srx);
+}
+#endif /* CONFIG_AF_RXRPC_IPV6 */
+
+/*
+ * Handle an error received on the local endpoint as a tunnel.
+ */
+void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
+			 unsigned int udp_offset)
+{
+	struct sock_extended_err ee;
+	struct sockaddr_rxrpc srx;
+	struct rxrpc_local *local;
+	struct rxrpc_peer *peer;
+	unsigned int info = 0;
+	int err;
+	u8 version = ip_hdr(skb)->version;
+	u8 type = icmp_hdr(skb)->type;
+	u8 code = icmp_hdr(skb)->code;
+
+	rcu_read_lock();
+	local = rcu_dereference_sk_user_data(sk);
+	if (unlikely(!local)) {
+		rcu_read_unlock();
+		return;
+	}
+
+	rxrpc_new_skb(skb, rxrpc_skb_received);
+
+	switch (ip_hdr(skb)->version) {
+	case IPVERSION:
+		peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset,
+						  &info, &srx);
+		break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+	case 6:
+		peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset,
+						   &info, &srx);
+		break;
+#endif
+	default:
+		rcu_read_unlock();
+		return;
+	}
+
+	if (peer && !rxrpc_get_peer_maybe(peer))
+		peer = NULL;
+	if (!peer) {
+		rcu_read_unlock();
+		return;
+	}
+
+	memset(&ee, 0, sizeof(ee));
+
+	switch (version) {
+	case IPVERSION:
+		switch (type) {
+		case ICMP_DEST_UNREACH:
+			switch (code) {
+			case ICMP_FRAG_NEEDED:
+				rxrpc_adjust_mtu(peer, info);
+				rcu_read_unlock();
+				rxrpc_put_peer(peer);
+				return;
+			default:
+				break;
+			}
+
+			err = EHOSTUNREACH;
+			if (code <= NR_ICMP_UNREACH) {
+				/* Might want to do something different with
+				 * non-fatal errors
+				 */
+				//harderr = icmp_err_convert[code].fatal;
+				err = icmp_err_convert[code].errno;
+			}
+			break;
+
+		case ICMP_TIME_EXCEEDED:
+			err = EHOSTUNREACH;
+			break;
+		default:
+			err = EPROTO;
+			break;
+		}
+
+		ee.ee_origin = SO_EE_ORIGIN_ICMP;
+		ee.ee_type = type;
+		ee.ee_code = code;
+		ee.ee_errno = err;
+		break;
+
+#ifdef CONFIG_AF_RXRPC_IPV6
+	case 6:
+		switch (type) {
+		case ICMPV6_PKT_TOOBIG:
+			rxrpc_adjust_mtu(peer, info);
+			rcu_read_unlock();
+			rxrpc_put_peer(peer);
+			return;
+		}
+
+		icmpv6_err_convert(type, code, &err);
+
+		if (err == EACCES)
+			err = EHOSTUNREACH;
+
+		ee.ee_origin = SO_EE_ORIGIN_ICMP6;
+		ee.ee_type = type;
+		ee.ee_code = code;
+		ee.ee_errno = err;
+		break;
+#endif
+	}
+
+	trace_rxrpc_rx_icmp(peer, &ee, &srx);
+
+	rxrpc_distribute_error(peer, err, RXRPC_CALL_NETWORK_ERROR);
+	rcu_read_unlock();
+	rxrpc_put_peer(peer);
+}
+
+/*
+ * Find the peer associated with a local error.
+ */
+static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
+						      const struct sk_buff *skb,
+						      struct sockaddr_rxrpc *srx)
+{
+	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+
+	_enter("");
+
+	memset(srx, 0, sizeof(*srx));
+	srx->transport_type = local->srx.transport_type;
+	srx->transport_len = local->srx.transport_len;
+	srx->transport.family = local->srx.transport.family;
+
 	switch (srx->transport.family) {
 	case AF_INET:
 		srx->transport_len = sizeof(srx->transport.sin);
@@ -104,10 +346,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 /*
  * Handle an MTU/fragmentation problem.
  */
-static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr)
+static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
 {
-	u32 mtu = serr->ee.ee_info;
-
 	_net("Rx ICMP Fragmentation Needed (%d)", mtu);
 
 	/* wind down the local interface MTU */
@@ -148,7 +388,7 @@ void rxrpc_error_report(struct sock *sk)
 	struct sock_exterr_skb *serr;
 	struct sockaddr_rxrpc srx;
 	struct rxrpc_local *local;
-	struct rxrpc_peer *peer;
+	struct rxrpc_peer *peer = NULL;
 	struct sk_buff *skb;
 
 	rcu_read_lock();
@@ -172,41 +412,20 @@ void rxrpc_error_report(struct sock *sk)
 	}
 	rxrpc_new_skb(skb, rxrpc_skb_received);
 	serr = SKB_EXT_ERR(skb);
-	if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
-		_leave("UDP empty message");
-		rcu_read_unlock();
-		rxrpc_free_skb(skb, rxrpc_skb_freed);
-		return;
-	}
 
-	peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx);
-	if (peer && !rxrpc_get_peer_maybe(peer))
-		peer = NULL;
-	if (!peer) {
-		rcu_read_unlock();
-		rxrpc_free_skb(skb, rxrpc_skb_freed);
-		_leave(" [no peer]");
-		return;
-	}
-
-	trace_rxrpc_rx_icmp(peer, &serr->ee, &srx);
-
-	if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP &&
-	     serr->ee.ee_type == ICMP_DEST_UNREACH &&
-	     serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
-		rxrpc_adjust_mtu(peer, serr);
-		rcu_read_unlock();
-		rxrpc_free_skb(skb, rxrpc_skb_freed);
-		rxrpc_put_peer(peer);
-		_leave(" [MTU update]");
-		return;
+	if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) {
+		peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx);
+		if (peer && !rxrpc_get_peer_maybe(peer))
+			peer = NULL;
+		if (peer) {
+			trace_rxrpc_rx_icmp(peer, &serr->ee, &srx);
+			rxrpc_store_error(peer, serr);
+		}
 	}
 
-	rxrpc_store_error(peer, serr);
 	rcu_read_unlock();
 	rxrpc_free_skb(skb, rxrpc_skb_freed);
 	rxrpc_put_peer(peer);
-
 	_leave("");
 }
 
-- 
cgit v1.2.3


From 21457f4a91cb522f1a3ad9741ff1d25fadfaa3c5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 31 Aug 2022 13:24:44 +0100
Subject: rxrpc: Remove rxrpc_get_reply_time() which is no longer used

Remove rxrpc_get_reply_time() as that is no longer used now that the call
issue time is used instead of the reply time.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/networking/rxrpc.rst | 11 ----------
 include/net/af_rxrpc.h             |  2 --
 net/rxrpc/recvmsg.c                | 43 --------------------------------------
 3 files changed, 56 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst
index 39c2249c7aa7..39494a6ea739 100644
--- a/Documentation/networking/rxrpc.rst
+++ b/Documentation/networking/rxrpc.rst
@@ -1055,17 +1055,6 @@ The kernel interface functions are as follows:
      first function to change.  Note that this must be called in TASK_RUNNING
      state.
 
- (#) Get reply timestamp::
-
-	bool rxrpc_kernel_get_reply_time(struct socket *sock,
-					 struct rxrpc_call *call,
-					 ktime_t *_ts)
-
-     This allows the timestamp on the first DATA packet of the reply of a
-     client call to be queried, provided that it is still in the Rx ring.  If
-     successful, the timestamp will be stored into ``*_ts`` and true will be
-     returned; false will be returned otherwise.
-
  (#) Get remote client epoch::
 
 	u32 rxrpc_kernel_get_epoch(struct socket *sock,
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index cee5f83c0f11..b69ca695935c 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -66,8 +66,6 @@ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
 void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64);
 bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *);
 u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *);
-bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *,
-				 ktime_t *);
 bool rxrpc_kernel_call_is_complete(struct rxrpc_call *);
 void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
 			       unsigned long);
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 250f23bc1c07..7e39c262fd79 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -771,46 +771,3 @@ call_complete:
 	goto out;
 }
 EXPORT_SYMBOL(rxrpc_kernel_recv_data);
-
-/**
- * rxrpc_kernel_get_reply_time - Get timestamp on first reply packet
- * @sock: The socket that the call exists on
- * @call: The call to query
- * @_ts: Where to put the timestamp
- *
- * Retrieve the timestamp from the first DATA packet of the reply if it is
- * in the ring.  Returns true if successful, false if not.
- */
-bool rxrpc_kernel_get_reply_time(struct socket *sock, struct rxrpc_call *call,
-				 ktime_t *_ts)
-{
-	struct sk_buff *skb;
-	rxrpc_seq_t hard_ack, top, seq;
-	bool success = false;
-
-	mutex_lock(&call->user_mutex);
-
-	if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_RECV_REPLY)
-		goto out;
-
-	hard_ack = call->rx_hard_ack;
-	if (hard_ack != 0)
-		goto out;
-
-	seq = hard_ack + 1;
-	top = smp_load_acquire(&call->rx_top);
-	if (after(seq, top))
-		goto out;
-
-	skb = call->rxtx_buffer[seq & RXRPC_RXTX_BUFF_MASK];
-	if (!skb)
-		goto out;
-
-	*_ts = skb_get_ktime(skb);
-	success = true;
-
-out:
-	mutex_unlock(&call->user_mutex);
-	return success;
-}
-EXPORT_SYMBOL(rxrpc_kernel_get_reply_time);
-- 
cgit v1.2.3


From aded3cad909581c60335037112c4f86bbfe90f17 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sun, 21 Aug 2022 17:01:47 +0200
Subject: kunit: fix assert_type for comparison macros

When replacing KUNIT_BINARY_*_MSG_ASSERTION() macros with
KUNIT_BINARY_INT_ASSERTION(), the assert_type parameter was not always
correctly transferred.  Specifically, the following errors were
introduced:
  - KUNIT_EXPECT_LE_MSG() uses KUNIT_ASSERTION
  - KUNIT_ASSERT_LT_MSG() uses KUNIT_EXPECTATION
  - KUNIT_ASSERT_GT_MSG() uses KUNIT_EXPECTATION

A failing KUNIT_EXPECT_LE_MSG() test thus prevents further tests from
running, while failing KUNIT_ASSERT_{LT,GT}_MSG() tests do not prevent
further tests from running.  This is contrary to the documentation,
which states that failing KUNIT_EXPECT_* macros allow further tests to
run, while failing KUNIT_ASSERT_* macros should prevent this.

Revert the KUNIT_{ASSERTION,EXPECTATION} switches to fix the behaviour
for the affected macros.

Fixes: 40f39777ce4f ("kunit: decrease macro layering for integer asserts")
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Reviewed-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index c958855681cc..840a2c375065 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -826,7 +826,7 @@ do {									       \
 
 #define KUNIT_EXPECT_LE_MSG(test, left, right, fmt, ...)		       \
 	KUNIT_BINARY_INT_ASSERTION(test,				       \
-				   KUNIT_ASSERTION,			       \
+				   KUNIT_EXPECTATION,			       \
 				   left, <=, right,			       \
 				   fmt,					       \
 				    ##__VA_ARGS__)
@@ -1116,7 +1116,7 @@ do {									       \
 
 #define KUNIT_ASSERT_LT_MSG(test, left, right, fmt, ...)		       \
 	KUNIT_BINARY_INT_ASSERTION(test,				       \
-				   KUNIT_EXPECTATION,			       \
+				   KUNIT_ASSERTION,			       \
 				   left, <, right,			       \
 				   fmt,					       \
 				    ##__VA_ARGS__)
@@ -1157,7 +1157,7 @@ do {									       \
 
 #define KUNIT_ASSERT_GT_MSG(test, left, right, fmt, ...)		       \
 	KUNIT_BINARY_INT_ASSERTION(test,				       \
-				   KUNIT_EXPECTATION,			       \
+				   KUNIT_ASSERTION,			       \
 				   left, >, right,			       \
 				   fmt,					       \
 				    ##__VA_ARGS__)
-- 
cgit v1.2.3


From 3261400639463a853ba2b3be8bd009c2a8089775 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 31 Aug 2022 23:38:09 +0000
Subject: tcp: TX zerocopy should not sense pfmemalloc status

We got a recent syzbot report [1] showing a possible misuse
of pfmemalloc page status in TCP zerocopy paths.

Indeed, for pages coming from user space or other layers,
using page_is_pfmemalloc() is moot, and possibly could give
false positives.

There has been attempts to make page_is_pfmemalloc() more robust,
but not using it in the first place in this context is probably better,
removing cpu cycles.

Note to stable teams :

You need to backport 84ce071e38a6 ("net: introduce
__skb_fill_page_desc_noacc") as a prereq.

Race is more probable after commit c07aea3ef4d4
("mm: add a signature in struct page") because page_is_pfmemalloc()
is now using low order bit from page->lru.next, which can change
more often than page->index.

Low order bit should never be set for lru.next (when used as an anchor
in LRU list), so KCSAN report is mostly a false positive.

Backporting to older kernel versions seems not necessary.

[1]
BUG: KCSAN: data-race in lru_add_fn / tcp_build_frag

write to 0xffffea0004a1d2c8 of 8 bytes by task 18600 on cpu 0:
__list_add include/linux/list.h:73 [inline]
list_add include/linux/list.h:88 [inline]
lruvec_add_folio include/linux/mm_inline.h:105 [inline]
lru_add_fn+0x440/0x520 mm/swap.c:228
folio_batch_move_lru+0x1e1/0x2a0 mm/swap.c:246
folio_batch_add_and_move mm/swap.c:263 [inline]
folio_add_lru+0xf1/0x140 mm/swap.c:490
filemap_add_folio+0xf8/0x150 mm/filemap.c:948
__filemap_get_folio+0x510/0x6d0 mm/filemap.c:1981
pagecache_get_page+0x26/0x190 mm/folio-compat.c:104
grab_cache_page_write_begin+0x2a/0x30 mm/folio-compat.c:116
ext4_da_write_begin+0x2dd/0x5f0 fs/ext4/inode.c:2988
generic_perform_write+0x1d4/0x3f0 mm/filemap.c:3738
ext4_buffered_write_iter+0x235/0x3e0 fs/ext4/file.c:270
ext4_file_write_iter+0x2e3/0x1210
call_write_iter include/linux/fs.h:2187 [inline]
new_sync_write fs/read_write.c:491 [inline]
vfs_write+0x468/0x760 fs/read_write.c:578
ksys_write+0xe8/0x1a0 fs/read_write.c:631
__do_sys_write fs/read_write.c:643 [inline]
__se_sys_write fs/read_write.c:640 [inline]
__x64_sys_write+0x3e/0x50 fs/read_write.c:640
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd

read to 0xffffea0004a1d2c8 of 8 bytes by task 18611 on cpu 1:
page_is_pfmemalloc include/linux/mm.h:1740 [inline]
__skb_fill_page_desc include/linux/skbuff.h:2422 [inline]
skb_fill_page_desc include/linux/skbuff.h:2443 [inline]
tcp_build_frag+0x613/0xb20 net/ipv4/tcp.c:1018
do_tcp_sendpages+0x3e8/0xaf0 net/ipv4/tcp.c:1075
tcp_sendpage_locked net/ipv4/tcp.c:1140 [inline]
tcp_sendpage+0x89/0xb0 net/ipv4/tcp.c:1150
inet_sendpage+0x7f/0xc0 net/ipv4/af_inet.c:833
kernel_sendpage+0x184/0x300 net/socket.c:3561
sock_sendpage+0x5a/0x70 net/socket.c:1054
pipe_to_sendpage+0x128/0x160 fs/splice.c:361
splice_from_pipe_feed fs/splice.c:415 [inline]
__splice_from_pipe+0x222/0x4d0 fs/splice.c:559
splice_from_pipe fs/splice.c:594 [inline]
generic_splice_sendpage+0x89/0xc0 fs/splice.c:743
do_splice_from fs/splice.c:764 [inline]
direct_splice_actor+0x80/0xa0 fs/splice.c:931
splice_direct_to_actor+0x305/0x620 fs/splice.c:886
do_splice_direct+0xfb/0x180 fs/splice.c:974
do_sendfile+0x3bf/0x910 fs/read_write.c:1249
__do_sys_sendfile64 fs/read_write.c:1317 [inline]
__se_sys_sendfile64 fs/read_write.c:1303 [inline]
__x64_sys_sendfile64+0x10c/0x150 fs/read_write.c:1303
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd

value changed: 0x0000000000000000 -> 0xffffea0004a1d288

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 18611 Comm: syz-executor.4 Not tainted 6.0.0-rc2-syzkaller-00248-ge022620b5d05-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022

Fixes: c07aea3ef4d4 ("mm: add a signature in struct page")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 21 +++++++++++++++++++++
 net/core/datagram.c    |  2 +-
 net/ipv4/tcp.c         |  2 +-
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ca8afa382bf2..18e163a3460d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2444,6 +2444,27 @@ static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
 	skb_shinfo(skb)->nr_frags = i + 1;
 }
 
+/**
+ * skb_fill_page_desc_noacc - initialise a paged fragment in an skb
+ * @skb: buffer containing fragment to be initialised
+ * @i: paged fragment index to initialise
+ * @page: the page to use for this fragment
+ * @off: the offset to the data with @page
+ * @size: the length of the data
+ *
+ * Variant of skb_fill_page_desc() which does not deal with
+ * pfmemalloc, if page is not owned by us.
+ */
+static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i,
+					    struct page *page, int off,
+					    int size)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	__skb_fill_page_desc_noacc(shinfo, i, page, off, size);
+	shinfo->nr_frags = i + 1;
+}
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		     int size, unsigned int truesize);
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 7255531f63ae..e4ff2db40c98 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -677,7 +677,7 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
 				page_ref_sub(last_head, refs);
 				refs = 0;
 			}
-			skb_fill_page_desc(skb, frag++, head, start, size);
+			skb_fill_page_desc_noacc(skb, frag++, head, start, size);
 		}
 		if (refs)
 			page_ref_sub(last_head, refs);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e5011c136fdb..6cdfce6f2867 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1015,7 +1015,7 @@ new_segment:
 		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
 	} else {
 		get_page(page);
-		skb_fill_page_desc(skb, i, page, offset, copy);
+		skb_fill_page_desc_noacc(skb, i, page, offset, copy);
 	}
 
 	if (!(flags & MSG_NO_SHARED_FRAGS))
-- 
cgit v1.2.3


From b30f7c8eb0780e1479a9882526e838664271f4c9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 1 Sep 2022 13:07:32 +0100
Subject: spi: mux: Fix mux interaction with fast path optimisations

The spi-mux driver is rather too clever and attempts to resubmit any
message that is submitted to it to the parent controller with some
adjusted callbacks.  This does not play at all nicely with the fast
path which now sets flags on the message indicating that it's being
handled through the fast path, we see async messages flagged as being on
the fast path.  Ideally the spi-mux code would duplicate the message but
that's rather invasive and a bit fragile in that it relies on the mux
knowing which fields in the message to copy.  Instead teach the core
that there are controllers which can't cope with the fast path and have
the mux flag itself as being such a controller, ensuring that messages
going via the mux don't get partially handled via the fast path.

This will reduce the performance of any spi-mux connected device since
we'll now always use the thread for both the actual controller and the
mux controller instead of just the actual controller but given that we
were always hitting the slow path anyway it's hopefully not too much of
an additional cost and it allows us to keep the fast path.

Fixes: ae7d2346dc89 ("spi: Don't use the message queue if possible in spi_sync")
Reported-by: Casper Andersson <casper.casan@gmail.com>
Tested-by: Casper Andersson <casper.casan@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20220901120732.49245-1-broonie@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-mux.c   | 1 +
 drivers/spi/spi.c       | 2 +-
 include/linux/spi/spi.h | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/spi/spi-mux.c b/drivers/spi/spi-mux.c
index f5d32ec4634e..0709e987bd5a 100644
--- a/drivers/spi/spi-mux.c
+++ b/drivers/spi/spi-mux.c
@@ -161,6 +161,7 @@ static int spi_mux_probe(struct spi_device *spi)
 	ctlr->num_chipselect = mux_control_states(priv->mux);
 	ctlr->bus_num = -1;
 	ctlr->dev.of_node = spi->dev.of_node;
+	ctlr->must_async = true;
 
 	ret = devm_spi_register_controller(&spi->dev, ctlr);
 	if (ret)
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 8f97a3eacdea..5cbe090f7676 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -4033,7 +4033,7 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message)
 	 * guard against reentrancy from a different context. The io_mutex
 	 * will catch those cases.
 	 */
-	if (READ_ONCE(ctlr->queue_empty)) {
+	if (READ_ONCE(ctlr->queue_empty) && !ctlr->must_async) {
 		message->actual_length = 0;
 		message->status = -EINPROGRESS;
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index e6c73d5ff1a8..f089ee1ead58 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -469,6 +469,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  *	SPI_TRANS_FAIL_NO_START.
  * @queue_empty: signal green light for opportunistically skipping the queue
  *	for spi_sync transfers.
+ * @must_async: disable all fast paths in the core
  *
  * Each SPI controller can communicate with one or more @spi_device
  * children.  These make a small bus, sharing MOSI, MISO and SCK signals
@@ -690,6 +691,7 @@ struct spi_controller {
 
 	/* Flag for enabling opportunistic skipping of the queue in spi_sync */
 	bool			queue_empty;
+	bool			must_async;
 };
 
 static inline void *spi_controller_get_devdata(struct spi_controller *ctlr)
-- 
cgit v1.2.3


From c7943bb324e503baeeba3df2bc5ca8a377111bfa Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Sat, 27 Aug 2022 00:34:51 +0300
Subject: drm/edid: Handle EDID 1.4 range descriptor h/vfreq offsets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

EDID 1.4 introduced some extra flags in the range
descriptor to support min/max h/vfreq >= 255. Consult them
to correctly parse the vfreq limits.

Note that some combinations of the flags are documented
as "reserved" (as are some other values in the descriptor)
but explicitly checking for those doesn't seem particularly
worthwile since we end up with bogus results whether we
decode them or not.

v2: Increase the storage to u16 to make it work (Jani)
    Note the "reserved" values situation (Jani)
v3: Document the EDID version number in the defines
    Drop some bogus (u8) casts

Cc: stable@vger.kernel.org
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/6519
References: https://gitlab.freedesktop.org/drm/intel/-/issues/6484
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220826213501.31490-2-ville.syrjala@linux.intel.com
Reviewed-by: Manasi Navare <manasi.d.navare@intel.com>
---
 drivers/gpu/drm/drm_debugfs.c |  4 ++--
 drivers/gpu/drm/drm_edid.c    | 24 ++++++++++++++++++------
 include/drm/drm_connector.h   |  4 ++--
 include/drm/drm_edid.h        |  5 +++++
 4 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_debugfs.c b/drivers/gpu/drm/drm_debugfs.c
index 493922069c90..01ee3febb813 100644
--- a/drivers/gpu/drm/drm_debugfs.c
+++ b/drivers/gpu/drm/drm_debugfs.c
@@ -377,8 +377,8 @@ static int vrr_range_show(struct seq_file *m, void *data)
 	if (connector->status != connector_status_connected)
 		return -ENODEV;
 
-	seq_printf(m, "Min: %u\n", (u8)connector->display_info.monitor_range.min_vfreq);
-	seq_printf(m, "Max: %u\n", (u8)connector->display_info.monitor_range.max_vfreq);
+	seq_printf(m, "Min: %u\n", connector->display_info.monitor_range.min_vfreq);
+	seq_printf(m, "Max: %u\n", connector->display_info.monitor_range.max_vfreq);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index bbc25e3b7220..eaa819381281 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -5971,12 +5971,14 @@ static void drm_parse_cea_ext(struct drm_connector *connector,
 }
 
 static
-void get_monitor_range(const struct detailed_timing *timing,
-		       void *info_monitor_range)
+void get_monitor_range(const struct detailed_timing *timing, void *c)
 {
-	struct drm_monitor_range_info *monitor_range = info_monitor_range;
+	struct detailed_mode_closure *closure = c;
+	struct drm_display_info *info = &closure->connector->display_info;
+	struct drm_monitor_range_info *monitor_range = &info->monitor_range;
 	const struct detailed_non_pixel *data = &timing->data.other_data;
 	const struct detailed_data_monitor_range *range = &data->data.range;
+	const struct edid *edid = closure->drm_edid->edid;
 
 	if (!is_display_descriptor(timing, EDID_DETAIL_MONITOR_RANGE))
 		return;
@@ -5992,18 +5994,28 @@ void get_monitor_range(const struct detailed_timing *timing,
 
 	monitor_range->min_vfreq = range->min_vfreq;
 	monitor_range->max_vfreq = range->max_vfreq;
+
+	if (edid->revision >= 4) {
+		if (data->pad2 & DRM_EDID_RANGE_OFFSET_MIN_VFREQ)
+			monitor_range->min_vfreq += 255;
+		if (data->pad2 & DRM_EDID_RANGE_OFFSET_MAX_VFREQ)
+			monitor_range->max_vfreq += 255;
+	}
 }
 
 static void drm_get_monitor_range(struct drm_connector *connector,
 				  const struct drm_edid *drm_edid)
 {
-	struct drm_display_info *info = &connector->display_info;
+	const struct drm_display_info *info = &connector->display_info;
+	struct detailed_mode_closure closure = {
+		.connector = connector,
+		.drm_edid = drm_edid,
+	};
 
 	if (!version_greater(drm_edid, 1, 1))
 		return;
 
-	drm_for_each_detailed_block(drm_edid, get_monitor_range,
-				    &info->monitor_range);
+	drm_for_each_detailed_block(drm_edid, get_monitor_range, &closure);
 
 	DRM_DEBUG_KMS("Supported Monitor Refresh rate range is %d Hz - %d Hz\n",
 		      info->monitor_range.min_vfreq,
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index a1705d6b3fba..7df7876b2ad5 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -319,8 +319,8 @@ enum drm_panel_orientation {
  *             EDID's detailed monitor range
  */
 struct drm_monitor_range_info {
-	u8 min_vfreq;
-	u8 max_vfreq;
+	u16 min_vfreq;
+	u16 max_vfreq;
 };
 
 /**
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 2181977ae683..1ed61e2b30a4 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -92,6 +92,11 @@ struct detailed_data_string {
 	u8 str[13];
 } __attribute__((packed));
 
+#define DRM_EDID_RANGE_OFFSET_MIN_VFREQ (1 << 0) /* 1.4 */
+#define DRM_EDID_RANGE_OFFSET_MAX_VFREQ (1 << 1) /* 1.4 */
+#define DRM_EDID_RANGE_OFFSET_MIN_HFREQ (1 << 2) /* 1.4 */
+#define DRM_EDID_RANGE_OFFSET_MAX_HFREQ (1 << 3) /* 1.4 */
+
 #define DRM_EDID_DEFAULT_GTF_SUPPORT_FLAG   0x00
 #define DRM_EDID_RANGE_LIMITS_ONLY_FLAG     0x01
 #define DRM_EDID_SECONDARY_GTF_SUPPORT_FLAG 0x02
-- 
cgit v1.2.3


From 2aec909912da55a6e469fd6ee8412080a5433ed2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Aug 2022 11:46:38 +0200
Subject: wifi: use struct_group to copy addresses

We sometimes copy all the addresses from the 802.11 header
for the AAD, which may cause complaints from fortify checks.
Use struct_group() to avoid the compiler warnings/errors.

Change-Id: Ic3ea389105e7813b22095b295079eecdabde5045
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h          | 8 +++++---
 net/mac80211/wpa.c                 | 4 ++--
 net/wireless/lib80211_crypt_ccmp.c | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 55e6f4ad0ca6..b6e6d5b40774 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -310,9 +310,11 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 struct ieee80211_hdr {
 	__le16 frame_control;
 	__le16 duration_id;
-	u8 addr1[ETH_ALEN];
-	u8 addr2[ETH_ALEN];
-	u8 addr3[ETH_ALEN];
+	struct_group(addrs,
+		u8 addr1[ETH_ALEN];
+		u8 addr2[ETH_ALEN];
+		u8 addr3[ETH_ALEN];
+	);
 	__le16 seq_ctrl;
 	u8 addr4[ETH_ALEN];
 } __packed __aligned(2);
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 93ec2f349748..20f742b5503b 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -351,7 +351,7 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad)
 	 * FC | A1 | A2 | A3 | SC | [A4] | [QC] */
 	put_unaligned_be16(len_a, &aad[0]);
 	put_unaligned(mask_fc, (__le16 *)&aad[2]);
-	memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN);
+	memcpy(&aad[4], &hdr->addrs, 3 * ETH_ALEN);
 
 	/* Mask Seq#, leave Frag# */
 	aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f;
@@ -792,7 +792,7 @@ static void bip_aad(struct sk_buff *skb, u8 *aad)
 				IEEE80211_FCTL_MOREDATA);
 	put_unaligned(mask_fc, (__le16 *) &aad[0]);
 	/* A1 || A2 || A3 */
-	memcpy(aad + 2, &hdr->addr1, 3 * ETH_ALEN);
+	memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN);
 }
 
 
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index 6a5f08f7491e..cca5e1cf089e 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -136,7 +136,7 @@ static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr,
 	pos = (u8 *) hdr;
 	aad[0] = pos[0] & 0x8f;
 	aad[1] = pos[1] & 0xc7;
-	memcpy(aad + 2, hdr->addr1, 3 * ETH_ALEN);
+	memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN);
 	pos = (u8 *) & hdr->seq_ctrl;
 	aad[20] = pos[0] & 0x0f;
 	aad[21] = 0;		/* all bits masked */
-- 
cgit v1.2.3


From dec9b2f1e0455a151a7293c367da22ab973f713e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 2 Sep 2022 16:59:15 +0200
Subject: debugfs: add debugfs_lookup_and_remove()

There is a very common pattern of using
debugfs_remove(debufs_lookup(..)) which results in a dentry leak of the
dentry that was looked up.  Instead of having to open-code the correct
pattern of calling dput() on the dentry, create
debugfs_lookup_and_remove() to handle this pattern automatically and
properly without any memory leaks.

Cc: stable <stable@kernel.org>
Reported-by: Kuyo Chang <kuyo.chang@mediatek.com>
Tested-by: Kuyo Chang <kuyo.chang@mediatek.com>
Link: https://lore.kernel.org/r/YxIaQ8cSinDR881k@kroah.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/inode.c      | 22 ++++++++++++++++++++++
 include/linux/debugfs.h |  6 ++++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dcf0b8b4e93..232cfdf095ae 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -744,6 +744,28 @@ void debugfs_remove(struct dentry *dentry)
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
 
+/**
+ * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it
+ * @name: a pointer to a string containing the name of the item to look up.
+ * @parent: a pointer to the parent dentry of the item.
+ *
+ * This is the equlivant of doing something like
+ * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting
+ * handled for the directory being looked up.
+ */
+void debugfs_lookup_and_remove(const char *name, struct dentry *parent)
+{
+	struct dentry *dentry;
+
+	dentry = debugfs_lookup(name, parent);
+	if (!dentry)
+		return;
+
+	debugfs_remove(dentry);
+	dput(dentry);
+}
+EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove);
+
 /**
  * debugfs_rename - rename a file/directory in the debugfs filesystem
  * @old_dir: a pointer to the parent dentry for the renamed object. This
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index c869f1e73d75..f60674692d36 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -91,6 +91,8 @@ struct dentry *debugfs_create_automount(const char *name,
 void debugfs_remove(struct dentry *dentry);
 #define debugfs_remove_recursive debugfs_remove
 
+void debugfs_lookup_and_remove(const char *name, struct dentry *parent);
+
 const struct file_operations *debugfs_real_fops(const struct file *filp);
 
 int debugfs_file_get(struct dentry *dentry);
@@ -225,6 +227,10 @@ static inline void debugfs_remove(struct dentry *dentry)
 static inline void debugfs_remove_recursive(struct dentry *dentry)
 { }
 
+static inline void debugfs_lookup_and_remove(const char *name,
+					     struct dentry *parent)
+{ }
+
 const struct file_operations *debugfs_real_fops(const struct file *filp);
 
 static inline int debugfs_file_get(struct dentry *dentry)
-- 
cgit v1.2.3


From 9ca05b0f27de928be121cccf07735819dc9e1ed3 Mon Sep 17 00:00:00 2001
From: Maher Sanalla <msanalla@nvidia.com>
Date: Mon, 29 Aug 2022 12:02:27 +0300
Subject: RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting
 profile

When the RDMA auxiliary driver probes, it sets its profile based on
devlink driverinit value. The latter might not be in sync with FW yet
(In case devlink reload is not performed), thus causing a mismatch
between RDMA driver and FW. This results in the following FW syndrome
when the RDMA driver tries to adjust RoCE state, which fails the probe:

"0xC1F678 | modify_nic_vport_context: roce_en set on a vport that
doesn't support roce"

To prevent this, select the PF profile based on FW RoCE capability
instead of relying on devlink driverinit value.
To provide backward compatibility of the RoCE disable feature, on older
FW's where roce_rw is not set (FW RoCE capability is read-only), keep
the current behavior e.g., rely on devlink driverinit value.

Fixes: fbfa97b4d79f ("net/mlx5: Disable roce at HCA level")
Reviewed-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
Link: https://lore.kernel.org/r/cb34ce9a1df4a24c135cb804db87f7d2418bd6cc.1661763459.git.leonro@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/main.c              |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 23 +++++++++++++++++++++--
 include/linux/mlx5/driver.h                    | 19 ++++++++++---------
 3 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index fc94a1b25485..883d7c60143e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4336,7 +4336,7 @@ static int mlx5r_probe(struct auxiliary_device *adev,
 	dev->mdev = mdev;
 	dev->num_ports = num_ports;
 
-	if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_init_enabled(mdev))
+	if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev))
 		profile = &raw_eth_profile;
 	else
 		profile = &pf_profile;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index bec8d6d0b5f6..4870a88cecb7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -494,6 +494,24 @@ static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev)
 	return err;
 }
 
+bool mlx5_is_roce_on(struct mlx5_core_dev *dev)
+{
+	struct devlink *devlink = priv_to_devlink(dev);
+	union devlink_param_value val;
+	int err;
+
+	err = devlink_param_driverinit_value_get(devlink,
+						 DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+						 &val);
+
+	if (!err)
+		return val.vbool;
+
+	mlx5_core_dbg(dev, "Failed to get param. err = %d\n", err);
+	return MLX5_CAP_GEN(dev, roce);
+}
+EXPORT_SYMBOL(mlx5_is_roce_on);
+
 static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx)
 {
 	void *set_hca_cap;
@@ -597,7 +615,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
 			 MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix));
 
 	if (MLX5_CAP_GEN(dev, roce_rw_supported))
-		MLX5_SET(cmd_hca_cap, set_hca_cap, roce, mlx5_is_roce_init_enabled(dev));
+		MLX5_SET(cmd_hca_cap, set_hca_cap, roce,
+			 mlx5_is_roce_on(dev));
 
 	max_uc_list = max_uc_list_get_devlink_param(dev);
 	if (max_uc_list > 0)
@@ -623,7 +642,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
  */
 static bool is_roce_fw_disabled(struct mlx5_core_dev *dev)
 {
-	return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_init_enabled(dev)) ||
+	return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_on(dev)) ||
 		(!MLX5_CAP_GEN(dev, roce_rw_supported) && !MLX5_CAP_GEN(dev, roce));
 }
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 96b16fbe1aa4..d6338fb449c8 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1279,16 +1279,17 @@ enum {
 	MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
 };
 
-static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev)
+bool mlx5_is_roce_on(struct mlx5_core_dev *dev);
+
+static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev)
 {
-	struct devlink *devlink = priv_to_devlink(dev);
-	union devlink_param_value val;
-	int err;
-
-	err = devlink_param_driverinit_value_get(devlink,
-						 DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
-						 &val);
-	return err ? MLX5_CAP_GEN(dev, roce) : val.vbool;
+	if (MLX5_CAP_GEN(dev, roce_rw_supported))
+		return MLX5_CAP_GEN(dev, roce);
+
+	/* If RoCE cap is read-only in FW, get RoCE state from devlink
+	 * in order to support RoCE enable/disable feature
+	 */
+	return mlx5_is_roce_on(dev);
 }
 
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From 8cbb2b50ee2dcb082675237eaaa48fe8479f8aa5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 25 Aug 2022 10:25:05 +0200
Subject: asm-generic: Conditionally enable do_softirq_own_stack() via Kconfig.

Remove the CONFIG_PREEMPT_RT symbol from the ifdef around
do_softirq_own_stack() and move it to Kconfig instead.

Enable softirq stacks based on SOFTIRQ_ON_OWN_STACK which depends on
HAVE_SOFTIRQ_ON_OWN_STACK and its default value is set to !PREEMPT_RT.
This ensures that softirq stacks are not used on PREEMPT_RT and avoids
a 'select' statement on an option which has a 'depends' statement.

Link: https://lore.kernel.org/YvN5E%2FPrHfUhggr7@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/Kconfig                          | 3 +++
 arch/arm/kernel/irq.c                 | 2 +-
 arch/parisc/kernel/irq.c              | 2 +-
 arch/powerpc/kernel/irq.c             | 4 ++--
 arch/s390/include/asm/softirq_stack.h | 2 +-
 arch/sh/kernel/irq.c                  | 2 +-
 arch/sparc/kernel/irq_64.c            | 2 +-
 arch/x86/include/asm/irq_stack.h      | 2 +-
 arch/x86/kernel/irq_32.c              | 2 +-
 include/asm-generic/softirq_stack.h   | 2 +-
 10 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 5dbf11a5ba4e..8b311e400ec1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -923,6 +923,9 @@ config HAVE_SOFTIRQ_ON_OWN_STACK
 	  Architecture provides a function to run __do_softirq() on a
 	  separate stack.
 
+config SOFTIRQ_ON_OWN_STACK
+	def_bool HAVE_SOFTIRQ_ON_OWN_STACK && !PREEMPT_RT
+
 config ALTERNATE_USER_ADDRESS_SPACE
 	bool
 	help
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 034cb48c9eeb..fe28fc1f759d 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -70,7 +70,7 @@ static void __init init_irq_stacks(void)
 	}
 }
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 static void ____do_softirq(void *arg)
 {
 	__do_softirq();
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index fbb882cb8dbb..b05055f3ba4b 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -480,7 +480,7 @@ static void execute_on_irq_stack(void *func, unsigned long param1)
 	*irq_stack_in_use = 1;
 }
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 void do_softirq_own_stack(void)
 {
 	execute_on_irq_stack(__do_softirq, 0);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 0f17268c1f0b..9ede61a5a469 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -199,7 +199,7 @@ static inline void check_stack_overflow(unsigned long sp)
 	}
 }
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 static __always_inline void call_do_softirq(const void *sp)
 {
 	/* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */
@@ -335,7 +335,7 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly;
 void *softirq_ctx[NR_CPUS] __read_mostly;
 void *hardirq_ctx[NR_CPUS] __read_mostly;
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 void do_softirq_own_stack(void)
 {
 	call_do_softirq(softirq_ctx[smp_processor_id()]);
diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h
index af68d6c1d584..1ac5115d3115 100644
--- a/arch/s390/include/asm/softirq_stack.h
+++ b/arch/s390/include/asm/softirq_stack.h
@@ -5,7 +5,7 @@
 #include <asm/lowcore.h>
 #include <asm/stacktrace.h>
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 static inline void do_softirq_own_stack(void)
 {
 	call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq);
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 909276738078..4e6835de54cf 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -149,7 +149,7 @@ void irq_ctx_exit(int cpu)
 	hardirq_ctx[cpu] = NULL;
 }
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 void do_softirq_own_stack(void)
 {
 	struct thread_info *curctx;
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 41fa1be980a3..72da2e10e255 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -855,7 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 void do_softirq_own_stack(void)
 {
 	void *orig_sp, *sp = softirq_stack[smp_processor_id()];
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 63f818aedf77..147cb8fdda92 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -203,7 +203,7 @@
 			      IRQ_CONSTRAINTS, regs, vector);		\
 }
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 /*
  * Macro to invoke __do_softirq on the irq stack. This is only called from
  * task context when bottom halves are about to be reenabled and soft
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index e5dd6da78713..01833ebf5e8e 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -132,7 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)
 	return 0;
 }
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 void do_softirq_own_stack(void)
 {
 	struct irq_stack *irqstk;
diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h
index d3e2d81656e0..2a67aed9ac52 100644
--- a/include/asm-generic/softirq_stack.h
+++ b/include/asm-generic/softirq_stack.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_GENERIC_SOFTIRQ_STACK_H
 #define __ASM_GENERIC_SOFTIRQ_STACK_H
 
-#if defined(CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK) && !defined(CONFIG_PREEMPT_RT)
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 void do_softirq_own_stack(void);
 #else
 static inline void do_softirq_own_stack(void)
-- 
cgit v1.2.3


From 9fc18f6d56d5b79d527c17a8100a0965d18345cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 21 Aug 2022 16:06:44 +0200
Subject: dma-mapping: mark dma_supported static

Now that the remaining users in drivers are gone, this function can be
marked static.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 5 -----
 kernel/dma/mapping.c        | 3 +--
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 25a30906289d..0ee20b764000 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -139,7 +139,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs);
 bool dma_can_mmap(struct device *dev);
-int dma_supported(struct device *dev, u64 mask);
 bool dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
@@ -248,10 +247,6 @@ static inline bool dma_can_mmap(struct device *dev)
 {
 	return false;
 }
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-	return 0;
-}
 static inline bool dma_pci_p2pdma_supported(struct device *dev)
 {
 	return false;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 49cbf3e33de7..27f272381cf2 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -707,7 +707,7 @@ int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
 
-int dma_supported(struct device *dev, u64 mask)
+static int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
@@ -721,7 +721,6 @@ int dma_supported(struct device *dev, u64 mask)
 		return 1;
 	return ops->dma_supported(dev, mask);
 }
-EXPORT_SYMBOL(dma_supported);
 
 bool dma_pci_p2pdma_supported(struct device *dev)
 {
-- 
cgit v1.2.3


From 9cb252c4c1c53ae58bc565bab76e98133288f23a Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Mon, 5 Sep 2022 11:50:15 +0800
Subject: net: skb: export skb drop reaons to user by TRACE_DEFINE_ENUM

As Eric reported, the 'reason' field is not presented when trace the
kfree_skb event by perf:

$ perf record -e skb:kfree_skb -a sleep 10
$ perf script
  ip_defrag 14605 [021]   221.614303:   skb:kfree_skb:
  skbaddr=0xffff9d2851242700 protocol=34525 location=0xffffffffa39346b1
  reason:

The cause seems to be passing kernel address directly to TP_printk(),
which is not right. As the enum 'skb_drop_reason' is not exported to
user space through TRACE_DEFINE_ENUM(), perf can't get the drop reason
string from the 'reason' field, which is a number.

Therefore, we introduce the macro DEFINE_DROP_REASON(), which is used
to define the trace enum by TRACE_DEFINE_ENUM(). With the help of
DEFINE_DROP_REASON(), now we can remove the auto-generate that we
introduced in the commit ec43908dd556
("net: skb: use auto-generation to convert skb drop reason to string"),
and define the string array 'drop_reasons'.

Hmmmm...now we come back to the situation that have to maintain drop
reasons in both enum skb_drop_reason and DEFINE_DROP_REASON. But they
are both in dropreason.h, which makes it easier.

After this commit, now the format of kfree_skb is like this:

$ cat /tracing/events/skb/kfree_skb/format
name: kfree_skb
ID: 1524
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:void * skbaddr;   offset:8;       size:8; signed:0;
        field:void * location;  offset:16;      size:8; signed:0;
        field:unsigned short protocol;  offset:24;      size:2; signed:0;
        field:enum skb_drop_reason reason;      offset:28;      size:4; signed:0;

print fmt: "skbaddr=%p protocol=%u location=%p reason: %s", REC->skbaddr, REC->protocol, REC->location, __print_symbolic(REC->reason, { 1, "NOT_SPECIFIED" }, { 2, "NO_SOCKET" } ......

Fixes: ec43908dd556 ("net: skb: use auto-generation to convert skb drop reason to string")
Link: https://lore.kernel.org/netdev/CANn89i+bx0ybvE55iMYf5GJM48WwV1HNpdm9Q6t-HaEstqpCSA@mail.gmail.com/
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dropreason.h   | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 include/trace/events/skb.h | 15 ++++++++++-
 net/core/.gitignore        |  1 -
 net/core/Makefile          | 22 +--------------
 net/core/skbuff.c          |  6 ++++-
 5 files changed, 87 insertions(+), 24 deletions(-)
 delete mode 100644 net/core/.gitignore

(limited to 'include')

diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index fae9b40e54fa..c1cbcdbaf149 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -3,6 +3,73 @@
 #ifndef _LINUX_DROPREASON_H
 #define _LINUX_DROPREASON_H
 
+#define DEFINE_DROP_REASON(FN, FNe)	\
+	FN(NOT_SPECIFIED)		\
+	FN(NO_SOCKET)			\
+	FN(PKT_TOO_SMALL)		\
+	FN(TCP_CSUM)			\
+	FN(SOCKET_FILTER)		\
+	FN(UDP_CSUM)			\
+	FN(NETFILTER_DROP)		\
+	FN(OTHERHOST)			\
+	FN(IP_CSUM)			\
+	FN(IP_INHDR)			\
+	FN(IP_RPFILTER)			\
+	FN(UNICAST_IN_L2_MULTICAST)	\
+	FN(XFRM_POLICY)			\
+	FN(IP_NOPROTO)			\
+	FN(SOCKET_RCVBUFF)		\
+	FN(PROTO_MEM)			\
+	FN(TCP_MD5NOTFOUND)		\
+	FN(TCP_MD5UNEXPECTED)		\
+	FN(TCP_MD5FAILURE)		\
+	FN(SOCKET_BACKLOG)		\
+	FN(TCP_FLAGS)			\
+	FN(TCP_ZEROWINDOW)		\
+	FN(TCP_OLD_DATA)		\
+	FN(TCP_OVERWINDOW)		\
+	FN(TCP_OFOMERGE)		\
+	FN(TCP_RFC7323_PAWS)		\
+	FN(TCP_INVALID_SEQUENCE)	\
+	FN(TCP_RESET)			\
+	FN(TCP_INVALID_SYN)		\
+	FN(TCP_CLOSE)			\
+	FN(TCP_FASTOPEN)		\
+	FN(TCP_OLD_ACK)			\
+	FN(TCP_TOO_OLD_ACK)		\
+	FN(TCP_ACK_UNSENT_DATA)		\
+	FN(TCP_OFO_QUEUE_PRUNE)		\
+	FN(TCP_OFO_DROP)		\
+	FN(IP_OUTNOROUTES)		\
+	FN(BPF_CGROUP_EGRESS)		\
+	FN(IPV6DISABLED)		\
+	FN(NEIGH_CREATEFAIL)		\
+	FN(NEIGH_FAILED)		\
+	FN(NEIGH_QUEUEFULL)		\
+	FN(NEIGH_DEAD)			\
+	FN(TC_EGRESS)			\
+	FN(QDISC_DROP)			\
+	FN(CPU_BACKLOG)			\
+	FN(XDP)				\
+	FN(TC_INGRESS)			\
+	FN(UNHANDLED_PROTO)		\
+	FN(SKB_CSUM)			\
+	FN(SKB_GSO_SEG)			\
+	FN(SKB_UCOPY_FAULT)		\
+	FN(DEV_HDR)			\
+	FN(DEV_READY)			\
+	FN(FULL_RING)			\
+	FN(NOMEM)			\
+	FN(HDR_TRUNC)			\
+	FN(TAP_FILTER)			\
+	FN(TAP_TXFILTER)		\
+	FN(ICMP_CSUM)			\
+	FN(INVALID_PROTO)		\
+	FN(IP_INADDRERRORS)		\
+	FN(IP_INNOROUTES)		\
+	FN(PKT_TOO_BIG)			\
+	FNe(MAX)
+
 /**
  * enum skb_drop_reason - the reasons of skb drops
  *
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 45264e4bb254..50a974f7dfb4 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -9,6 +9,15 @@
 #include <linux/netdevice.h>
 #include <linux/tracepoint.h>
 
+#undef FN
+#define FN(reason)	TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
+DEFINE_DROP_REASON(FN, FN)
+
+#undef FN
+#undef FNe
+#define FN(reason)	{ SKB_DROP_REASON_##reason, #reason },
+#define FNe(reason)	{ SKB_DROP_REASON_##reason, #reason }
+
 /*
  * Tracepoint for free an sk_buff:
  */
@@ -35,9 +44,13 @@ TRACE_EVENT(kfree_skb,
 
 	TP_printk("skbaddr=%p protocol=%u location=%p reason: %s",
 		  __entry->skbaddr, __entry->protocol, __entry->location,
-		  drop_reasons[__entry->reason])
+		  __print_symbolic(__entry->reason,
+				   DEFINE_DROP_REASON(FN, FNe)))
 );
 
+#undef FN
+#undef FNe
+
 TRACE_EVENT(consume_skb,
 
 	TP_PROTO(struct sk_buff *skb),
diff --git a/net/core/.gitignore b/net/core/.gitignore
deleted file mode 100644
index df1e74372cce..000000000000
--- a/net/core/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-dropreason_str.c
diff --git a/net/core/Makefile b/net/core/Makefile
index e8ce3bd283a6..5857cec87b83 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -5,7 +5,7 @@
 
 obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
 	 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \
-	 flow_dissector.o dropreason_str.o
+	 flow_dissector.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 
@@ -40,23 +40,3 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
 obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
 obj-$(CONFIG_OF)	+= of_net.o
-
-clean-files := dropreason_str.c
-
-quiet_cmd_dropreason_str = GEN     $@
-cmd_dropreason_str = awk -F ',' 'BEGIN{ print "\#include <net/dropreason.h>\n"; \
-	print "const char * const drop_reasons[] = {" }\
-	/^enum skb_drop/ { dr=1; }\
-	/^\};/ { dr=0; }\
-	/^\tSKB_DROP_REASON_/ {\
-		if (dr) {\
-			sub(/\tSKB_DROP_REASON_/, "", $$1);\
-			printf "\t[SKB_DROP_REASON_%s] = \"%s\",\n", $$1, $$1;\
-		}\
-	}\
-	END{ print "};" }' $< > $@
-
-$(obj)/dropreason_str.c: $(srctree)/include/net/dropreason.h
-	$(call cmd,dropreason_str)
-
-$(obj)/dropreason_str.o: $(obj)/dropreason_str.c
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 84bb5e188d0d..417463da4fac 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -91,7 +91,11 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;
 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
 EXPORT_SYMBOL(sysctl_max_skb_frags);
 
-/* The array 'drop_reasons' is auto-generated in dropreason_str.c */
+#undef FN
+#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
+const char * const drop_reasons[] = {
+	DEFINE_DROP_REASON(FN, FN)
+};
 EXPORT_SYMBOL(drop_reasons);
 
 /**
-- 
cgit v1.2.3


From 2f79cdfe58c13949bbbb65ba5926abfe9561d0ec Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 31 Aug 2022 09:46:12 -0700
Subject: fs: only do a memory barrier for the first set_buffer_uptodate()

Commit d4252071b97d ("add barriers to buffer_uptodate and
set_buffer_uptodate") added proper memory barriers to the buffer head
BH_Uptodate bit, so that anybody who tests a buffer for being up-to-date
will be guaranteed to actually see initialized state.

However, that commit didn't _just_ add the memory barrier, it also ended
up dropping the "was it already set" logic that the BUFFER_FNS() macro
had.

That's conceptually the right thing for a generic "this is a memory
barrier" operation, but in the case of the buffer contents, we really
only care about the memory barrier for the _first_ time we set the bit,
in that the only memory ordering protection we need is to avoid anybody
seeing uninitialized memory contents.

Any other access ordering wouldn't be about the BH_Uptodate bit anyway,
and would require some other proper lock (typically BH_Lock or the folio
lock).  A reader that races with somebody invalidating the buffer head
isn't an issue wrt the memory ordering, it's a serialization issue.

Now, you'd think that the buffer head operations don't matter in this
day and age (and I certainly thought so), but apparently some loads
still end up being heavy users of buffer heads.  In particular, the
kernel test robot reported that not having this bit access optimization
in place caused a noticeable direct IO performance regression on ext4:

  fxmark.ssd_ext4_no_jnl_DWTL_54_directio.works/sec -26.5% regression

although you presumably need a fast disk and a lot of cores to actually
notice.

Link: https://lore.kernel.org/all/Yw8L7HTZ%2FdE2%2Fo9C@xsang-OptiPlex-9020/
Reported-by: kernel test robot <oliver.sang@intel.com>
Tested-by: Fengwei Yin <fengwei.yin@intel.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buffer_head.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 089c9ade4325..df518c429667 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -137,6 +137,17 @@ BUFFER_FNS(Defer_Completion, defer_completion)
 
 static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
 {
+	/*
+	 * If somebody else already set this uptodate, they will
+	 * have done the memory barrier, and a reader will thus
+	 * see *some* valid buffer state.
+	 *
+	 * Any other serialization (with IO errors or whatever that
+	 * might clear the bit) has to come from other state (eg BH_Lock).
+	 */
+	if (test_bit(BH_Uptodate, &bh->b_state))
+		return;
+
 	/*
 	 * make it consistent with folio_mark_uptodate
 	 * pairs with smp_load_acquire in buffer_uptodate
-- 
cgit v1.2.3


From 9cd4f1434479f1ac25c440c421fbf52069079914 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sun, 11 Sep 2022 11:18:45 +0800
Subject: iommu/vt-d: Fix possible recursive locking in intel_iommu_init()

The global rwsem dmar_global_lock was introduced by commit 3a5670e8ac932
("iommu/vt-d: Introduce a rwsem to protect global data structures"). It
is used to protect DMAR related global data from DMAR hotplug operations.

The dmar_global_lock used in the intel_iommu_init() might cause recursive
locking issue, for example, intel_iommu_get_resv_regions() is taking the
dmar_global_lock from within a section where intel_iommu_init() already
holds it via probe_acpi_namespace_devices().

Using dmar_global_lock in intel_iommu_init() could be relaxed since it is
unlikely that any IO board must be hot added before the IOMMU subsystem is
initialized. This eliminates the possible recursive locking issue by moving
down DMAR hotplug support after the IOMMU is initialized and removing the
uses of dmar_global_lock in intel_iommu_init().

Fixes: d5692d4af08cd ("iommu/vt-d: Fix suspicious RCU usage in probe_acpi_namespace_devices()")
Reported-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/894db0ccae854b35c73814485569b634237b5538.1657034828.git.robin.murphy@arm.com
Link: https://lore.kernel.org/r/20220718235325.3952426-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/dmar.c  |  7 +++++++
 drivers/iommu/intel/iommu.c | 27 ++-------------------------
 include/linux/dmar.h        |  4 +++-
 3 files changed, 12 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 5a8f780e7ffd..497c912ad9e1 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -2349,6 +2349,13 @@ static int dmar_device_hotplug(acpi_handle handle, bool insert)
 	if (!dmar_in_use())
 		return 0;
 
+	/*
+	 * It's unlikely that any I/O board is hot added before the IOMMU
+	 * subsystem is initialized.
+	 */
+	if (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled)
+		return -EOPNOTSUPP;
+
 	if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) {
 		tmp = handle;
 	} else {
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 64d30895a4c8..1f2cd43cf9bc 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3019,13 +3019,7 @@ static int __init init_dmars(void)
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
-			/*
-			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
-			 * could cause possible lock race condition.
-			 */
-			up_write(&dmar_global_lock);
 			ret = intel_svm_enable_prq(iommu);
-			down_write(&dmar_global_lock);
 			if (ret)
 				goto free_iommu;
 		}
@@ -3938,7 +3932,6 @@ int __init intel_iommu_init(void)
 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
 		    platform_optin_force_iommu();
 
-	down_write(&dmar_global_lock);
 	if (dmar_table_init()) {
 		if (force_on)
 			panic("tboot: Failed to initialize DMAR table\n");
@@ -3951,16 +3944,6 @@ int __init intel_iommu_init(void)
 		goto out_free_dmar;
 	}
 
-	up_write(&dmar_global_lock);
-
-	/*
-	 * The bus notifier takes the dmar_global_lock, so lockdep will
-	 * complain later when we register it under the lock.
-	 */
-	dmar_register_bus_notifier();
-
-	down_write(&dmar_global_lock);
-
 	if (!no_iommu)
 		intel_iommu_debugfs_init();
 
@@ -4005,11 +3988,9 @@ int __init intel_iommu_init(void)
 		pr_err("Initialization failed\n");
 		goto out_free_dmar;
 	}
-	up_write(&dmar_global_lock);
 
 	init_iommu_pm_ops();
 
-	down_read(&dmar_global_lock);
 	for_each_active_iommu(iommu, drhd) {
 		/*
 		 * The flush queue implementation does not perform
@@ -4027,13 +4008,11 @@ int __init intel_iommu_init(void)
 				       "%s", iommu->name);
 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
 	}
-	up_read(&dmar_global_lock);
 
 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
 	if (si_domain && !hw_pass_through)
 		register_memory_notifier(&intel_iommu_memory_nb);
 
-	down_read(&dmar_global_lock);
 	if (probe_acpi_namespace_devices())
 		pr_warn("ACPI name space devices didn't probe correctly\n");
 
@@ -4044,17 +4023,15 @@ int __init intel_iommu_init(void)
 
 		iommu_disable_protect_mem_regions(iommu);
 	}
-	up_read(&dmar_global_lock);
-
-	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
 
 	intel_iommu_enabled = 1;
+	dmar_register_bus_notifier();
+	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
 
 	return 0;
 
 out_free_dmar:
 	intel_iommu_free_dmars();
-	up_write(&dmar_global_lock);
 	return ret;
 }
 
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index d81a51978d01..8917a32173c4 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -65,6 +65,7 @@ struct dmar_pci_notify_info {
 
 extern struct rw_semaphore dmar_global_lock;
 extern struct list_head dmar_drhd_units;
+extern int intel_iommu_enabled;
 
 #define for_each_drhd_unit(drhd)					\
 	list_for_each_entry_rcu(drhd, &dmar_drhd_units, list,		\
@@ -88,7 +89,8 @@ extern struct list_head dmar_drhd_units;
 static inline bool dmar_rcu_check(void)
 {
 	return rwsem_is_locked(&dmar_global_lock) ||
-	       system_state == SYSTEM_BOOTING;
+	       system_state == SYSTEM_BOOTING ||
+	       (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled);
 }
 
 #define	dmar_rcu_dereference(p)	rcu_dereference_check((p), dmar_rcu_check())
-- 
cgit v1.2.3