From e509996b16728e37d5a909a5c63c1bd64f23b306 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Mon, 2 Sep 2024 17:07:09 -0700
Subject: xfrm: extract dst lookup parameters into a struct

Preparation for adding more fields to dst lookup functions without
changing their signatures.

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b6bfdc6416c7..f3ae50372707 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -349,20 +349,23 @@ struct xfrm_if_cb {
 void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb);
 void xfrm_if_unregister_cb(void);
 
+struct xfrm_dst_lookup_params {
+	struct net *net;
+	int tos;
+	int oif;
+	xfrm_address_t *saddr;
+	xfrm_address_t *daddr;
+	u32 mark;
+};
+
 struct net_device;
 struct xfrm_type;
 struct xfrm_dst;
 struct xfrm_policy_afinfo {
 	struct dst_ops		*dst_ops;
-	struct dst_entry	*(*dst_lookup)(struct net *net,
-					       int tos, int oif,
-					       const xfrm_address_t *saddr,
-					       const xfrm_address_t *daddr,
-					       u32 mark);
-	int			(*get_saddr)(struct net *net, int oif,
-					     xfrm_address_t *saddr,
-					     xfrm_address_t *daddr,
-					     u32 mark);
+	struct dst_entry	*(*dst_lookup)(const struct xfrm_dst_lookup_params *params);
+	int			(*get_saddr)(xfrm_address_t *saddr,
+					     const struct xfrm_dst_lookup_params *params);
 	int			(*fill_dst)(struct xfrm_dst *xdst,
 					    struct net_device *dev,
 					    const struct flowi *fl);
@@ -1764,10 +1767,7 @@ static inline int xfrm_user_policy(struct sock *sk, int optname,
 }
 #endif
 
-struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
-				    const xfrm_address_t *saddr,
-				    const xfrm_address_t *daddr,
-				    int family, u32 mark);
+struct dst_entry *__xfrm_dst_lookup(int family, const struct xfrm_dst_lookup_params *params);
 
 struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);
 
-- 
cgit v1.2.3


From b8469721034300bbb6dec5b4bf32492c95e16a0c Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Mon, 2 Sep 2024 17:07:10 -0700
Subject: xfrm: respect ip protocols rules criteria when performing dst lookups

The series in the "fixes" tag added the ability to consider L4 attributes
in routing rules.

The dst lookup on the outer packet of encapsulated traffic in the xfrm
code was not adapted to this change, thus routing behavior that relies
on L4 information is not respected.

Pass the ip protocol information when performing dst lookups.

Fixes: a25724b05af0 ("Merge branch 'fib_rules-support-sport-dport-and-proto-match'")
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Tested-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  2 ++
 net/ipv4/xfrm4_policy.c |  2 ++
 net/ipv6/xfrm6_policy.c |  3 +++
 net/xfrm/xfrm_policy.c  | 15 +++++++++++++++
 4 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index f3ae50372707..a0bdd58f401c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -356,6 +356,8 @@ struct xfrm_dst_lookup_params {
 	xfrm_address_t *saddr;
 	xfrm_address_t *daddr;
 	u32 mark;
+	__u8 ipproto;
+	union flowi_uli uli;
 };
 
 struct net_device;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index ac1a28ef0c56..7e1c2faed1ff 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -30,6 +30,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4,
 	fl4->flowi4_mark = params->mark;
 	if (params->saddr)
 		fl4->saddr = params->saddr->a4;
+	fl4->flowi4_proto = params->ipproto;
+	fl4->uli = params->uli;
 
 	rt = __ip_route_output_key(params->net, fl4);
 	if (!IS_ERR(rt))
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index fc3f5eec6898..1f19b6f14484 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -37,6 +37,9 @@ static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *p
 	if (params->saddr)
 		memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr));
 
+	fl6.flowi4_proto = params->ipproto;
+	fl6.uli = params->uli;
+
 	dst = ip6_route_output(params->net, NULL, &fl6);
 
 	err = dst->error;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ec8f2fe13a51..55cc9f4cb42b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -315,6 +315,21 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
 	params.tos = tos;
 	params.oif = oif;
 	params.mark = mark;
+	params.ipproto = x->id.proto;
+	if (x->encap) {
+		switch (x->encap->encap_type) {
+		case UDP_ENCAP_ESPINUDP:
+			params.ipproto = IPPROTO_UDP;
+			params.uli.ports.sport = x->encap->encap_sport;
+			params.uli.ports.dport = x->encap->encap_dport;
+			break;
+		case TCP_ENCAP_ESPINTCP:
+			params.ipproto = IPPROTO_TCP;
+			params.uli.ports.sport = x->encap->encap_sport;
+			params.uli.ports.dport = x->encap->encap_dport;
+			break;
+		}
+	}
 
 	dst = __xfrm_dst_lookup(family, &params);
 
-- 
cgit v1.2.3


From 645546a05b0370391c0eac0f14f5b9ddf8d00731 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 18 Sep 2024 11:12:49 +0200
Subject: xfrm: policy: remove last remnants of pernet inexact list

xfrm_net still contained the no-longer-used inexact policy list heads,
remove them.

Fixes: a54ad727f745 ("xfrm: policy: remove remaining use of inexact list")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h | 1 -
 net/xfrm/xfrm_policy.c   | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index d489d9250bff..ae60d6664095 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -51,7 +51,6 @@ struct netns_xfrm {
 	struct hlist_head	*policy_byidx;
 	unsigned int		policy_idx_hmask;
 	unsigned int		idx_generator;
-	struct hlist_head	policy_inexact[XFRM_POLICY_MAX];
 	struct xfrm_policy_hash	policy_bydst[XFRM_POLICY_MAX];
 	unsigned int		policy_count[XFRM_POLICY_MAX * 2];
 	struct work_struct	policy_hash_work;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 55cc9f4cb42b..a2ea9dbac90b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4206,7 +4206,6 @@ static int __net_init xfrm_policy_init(struct net *net)
 
 		net->xfrm.policy_count[dir] = 0;
 		net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;
-		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
 
 		htab = &net->xfrm.policy_bydst[dir];
 		htab->table = xfrm_hash_alloc(sz);
@@ -4260,8 +4259,6 @@ static void xfrm_policy_fini(struct net *net)
 	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
 		struct xfrm_policy_hash *htab;
 
-		WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir]));
-
 		htab = &net->xfrm.policy_bydst[dir];
 		sz = (htab->hmask + 1) * sizeof(struct hlist_head);
 		WARN_ON(!hlist_empty(htab->table));
-- 
cgit v1.2.3


From 09d88791c7cd888d5195c84733caf9183dcfbd16 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 20 Sep 2024 14:56:24 +0200
Subject: bpf: Make sure internal and UAPI bpf_redirect flags don't overlap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bpf_redirect_info is shared between the SKB and XDP redirect paths,
and the two paths use the same numeric flag values in the ri->flags
field (specifically, BPF_F_BROADCAST == BPF_F_NEXTHOP). This means that
if skb bpf_redirect_neigh() is used with a non-NULL params argument and,
subsequently, an XDP redirect is performed using the same
bpf_redirect_info struct, the XDP path will get confused and end up
crashing, which syzbot managed to trigger.

With the stack-allocated bpf_redirect_info, the structure is no longer
shared between the SKB and XDP paths, so the crash doesn't happen
anymore. However, different code paths using identically-numbered flag
values in the same struct field still seems like a bit of a mess, so
this patch cleans that up by moving the flag definitions together and
redefining the three flags in BPF_F_REDIRECT_INTERNAL to not overlap
with the flags used for XDP. It also adds a BUILD_BUG_ON() check to make
sure the overlap is not re-introduced by mistake.

Fixes: e624d4ed4aa8 ("xdp: Extend xdp_redirect_map with broadcast support")
Reported-by: syzbot+cca39e6e84a367a7e6f6@syzkaller.appspotmail.com
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Closes: https://syzkaller.appspot.com/bug?extid=cca39e6e84a367a7e6f6
Link: https://lore.kernel.org/bpf/20240920125625.59465-1-toke@redhat.com
---
 include/uapi/linux/bpf.h | 13 +++++--------
 net/core/filter.c        |  8 +++++---
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c6cd7c7aeeee..e8241b320c6d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6047,11 +6047,6 @@ enum {
 	BPF_F_MARK_ENFORCE		= (1ULL << 6),
 };
 
-/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
-enum {
-	BPF_F_INGRESS			= (1ULL << 0),
-};
-
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 enum {
 	BPF_F_TUNINFO_IPV6		= (1ULL << 0),
@@ -6198,10 +6193,12 @@ enum {
 	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
 };
 
-/* Flags for bpf_redirect_map helper */
+/* Flags for bpf_redirect and bpf_redirect_map helpers */
 enum {
-	BPF_F_BROADCAST		= (1ULL << 3),
-	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
+	BPF_F_INGRESS		= (1ULL << 0), /* used for skb path */
+	BPF_F_BROADCAST		= (1ULL << 3), /* used for XDP path */
+	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4), /* used for XDP path */
+#define BPF_F_REDIRECT_FLAGS (BPF_F_INGRESS | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
 };
 
 #define __bpf_md_ptr(type, name)	\
diff --git a/net/core/filter.c b/net/core/filter.c
index cd3524cb326b..4e3f42cc6611 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2438,9 +2438,9 @@ out:
 
 /* Internal, non-exposed redirect flags. */
 enum {
-	BPF_F_NEIGH	= (1ULL << 1),
-	BPF_F_PEER	= (1ULL << 2),
-	BPF_F_NEXTHOP	= (1ULL << 3),
+	BPF_F_NEIGH	= (1ULL << 16),
+	BPF_F_PEER	= (1ULL << 17),
+	BPF_F_NEXTHOP	= (1ULL << 18),
 #define BPF_F_REDIRECT_INTERNAL	(BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
 };
 
@@ -2450,6 +2450,8 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 	struct sk_buff *clone;
 	int ret;
 
+	BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);
+
 	if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 0a97195d2181caced187acd7454464b8e37021d7 Mon Sep 17 00:00:00 2001
From: Rajendra Nayak <quic_rjendra@quicinc.com>
Date: Tue, 3 Sep 2024 15:45:10 +0530
Subject: EDAC/qcom: Make irq configuration optional

On most modern qualcomm SoCs, the configuration necessary to enable the
Tag/Data RAM related irqs being propagated to the SoC irq controller is
already done in firmware (in DSF or 'DDR System Firmware')

On some like the x1e80100, these registers aren't even accesible to the
kernel causing a crash when edac device is probed.

Hence, make the irq configuration optional in the driver and mark x1e80100
as the SoC on which this should be avoided.

Fixes: af16b00578a7 ("arm64: dts: qcom: Add base X1E80100 dtsi and the QCP dts")
Reported-by: Bjorn Andersson <andersson@kernel.org>
Signed-off-by: Rajendra Nayak <quic_rjendra@quicinc.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
Link: https://lore.kernel.org/r/20240903101510.3452734-1-quic_rjendra@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/edac/qcom_edac.c           | 8 +++++---
 drivers/soc/qcom/llcc-qcom.c       | 3 +++
 include/linux/soc/qcom/llcc-qcom.h | 2 ++
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c
index d3cd4cc54ace..a9a8ba067007 100644
--- a/drivers/edac/qcom_edac.c
+++ b/drivers/edac/qcom_edac.c
@@ -342,9 +342,11 @@ static int qcom_llcc_edac_probe(struct platform_device *pdev)
 	int ecc_irq;
 	int rc;
 
-	rc = qcom_llcc_core_setup(llcc_driv_data, llcc_driv_data->bcast_regmap);
-	if (rc)
-		return rc;
+	if (!llcc_driv_data->ecc_irq_configured) {
+		rc = qcom_llcc_core_setup(llcc_driv_data, llcc_driv_data->bcast_regmap);
+		if (rc)
+			return rc;
+	}
 
 	/* Allocate edac control info */
 	edev_ctl = edac_device_alloc_ctl_info(0, "qcom-llcc", 1, "bank",
diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index 8fa4ffd3a9b5..28bcc65e91be 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -139,6 +139,7 @@ struct qcom_llcc_config {
 	int size;
 	bool need_llcc_cfg;
 	bool no_edac;
+	bool irq_configured;
 };
 
 struct qcom_sct_config {
@@ -718,6 +719,7 @@ static const struct qcom_llcc_config x1e80100_cfg[] = {
 		.need_llcc_cfg	= true,
 		.reg_offset	= llcc_v2_1_reg_offset,
 		.edac_reg_offset = &llcc_v2_1_edac_reg_offset,
+		.irq_configured = true,
 	},
 };
 
@@ -1345,6 +1347,7 @@ static int qcom_llcc_probe(struct platform_device *pdev)
 	drv_data->cfg = llcc_cfg;
 	drv_data->cfg_size = sz;
 	drv_data->edac_reg_offset = cfg->edac_reg_offset;
+	drv_data->ecc_irq_configured = cfg->irq_configured;
 	mutex_init(&drv_data->lock);
 	platform_set_drvdata(pdev, drv_data);
 
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 9e9f528b1370..2f20281d4ad4 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -125,6 +125,7 @@ struct llcc_edac_reg_offset {
  * @num_banks: Number of llcc banks
  * @bitmap: Bit map to track the active slice ids
  * @ecc_irq: interrupt for llcc cache error detection and reporting
+ * @ecc_irq_configured: 'True' if firmware has already configured the irq propagation
  * @version: Indicates the LLCC version
  */
 struct llcc_drv_data {
@@ -139,6 +140,7 @@ struct llcc_drv_data {
 	u32 num_banks;
 	unsigned long *bitmap;
 	int ecc_irq;
+	bool ecc_irq_configured;
 	u32 version;
 };
 
-- 
cgit v1.2.3


From 796a4049640b54cb1daf9e7fe543292c5ca02c74 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Oct 2024 15:33:58 +0100
Subject: netfs: In readahead, put the folio refs as soon extracted

netfslib currently defers dropping the ref on the folios it obtains during
readahead to after it has started I/O on the basis that we can do it whilst
we wait for the I/O to complete, but this runs the risk of the I/O
collection racing with this in future.

Furthermore, Matthew Wilcox strongly suggests that the refs should be
dropped immediately, as readahead_folio() does (netfslib is using
__readahead_batch() which doesn't drop the refs).

Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/3771538.1728052438@warthog.procyon.org.uk
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/buffered_read.c     | 47 +++++++++++++-------------------------------
 fs/netfs/read_collect.c      |  2 ++
 include/trace/events/netfs.h |  1 -
 3 files changed, 16 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index c40e226053cc..af46a598f4d7 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -67,7 +67,8 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
  * Decant the list of folios to read into a rolling buffer.
  */
 static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
-					struct folio_queue *folioq)
+					struct folio_queue *folioq,
+					struct folio_batch *put_batch)
 {
 	unsigned int order, nr;
 	size_t size = 0;
@@ -82,6 +83,9 @@ static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
 		order = folio_order(folio);
 		folioq->orders[i] = order;
 		size += PAGE_SIZE << order;
+
+		if (!folio_batch_add(put_batch, folio))
+			folio_batch_release(put_batch);
 	}
 
 	for (int i = nr; i < folioq_nr_slots(folioq); i++)
@@ -120,6 +124,9 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 		 * that we will need to release later - but we don't want to do
 		 * that until after we've started the I/O.
 		 */
+		struct folio_batch put_batch;
+
+		folio_batch_init(&put_batch);
 		while (rreq->submitted < subreq->start + rsize) {
 			struct folio_queue *tail = rreq->buffer_tail, *new;
 			size_t added;
@@ -132,10 +139,11 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 			new->prev = tail;
 			tail->next = new;
 			rreq->buffer_tail = new;
-			added = netfs_load_buffer_from_ra(rreq, new);
+			added = netfs_load_buffer_from_ra(rreq, new, &put_batch);
 			rreq->iter.count += added;
 			rreq->submitted += added;
 		}
+		folio_batch_release(&put_batch);
 	}
 
 	subreq->len = rsize;
@@ -348,6 +356,7 @@ static int netfs_wait_for_read(struct netfs_io_request *rreq)
 static int netfs_prime_buffer(struct netfs_io_request *rreq)
 {
 	struct folio_queue *folioq;
+	struct folio_batch put_batch;
 	size_t added;
 
 	folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
@@ -360,39 +369,14 @@ static int netfs_prime_buffer(struct netfs_io_request *rreq)
 	rreq->submitted = rreq->start;
 	iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0);
 
-	added = netfs_load_buffer_from_ra(rreq, folioq);
+	folio_batch_init(&put_batch);
+	added = netfs_load_buffer_from_ra(rreq, folioq, &put_batch);
+	folio_batch_release(&put_batch);
 	rreq->iter.count += added;
 	rreq->submitted += added;
 	return 0;
 }
 
-/*
- * Drop the ref on each folio that we inherited from the VM readahead code.  We
- * still have the folio locks to pin the page until we complete the I/O.
- *
- * Note that we can't just release the batch in each queue struct as we use the
- * occupancy count in other places.
- */
-static void netfs_put_ra_refs(struct folio_queue *folioq)
-{
-	struct folio_batch fbatch;
-
-	folio_batch_init(&fbatch);
-	while (folioq) {
-		for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) {
-			struct folio *folio = folioq_folio(folioq, slot);
-			if (!folio)
-				continue;
-			trace_netfs_folio(folio, netfs_folio_trace_read_put);
-			if (!folio_batch_add(&fbatch, folio))
-				folio_batch_release(&fbatch);
-		}
-		folioq = folioq->next;
-	}
-
-	folio_batch_release(&fbatch);
-}
-
 /**
  * netfs_readahead - Helper to manage a read request
  * @ractl: The description of the readahead request
@@ -436,9 +420,6 @@ void netfs_readahead(struct readahead_control *ractl)
 		goto cleanup_free;
 	netfs_read_to_pagecache(rreq);
 
-	/* Release the folio refs whilst we're waiting for the I/O. */
-	netfs_put_ra_refs(rreq->buffer);
-
 	netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
 	return;
 
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index b18c65ba5580..3cbb289535a8 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -77,6 +77,8 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
 			folio_unlock(folio);
 		}
 	}
+
+	folioq_clear(folioq, slot);
 }
 
 /*
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 1d7c52821e55..69975c9c6823 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -172,7 +172,6 @@
 	EM(netfs_folio_trace_read,		"read")		\
 	EM(netfs_folio_trace_read_done,		"read-done")	\
 	EM(netfs_folio_trace_read_gaps,		"read-gaps")	\
-	EM(netfs_folio_trace_read_put,		"read-put")	\
 	EM(netfs_folio_trace_read_unlock,	"read-unlock")	\
 	EM(netfs_folio_trace_redirtied,		"redirtied")	\
 	EM(netfs_folio_trace_store,		"store")	\
-- 
cgit v1.2.3


From 6ef6a0e821d3dad6bf8a5d5508762dba9042c84b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 3 Oct 2024 08:09:16 -0700
Subject: iomap: share iomap_unshare_iter predicate code with fsdax

The predicate code that iomap_unshare_iter uses to decide if it's really
needs to unshare a file range mapping should be shared with the fsdax
version, because right now they're opencoded and inconsistent.

Note that we simplify the predicate logic a bit -- we no longer allow
unsharing of inline data mappings, but there aren't any filesystems that
allow shared inline data currently.

This is a fix in the sense that it should have been ported to fsdax.

Fixes: b53fdb215d13 ("iomap: improve shared block detection in iomap_unshare_iter")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/172796813294.1131942.15762084021076932620.stgit@frogsfrogsfrogs
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/dax.c               |  3 +--
 fs/iomap/buffered-io.c | 30 ++++++++++++++++--------------
 include/linux/iomap.h  |  1 +
 3 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index c62acd2812f8..5064eefb1c1e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1268,8 +1268,7 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
 	s64 ret = 0;
 	void *daddr = NULL, *saddr = NULL;
 
-	/* don't bother with blocks that are not shared to start with */
-	if (!(iomap->flags & IOMAP_F_SHARED))
+	if (!iomap_want_unshare_iter(iter))
 		return length;
 
 	id = dax_read_lock();
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 78ebd265f425..3899169b2cf7 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1309,19 +1309,12 @@ void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
 
-static loff_t iomap_unshare_iter(struct iomap_iter *iter)
+bool iomap_want_unshare_iter(const struct iomap_iter *iter)
 {
-	struct iomap *iomap = &iter->iomap;
-	loff_t pos = iter->pos;
-	loff_t length = iomap_length(iter);
-	loff_t written = 0;
-
-	/* Don't bother with blocks that are not shared to start with. */
-	if (!(iomap->flags & IOMAP_F_SHARED))
-		return length;
-
 	/*
-	 * Don't bother with delalloc reservations, holes or unwritten extents.
+	 * Don't bother with blocks that are not shared to start with; or
+	 * mappings that cannot be shared, such as inline data, delalloc
+	 * reservations, holes or unwritten extents.
 	 *
 	 * Note that we use srcmap directly instead of iomap_iter_srcmap as
 	 * unsharing requires providing a separate source map, and the presence
@@ -1329,9 +1322,18 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
 	 * IOMAP_F_SHARED which can be set for any data that goes into the COW
 	 * fork for XFS.
 	 */
-	if (iter->srcmap.type == IOMAP_HOLE ||
-	    iter->srcmap.type == IOMAP_DELALLOC ||
-	    iter->srcmap.type == IOMAP_UNWRITTEN)
+	return (iter->iomap.flags & IOMAP_F_SHARED) &&
+		iter->srcmap.type == IOMAP_MAPPED;
+}
+
+static loff_t iomap_unshare_iter(struct iomap_iter *iter)
+{
+	struct iomap *iomap = &iter->iomap;
+	loff_t pos = iter->pos;
+	loff_t length = iomap_length(iter);
+	loff_t written = 0;
+
+	if (!iomap_want_unshare_iter(iter))
 		return length;
 
 	do {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4ad12a3c8bae..d8a7fc84348c 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -267,6 +267,7 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
 bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
+bool iomap_want_unshare_iter(const struct iomap_iter *iter);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 		bool *did_zero, const struct iomap_ops *ops);
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-- 
cgit v1.2.3


From 1442ee0011983f0c5c4b92380e6853afb513841a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 2 Oct 2024 21:49:59 +0100
Subject: irqchip/gic-v4: Don't allow a VMOVP on a dying VPE

Kunkun Jiang reported that there is a small window of opportunity for
userspace to force a change of affinity for a VPE while the VPE has already
been unmapped, but the corresponding doorbell interrupt still visible in
/proc/irq/.

Plug the race by checking the value of vmapp_count, which tracks whether
the VPE is mapped ot not, and returning an error in this case.

This involves making vmapp_count common to both GICv4.1 and its v4.0
ancestor.

Fixes: 64edfaa9a234 ("irqchip/gic-v4.1: Implement the v4.1 flavour of VMAPP")
Reported-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/c182ece6-2ba0-ce4f-3404-dba7a3ab6c52@huawei.com
Link: https://lore.kernel.org/all/20241002204959.2051709-1-maz@kernel.org
---
 drivers/irqchip/irq-gic-v3-its.c   | 18 ++++++++++++------
 include/linux/irqchip/arm-gic-v4.h |  4 +++-
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index fdec478ba5e7..ab597e74ba08 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -797,8 +797,8 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its,
 	its_encode_valid(cmd, desc->its_vmapp_cmd.valid);
 
 	if (!desc->its_vmapp_cmd.valid) {
+		alloc = !atomic_dec_return(&desc->its_vmapp_cmd.vpe->vmapp_count);
 		if (is_v4_1(its)) {
-			alloc = !atomic_dec_return(&desc->its_vmapp_cmd.vpe->vmapp_count);
 			its_encode_alloc(cmd, alloc);
 			/*
 			 * Unmapping a VPE is self-synchronizing on GICv4.1,
@@ -817,13 +817,13 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its,
 	its_encode_vpt_addr(cmd, vpt_addr);
 	its_encode_vpt_size(cmd, LPI_NRBITS - 1);
 
+	alloc = !atomic_fetch_inc(&desc->its_vmapp_cmd.vpe->vmapp_count);
+
 	if (!is_v4_1(its))
 		goto out;
 
 	vconf_addr = virt_to_phys(page_address(desc->its_vmapp_cmd.vpe->its_vm->vprop_page));
 
-	alloc = !atomic_fetch_inc(&desc->its_vmapp_cmd.vpe->vmapp_count);
-
 	its_encode_alloc(cmd, alloc);
 
 	/*
@@ -3806,6 +3806,13 @@ static int its_vpe_set_affinity(struct irq_data *d,
 	struct cpumask *table_mask;
 	unsigned long flags;
 
+	/*
+	 * Check if we're racing against a VPE being destroyed, for
+	 * which we don't want to allow a VMOVP.
+	 */
+	if (!atomic_read(&vpe->vmapp_count))
+		return -EINVAL;
+
 	/*
 	 * Changing affinity is mega expensive, so let's be as lazy as
 	 * we can and only do it if we really have to. Also, if mapped
@@ -4463,9 +4470,8 @@ static int its_vpe_init(struct its_vpe *vpe)
 	raw_spin_lock_init(&vpe->vpe_lock);
 	vpe->vpe_id = vpe_id;
 	vpe->vpt_page = vpt_page;
-	if (gic_rdists->has_rvpeid)
-		atomic_set(&vpe->vmapp_count, 0);
-	else
+	atomic_set(&vpe->vmapp_count, 0);
+	if (!gic_rdists->has_rvpeid)
 		vpe->vpe_proxy_event = -1;
 
 	return 0;
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index ecabed6d3307..7f1f11a5e4e4 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -66,10 +66,12 @@ struct its_vpe {
 				bool	enabled;
 				bool	group;
 			}			sgi_config[16];
-			atomic_t vmapp_count;
 		};
 	};
 
+	/* Track the VPE being mapped */
+	atomic_t vmapp_count;
+
 	/*
 	 * Ensures mutual exclusion between affinity setting of the
 	 * vPE and vLPI operations using vpe->col_idx.
-- 
cgit v1.2.3


From 68d0021fe7231eec0fb84cd110cf62a6e782b72d Mon Sep 17 00:00:00 2001
From: Remi Pommarel <repk@triplefau.lt>
Date: Tue, 24 Sep 2024 21:28:04 +0200
Subject: wifi: cfg80211: Add wiphy_delayed_work_pending()

Add wiphy_delayed_work_pending() to check if any delayed work timer is
pending, that can be used to be sure that wiphy_delayed_work_queue()
won't postpone an already pending delayed work.

Signed-off-by: Remi Pommarel <repk@triplefau.lt>
Link: https://patch.msgid.link/20240924192805.13859-2-repk@triplefau.lt
[fix return value kernel-doc]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/core.c    |  7 +++++++
 2 files changed, 51 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 69ec1eb41a09..941dc62f3027 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6129,6 +6129,50 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy,
 void wiphy_delayed_work_flush(struct wiphy *wiphy,
 			      struct wiphy_delayed_work *dwork);
 
+/**
+ * wiphy_delayed_work_pending - Find out whether a wiphy delayable
+ * work item is currently pending.
+ *
+ * @wiphy: the wiphy, for debug purposes
+ * @dwork: the delayed work in question
+ *
+ * Return: true if timer is pending, false otherwise
+ *
+ * How wiphy_delayed_work_queue() works is by setting a timer which
+ * when it expires calls wiphy_work_queue() to queue the wiphy work.
+ * Because wiphy_delayed_work_queue() uses mod_timer(), if it is
+ * called twice and the second call happens before the first call
+ * deadline, the work will rescheduled for the second deadline and
+ * won't run before that.
+ *
+ * wiphy_delayed_work_pending() can be used to detect if calling
+ * wiphy_work_delayed_work_queue() would start a new work schedule
+ * or delayed a previous one. As seen below it cannot be used to
+ * detect precisely if the work has finished to execute nor if it
+ * is currently executing.
+ *
+ *      CPU0                                CPU1
+ * wiphy_delayed_work_queue(wk)
+ *  mod_timer(wk->timer)
+ *                                     wiphy_delayed_work_pending(wk) -> true
+ *
+ * [...]
+ * expire_timers(wk->timer)
+ *  detach_timer(wk->timer)
+ *                                     wiphy_delayed_work_pending(wk) -> false
+ *  wk->timer->function()                          |
+ *   wiphy_work_queue(wk)                          | delayed work pending
+ *    list_add_tail()                              | returns false but
+ *    queue_work(cfg80211_wiphy_work)              | wk->func() has not
+ *                                                 | been run yet
+ * [...]                                           |
+ *  cfg80211_wiphy_work()                          |
+ *   wk->func()                                    V
+ *
+ */
+bool wiphy_delayed_work_pending(struct wiphy *wiphy,
+				struct wiphy_delayed_work *dwork);
+
 /**
  * enum ieee80211_ap_reg_power - regulatory power for an Access Point
  *
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 661adfc77644..8331064de9dd 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1704,6 +1704,13 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy,
 }
 EXPORT_SYMBOL_GPL(wiphy_delayed_work_flush);
 
+bool wiphy_delayed_work_pending(struct wiphy *wiphy,
+				struct wiphy_delayed_work *dwork)
+{
+	return timer_pending(&dwork->timer);
+}
+EXPORT_SYMBOL_GPL(wiphy_delayed_work_pending);
+
 static int __init cfg80211_init(void)
 {
 	int err;
-- 
cgit v1.2.3


From 57be3d3562ca4aa62b8047bc681028cc402af8ce Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Fri, 4 Oct 2024 14:14:44 -0600
Subject: wifi: radiotap: Avoid -Wflex-array-member-not-at-end warnings

-Wflex-array-member-not-at-end was introduced in GCC-14, and we are
getting ready to enable it, globally.

So, in order to avoid ending up with a flexible-array member in the
middle of multiple other structs, we use the `__struct_group()`
helper to create a new tagged `struct ieee80211_radiotap_header_fixed`.
This structure groups together all the members of the flexible
`struct ieee80211_radiotap_header` except the flexible array.

As a result, the array is effectively separated from the rest of the
members without modifying the memory layout of the flexible structure.
We then change the type of the middle struct members currently causing
trouble from `struct ieee80211_radiotap_header` to `struct
ieee80211_radiotap_header_fixed`.

We also want to ensure that in case new members need to be added to the
flexible structure, they are always included within the newly created
tagged struct. For this, we use `static_assert()`. This ensures that the
memory layout for both the flexible structure and the new tagged struct
is the same after any changes.

This approach avoids having to implement `struct ieee80211_radiotap_header_fixed`
as a completely separate structure, thus preventing having to maintain
two independent but basically identical structures, closing the door
to potential bugs in the future.

So, with these changes, fix the following warnings:
drivers/net/wireless/ath/wil6210/txrx.c:309:50: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
drivers/net/wireless/intel/ipw2x00/ipw2100.c:2521:50: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
drivers/net/wireless/intel/ipw2x00/ipw2200.h:1146:42: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
drivers/net/wireless/intel/ipw2x00/libipw.h:595:36: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
drivers/net/wireless/marvell/libertas/radiotap.h:34:42: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
drivers/net/wireless/marvell/libertas/radiotap.h:5:42: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
drivers/net/wireless/microchip/wilc1000/mon.c:10:42: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
drivers/net/wireless/microchip/wilc1000/mon.c:15:42: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
drivers/net/wireless/virtual/mac80211_hwsim.c:758:42: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
drivers/net/wireless/virtual/mac80211_hwsim.c:767:42: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://patch.msgid.link/ZwBMtBZKcrzwU7l4@kspp
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/wil6210/txrx.c          |  2 +-
 drivers/net/wireless/intel/ipw2x00/ipw2100.c     |  2 +-
 drivers/net/wireless/intel/ipw2x00/ipw2200.h     |  2 +-
 drivers/net/wireless/marvell/libertas/radiotap.h |  4 +--
 drivers/net/wireless/microchip/wilc1000/mon.c    |  4 +--
 drivers/net/wireless/virtual/mac80211_hwsim.c    |  4 +--
 include/net/ieee80211_radiotap.h                 | 43 +++++++++++++-----------
 7 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c
index f29ac6de7139..19702b6f09c3 100644
--- a/drivers/net/wireless/ath/wil6210/txrx.c
+++ b/drivers/net/wireless/ath/wil6210/txrx.c
@@ -306,7 +306,7 @@ static void wil_rx_add_radiotap_header(struct wil6210_priv *wil,
 				       struct sk_buff *skb)
 {
 	struct wil6210_rtap {
-		struct ieee80211_radiotap_header rthdr;
+		struct ieee80211_radiotap_header_fixed rthdr;
 		/* fields should be in the order of bits in rthdr.it_present */
 		/* flags */
 		u8 flags;
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
index b6636002c7d2..fe75941c584d 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
@@ -2518,7 +2518,7 @@ static void isr_rx_monitor(struct ipw2100_priv *priv, int i,
 	 * to build this manually element by element, we can write it much
 	 * more efficiently than we can parse it. ORDER MATTERS HERE */
 	struct ipw_rt_hdr {
-		struct ieee80211_radiotap_header rt_hdr;
+		struct ieee80211_radiotap_header_fixed rt_hdr;
 		s8 rt_dbmsignal; /* signal in dbM, kluged to signed */
 	} *ipw_rt;
 
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.h b/drivers/net/wireless/intel/ipw2x00/ipw2200.h
index 8ebf09121e17..226286cb7eb8 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2200.h
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.h
@@ -1143,7 +1143,7 @@ struct ipw_prom_priv {
  * structure is provided regardless of any bits unset.
  */
 struct ipw_rt_hdr {
-	struct ieee80211_radiotap_header rt_hdr;
+	struct ieee80211_radiotap_header_fixed rt_hdr;
 	u64 rt_tsf;      /* TSF */	/* XXX */
 	u8 rt_flags;	/* radiotap packet flags */
 	u8 rt_rate;	/* rate in 500kb/s */
diff --git a/drivers/net/wireless/marvell/libertas/radiotap.h b/drivers/net/wireless/marvell/libertas/radiotap.h
index 1ed5608d353f..d543bfe739dc 100644
--- a/drivers/net/wireless/marvell/libertas/radiotap.h
+++ b/drivers/net/wireless/marvell/libertas/radiotap.h
@@ -2,7 +2,7 @@
 #include <net/ieee80211_radiotap.h>
 
 struct tx_radiotap_hdr {
-	struct ieee80211_radiotap_header hdr;
+	struct ieee80211_radiotap_header_fixed hdr;
 	u8 rate;
 	u8 txpower;
 	u8 rts_retries;
@@ -31,7 +31,7 @@ struct tx_radiotap_hdr {
 #define IEEE80211_FC_DSTODS          0x0300
 
 struct rx_radiotap_hdr {
-	struct ieee80211_radiotap_header hdr;
+	struct ieee80211_radiotap_header_fixed hdr;
 	u8 flags;
 	u8 rate;
 	u8 antsignal;
diff --git a/drivers/net/wireless/microchip/wilc1000/mon.c b/drivers/net/wireless/microchip/wilc1000/mon.c
index 03b7229a0ff5..c3d27aaec297 100644
--- a/drivers/net/wireless/microchip/wilc1000/mon.c
+++ b/drivers/net/wireless/microchip/wilc1000/mon.c
@@ -7,12 +7,12 @@
 #include "cfg80211.h"
 
 struct wilc_wfi_radiotap_hdr {
-	struct ieee80211_radiotap_header hdr;
+	struct ieee80211_radiotap_header_fixed hdr;
 	u8 rate;
 } __packed;
 
 struct wilc_wfi_radiotap_cb_hdr {
-	struct ieee80211_radiotap_header hdr;
+	struct ieee80211_radiotap_header_fixed hdr;
 	u8 rate;
 	u8 dump;
 	u16 tx_flags;
diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c
index f0e528abb1b4..3f424f14de4e 100644
--- a/drivers/net/wireless/virtual/mac80211_hwsim.c
+++ b/drivers/net/wireless/virtual/mac80211_hwsim.c
@@ -763,7 +763,7 @@ static const struct rhashtable_params hwsim_rht_params = {
 };
 
 struct hwsim_radiotap_hdr {
-	struct ieee80211_radiotap_header hdr;
+	struct ieee80211_radiotap_header_fixed hdr;
 	__le64 rt_tsft;
 	u8 rt_flags;
 	u8 rt_rate;
@@ -772,7 +772,7 @@ struct hwsim_radiotap_hdr {
 } __packed;
 
 struct hwsim_radiotap_ack_hdr {
-	struct ieee80211_radiotap_header hdr;
+	struct ieee80211_radiotap_header_fixed hdr;
 	u8 rt_flags;
 	u8 pad;
 	__le16 rt_channel;
diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 91762faecc13..1458d3695005 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -24,25 +24,27 @@
  * struct ieee80211_radiotap_header - base radiotap header
  */
 struct ieee80211_radiotap_header {
-	/**
-	 * @it_version: radiotap version, always 0
-	 */
-	uint8_t it_version;
-
-	/**
-	 * @it_pad: padding (or alignment)
-	 */
-	uint8_t it_pad;
-
-	/**
-	 * @it_len: overall radiotap header length
-	 */
-	__le16 it_len;
-
-	/**
-	 * @it_present: (first) present word
-	 */
-	__le32 it_present;
+	__struct_group(ieee80211_radiotap_header_fixed, hdr, __packed,
+		/**
+		 * @it_version: radiotap version, always 0
+		 */
+		uint8_t it_version;
+
+		/**
+		 * @it_pad: padding (or alignment)
+		 */
+		uint8_t it_pad;
+
+		/**
+		 * @it_len: overall radiotap header length
+		 */
+		__le16 it_len;
+
+		/**
+		 * @it_present: (first) present word
+		 */
+		__le32 it_present;
+	);
 
 	/**
 	 * @it_optional: all remaining presence bitmaps
@@ -50,6 +52,9 @@ struct ieee80211_radiotap_header {
 	__le32 it_optional[];
 } __packed;
 
+static_assert(offsetof(struct ieee80211_radiotap_header, it_optional) == sizeof(struct ieee80211_radiotap_header_fixed),
+	      "struct member likely outside of __struct_group()");
+
 /* version is always 0 */
 #define PKTHDR_RADIOTAP_VERSION	0
 
-- 
cgit v1.2.3


From 04b670de2859a8a8b0830779f9c9bda5d39662ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 7 Oct 2024 16:54:11 -0400
Subject: closures: Add closure_wait_event_timeout()

Add a closure version of wait_event_timeout(), with the same semantics.

The closure version is useful because unlike wait_event(), it allows
blocking code to run in the conditional expression.

Cc: Coly Li <colyli@suse.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/closure.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index 2af44427107d..880fe85e35e9 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -454,4 +454,39 @@ do {									\
 		__closure_wait_event(waitlist, _cond);			\
 } while (0)
 
+#define __closure_wait_event_timeout(waitlist, _cond, _until)		\
+({									\
+	struct closure cl;						\
+	long _t;							\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		closure_wait(waitlist, &cl);				\
+		if (_cond) {						\
+			_t = max_t(long, 1L, _until - jiffies);		\
+			break;						\
+		}							\
+		_t = max_t(long, 0L, _until - jiffies);			\
+		if (!_t)						\
+			break;						\
+		closure_sync_timeout(&cl, _t);				\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+	_t;								\
+})
+
+/*
+ * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if
+ * condition became true
+ */
+#define closure_wait_event_timeout(waitlist, _cond, _timeout)		\
+({									\
+	unsigned long _until = jiffies + _timeout;			\
+	(_cond)								\
+		? max_t(long, 1L, _until - jiffies)			\
+		: __closure_wait_event_timeout(waitlist, _cond, _until);\
+})
+
 #endif /* _LINUX_CLOSURE_H */
-- 
cgit v1.2.3


From 9eb2142a2ae8c8fdfce2aaa4c110f5a6f6b0b56e Mon Sep 17 00:00:00 2001
From: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com>
Date: Wed, 9 Oct 2024 10:12:30 +0200
Subject: ASoC: topology: Bump minimal topology ABI version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When v4 topology support was removed, minimal topology ABI version
should have been bumped.

Fixes: fe4a07454256 ("ASoC: Drop soc-topology ABI v4 support")
Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com>
Link: https://patch.msgid.link/20241009081230.304918-1-amadeuszx.slawinski@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index 99333cbd3114..c117672d4439 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -88,7 +88,7 @@
 
 /* ABI version */
 #define SND_SOC_TPLG_ABI_VERSION	0x5	/* current version */
-#define SND_SOC_TPLG_ABI_VERSION_MIN	0x4	/* oldest version supported */
+#define SND_SOC_TPLG_ABI_VERSION_MIN	0x5	/* oldest version supported */
 
 /* Max size of TLV data */
 #define SND_SOC_TPLG_TLV_SIZE		32
-- 
cgit v1.2.3


From c657243ae12000dc57e3648b0ddd30da9ffd1f14 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Wed, 9 Oct 2024 16:51:07 +0200
Subject: serial: qcom-geni: fix rx cancel dma status bit

Cancelling an rx command is signalled using bit 14 of the rx DMA status
register and not bit 11.

This bit is currently unused, but this error becomes apparent, for
example, when tracing the status register when closing the port.

Fixes: eddac5af0654 ("soc: qcom: Add GENI based QUP Wrapper driver")
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20241009145110.16847-7-johan+linaro@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/soc/qcom/geni-se.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/soc/qcom/geni-se.h b/include/linux/soc/qcom/geni-se.h
index c3bca9c0bf2c..2996a3c28ef3 100644
--- a/include/linux/soc/qcom/geni-se.h
+++ b/include/linux/soc/qcom/geni-se.h
@@ -258,8 +258,8 @@ struct geni_se {
 #define RX_DMA_PARITY_ERR		BIT(5)
 #define RX_DMA_BREAK			GENMASK(8, 7)
 #define RX_GENI_GP_IRQ			GENMASK(10, 5)
-#define RX_GENI_CANCEL_IRQ		BIT(11)
 #define RX_GENI_GP_IRQ_EXT		GENMASK(13, 12)
+#define RX_GENI_CANCEL_IRQ		BIT(14)
 
 /* SE_HW_PARAM_0 fields */
 #define TX_FIFO_WIDTH_MSK		GENMASK(29, 24)
-- 
cgit v1.2.3


From 73ab05aa46b02d96509cb029a8d04fca7bbde8c7 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 9 Oct 2024 21:44:32 -0400
Subject: sched/core: Disable page allocation in task_tick_mm_cid()

With KASAN and PREEMPT_RT enabled, calling task_work_add() in
task_tick_mm_cid() may cause the following splat.

[   63.696416] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
[   63.696416] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 610, name: modprobe
[   63.696416] preempt_count: 10001, expected: 0
[   63.696416] RCU nest depth: 1, expected: 1

This problem is caused by the following call trace.

  sched_tick() [ acquire rq->__lock ]
   -> task_tick_mm_cid()
    -> task_work_add()
     -> __kasan_record_aux_stack()
      -> kasan_save_stack()
       -> stack_depot_save_flags()
        -> alloc_pages_mpol_noprof()
         -> __alloc_pages_noprof()
	  -> get_page_from_freelist()
	   -> rmqueue()
	    -> rmqueue_pcplist()
	     -> __rmqueue_pcplist()
	      -> rmqueue_bulk()
	       -> rt_spin_lock()

The rq lock is a raw_spinlock_t. We can't sleep while holding
it. IOW, we can't call alloc_pages() in stack_depot_save_flags().

The task_tick_mm_cid() function with its task_work_add() call was
introduced by commit 223baf9d17f2 ("sched: Fix performance regression
introduced by mm_cid") in v6.4 kernel.

Fortunately, there is a kasan_record_aux_stack_noalloc() variant that
calls stack_depot_save_flags() while not allowing it to allocate
new pages.  To allow task_tick_mm_cid() to use task_work without
page allocation, a new TWAF_NO_ALLOC flag is added to enable calling
kasan_record_aux_stack_noalloc() instead of kasan_record_aux_stack()
if set. The task_tick_mm_cid() function is modified to add this new flag.

The possible downside is the missing stack trace in a KASAN report due
to new page allocation required when task_work_add_noallloc() is called
which should be rare.

Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241010014432.194742-1-longman@redhat.com
---
 include/linux/task_work.h |  5 ++++-
 kernel/sched/core.c       |  4 +++-
 kernel/task_work.c        | 15 +++++++++++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index cf5e7e891a77..2964171856e0 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -14,11 +14,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
 }
 
 enum task_work_notify_mode {
-	TWA_NONE,
+	TWA_NONE = 0,
 	TWA_RESUME,
 	TWA_SIGNAL,
 	TWA_SIGNAL_NO_IPI,
 	TWA_NMI_CURRENT,
+
+	TWA_FLAGS = 0xff00,
+	TWAF_NO_ALLOC = 0x0100,
 };
 
 static inline bool task_work_pending(struct task_struct *task)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 43e453ab7e20..0259301e572e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10458,7 +10458,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
 		return;
 	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
 		return;
-	task_work_add(curr, work, TWA_RESUME);
+
+	/* No page allocation under rq lock */
+	task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
 }
 
 void sched_mm_cid_exit_signals(struct task_struct *t)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5d14d639ac71..c969f1f26be5 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -55,15 +55,26 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 		  enum task_work_notify_mode notify)
 {
 	struct callback_head *head;
+	int flags = notify & TWA_FLAGS;
 
+	notify &= ~TWA_FLAGS;
 	if (notify == TWA_NMI_CURRENT) {
 		if (WARN_ON_ONCE(task != current))
 			return -EINVAL;
 		if (!IS_ENABLED(CONFIG_IRQ_WORK))
 			return -EINVAL;
 	} else {
-		/* record the work call stack in order to print it in KASAN reports */
-		kasan_record_aux_stack(work);
+		/*
+		 * Record the work call stack in order to print it in KASAN
+		 * reports.
+		 *
+		 * Note that stack allocation can fail if TWAF_NO_ALLOC flag
+		 * is set and new page is needed to expand the stack buffer.
+		 */
+		if (flags & TWAF_NO_ALLOC)
+			kasan_record_aux_stack_noalloc(work);
+		else
+			kasan_record_aux_stack(work);
 	}
 
 	head = READ_ONCE(task->task_works);
-- 
cgit v1.2.3


From 1d7b2ce43d2c22a21dadaf689cb36a69570346a6 Mon Sep 17 00:00:00 2001
From: Wei Fang <wei.fang@nxp.com>
Date: Fri, 11 Oct 2024 11:01:03 +0800
Subject: net: enetc: add missing static descriptor and inline keyword

Fix the build warnings when CONFIG_FSL_ENETC_MDIO is not enabled.
The detailed warnings are shown as follows.

include/linux/fsl/enetc_mdio.h:62:18: warning: no previous prototype for function 'enetc_hw_alloc' [-Wmissing-prototypes]
      62 | struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs)
         |                  ^
include/linux/fsl/enetc_mdio.h:62:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
      62 | struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs)
         | ^
         | static
8 warnings generated.

Fixes: 6517798dd343 ("enetc: Make MDIO accessors more generic and export to include/linux/fsl")
Cc: stable@vger.kernel.org
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202410102136.jQHZOcS4-lkp@intel.com/
Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20241011030103.392362-1-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/fsl/enetc_mdio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fsl/enetc_mdio.h b/include/linux/fsl/enetc_mdio.h
index df25fffdc0ae..623ccfcbf39c 100644
--- a/include/linux/fsl/enetc_mdio.h
+++ b/include/linux/fsl/enetc_mdio.h
@@ -59,7 +59,8 @@ static inline int enetc_mdio_read_c45(struct mii_bus *bus, int phy_id,
 static inline int enetc_mdio_write_c45(struct mii_bus *bus, int phy_id,
 				       int devad, int regnum, u16 value)
 { return -EINVAL; }
-struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs)
+static inline struct enetc_hw *enetc_hw_alloc(struct device *dev,
+					      void __iomem *port_regs)
 { return ERR_PTR(-EINVAL); }
 
 #endif
-- 
cgit v1.2.3


From cd9626e9ebc77edec33023fe95dab4b04ffc819d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 10 Oct 2024 11:38:10 +0200
Subject: sched/fair: Fix external p->on_rq users

Sean noted that ever since commit 152e11f6df29 ("sched/fair: Implement
delayed dequeue") KVM's preemption notifiers have started
mis-classifying preemption vs blocking.

Notably p->on_rq is no longer sufficient to determine if a task is
runnable or blocked -- the aforementioned commit introduces tasks that
remain on the runqueue even through they will not run again, and
should be considered blocked for many cases.

Add the task_is_runnable() helper to classify things and audit all
external users of the p->on_rq state. Also add a few comments.

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Reported-by: Sean Christopherson <seanjc@google.com>
Tested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20241010091843.GK33184@noisy.programming.kicks-ass.net
---
 include/linux/sched.h         |  5 +++++
 kernel/events/core.c          |  2 +-
 kernel/freezer.c              |  7 ++++++-
 kernel/rcu/tasks.h            |  9 +++++++++
 kernel/sched/core.c           | 12 +++++++++---
 kernel/time/tick-sched.c      |  6 ++++++
 kernel/trace/trace_selftest.c |  2 +-
 virt/kvm/kvm_main.c           |  2 +-
 8 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e6ee4258169a..8a9517e6640c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2133,6 +2133,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+static inline bool task_is_runnable(struct task_struct *p)
+{
+	return p->on_rq && !p->se.sched_delayed;
+}
+
 extern bool sched_task_on_rq(struct task_struct *p);
 extern unsigned long get_wchan(struct task_struct *p);
 extern struct task_struct *cpu_curr_snapshot(int cpu);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e3589c4287cb..cdd09769e6c5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9251,7 +9251,7 @@ static void perf_event_switch(struct task_struct *task,
 		},
 	};
 
-	if (!sched_in && task->on_rq) {
+	if (!sched_in && task_is_runnable(task)) {
 		switch_event.event_id.header.misc |=
 				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
 	}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 44bbd7dbd2c8..8d530d0949ff 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
 {
 	unsigned int state = READ_ONCE(p->__state);
 
-	if (p->on_rq)
+	/*
+	 * Allow freezing the sched_delayed tasks; they will not execute until
+	 * ttwu() fixes them up, so it is safe to swap their state now, instead
+	 * of waiting for them to get fully dequeued.
+	 */
+	if (task_is_runnable(p))
 		return 0;
 
 	if (p != current && task_curr(p))
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 6333f4ccf024..4d7ee95df06e 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -985,6 +985,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t)
 	if (!READ_ONCE(t->on_rq))
 		return false;
 
+	/*
+	 * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but
+	 * since it is a spurious state (it will transition into the
+	 * traditional blocked state or get woken up without outside
+	 * dependencies), not considering it such should only affect timing.
+	 *
+	 * Be conservative for now and not include it.
+	 */
+
 	/*
 	 * Idle tasks (or idle injection) within the idle loop are RCU-tasks
 	 * quiescent states. But CPU boot code performed by the idle task
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 71232f8f9b96..7db711ba6d12 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
  *   ON_RQ_MIGRATING state is used for migration without holding both
  *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
  *
+ *   Additionally it is possible to be ->on_rq but still be considered not
+ *   runnable when p->se.sched_delayed is true. These tasks are on the runqueue
+ *   but will be dequeued as soon as they get picked again. See the
+ *   task_is_runnable() helper.
+ *
  * p->on_cpu <- { 0, 1 }:
  *
  *   is set by prepare_task() and cleared by finish_task() such that it will be
@@ -4317,9 +4322,10 @@ static bool __task_needs_rq_lock(struct task_struct *p)
  * @arg: Argument to function.
  *
  * Fix the task in it's current state by avoiding wakeups and or rq operations
- * and call @func(@arg) on it.  This function can use ->on_rq and task_curr()
- * to work out what the state is, if required.  Given that @func can be invoked
- * with a runqueue lock held, it had better be quite lightweight.
+ * and call @func(@arg) on it.  This function can use task_is_runnable() and
+ * task_curr() to work out what the state is, if required.  Given that @func
+ * can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
  *
  * Returns:
  *   Whatever @func returns
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 753a184c7090..f203f000da1a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -434,6 +434,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
 	 *   smp_mb__after_spin_lock()
 	 *   tick_nohz_task_switch()
 	 *     LOAD p->tick_dep_mask
+	 *
+	 * XXX given a task picks up the dependency on schedule(), should we
+	 * only care about tasks that are currently on the CPU instead of all
+	 * that are on the runqueue?
+	 *
+	 * That is, does this want to be: task_on_cpu() / task_curr()?
 	 */
 	if (!sched_task_on_rq(tsk))
 		return;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c4ad7cd7e778..1469dd8075fa 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	/* reset the max latency */
 	tr->max_latency = 0;
 
-	while (p->on_rq) {
+	while (task_is_runnable(p)) {
 		/*
 		 * Sleep to make sure the -deadline thread is asleep too.
 		 * On virtual machines we can't rely on timings,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 05cbb2548d99..0c666f1870af 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6387,7 +6387,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 
 	WRITE_ONCE(vcpu->scheduled_out, true);
 
-	if (current->on_rq && vcpu->wants_to_run) {
+	if (task_is_runnable(current) && vcpu->wants_to_run) {
 		WRITE_ONCE(vcpu->preempted, true);
 		WRITE_ONCE(vcpu->ready, true);
 	}
-- 
cgit v1.2.3


From 71dce222d5865ccb19b231a84d26ca316a65e255 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 1 Oct 2024 15:06:11 +0800
Subject: ALSA/hda: intel-sdw-acpi: add support for sdw-manager-list property
 read

The DisCo for SoundWire 2.0 spec adds support for a new
sdw-manager-list property.  Add it in backwards-compatible mode with
'sdw-master-count', which assumed that all links between 0..count-1
exist.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20241001070611.63288-5-yung-chuan.liao@linux.intel.com
---
 include/linux/soundwire/sdw_intel.h |  2 +-
 sound/hda/intel-sdw-acpi.c          | 25 +++++++++++++++++--------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 37ae69365fe2..734dc1fa3b5b 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -227,7 +227,7 @@ struct sdw_intel_ops {
 /**
  * struct sdw_intel_acpi_info - Soundwire Intel information found in ACPI tables
  * @handle: ACPI controller handle
- * @count: link count found with "sdw-master-count" property
+ * @count: link count found with "sdw-master-count" or "sdw-manager-list" property
  * @link_mask: bit-wise mask listing links enabled by BIOS menu
  *
  * this structure could be expanded to e.g. provide all the _ADR
diff --git a/sound/hda/intel-sdw-acpi.c b/sound/hda/intel-sdw-acpi.c
index 582e761e7b9f..ed530e0dd4dd 100644
--- a/sound/hda/intel-sdw-acpi.c
+++ b/sound/hda/intel-sdw-acpi.c
@@ -57,8 +57,10 @@ sdw_intel_scan_controller(struct sdw_intel_acpi_info *info)
 {
 	struct acpi_device *adev = acpi_fetch_acpi_dev(info->handle);
 	struct fwnode_handle *fwnode;
+	unsigned long list;
 	unsigned int i;
 	u32 count;
+	u32 tmp;
 	int ret;
 
 	if (!adev)
@@ -66,10 +68,9 @@ sdw_intel_scan_controller(struct sdw_intel_acpi_info *info)
 
 	fwnode = acpi_fwnode_handle(adev);
 
-	/* Found controller, find links supported */
-	ret = fwnode_property_read_u32(fwnode, "mipi-sdw-master-count", &count);
-
 	/*
+	 * Found controller, find links supported
+	 *
 	 * In theory we could check the number of links supported in
 	 * hardware, but in that step we cannot assume SoundWire IP is
 	 * powered.
@@ -80,11 +81,19 @@ sdw_intel_scan_controller(struct sdw_intel_acpi_info *info)
 	 *
 	 * We will check the hardware capabilities in the startup() step
 	 */
-
+	ret = fwnode_property_read_u32(fwnode, "mipi-sdw-manager-list", &tmp);
 	if (ret) {
-		dev_err(&adev->dev,
-			"Failed to read mipi-sdw-master-count: %d\n", ret);
-		return ret;
+		ret = fwnode_property_read_u32(fwnode, "mipi-sdw-master-count", &count);
+		if (ret) {
+			dev_err(&adev->dev,
+				"Failed to read mipi-sdw-master-count: %d\n",
+				ret);
+			return ret;
+		}
+		list = GENMASK(count - 1, 0);
+	} else {
+		list = tmp;
+		count = hweight32(list);
 	}
 
 	/* Check count is within bounds */
@@ -103,7 +112,7 @@ sdw_intel_scan_controller(struct sdw_intel_acpi_info *info)
 	info->count = count;
 	info->link_mask = 0;
 
-	for (i = 0; i < count; i++) {
+	for_each_set_bit(i, &list, SDW_INTEL_MAX_LINKS) {
 		if (ctrl_link_mask && !(ctrl_link_mask & BIT(i))) {
 			dev_dbg(&adev->dev,
 				"Link %d masked, will not be enabled\n", i);
-- 
cgit v1.2.3


From bb94f56b9cfa4a946a43492b2a13ecb5b9b571bc Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 15 Oct 2024 08:53:29 +0200
Subject: fbdev: da8xx: remove the driver

This driver is no longer used on any platform. It has been replaced by
tilcdc on the two DaVinci boards we still support and can be removed.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 drivers/video/fbdev/Kconfig    |   13 -
 drivers/video/fbdev/Makefile   |    1 -
 drivers/video/fbdev/da8xx-fb.c | 1665 ----------------------------------------
 include/video/da8xx-fb.h       |   94 ---
 4 files changed, 1773 deletions(-)
 delete mode 100644 drivers/video/fbdev/da8xx-fb.c
 delete mode 100644 include/video/da8xx-fb.h

(limited to 'include')

diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 44a8fdbab6df..77b50579bd09 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -1659,19 +1659,6 @@ config FB_SH7760
 	  and 8, 15 or 16 bpp color; 90 degrees clockwise display rotation for
 	  panels <= 320 pixel horizontal resolution.
 
-config FB_DA8XX
-	tristate "DA8xx/OMAP-L1xx/AM335x Framebuffer support"
-	depends on FB && HAVE_CLK && HAS_IOMEM
-	depends on ARCH_DAVINCI_DA8XX || SOC_AM33XX || COMPILE_TEST
-	select FB_CFB_REV_PIXELS_IN_BYTE
-	select FB_IOMEM_HELPERS
-	select FB_MODE_HELPERS
-	select VIDEOMODE_HELPERS
-	help
-	  This is the frame buffer device driver for the TI LCD controller
-	  found on DA8xx/OMAP-L1xx/AM335x SoCs.
-	  If unsure, say N.
-
 config FB_VIRTUAL
 	tristate "Virtual Frame Buffer support (ONLY FOR TESTING!)"
 	depends on FB
diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile
index 3eecd51267fa..b3d12f977c06 100644
--- a/drivers/video/fbdev/Makefile
+++ b/drivers/video/fbdev/Makefile
@@ -121,7 +121,6 @@ obj-$(CONFIG_FB_VESA)             += vesafb.o
 obj-$(CONFIG_FB_EFI)              += efifb.o
 obj-$(CONFIG_FB_VGA16)            += vga16fb.o
 obj-$(CONFIG_FB_OF)               += offb.o
-obj-$(CONFIG_FB_DA8XX)		  += da8xx-fb.o
 obj-$(CONFIG_FB_SSD1307)	  += ssd1307fb.o
 obj-$(CONFIG_FB_SIMPLE)           += simplefb.o
 
diff --git a/drivers/video/fbdev/da8xx-fb.c b/drivers/video/fbdev/da8xx-fb.c
deleted file mode 100644
index fad1e13c6332..000000000000
--- a/drivers/video/fbdev/da8xx-fb.c
+++ /dev/null
@@ -1,1665 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2008-2009 MontaVista Software Inc.
- * Copyright (C) 2008-2009 Texas Instruments Inc
- *
- * Based on the LCD driver for TI Avalanche processors written by
- * Ajay Singh and Shalom Hai.
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/fb.h>
-#include <linux/dma-mapping.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/uaccess.h>
-#include <linux/pm_runtime.h>
-#include <linux/interrupt.h>
-#include <linux/wait.h>
-#include <linux/clk.h>
-#include <linux/cpufreq.h>
-#include <linux/console.h>
-#include <linux/regulator/consumer.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/lcm.h>
-#include <video/da8xx-fb.h>
-#include <asm/div64.h>
-
-#define DRIVER_NAME "da8xx_lcdc"
-
-#define LCD_VERSION_1	1
-#define LCD_VERSION_2	2
-
-/* LCD Status Register */
-#define LCD_END_OF_FRAME1		BIT(9)
-#define LCD_END_OF_FRAME0		BIT(8)
-#define LCD_PL_LOAD_DONE		BIT(6)
-#define LCD_FIFO_UNDERFLOW		BIT(5)
-#define LCD_SYNC_LOST			BIT(2)
-#define LCD_FRAME_DONE			BIT(0)
-
-/* LCD DMA Control Register */
-#define LCD_DMA_BURST_SIZE(x)		((x) << 4)
-#define LCD_DMA_BURST_1			0x0
-#define LCD_DMA_BURST_2			0x1
-#define LCD_DMA_BURST_4			0x2
-#define LCD_DMA_BURST_8			0x3
-#define LCD_DMA_BURST_16		0x4
-#define LCD_V1_END_OF_FRAME_INT_ENA	BIT(2)
-#define LCD_V2_END_OF_FRAME0_INT_ENA	BIT(8)
-#define LCD_V2_END_OF_FRAME1_INT_ENA	BIT(9)
-#define LCD_DUAL_FRAME_BUFFER_ENABLE	BIT(0)
-
-/* LCD Control Register */
-#define LCD_CLK_DIVISOR(x)		((x) << 8)
-#define LCD_RASTER_MODE			0x01
-
-/* LCD Raster Control Register */
-#define LCD_PALETTE_LOAD_MODE(x)	((x) << 20)
-#define PALETTE_AND_DATA		0x00
-#define PALETTE_ONLY			0x01
-#define DATA_ONLY			0x02
-
-#define LCD_MONO_8BIT_MODE		BIT(9)
-#define LCD_RASTER_ORDER		BIT(8)
-#define LCD_TFT_MODE			BIT(7)
-#define LCD_V1_UNDERFLOW_INT_ENA	BIT(6)
-#define LCD_V2_UNDERFLOW_INT_ENA	BIT(5)
-#define LCD_V1_PL_INT_ENA		BIT(4)
-#define LCD_V2_PL_INT_ENA		BIT(6)
-#define LCD_MONOCHROME_MODE		BIT(1)
-#define LCD_RASTER_ENABLE		BIT(0)
-#define LCD_TFT_ALT_ENABLE		BIT(23)
-#define LCD_STN_565_ENABLE		BIT(24)
-#define LCD_V2_DMA_CLK_EN		BIT(2)
-#define LCD_V2_LIDD_CLK_EN		BIT(1)
-#define LCD_V2_CORE_CLK_EN		BIT(0)
-#define LCD_V2_LPP_B10			26
-#define LCD_V2_TFT_24BPP_MODE		BIT(25)
-#define LCD_V2_TFT_24BPP_UNPACK		BIT(26)
-
-/* LCD Raster Timing 2 Register */
-#define LCD_AC_BIAS_TRANSITIONS_PER_INT(x)	((x) << 16)
-#define LCD_AC_BIAS_FREQUENCY(x)		((x) << 8)
-#define LCD_SYNC_CTRL				BIT(25)
-#define LCD_SYNC_EDGE				BIT(24)
-#define LCD_INVERT_PIXEL_CLOCK			BIT(22)
-#define LCD_INVERT_LINE_CLOCK			BIT(21)
-#define LCD_INVERT_FRAME_CLOCK			BIT(20)
-
-/* LCD Block */
-#define  LCD_PID_REG				0x0
-#define  LCD_CTRL_REG				0x4
-#define  LCD_STAT_REG				0x8
-#define  LCD_RASTER_CTRL_REG			0x28
-#define  LCD_RASTER_TIMING_0_REG		0x2C
-#define  LCD_RASTER_TIMING_1_REG		0x30
-#define  LCD_RASTER_TIMING_2_REG		0x34
-#define  LCD_DMA_CTRL_REG			0x40
-#define  LCD_DMA_FRM_BUF_BASE_ADDR_0_REG	0x44
-#define  LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG	0x48
-#define  LCD_DMA_FRM_BUF_BASE_ADDR_1_REG	0x4C
-#define  LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG	0x50
-
-/* Interrupt Registers available only in Version 2 */
-#define  LCD_RAW_STAT_REG			0x58
-#define  LCD_MASKED_STAT_REG			0x5c
-#define  LCD_INT_ENABLE_SET_REG			0x60
-#define  LCD_INT_ENABLE_CLR_REG			0x64
-#define  LCD_END_OF_INT_IND_REG			0x68
-
-/* Clock registers available only on Version 2 */
-#define  LCD_CLK_ENABLE_REG			0x6c
-#define  LCD_CLK_RESET_REG			0x70
-#define  LCD_CLK_MAIN_RESET			BIT(3)
-
-#define LCD_NUM_BUFFERS	2
-
-#define PALETTE_SIZE	256
-
-#define	CLK_MIN_DIV	2
-#define	CLK_MAX_DIV	255
-
-static void __iomem *da8xx_fb_reg_base;
-static unsigned int lcd_revision;
-static irq_handler_t lcdc_irq_handler;
-static wait_queue_head_t frame_done_wq;
-static int frame_done_flag;
-
-static unsigned int lcdc_read(unsigned int addr)
-{
-	return (unsigned int)__raw_readl(da8xx_fb_reg_base + (addr));
-}
-
-static void lcdc_write(unsigned int val, unsigned int addr)
-{
-	__raw_writel(val, da8xx_fb_reg_base + (addr));
-}
-
-struct da8xx_fb_par {
-	struct device		*dev;
-	dma_addr_t		p_palette_base;
-	unsigned char *v_palette_base;
-	dma_addr_t		vram_phys;
-	unsigned long		vram_size;
-	void			*vram_virt;
-	unsigned int		dma_start;
-	unsigned int		dma_end;
-	struct clk *lcdc_clk;
-	int irq;
-	unsigned int palette_sz;
-	int blank;
-	wait_queue_head_t	vsync_wait;
-	int			vsync_flag;
-	int			vsync_timeout;
-	spinlock_t		lock_for_chan_update;
-
-	/*
-	 * LCDC has 2 ping pong DMA channels, channel 0
-	 * and channel 1.
-	 */
-	unsigned int		which_dma_channel_done;
-#ifdef CONFIG_CPU_FREQ
-	struct notifier_block	freq_transition;
-#endif
-	unsigned int		lcdc_clk_rate;
-	struct regulator	*lcd_supply;
-	u32 pseudo_palette[16];
-	struct fb_videomode	mode;
-	struct lcd_ctrl_config	cfg;
-};
-
-static struct fb_var_screeninfo da8xx_fb_var;
-
-static struct fb_fix_screeninfo da8xx_fb_fix = {
-	.id = "DA8xx FB Drv",
-	.type = FB_TYPE_PACKED_PIXELS,
-	.type_aux = 0,
-	.visual = FB_VISUAL_PSEUDOCOLOR,
-	.xpanstep = 0,
-	.ypanstep = 1,
-	.ywrapstep = 0,
-	.accel = FB_ACCEL_NONE
-};
-
-static struct fb_videomode known_lcd_panels[] = {
-	/* Sharp LCD035Q3DG01 */
-	[0] = {
-		.name           = "Sharp_LCD035Q3DG01",
-		.xres           = 320,
-		.yres           = 240,
-		.pixclock       = KHZ2PICOS(4607),
-		.left_margin    = 6,
-		.right_margin   = 8,
-		.upper_margin   = 2,
-		.lower_margin   = 2,
-		.hsync_len      = 0,
-		.vsync_len      = 0,
-		.sync           = FB_SYNC_CLK_INVERT,
-	},
-	/* Sharp LK043T1DG01 */
-	[1] = {
-		.name           = "Sharp_LK043T1DG01",
-		.xres           = 480,
-		.yres           = 272,
-		.pixclock       = KHZ2PICOS(7833),
-		.left_margin    = 2,
-		.right_margin   = 2,
-		.upper_margin   = 2,
-		.lower_margin   = 2,
-		.hsync_len      = 41,
-		.vsync_len      = 10,
-		.sync           = 0,
-		.flag           = 0,
-	},
-	[2] = {
-		/* Hitachi SP10Q010 */
-		.name           = "SP10Q010",
-		.xres           = 320,
-		.yres           = 240,
-		.pixclock       = KHZ2PICOS(7833),
-		.left_margin    = 10,
-		.right_margin   = 10,
-		.upper_margin   = 10,
-		.lower_margin   = 10,
-		.hsync_len      = 10,
-		.vsync_len      = 10,
-		.sync           = 0,
-		.flag           = 0,
-	},
-	[3] = {
-		/* Densitron 84-0023-001T */
-		.name           = "Densitron_84-0023-001T",
-		.xres           = 320,
-		.yres           = 240,
-		.pixclock       = KHZ2PICOS(6400),
-		.left_margin    = 0,
-		.right_margin   = 0,
-		.upper_margin   = 0,
-		.lower_margin   = 0,
-		.hsync_len      = 30,
-		.vsync_len      = 3,
-		.sync           = 0,
-	},
-};
-
-static bool da8xx_fb_is_raster_enabled(void)
-{
-	return !!(lcdc_read(LCD_RASTER_CTRL_REG) & LCD_RASTER_ENABLE);
-}
-
-/* Enable the Raster Engine of the LCD Controller */
-static void lcd_enable_raster(void)
-{
-	u32 reg;
-
-	/* Put LCDC in reset for several cycles */
-	if (lcd_revision == LCD_VERSION_2)
-		/* Write 1 to reset LCDC */
-		lcdc_write(LCD_CLK_MAIN_RESET, LCD_CLK_RESET_REG);
-	mdelay(1);
-
-	/* Bring LCDC out of reset */
-	if (lcd_revision == LCD_VERSION_2)
-		lcdc_write(0, LCD_CLK_RESET_REG);
-	mdelay(1);
-
-	/* Above reset sequence doesnot reset register context */
-	reg = lcdc_read(LCD_RASTER_CTRL_REG);
-	if (!(reg & LCD_RASTER_ENABLE))
-		lcdc_write(reg | LCD_RASTER_ENABLE, LCD_RASTER_CTRL_REG);
-}
-
-/* Disable the Raster Engine of the LCD Controller */
-static void lcd_disable_raster(enum da8xx_frame_complete wait_for_frame_done)
-{
-	u32 reg;
-	int ret;
-
-	reg = lcdc_read(LCD_RASTER_CTRL_REG);
-	if (reg & LCD_RASTER_ENABLE)
-		lcdc_write(reg & ~LCD_RASTER_ENABLE, LCD_RASTER_CTRL_REG);
-	else
-		/* return if already disabled */
-		return;
-
-	if ((wait_for_frame_done == DA8XX_FRAME_WAIT) &&
-			(lcd_revision == LCD_VERSION_2)) {
-		frame_done_flag = 0;
-		ret = wait_event_interruptible_timeout(frame_done_wq,
-				frame_done_flag != 0,
-				msecs_to_jiffies(50));
-		if (ret == 0)
-			pr_err("LCD Controller timed out\n");
-	}
-}
-
-static void lcd_blit(int load_mode, struct da8xx_fb_par *par)
-{
-	u32 start;
-	u32 end;
-	u32 reg_ras;
-	u32 reg_dma;
-	u32 reg_int;
-
-	/* init reg to clear PLM (loading mode) fields */
-	reg_ras = lcdc_read(LCD_RASTER_CTRL_REG);
-	reg_ras &= ~(3 << 20);
-
-	reg_dma  = lcdc_read(LCD_DMA_CTRL_REG);
-
-	if (load_mode == LOAD_DATA) {
-		start    = par->dma_start;
-		end      = par->dma_end;
-
-		reg_ras |= LCD_PALETTE_LOAD_MODE(DATA_ONLY);
-		if (lcd_revision == LCD_VERSION_1) {
-			reg_dma |= LCD_V1_END_OF_FRAME_INT_ENA;
-		} else {
-			reg_int = lcdc_read(LCD_INT_ENABLE_SET_REG) |
-				LCD_V2_END_OF_FRAME0_INT_ENA |
-				LCD_V2_END_OF_FRAME1_INT_ENA |
-				LCD_FRAME_DONE | LCD_SYNC_LOST;
-			lcdc_write(reg_int, LCD_INT_ENABLE_SET_REG);
-		}
-		reg_dma |= LCD_DUAL_FRAME_BUFFER_ENABLE;
-
-		lcdc_write(start, LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
-		lcdc_write(end, LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
-		lcdc_write(start, LCD_DMA_FRM_BUF_BASE_ADDR_1_REG);
-		lcdc_write(end, LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG);
-	} else if (load_mode == LOAD_PALETTE) {
-		start    = par->p_palette_base;
-		end      = start + par->palette_sz - 1;
-
-		reg_ras |= LCD_PALETTE_LOAD_MODE(PALETTE_ONLY);
-
-		if (lcd_revision == LCD_VERSION_1) {
-			reg_ras |= LCD_V1_PL_INT_ENA;
-		} else {
-			reg_int = lcdc_read(LCD_INT_ENABLE_SET_REG) |
-				LCD_V2_PL_INT_ENA;
-			lcdc_write(reg_int, LCD_INT_ENABLE_SET_REG);
-		}
-
-		lcdc_write(start, LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
-		lcdc_write(end, LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
-	}
-
-	lcdc_write(reg_dma, LCD_DMA_CTRL_REG);
-	lcdc_write(reg_ras, LCD_RASTER_CTRL_REG);
-
-	/*
-	 * The Raster enable bit must be set after all other control fields are
-	 * set.
-	 */
-	lcd_enable_raster();
-}
-
-/* Configure the Burst Size and fifo threhold of DMA */
-static int lcd_cfg_dma(int burst_size, int fifo_th)
-{
-	u32 reg;
-
-	reg = lcdc_read(LCD_DMA_CTRL_REG) & 0x00000001;
-	switch (burst_size) {
-	case 1:
-		reg |= LCD_DMA_BURST_SIZE(LCD_DMA_BURST_1);
-		break;
-	case 2:
-		reg |= LCD_DMA_BURST_SIZE(LCD_DMA_BURST_2);
-		break;
-	case 4:
-		reg |= LCD_DMA_BURST_SIZE(LCD_DMA_BURST_4);
-		break;
-	case 8:
-		reg |= LCD_DMA_BURST_SIZE(LCD_DMA_BURST_8);
-		break;
-	case 16:
-	default:
-		reg |= LCD_DMA_BURST_SIZE(LCD_DMA_BURST_16);
-		break;
-	}
-
-	reg |= (fifo_th << 8);
-
-	lcdc_write(reg, LCD_DMA_CTRL_REG);
-
-	return 0;
-}
-
-static void lcd_cfg_ac_bias(int period, int transitions_per_int)
-{
-	u32 reg;
-
-	/* Set the AC Bias Period and Number of Transisitons per Interrupt */
-	reg = lcdc_read(LCD_RASTER_TIMING_2_REG) & 0xFFF00000;
-	reg |= LCD_AC_BIAS_FREQUENCY(period) |
-		LCD_AC_BIAS_TRANSITIONS_PER_INT(transitions_per_int);
-	lcdc_write(reg, LCD_RASTER_TIMING_2_REG);
-}
-
-static void lcd_cfg_horizontal_sync(int back_porch, int pulse_width,
-		int front_porch)
-{
-	u32 reg;
-
-	reg = lcdc_read(LCD_RASTER_TIMING_0_REG) & 0x3ff;
-	reg |= (((back_porch-1) & 0xff) << 24)
-	    | (((front_porch-1) & 0xff) << 16)
-	    | (((pulse_width-1) & 0x3f) << 10);
-	lcdc_write(reg, LCD_RASTER_TIMING_0_REG);
-
-	/*
-	 * LCDC Version 2 adds some extra bits that increase the allowable
-	 * size of the horizontal timing registers.
-	 * remember that the registers use 0 to represent 1 so all values
-	 * that get set into register need to be decremented by 1
-	 */
-	if (lcd_revision == LCD_VERSION_2) {
-		/* Mask off the bits we want to change */
-		reg = lcdc_read(LCD_RASTER_TIMING_2_REG) & ~0x780000ff;
-		reg |= ((front_porch-1) & 0x300) >> 8;
-		reg |= ((back_porch-1) & 0x300) >> 4;
-		reg |= ((pulse_width-1) & 0x3c0) << 21;
-		lcdc_write(reg, LCD_RASTER_TIMING_2_REG);
-	}
-}
-
-static void lcd_cfg_vertical_sync(int back_porch, int pulse_width,
-		int front_porch)
-{
-	u32 reg;
-
-	reg = lcdc_read(LCD_RASTER_TIMING_1_REG) & 0x3ff;
-	reg |= ((back_porch & 0xff) << 24)
-	    | ((front_porch & 0xff) << 16)
-	    | (((pulse_width-1) & 0x3f) << 10);
-	lcdc_write(reg, LCD_RASTER_TIMING_1_REG);
-}
-
-static int lcd_cfg_display(const struct lcd_ctrl_config *cfg,
-		struct fb_videomode *panel)
-{
-	u32 reg;
-	u32 reg_int;
-
-	reg = lcdc_read(LCD_RASTER_CTRL_REG) & ~(LCD_TFT_MODE |
-						LCD_MONO_8BIT_MODE |
-						LCD_MONOCHROME_MODE);
-
-	switch (cfg->panel_shade) {
-	case MONOCHROME:
-		reg |= LCD_MONOCHROME_MODE;
-		if (cfg->mono_8bit_mode)
-			reg |= LCD_MONO_8BIT_MODE;
-		break;
-	case COLOR_ACTIVE:
-		reg |= LCD_TFT_MODE;
-		if (cfg->tft_alt_mode)
-			reg |= LCD_TFT_ALT_ENABLE;
-		break;
-
-	case COLOR_PASSIVE:
-		/* AC bias applicable only for Pasive panels */
-		lcd_cfg_ac_bias(cfg->ac_bias, cfg->ac_bias_intrpt);
-		if (cfg->bpp == 12 && cfg->stn_565_mode)
-			reg |= LCD_STN_565_ENABLE;
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	/* enable additional interrupts here */
-	if (lcd_revision == LCD_VERSION_1) {
-		reg |= LCD_V1_UNDERFLOW_INT_ENA;
-	} else {
-		reg_int = lcdc_read(LCD_INT_ENABLE_SET_REG) |
-			LCD_V2_UNDERFLOW_INT_ENA;
-		lcdc_write(reg_int, LCD_INT_ENABLE_SET_REG);
-	}
-
-	lcdc_write(reg, LCD_RASTER_CTRL_REG);
-
-	reg = lcdc_read(LCD_RASTER_TIMING_2_REG);
-
-	reg |= LCD_SYNC_CTRL;
-
-	if (cfg->sync_edge)
-		reg |= LCD_SYNC_EDGE;
-	else
-		reg &= ~LCD_SYNC_EDGE;
-
-	if ((panel->sync & FB_SYNC_HOR_HIGH_ACT) == 0)
-		reg |= LCD_INVERT_LINE_CLOCK;
-	else
-		reg &= ~LCD_INVERT_LINE_CLOCK;
-
-	if ((panel->sync & FB_SYNC_VERT_HIGH_ACT) == 0)
-		reg |= LCD_INVERT_FRAME_CLOCK;
-	else
-		reg &= ~LCD_INVERT_FRAME_CLOCK;
-
-	lcdc_write(reg, LCD_RASTER_TIMING_2_REG);
-
-	return 0;
-}
-
-static int lcd_cfg_frame_buffer(struct da8xx_fb_par *par, u32 width, u32 height,
-		u32 bpp, u32 raster_order)
-{
-	u32 reg;
-
-	if (bpp > 16 && lcd_revision == LCD_VERSION_1)
-		return -EINVAL;
-
-	/* Set the Panel Width */
-	/* Pixels per line = (PPL + 1)*16 */
-	if (lcd_revision == LCD_VERSION_1) {
-		/*
-		 * 0x3F in bits 4..9 gives max horizontal resolution = 1024
-		 * pixels.
-		 */
-		width &= 0x3f0;
-	} else {
-		/*
-		 * 0x7F in bits 4..10 gives max horizontal resolution = 2048
-		 * pixels.
-		 */
-		width &= 0x7f0;
-	}
-
-	reg = lcdc_read(LCD_RASTER_TIMING_0_REG);
-	reg &= 0xfffffc00;
-	if (lcd_revision == LCD_VERSION_1) {
-		reg |= ((width >> 4) - 1) << 4;
-	} else {
-		width = (width >> 4) - 1;
-		reg |= ((width & 0x3f) << 4) | ((width & 0x40) >> 3);
-	}
-	lcdc_write(reg, LCD_RASTER_TIMING_0_REG);
-
-	/* Set the Panel Height */
-	/* Set bits 9:0 of Lines Per Pixel */
-	reg = lcdc_read(LCD_RASTER_TIMING_1_REG);
-	reg = ((height - 1) & 0x3ff) | (reg & 0xfffffc00);
-	lcdc_write(reg, LCD_RASTER_TIMING_1_REG);
-
-	/* Set bit 10 of Lines Per Pixel */
-	if (lcd_revision == LCD_VERSION_2) {
-		reg = lcdc_read(LCD_RASTER_TIMING_2_REG);
-		reg |= ((height - 1) & 0x400) << 16;
-		lcdc_write(reg, LCD_RASTER_TIMING_2_REG);
-	}
-
-	/* Set the Raster Order of the Frame Buffer */
-	reg = lcdc_read(LCD_RASTER_CTRL_REG) & ~(1 << 8);
-	if (raster_order)
-		reg |= LCD_RASTER_ORDER;
-
-	par->palette_sz = 16 * 2;
-
-	switch (bpp) {
-	case 1:
-	case 2:
-	case 4:
-	case 16:
-		break;
-	case 24:
-		reg |= LCD_V2_TFT_24BPP_MODE;
-		break;
-	case 32:
-		reg |= LCD_V2_TFT_24BPP_MODE;
-		reg |= LCD_V2_TFT_24BPP_UNPACK;
-		break;
-	case 8:
-		par->palette_sz = 256 * 2;
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	lcdc_write(reg, LCD_RASTER_CTRL_REG);
-
-	return 0;
-}
-
-#define CNVT_TOHW(val, width) ((((val) << (width)) + 0x7FFF - (val)) >> 16)
-static int fb_setcolreg(unsigned regno, unsigned red, unsigned green,
-			      unsigned blue, unsigned transp,
-			      struct fb_info *info)
-{
-	struct da8xx_fb_par *par = info->par;
-	unsigned short *palette = (unsigned short *) par->v_palette_base;
-	u_short pal;
-	int update_hw = 0;
-
-	if (regno > 255)
-		return 1;
-
-	if (info->fix.visual == FB_VISUAL_DIRECTCOLOR)
-		return 1;
-
-	if (info->var.bits_per_pixel > 16 && lcd_revision == LCD_VERSION_1)
-		return -EINVAL;
-
-	switch (info->fix.visual) {
-	case FB_VISUAL_TRUECOLOR:
-		red = CNVT_TOHW(red, info->var.red.length);
-		green = CNVT_TOHW(green, info->var.green.length);
-		blue = CNVT_TOHW(blue, info->var.blue.length);
-		break;
-	case FB_VISUAL_PSEUDOCOLOR:
-		switch (info->var.bits_per_pixel) {
-		case 4:
-			if (regno > 15)
-				return -EINVAL;
-
-			if (info->var.grayscale) {
-				pal = regno;
-			} else {
-				red >>= 4;
-				green >>= 8;
-				blue >>= 12;
-
-				pal = red & 0x0f00;
-				pal |= green & 0x00f0;
-				pal |= blue & 0x000f;
-			}
-			if (regno == 0)
-				pal |= 0x2000;
-			palette[regno] = pal;
-			break;
-
-		case 8:
-			red >>= 4;
-			green >>= 8;
-			blue >>= 12;
-
-			pal = (red & 0x0f00);
-			pal |= (green & 0x00f0);
-			pal |= (blue & 0x000f);
-
-			if (palette[regno] != pal) {
-				update_hw = 1;
-				palette[regno] = pal;
-			}
-			break;
-		}
-		break;
-	}
-
-	/* Truecolor has hardware independent palette */
-	if (info->fix.visual == FB_VISUAL_TRUECOLOR) {
-		u32 v;
-
-		if (regno > 15)
-			return -EINVAL;
-
-		v = (red << info->var.red.offset) |
-			(green << info->var.green.offset) |
-			(blue << info->var.blue.offset);
-
-		((u32 *) (info->pseudo_palette))[regno] = v;
-		if (palette[0] != 0x4000) {
-			update_hw = 1;
-			palette[0] = 0x4000;
-		}
-	}
-
-	/* Update the palette in the h/w as needed. */
-	if (update_hw)
-		lcd_blit(LOAD_PALETTE, par);
-
-	return 0;
-}
-#undef CNVT_TOHW
-
-static void da8xx_fb_lcd_reset(void)
-{
-	/* DMA has to be disabled */
-	lcdc_write(0, LCD_DMA_CTRL_REG);
-	lcdc_write(0, LCD_RASTER_CTRL_REG);
-
-	if (lcd_revision == LCD_VERSION_2) {
-		lcdc_write(0, LCD_INT_ENABLE_SET_REG);
-		/* Write 1 to reset */
-		lcdc_write(LCD_CLK_MAIN_RESET, LCD_CLK_RESET_REG);
-		lcdc_write(0, LCD_CLK_RESET_REG);
-	}
-}
-
-static int da8xx_fb_config_clk_divider(struct da8xx_fb_par *par,
-					      unsigned lcdc_clk_div,
-					      unsigned lcdc_clk_rate)
-{
-	int ret;
-
-	if (par->lcdc_clk_rate != lcdc_clk_rate) {
-		ret = clk_set_rate(par->lcdc_clk, lcdc_clk_rate);
-		if (ret) {
-			dev_err(par->dev,
-				"unable to set clock rate at %u\n",
-				lcdc_clk_rate);
-			return ret;
-		}
-		par->lcdc_clk_rate = clk_get_rate(par->lcdc_clk);
-	}
-
-	/* Configure the LCD clock divisor. */
-	lcdc_write(LCD_CLK_DIVISOR(lcdc_clk_div) |
-			(LCD_RASTER_MODE & 0x1), LCD_CTRL_REG);
-
-	if (lcd_revision == LCD_VERSION_2)
-		lcdc_write(LCD_V2_DMA_CLK_EN | LCD_V2_LIDD_CLK_EN |
-				LCD_V2_CORE_CLK_EN, LCD_CLK_ENABLE_REG);
-
-	return 0;
-}
-
-static unsigned int da8xx_fb_calc_clk_divider(struct da8xx_fb_par *par,
-					      unsigned pixclock,
-					      unsigned *lcdc_clk_rate)
-{
-	unsigned lcdc_clk_div;
-
-	pixclock = PICOS2KHZ(pixclock) * 1000;
-
-	*lcdc_clk_rate = par->lcdc_clk_rate;
-
-	if (pixclock < (*lcdc_clk_rate / CLK_MAX_DIV)) {
-		*lcdc_clk_rate = clk_round_rate(par->lcdc_clk,
-						pixclock * CLK_MAX_DIV);
-		lcdc_clk_div = CLK_MAX_DIV;
-	} else if (pixclock > (*lcdc_clk_rate / CLK_MIN_DIV)) {
-		*lcdc_clk_rate = clk_round_rate(par->lcdc_clk,
-						pixclock * CLK_MIN_DIV);
-		lcdc_clk_div = CLK_MIN_DIV;
-	} else {
-		lcdc_clk_div = *lcdc_clk_rate / pixclock;
-	}
-
-	return lcdc_clk_div;
-}
-
-static int da8xx_fb_calc_config_clk_divider(struct da8xx_fb_par *par,
-					    struct fb_videomode *mode)
-{
-	unsigned lcdc_clk_rate;
-	unsigned lcdc_clk_div = da8xx_fb_calc_clk_divider(par, mode->pixclock,
-							  &lcdc_clk_rate);
-
-	return da8xx_fb_config_clk_divider(par, lcdc_clk_div, lcdc_clk_rate);
-}
-
-static unsigned da8xx_fb_round_clk(struct da8xx_fb_par *par,
-					  unsigned pixclock)
-{
-	unsigned lcdc_clk_div, lcdc_clk_rate;
-
-	lcdc_clk_div = da8xx_fb_calc_clk_divider(par, pixclock, &lcdc_clk_rate);
-	return KHZ2PICOS(lcdc_clk_rate / (1000 * lcdc_clk_div));
-}
-
-static int lcd_init(struct da8xx_fb_par *par, const struct lcd_ctrl_config *cfg,
-		struct fb_videomode *panel)
-{
-	u32 bpp;
-	int ret = 0;
-
-	ret = da8xx_fb_calc_config_clk_divider(par, panel);
-	if (ret) {
-		dev_err(par->dev, "unable to configure clock\n");
-		return ret;
-	}
-
-	if (panel->sync & FB_SYNC_CLK_INVERT)
-		lcdc_write((lcdc_read(LCD_RASTER_TIMING_2_REG) |
-			LCD_INVERT_PIXEL_CLOCK), LCD_RASTER_TIMING_2_REG);
-	else
-		lcdc_write((lcdc_read(LCD_RASTER_TIMING_2_REG) &
-			~LCD_INVERT_PIXEL_CLOCK), LCD_RASTER_TIMING_2_REG);
-
-	/* Configure the DMA burst size and fifo threshold. */
-	ret = lcd_cfg_dma(cfg->dma_burst_sz, cfg->fifo_th);
-	if (ret < 0)
-		return ret;
-
-	/* Configure the vertical and horizontal sync properties. */
-	lcd_cfg_vertical_sync(panel->upper_margin, panel->vsync_len,
-			panel->lower_margin);
-	lcd_cfg_horizontal_sync(panel->left_margin, panel->hsync_len,
-			panel->right_margin);
-
-	/* Configure for disply */
-	ret = lcd_cfg_display(cfg, panel);
-	if (ret < 0)
-		return ret;
-
-	bpp = cfg->bpp;
-
-	if (bpp == 12)
-		bpp = 16;
-	ret = lcd_cfg_frame_buffer(par, (unsigned int)panel->xres,
-				(unsigned int)panel->yres, bpp,
-				cfg->raster_order);
-	if (ret < 0)
-		return ret;
-
-	/* Configure FDD */
-	lcdc_write((lcdc_read(LCD_RASTER_CTRL_REG) & 0xfff00fff) |
-		       (cfg->fdd << 12), LCD_RASTER_CTRL_REG);
-
-	return 0;
-}
-
-/* IRQ handler for version 2 of LCDC */
-static irqreturn_t lcdc_irq_handler_rev02(int irq, void *arg)
-{
-	struct da8xx_fb_par *par = arg;
-	u32 stat = lcdc_read(LCD_MASKED_STAT_REG);
-
-	if ((stat & LCD_SYNC_LOST) && (stat & LCD_FIFO_UNDERFLOW)) {
-		lcd_disable_raster(DA8XX_FRAME_NOWAIT);
-		lcdc_write(stat, LCD_MASKED_STAT_REG);
-		lcd_enable_raster();
-	} else if (stat & LCD_PL_LOAD_DONE) {
-		/*
-		 * Must disable raster before changing state of any control bit.
-		 * And also must be disabled before clearing the PL loading
-		 * interrupt via the following write to the status register. If
-		 * this is done after then one gets multiple PL done interrupts.
-		 */
-		lcd_disable_raster(DA8XX_FRAME_NOWAIT);
-
-		lcdc_write(stat, LCD_MASKED_STAT_REG);
-
-		/* Disable PL completion interrupt */
-		lcdc_write(LCD_V2_PL_INT_ENA, LCD_INT_ENABLE_CLR_REG);
-
-		/* Setup and start data loading mode */
-		lcd_blit(LOAD_DATA, par);
-	} else {
-		lcdc_write(stat, LCD_MASKED_STAT_REG);
-
-		if (stat & LCD_END_OF_FRAME0) {
-			par->which_dma_channel_done = 0;
-			lcdc_write(par->dma_start,
-				   LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
-			lcdc_write(par->dma_end,
-				   LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
-			par->vsync_flag = 1;
-			wake_up_interruptible(&par->vsync_wait);
-		}
-
-		if (stat & LCD_END_OF_FRAME1) {
-			par->which_dma_channel_done = 1;
-			lcdc_write(par->dma_start,
-				   LCD_DMA_FRM_BUF_BASE_ADDR_1_REG);
-			lcdc_write(par->dma_end,
-				   LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG);
-			par->vsync_flag = 1;
-			wake_up_interruptible(&par->vsync_wait);
-		}
-
-		/* Set only when controller is disabled and at the end of
-		 * active frame
-		 */
-		if (stat & BIT(0)) {
-			frame_done_flag = 1;
-			wake_up_interruptible(&frame_done_wq);
-		}
-	}
-
-	lcdc_write(0, LCD_END_OF_INT_IND_REG);
-	return IRQ_HANDLED;
-}
-
-/* IRQ handler for version 1 LCDC */
-static irqreturn_t lcdc_irq_handler_rev01(int irq, void *arg)
-{
-	struct da8xx_fb_par *par = arg;
-	u32 stat = lcdc_read(LCD_STAT_REG);
-	u32 reg_ras;
-
-	if ((stat & LCD_SYNC_LOST) && (stat & LCD_FIFO_UNDERFLOW)) {
-		lcd_disable_raster(DA8XX_FRAME_NOWAIT);
-		lcdc_write(stat, LCD_STAT_REG);
-		lcd_enable_raster();
-	} else if (stat & LCD_PL_LOAD_DONE) {
-		/*
-		 * Must disable raster before changing state of any control bit.
-		 * And also must be disabled before clearing the PL loading
-		 * interrupt via the following write to the status register. If
-		 * this is done after then one gets multiple PL done interrupts.
-		 */
-		lcd_disable_raster(DA8XX_FRAME_NOWAIT);
-
-		lcdc_write(stat, LCD_STAT_REG);
-
-		/* Disable PL completion inerrupt */
-		reg_ras  = lcdc_read(LCD_RASTER_CTRL_REG);
-		reg_ras &= ~LCD_V1_PL_INT_ENA;
-		lcdc_write(reg_ras, LCD_RASTER_CTRL_REG);
-
-		/* Setup and start data loading mode */
-		lcd_blit(LOAD_DATA, par);
-	} else {
-		lcdc_write(stat, LCD_STAT_REG);
-
-		if (stat & LCD_END_OF_FRAME0) {
-			par->which_dma_channel_done = 0;
-			lcdc_write(par->dma_start,
-				   LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
-			lcdc_write(par->dma_end,
-				   LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
-			par->vsync_flag = 1;
-			wake_up_interruptible(&par->vsync_wait);
-		}
-
-		if (stat & LCD_END_OF_FRAME1) {
-			par->which_dma_channel_done = 1;
-			lcdc_write(par->dma_start,
-				   LCD_DMA_FRM_BUF_BASE_ADDR_1_REG);
-			lcdc_write(par->dma_end,
-				   LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG);
-			par->vsync_flag = 1;
-			wake_up_interruptible(&par->vsync_wait);
-		}
-	}
-
-	return IRQ_HANDLED;
-}
-
-static int fb_check_var(struct fb_var_screeninfo *var,
-			struct fb_info *info)
-{
-	int err = 0;
-	struct da8xx_fb_par *par = info->par;
-	int bpp = var->bits_per_pixel >> 3;
-	unsigned long line_size = var->xres_virtual * bpp;
-
-	if (var->bits_per_pixel > 16 && lcd_revision == LCD_VERSION_1)
-		return -EINVAL;
-
-	switch (var->bits_per_pixel) {
-	case 1:
-	case 8:
-		var->red.offset = 0;
-		var->red.length = 8;
-		var->green.offset = 0;
-		var->green.length = 8;
-		var->blue.offset = 0;
-		var->blue.length = 8;
-		var->transp.offset = 0;
-		var->transp.length = 0;
-		var->nonstd = 0;
-		break;
-	case 4:
-		var->red.offset = 0;
-		var->red.length = 4;
-		var->green.offset = 0;
-		var->green.length = 4;
-		var->blue.offset = 0;
-		var->blue.length = 4;
-		var->transp.offset = 0;
-		var->transp.length = 0;
-		var->nonstd = FB_NONSTD_REV_PIX_IN_B;
-		break;
-	case 16:		/* RGB 565 */
-		var->red.offset = 11;
-		var->red.length = 5;
-		var->green.offset = 5;
-		var->green.length = 6;
-		var->blue.offset = 0;
-		var->blue.length = 5;
-		var->transp.offset = 0;
-		var->transp.length = 0;
-		var->nonstd = 0;
-		break;
-	case 24:
-		var->red.offset = 16;
-		var->red.length = 8;
-		var->green.offset = 8;
-		var->green.length = 8;
-		var->blue.offset = 0;
-		var->blue.length = 8;
-		var->nonstd = 0;
-		break;
-	case 32:
-		var->transp.offset = 24;
-		var->transp.length = 8;
-		var->red.offset = 16;
-		var->red.length = 8;
-		var->green.offset = 8;
-		var->green.length = 8;
-		var->blue.offset = 0;
-		var->blue.length = 8;
-		var->nonstd = 0;
-		break;
-	default:
-		err = -EINVAL;
-	}
-
-	var->red.msb_right = 0;
-	var->green.msb_right = 0;
-	var->blue.msb_right = 0;
-	var->transp.msb_right = 0;
-
-	if (line_size * var->yres_virtual > par->vram_size)
-		var->yres_virtual = par->vram_size / line_size;
-
-	if (var->yres > var->yres_virtual)
-		var->yres = var->yres_virtual;
-
-	if (var->xres > var->xres_virtual)
-		var->xres = var->xres_virtual;
-
-	if (var->xres + var->xoffset > var->xres_virtual)
-		var->xoffset = var->xres_virtual - var->xres;
-	if (var->yres + var->yoffset > var->yres_virtual)
-		var->yoffset = var->yres_virtual - var->yres;
-
-	var->pixclock = da8xx_fb_round_clk(par, var->pixclock);
-
-	return err;
-}
-
-#ifdef CONFIG_CPU_FREQ
-static int lcd_da8xx_cpufreq_transition(struct notifier_block *nb,
-				     unsigned long val, void *data)
-{
-	struct da8xx_fb_par *par;
-
-	par = container_of(nb, struct da8xx_fb_par, freq_transition);
-	if (val == CPUFREQ_POSTCHANGE) {
-		if (par->lcdc_clk_rate != clk_get_rate(par->lcdc_clk)) {
-			par->lcdc_clk_rate = clk_get_rate(par->lcdc_clk);
-			lcd_disable_raster(DA8XX_FRAME_WAIT);
-			da8xx_fb_calc_config_clk_divider(par, &par->mode);
-			if (par->blank == FB_BLANK_UNBLANK)
-				lcd_enable_raster();
-		}
-	}
-
-	return 0;
-}
-
-static int lcd_da8xx_cpufreq_register(struct da8xx_fb_par *par)
-{
-	par->freq_transition.notifier_call = lcd_da8xx_cpufreq_transition;
-
-	return cpufreq_register_notifier(&par->freq_transition,
-					 CPUFREQ_TRANSITION_NOTIFIER);
-}
-
-static void lcd_da8xx_cpufreq_deregister(struct da8xx_fb_par *par)
-{
-	cpufreq_unregister_notifier(&par->freq_transition,
-				    CPUFREQ_TRANSITION_NOTIFIER);
-}
-#endif
-
-static void fb_remove(struct platform_device *dev)
-{
-	struct fb_info *info = platform_get_drvdata(dev);
-	struct da8xx_fb_par *par = info->par;
-	int ret;
-
-#ifdef CONFIG_CPU_FREQ
-	lcd_da8xx_cpufreq_deregister(par);
-#endif
-	if (par->lcd_supply) {
-		ret = regulator_disable(par->lcd_supply);
-		if (ret)
-			dev_warn(&dev->dev, "Failed to disable regulator (%pe)\n",
-				 ERR_PTR(ret));
-	}
-
-	lcd_disable_raster(DA8XX_FRAME_WAIT);
-	lcdc_write(0, LCD_RASTER_CTRL_REG);
-
-	/* disable DMA  */
-	lcdc_write(0, LCD_DMA_CTRL_REG);
-
-	unregister_framebuffer(info);
-	fb_dealloc_cmap(&info->cmap);
-	pm_runtime_put_sync(&dev->dev);
-	pm_runtime_disable(&dev->dev);
-	framebuffer_release(info);
-}
-
-/*
- * Function to wait for vertical sync which for this LCD peripheral
- * translates into waiting for the current raster frame to complete.
- */
-static int fb_wait_for_vsync(struct fb_info *info)
-{
-	struct da8xx_fb_par *par = info->par;
-	int ret;
-
-	/*
-	 * Set flag to 0 and wait for isr to set to 1. It would seem there is a
-	 * race condition here where the ISR could have occurred just before or
-	 * just after this set. But since we are just coarsely waiting for
-	 * a frame to complete then that's OK. i.e. if the frame completed
-	 * just before this code executed then we have to wait another full
-	 * frame time but there is no way to avoid such a situation. On the
-	 * other hand if the frame completed just after then we don't need
-	 * to wait long at all. Either way we are guaranteed to return to the
-	 * user immediately after a frame completion which is all that is
-	 * required.
-	 */
-	par->vsync_flag = 0;
-	ret = wait_event_interruptible_timeout(par->vsync_wait,
-					       par->vsync_flag != 0,
-					       par->vsync_timeout);
-	if (ret < 0)
-		return ret;
-	if (ret == 0)
-		return -ETIMEDOUT;
-
-	return 0;
-}
-
-static int fb_ioctl(struct fb_info *info, unsigned int cmd,
-			  unsigned long arg)
-{
-	struct lcd_sync_arg sync_arg;
-
-	switch (cmd) {
-	case FBIOGET_CONTRAST:
-	case FBIOPUT_CONTRAST:
-	case FBIGET_BRIGHTNESS:
-	case FBIPUT_BRIGHTNESS:
-	case FBIGET_COLOR:
-	case FBIPUT_COLOR:
-		return -ENOTTY;
-	case FBIPUT_HSYNC:
-		if (copy_from_user(&sync_arg, (char *)arg,
-				sizeof(struct lcd_sync_arg)))
-			return -EFAULT;
-		lcd_cfg_horizontal_sync(sync_arg.back_porch,
-					sync_arg.pulse_width,
-					sync_arg.front_porch);
-		break;
-	case FBIPUT_VSYNC:
-		if (copy_from_user(&sync_arg, (char *)arg,
-				sizeof(struct lcd_sync_arg)))
-			return -EFAULT;
-		lcd_cfg_vertical_sync(sync_arg.back_porch,
-					sync_arg.pulse_width,
-					sync_arg.front_porch);
-		break;
-	case FBIO_WAITFORVSYNC:
-		return fb_wait_for_vsync(info);
-	default:
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int cfb_blank(int blank, struct fb_info *info)
-{
-	struct da8xx_fb_par *par = info->par;
-	int ret = 0;
-
-	if (par->blank == blank)
-		return 0;
-
-	par->blank = blank;
-	switch (blank) {
-	case FB_BLANK_UNBLANK:
-		lcd_enable_raster();
-
-		if (par->lcd_supply) {
-			ret = regulator_enable(par->lcd_supply);
-			if (ret)
-				return ret;
-		}
-		break;
-	case FB_BLANK_NORMAL:
-	case FB_BLANK_VSYNC_SUSPEND:
-	case FB_BLANK_HSYNC_SUSPEND:
-	case FB_BLANK_POWERDOWN:
-		if (par->lcd_supply) {
-			ret = regulator_disable(par->lcd_supply);
-			if (ret)
-				return ret;
-		}
-
-		lcd_disable_raster(DA8XX_FRAME_WAIT);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-
-/*
- * Set new x,y offsets in the virtual display for the visible area and switch
- * to the new mode.
- */
-static int da8xx_pan_display(struct fb_var_screeninfo *var,
-			     struct fb_info *fbi)
-{
-	int ret = 0;
-	struct fb_var_screeninfo new_var;
-	struct da8xx_fb_par         *par = fbi->par;
-	struct fb_fix_screeninfo    *fix = &fbi->fix;
-	unsigned int end;
-	unsigned int start;
-	unsigned long irq_flags;
-
-	if (var->xoffset != fbi->var.xoffset ||
-			var->yoffset != fbi->var.yoffset) {
-		memcpy(&new_var, &fbi->var, sizeof(new_var));
-		new_var.xoffset = var->xoffset;
-		new_var.yoffset = var->yoffset;
-		if (fb_check_var(&new_var, fbi))
-			ret = -EINVAL;
-		else {
-			memcpy(&fbi->var, &new_var, sizeof(new_var));
-
-			start	= fix->smem_start +
-				new_var.yoffset * fix->line_length +
-				new_var.xoffset * fbi->var.bits_per_pixel / 8;
-			end	= start + fbi->var.yres * fix->line_length - 1;
-			par->dma_start	= start;
-			par->dma_end	= end;
-			spin_lock_irqsave(&par->lock_for_chan_update,
-					irq_flags);
-			if (par->which_dma_channel_done == 0) {
-				lcdc_write(par->dma_start,
-					   LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
-				lcdc_write(par->dma_end,
-					   LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
-			} else if (par->which_dma_channel_done == 1) {
-				lcdc_write(par->dma_start,
-					   LCD_DMA_FRM_BUF_BASE_ADDR_1_REG);
-				lcdc_write(par->dma_end,
-					   LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG);
-			}
-			spin_unlock_irqrestore(&par->lock_for_chan_update,
-					irq_flags);
-		}
-	}
-
-	return ret;
-}
-
-static int da8xxfb_set_par(struct fb_info *info)
-{
-	struct da8xx_fb_par *par = info->par;
-	int ret;
-	bool raster = da8xx_fb_is_raster_enabled();
-
-	if (raster)
-		lcd_disable_raster(DA8XX_FRAME_WAIT);
-
-	fb_var_to_videomode(&par->mode, &info->var);
-
-	par->cfg.bpp = info->var.bits_per_pixel;
-
-	info->fix.visual = (par->cfg.bpp <= 8) ?
-				FB_VISUAL_PSEUDOCOLOR : FB_VISUAL_TRUECOLOR;
-	info->fix.line_length = (par->mode.xres * par->cfg.bpp) / 8;
-
-	ret = lcd_init(par, &par->cfg, &par->mode);
-	if (ret < 0) {
-		dev_err(par->dev, "lcd init failed\n");
-		return ret;
-	}
-
-	par->dma_start = info->fix.smem_start +
-			 info->var.yoffset * info->fix.line_length +
-			 info->var.xoffset * info->var.bits_per_pixel / 8;
-	par->dma_end   = par->dma_start +
-			 info->var.yres * info->fix.line_length - 1;
-
-	lcdc_write(par->dma_start, LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
-	lcdc_write(par->dma_end, LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
-	lcdc_write(par->dma_start, LCD_DMA_FRM_BUF_BASE_ADDR_1_REG);
-	lcdc_write(par->dma_end, LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG);
-
-	if (raster)
-		lcd_enable_raster();
-
-	return 0;
-}
-
-static const struct fb_ops da8xx_fb_ops = {
-	.owner = THIS_MODULE,
-	FB_DEFAULT_IOMEM_OPS,
-	.fb_check_var = fb_check_var,
-	.fb_set_par = da8xxfb_set_par,
-	.fb_setcolreg = fb_setcolreg,
-	.fb_pan_display = da8xx_pan_display,
-	.fb_ioctl = fb_ioctl,
-	.fb_blank = cfb_blank,
-};
-
-static struct fb_videomode *da8xx_fb_get_videomode(struct platform_device *dev)
-{
-	struct da8xx_lcdc_platform_data *fb_pdata = dev_get_platdata(&dev->dev);
-	struct fb_videomode *lcdc_info;
-	int i;
-
-	for (i = 0, lcdc_info = known_lcd_panels;
-		i < ARRAY_SIZE(known_lcd_panels); i++, lcdc_info++) {
-		if (strcmp(fb_pdata->type, lcdc_info->name) == 0)
-			break;
-	}
-
-	if (i == ARRAY_SIZE(known_lcd_panels)) {
-		dev_err(&dev->dev, "no panel found\n");
-		return NULL;
-	}
-	dev_info(&dev->dev, "found %s panel\n", lcdc_info->name);
-
-	return lcdc_info;
-}
-
-static int fb_probe(struct platform_device *device)
-{
-	struct da8xx_lcdc_platform_data *fb_pdata =
-						dev_get_platdata(&device->dev);
-	struct lcd_ctrl_config *lcd_cfg;
-	struct fb_videomode *lcdc_info;
-	struct fb_info *da8xx_fb_info;
-	struct da8xx_fb_par *par;
-	struct clk *tmp_lcdc_clk;
-	int ret;
-	unsigned long ulcm;
-
-	if (fb_pdata == NULL) {
-		dev_err(&device->dev, "Can not get platform data\n");
-		return -ENOENT;
-	}
-
-	lcdc_info = da8xx_fb_get_videomode(device);
-	if (lcdc_info == NULL)
-		return -ENODEV;
-
-	da8xx_fb_reg_base = devm_platform_ioremap_resource(device, 0);
-	if (IS_ERR(da8xx_fb_reg_base))
-		return PTR_ERR(da8xx_fb_reg_base);
-
-	tmp_lcdc_clk = devm_clk_get(&device->dev, "fck");
-	if (IS_ERR(tmp_lcdc_clk))
-		return dev_err_probe(&device->dev, PTR_ERR(tmp_lcdc_clk),
-				     "Can not get device clock\n");
-
-	pm_runtime_enable(&device->dev);
-	pm_runtime_get_sync(&device->dev);
-
-	/* Determine LCD IP Version */
-	switch (lcdc_read(LCD_PID_REG)) {
-	case 0x4C100102:
-		lcd_revision = LCD_VERSION_1;
-		break;
-	case 0x4F200800:
-	case 0x4F201000:
-		lcd_revision = LCD_VERSION_2;
-		break;
-	default:
-		dev_warn(&device->dev, "Unknown PID Reg value 0x%x, "
-				"defaulting to LCD revision 1\n",
-				lcdc_read(LCD_PID_REG));
-		lcd_revision = LCD_VERSION_1;
-		break;
-	}
-
-	lcd_cfg = (struct lcd_ctrl_config *)fb_pdata->controller_data;
-
-	if (!lcd_cfg) {
-		ret = -EINVAL;
-		goto err_pm_runtime_disable;
-	}
-
-	da8xx_fb_info = framebuffer_alloc(sizeof(struct da8xx_fb_par),
-					&device->dev);
-	if (!da8xx_fb_info) {
-		ret = -ENOMEM;
-		goto err_pm_runtime_disable;
-	}
-
-	par = da8xx_fb_info->par;
-	par->dev = &device->dev;
-	par->lcdc_clk = tmp_lcdc_clk;
-	par->lcdc_clk_rate = clk_get_rate(par->lcdc_clk);
-
-	par->lcd_supply = devm_regulator_get_optional(&device->dev, "lcd");
-	if (IS_ERR(par->lcd_supply)) {
-		if (PTR_ERR(par->lcd_supply) == -EPROBE_DEFER) {
-			ret = -EPROBE_DEFER;
-			goto err_release_fb;
-		}
-
-		par->lcd_supply = NULL;
-	} else {
-		ret = regulator_enable(par->lcd_supply);
-		if (ret)
-			goto err_release_fb;
-	}
-
-	fb_videomode_to_var(&da8xx_fb_var, lcdc_info);
-	par->cfg = *lcd_cfg;
-
-	da8xx_fb_lcd_reset();
-
-	/* allocate frame buffer */
-	par->vram_size = lcdc_info->xres * lcdc_info->yres * lcd_cfg->bpp;
-	ulcm = lcm((lcdc_info->xres * lcd_cfg->bpp)/8, PAGE_SIZE);
-	par->vram_size = roundup(par->vram_size/8, ulcm);
-	par->vram_size = par->vram_size * LCD_NUM_BUFFERS;
-
-	par->vram_virt = dmam_alloc_coherent(par->dev,
-					     par->vram_size,
-					     &par->vram_phys,
-					     GFP_KERNEL | GFP_DMA);
-	if (!par->vram_virt) {
-		dev_err(&device->dev,
-			"GLCD: kmalloc for frame buffer failed\n");
-		ret = -EINVAL;
-		goto err_disable_reg;
-	}
-
-	da8xx_fb_info->screen_base = (char __iomem *) par->vram_virt;
-	da8xx_fb_fix.smem_start    = par->vram_phys;
-	da8xx_fb_fix.smem_len      = par->vram_size;
-	da8xx_fb_fix.line_length   = (lcdc_info->xres * lcd_cfg->bpp) / 8;
-
-	par->dma_start = par->vram_phys;
-	par->dma_end   = par->dma_start + lcdc_info->yres *
-		da8xx_fb_fix.line_length - 1;
-
-	/* allocate palette buffer */
-	par->v_palette_base = dmam_alloc_coherent(par->dev, PALETTE_SIZE,
-						  &par->p_palette_base,
-						  GFP_KERNEL | GFP_DMA);
-	if (!par->v_palette_base) {
-		dev_err(&device->dev,
-			"GLCD: kmalloc for palette buffer failed\n");
-		ret = -EINVAL;
-		goto err_release_fb;
-	}
-
-	par->irq = platform_get_irq(device, 0);
-	if (par->irq < 0) {
-		ret = -ENOENT;
-		goto err_release_fb;
-	}
-
-	da8xx_fb_var.grayscale =
-	    lcd_cfg->panel_shade == MONOCHROME ? 1 : 0;
-	da8xx_fb_var.bits_per_pixel = lcd_cfg->bpp;
-
-	/* Initialize fbinfo */
-	da8xx_fb_info->fix = da8xx_fb_fix;
-	da8xx_fb_info->var = da8xx_fb_var;
-	da8xx_fb_info->fbops = &da8xx_fb_ops;
-	da8xx_fb_info->pseudo_palette = par->pseudo_palette;
-	da8xx_fb_info->fix.visual = (da8xx_fb_info->var.bits_per_pixel <= 8) ?
-				FB_VISUAL_PSEUDOCOLOR : FB_VISUAL_TRUECOLOR;
-
-	ret = fb_alloc_cmap(&da8xx_fb_info->cmap, PALETTE_SIZE, 0);
-	if (ret)
-		goto err_disable_reg;
-	da8xx_fb_info->cmap.len = par->palette_sz;
-
-	/* initialize var_screeninfo */
-	da8xx_fb_var.activate = FB_ACTIVATE_FORCE;
-	fb_set_var(da8xx_fb_info, &da8xx_fb_var);
-
-	platform_set_drvdata(device, da8xx_fb_info);
-
-	/* initialize the vsync wait queue */
-	init_waitqueue_head(&par->vsync_wait);
-	par->vsync_timeout = HZ / 5;
-	par->which_dma_channel_done = -1;
-	spin_lock_init(&par->lock_for_chan_update);
-
-	/* Register the Frame Buffer  */
-	if (register_framebuffer(da8xx_fb_info) < 0) {
-		dev_err(&device->dev,
-			"GLCD: Frame Buffer Registration Failed!\n");
-		ret = -EINVAL;
-		goto err_dealloc_cmap;
-	}
-
-#ifdef CONFIG_CPU_FREQ
-	ret = lcd_da8xx_cpufreq_register(par);
-	if (ret) {
-		dev_err(&device->dev, "failed to register cpufreq\n");
-		goto err_cpu_freq;
-	}
-#endif
-
-	if (lcd_revision == LCD_VERSION_1)
-		lcdc_irq_handler = lcdc_irq_handler_rev01;
-	else {
-		init_waitqueue_head(&frame_done_wq);
-		lcdc_irq_handler = lcdc_irq_handler_rev02;
-	}
-
-	ret = devm_request_irq(&device->dev, par->irq, lcdc_irq_handler, 0,
-			       DRIVER_NAME, par);
-	if (ret)
-		goto irq_freq;
-	return 0;
-
-irq_freq:
-#ifdef CONFIG_CPU_FREQ
-	lcd_da8xx_cpufreq_deregister(par);
-err_cpu_freq:
-#endif
-	unregister_framebuffer(da8xx_fb_info);
-
-err_dealloc_cmap:
-	fb_dealloc_cmap(&da8xx_fb_info->cmap);
-
-err_disable_reg:
-	if (par->lcd_supply)
-		regulator_disable(par->lcd_supply);
-err_release_fb:
-	framebuffer_release(da8xx_fb_info);
-
-err_pm_runtime_disable:
-	pm_runtime_put_sync(&device->dev);
-	pm_runtime_disable(&device->dev);
-
-	return ret;
-}
-
-#ifdef CONFIG_PM_SLEEP
-static struct lcdc_context {
-	u32 clk_enable;
-	u32 ctrl;
-	u32 dma_ctrl;
-	u32 raster_timing_0;
-	u32 raster_timing_1;
-	u32 raster_timing_2;
-	u32 int_enable_set;
-	u32 dma_frm_buf_base_addr_0;
-	u32 dma_frm_buf_ceiling_addr_0;
-	u32 dma_frm_buf_base_addr_1;
-	u32 dma_frm_buf_ceiling_addr_1;
-	u32 raster_ctrl;
-} reg_context;
-
-static void lcd_context_save(void)
-{
-	if (lcd_revision == LCD_VERSION_2) {
-		reg_context.clk_enable = lcdc_read(LCD_CLK_ENABLE_REG);
-		reg_context.int_enable_set = lcdc_read(LCD_INT_ENABLE_SET_REG);
-	}
-
-	reg_context.ctrl = lcdc_read(LCD_CTRL_REG);
-	reg_context.dma_ctrl = lcdc_read(LCD_DMA_CTRL_REG);
-	reg_context.raster_timing_0 = lcdc_read(LCD_RASTER_TIMING_0_REG);
-	reg_context.raster_timing_1 = lcdc_read(LCD_RASTER_TIMING_1_REG);
-	reg_context.raster_timing_2 = lcdc_read(LCD_RASTER_TIMING_2_REG);
-	reg_context.dma_frm_buf_base_addr_0 =
-		lcdc_read(LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
-	reg_context.dma_frm_buf_ceiling_addr_0 =
-		lcdc_read(LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
-	reg_context.dma_frm_buf_base_addr_1 =
-		lcdc_read(LCD_DMA_FRM_BUF_BASE_ADDR_1_REG);
-	reg_context.dma_frm_buf_ceiling_addr_1 =
-		lcdc_read(LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG);
-	reg_context.raster_ctrl = lcdc_read(LCD_RASTER_CTRL_REG);
-	return;
-}
-
-static void lcd_context_restore(void)
-{
-	if (lcd_revision == LCD_VERSION_2) {
-		lcdc_write(reg_context.clk_enable, LCD_CLK_ENABLE_REG);
-		lcdc_write(reg_context.int_enable_set, LCD_INT_ENABLE_SET_REG);
-	}
-
-	lcdc_write(reg_context.ctrl, LCD_CTRL_REG);
-	lcdc_write(reg_context.dma_ctrl, LCD_DMA_CTRL_REG);
-	lcdc_write(reg_context.raster_timing_0, LCD_RASTER_TIMING_0_REG);
-	lcdc_write(reg_context.raster_timing_1, LCD_RASTER_TIMING_1_REG);
-	lcdc_write(reg_context.raster_timing_2, LCD_RASTER_TIMING_2_REG);
-	lcdc_write(reg_context.dma_frm_buf_base_addr_0,
-			LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
-	lcdc_write(reg_context.dma_frm_buf_ceiling_addr_0,
-			LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
-	lcdc_write(reg_context.dma_frm_buf_base_addr_1,
-			LCD_DMA_FRM_BUF_BASE_ADDR_1_REG);
-	lcdc_write(reg_context.dma_frm_buf_ceiling_addr_1,
-			LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG);
-	lcdc_write(reg_context.raster_ctrl, LCD_RASTER_CTRL_REG);
-	return;
-}
-
-static int fb_suspend(struct device *dev)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct da8xx_fb_par *par = info->par;
-	int ret;
-
-	console_lock();
-	if (par->lcd_supply) {
-		ret = regulator_disable(par->lcd_supply);
-		if (ret)
-			return ret;
-	}
-
-	fb_set_suspend(info, 1);
-	lcd_disable_raster(DA8XX_FRAME_WAIT);
-	lcd_context_save();
-	pm_runtime_put_sync(dev);
-	console_unlock();
-
-	return 0;
-}
-static int fb_resume(struct device *dev)
-{
-	struct fb_info *info = dev_get_drvdata(dev);
-	struct da8xx_fb_par *par = info->par;
-	int ret;
-
-	console_lock();
-	pm_runtime_get_sync(dev);
-	lcd_context_restore();
-	if (par->blank == FB_BLANK_UNBLANK) {
-		lcd_enable_raster();
-
-		if (par->lcd_supply) {
-			ret = regulator_enable(par->lcd_supply);
-			if (ret)
-				return ret;
-		}
-	}
-
-	fb_set_suspend(info, 0);
-	console_unlock();
-
-	return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(fb_pm_ops, fb_suspend, fb_resume);
-
-static struct platform_driver da8xx_fb_driver = {
-	.probe = fb_probe,
-	.remove = fb_remove,
-	.driver = {
-		   .name = DRIVER_NAME,
-		   .pm	= &fb_pm_ops,
-		   },
-};
-module_platform_driver(da8xx_fb_driver);
-
-MODULE_DESCRIPTION("Framebuffer driver for TI da8xx/omap-l1xx");
-MODULE_AUTHOR("Texas Instruments");
-MODULE_LICENSE("GPL");
diff --git a/include/video/da8xx-fb.h b/include/video/da8xx-fb.h
deleted file mode 100644
index 1d19ae62b844..000000000000
--- a/include/video/da8xx-fb.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Header file for TI DA8XX LCD controller platform data.
- *
- * Copyright (C) 2008-2009 MontaVista Software Inc.
- * Copyright (C) 2008-2009 Texas Instruments Inc
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-
-#ifndef DA8XX_FB_H
-#define DA8XX_FB_H
-
-enum panel_shade {
-	MONOCHROME = 0,
-	COLOR_ACTIVE,
-	COLOR_PASSIVE,
-};
-
-enum raster_load_mode {
-	LOAD_DATA = 1,
-	LOAD_PALETTE,
-};
-
-enum da8xx_frame_complete {
-	DA8XX_FRAME_WAIT,
-	DA8XX_FRAME_NOWAIT,
-};
-
-struct da8xx_lcdc_platform_data {
-	const char manu_name[10];
-	void *controller_data;
-	const char type[25];
-};
-
-struct lcd_ctrl_config {
-	enum panel_shade panel_shade;
-
-	/* AC Bias Pin Frequency */
-	int ac_bias;
-
-	/* AC Bias Pin Transitions per Interrupt */
-	int ac_bias_intrpt;
-
-	/* DMA burst size */
-	int dma_burst_sz;
-
-	/* Bits per pixel */
-	int bpp;
-
-	/* FIFO DMA Request Delay */
-	int fdd;
-
-	/* TFT Alternative Signal Mapping (Only for active) */
-	unsigned char tft_alt_mode;
-
-	/* 12 Bit Per Pixel (5-6-5) Mode (Only for passive) */
-	unsigned char stn_565_mode;
-
-	/* Mono 8-bit Mode: 1=D0-D7 or 0=D0-D3 */
-	unsigned char mono_8bit_mode;
-
-	/* Horizontal and Vertical Sync Edge: 0=rising 1=falling */
-	unsigned char sync_edge;
-
-	/* Raster Data Order Select: 1=Most-to-least 0=Least-to-most */
-	unsigned char raster_order;
-
-	/* DMA FIFO threshold */
-	int fifo_th;
-};
-
-struct lcd_sync_arg {
-	int back_porch;
-	int front_porch;
-	int pulse_width;
-};
-
-/* ioctls */
-#define FBIOGET_CONTRAST	_IOR('F', 1, int)
-#define FBIOPUT_CONTRAST	_IOW('F', 2, int)
-#define FBIGET_BRIGHTNESS	_IOR('F', 3, int)
-#define FBIPUT_BRIGHTNESS	_IOW('F', 3, int)
-#define FBIGET_COLOR		_IOR('F', 5, int)
-#define FBIPUT_COLOR		_IOW('F', 6, int)
-#define FBIPUT_HSYNC		_IOW('F', 9, int)
-#define FBIPUT_VSYNC		_IOW('F', 10, int)
-
-/* Proprietary FB_SYNC_ flags */
-#define FB_SYNC_CLK_INVERT 0x40000000
-
-#endif  /* ifndef DA8XX_FB_H */
-
-- 
cgit v1.2.3


From 6632863226d88383c9e2bedfeeb928ac7f8232b9 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 9 Oct 2024 13:18:08 +0800
Subject: iommu: Remove iommu_present()

The last callsite of iommu_present() is removed by commit <45c690aea8ee>
("drm/tegra: Use iommu_paging_domain_alloc()"). Remove it to avoid dead
code.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20241009051808.29455-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 25 -------------------------
 include/linux/iommu.h |  6 ------
 2 files changed, 31 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ce72edd34f07..4c29a4bc368b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1840,31 +1840,6 @@ int bus_iommu_probe(const struct bus_type *bus)
 	return 0;
 }
 
-/**
- * iommu_present() - make platform-specific assumptions about an IOMMU
- * @bus: bus to check
- *
- * Do not use this function. You want device_iommu_mapped() instead.
- *
- * Return: true if some IOMMU is present and aware of devices on the given bus;
- * in general it may not be the only IOMMU, and it may not have anything to do
- * with whatever device you are ultimately interested in.
- */
-bool iommu_present(const struct bus_type *bus)
-{
-	bool ret = false;
-
-	for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) {
-		if (iommu_buses[i] == bus) {
-			spin_lock(&iommu_device_lock);
-			ret = !list_empty(&iommu_device_list);
-			spin_unlock(&iommu_device_lock);
-		}
-	}
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_present);
-
 /**
  * device_iommu_capable() - check for a general IOMMU capability
  * @dev: device to which the capability would be relevant, if available
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..62d1b85c80d3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -785,7 +785,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 }
 
 extern int bus_iommu_probe(const struct bus_type *bus);
-extern bool iommu_present(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
 extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus);
@@ -1081,11 +1080,6 @@ struct iommu_iotlb_gather {};
 struct iommu_dirty_bitmap {};
 struct iommu_dirty_ops {};
 
-static inline bool iommu_present(const struct bus_type *bus)
-{
-	return false;
-}
-
 static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 {
 	return false;
-- 
cgit v1.2.3


From c0adf8c3a9bf33f1dd1bf950601380f46a3fcec3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Oct 2024 10:59:12 +0200
Subject: iomap: factor out a iomap_last_written_block helper

Split out a pice of logic from iomap_file_buffered_write_punch_delalloc
that is useful for all iomap_end implementations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/iomap/buffered-io.c | 13 ++-----------
 include/linux/iomap.h  | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 78ebd265f425..b944d77a78c6 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1280,7 +1280,6 @@ void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 {
 	loff_t			start_byte;
 	loff_t			end_byte;
-	unsigned int		blocksize = i_blocksize(inode);
 
 	if (iomap->type != IOMAP_DELALLOC)
 		return;
@@ -1289,16 +1288,8 @@ void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 	if (!(iomap->flags & IOMAP_F_NEW))
 		return;
 
-	/*
-	 * start_byte refers to the first unused block after a short write. If
-	 * nothing was written, round offset down to point at the first block in
-	 * the range.
-	 */
-	if (unlikely(!written))
-		start_byte = round_down(pos, blocksize);
-	else
-		start_byte = round_up(pos + written, blocksize);
-	end_byte = round_up(pos + length, blocksize);
+	start_byte = iomap_last_written_block(inode, pos, written);
+	end_byte = round_up(pos + length, i_blocksize(inode));
 
 	/* Nothing to do if we've written the entire delalloc extent */
 	if (start_byte >= end_byte)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4ad12a3c8bae..62253739dedc 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -256,6 +256,20 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
 	return &i->iomap;
 }
 
+/*
+ * Return the file offset for the first unchanged block after a short write.
+ *
+ * If nothing was written, round @pos down to point at the first block in
+ * the range, else round up to include the partially written block.
+ */
+static inline loff_t iomap_last_written_block(struct inode *inode, loff_t pos,
+		ssize_t written)
+{
+	if (unlikely(!written))
+		return round_down(pos, i_blocksize(inode));
+	return round_up(pos + written, i_blocksize(inode));
+}
+
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops, void *private);
 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
-- 
cgit v1.2.3


From caf0ea451d97c33c5bbaa0074dad33b0b2a4e649 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Oct 2024 10:59:13 +0200
Subject: iomap: remove iomap_file_buffered_write_punch_delalloc

Currently iomap_file_buffered_write_punch_delalloc can be called from
XFS either with the invalidate lock held or not.  To fix this while
keeping the locking in the file system and not the iomap library
code we'll need to life the locking up into the file system.

To prepare for that, open code iomap_file_buffered_write_punch_delalloc
in the only caller, and instead export iomap_write_delalloc_release.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 Documentation/filesystems/iomap/operations.rst |  2 +-
 fs/iomap/buffered-io.c                         | 85 +++++++++-----------------
 fs/xfs/xfs_iomap.c                             | 16 ++++-
 include/linux/iomap.h                          |  6 +-
 4 files changed, 46 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 8e6c721d2330..b93115ab8748 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -208,7 +208,7 @@ The filesystem must arrange to `cancel
 such `reservations
 <https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/>`_
 because writeback will not consume the reservation.
-The ``iomap_file_buffered_write_punch_delalloc`` can be called from a
+The ``iomap_write_delalloc_release`` can be called from a
 ``->iomap_end`` function to find all the clean areas of the folios
 caching a fresh (``IOMAP_F_NEW``) delalloc mapping.
 It takes the ``invalidate_lock``.
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index b944d77a78c6..b4f742f31041 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1145,10 +1145,36 @@ static void iomap_write_delalloc_scan(struct inode *inode,
 }
 
 /*
+ * When a short write occurs, the filesystem might need to use ->iomap_end
+ * to remove space reservations created in ->iomap_begin.
+ *
+ * For filesystems that use delayed allocation, there can be dirty pages over
+ * the delalloc extent outside the range of a short write but still within the
+ * delalloc extent allocated for this iomap if the write raced with page
+ * faults.
+ *
  * Punch out all the delalloc blocks in the range given except for those that
  * have dirty data still pending in the page cache - those are going to be
  * written and so must still retain the delalloc backing for writeback.
  *
+ * The punch() callback *must* only punch delalloc extents in the range passed
+ * to it. It must skip over all other types of extents in the range and leave
+ * them completely unchanged. It must do this punch atomically with respect to
+ * other extent modifications.
+ *
+ * The punch() callback may be called with a folio locked to prevent writeback
+ * extent allocation racing at the edge of the range we are currently punching.
+ * The locked folio may or may not cover the range being punched, so it is not
+ * safe for the punch() callback to lock folios itself.
+ *
+ * Lock order is:
+ *
+ * inode->i_rwsem (shared or exclusive)
+ *   inode->i_mapping->invalidate_lock (exclusive)
+ *     folio_lock()
+ *       ->punch
+ *         internal filesystem allocation lock
+ *
  * As we are scanning the page cache for data, we don't need to reimplement the
  * wheel - mapping_seek_hole_data() does exactly what we need to identify the
  * start and end of data ranges correctly even for sub-folio block sizes. This
@@ -1177,7 +1203,7 @@ static void iomap_write_delalloc_scan(struct inode *inode,
  * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
  * the code to subtle off-by-one bugs....
  */
-static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
+void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
 		loff_t end_byte, unsigned flags, struct iomap *iomap,
 		iomap_punch_t punch)
 {
@@ -1243,62 +1269,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
 out_unlock:
 	filemap_invalidate_unlock(inode->i_mapping);
 }
-
-/*
- * When a short write occurs, the filesystem may need to remove reserved space
- * that was allocated in ->iomap_begin from it's ->iomap_end method. For
- * filesystems that use delayed allocation, we need to punch out delalloc
- * extents from the range that are not dirty in the page cache. As the write can
- * race with page faults, there can be dirty pages over the delalloc extent
- * outside the range of a short write but still within the delalloc extent
- * allocated for this iomap.
- *
- * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
- * simplify range iterations.
- *
- * The punch() callback *must* only punch delalloc extents in the range passed
- * to it. It must skip over all other types of extents in the range and leave
- * them completely unchanged. It must do this punch atomically with respect to
- * other extent modifications.
- *
- * The punch() callback may be called with a folio locked to prevent writeback
- * extent allocation racing at the edge of the range we are currently punching.
- * The locked folio may or may not cover the range being punched, so it is not
- * safe for the punch() callback to lock folios itself.
- *
- * Lock order is:
- *
- * inode->i_rwsem (shared or exclusive)
- *   inode->i_mapping->invalidate_lock (exclusive)
- *     folio_lock()
- *       ->punch
- *         internal filesystem allocation lock
- */
-void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
-		loff_t pos, loff_t length, ssize_t written, unsigned flags,
-		struct iomap *iomap, iomap_punch_t punch)
-{
-	loff_t			start_byte;
-	loff_t			end_byte;
-
-	if (iomap->type != IOMAP_DELALLOC)
-		return;
-
-	/* If we didn't reserve the blocks, we're not allowed to punch them. */
-	if (!(iomap->flags & IOMAP_F_NEW))
-		return;
-
-	start_byte = iomap_last_written_block(inode, pos, written);
-	end_byte = round_up(pos + length, i_blocksize(inode));
-
-	/* Nothing to do if we've written the entire delalloc extent */
-	if (start_byte >= end_byte)
-		return;
-
-	iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap,
-			punch);
-}
-EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
+EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
 
 static loff_t iomap_unshare_iter(struct iomap_iter *iter)
 {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1e11f48814c0..30f2530b6d54 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1227,8 +1227,20 @@ xfs_buffered_write_iomap_end(
 	unsigned		flags,
 	struct iomap		*iomap)
 {
-	iomap_file_buffered_write_punch_delalloc(inode, offset, length, written,
-			flags, iomap, &xfs_buffered_write_delalloc_punch);
+	loff_t			start_byte, end_byte;
+
+	/* If we didn't reserve the blocks, we're not allowed to punch them. */
+	if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
+		return 0;
+
+	/* Nothing to do if we've written the entire delalloc extent */
+	start_byte = iomap_last_written_block(inode, offset, written);
+	end_byte = round_up(offset + length, i_blocksize(inode));
+	if (start_byte >= end_byte)
+		return 0;
+
+	iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap,
+			xfs_buffered_write_delalloc_punch);
 	return 0;
 }
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 62253739dedc..d0420e962ffd 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -290,9 +290,9 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
 
 typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
 		struct iomap *iomap);
-void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos,
-		loff_t length, ssize_t written, unsigned flag,
-		struct iomap *iomap, iomap_punch_t punch);
+void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
+		loff_t end_byte, unsigned flags, struct iomap *iomap,
+		iomap_punch_t punch);
 
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		u64 start, u64 len, const struct iomap_ops *ops);
-- 
cgit v1.2.3


From eb0c062161cf5f98556a906c48b0cfc019d9e89c Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 16 Sep 2024 15:33:20 +0200
Subject: gpu: host1x: Set up device DMA parameters

In order to store device DMA parameters, the DMA framework depends on
the device's dma_parms field to point at a valid memory location. Add
backing storage for this in struct host1x_memory_context and point to
it.

Reported-by: Jonathan Hunter <jonathanh@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240916133320.368620-1-thierry.reding@gmail.com
(cherry picked from commit b4ad4ef374d66cc8df3188bb1ddb65bce5fc9e50)
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/context.c | 1 +
 include/linux/host1x.h       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/host1x/context.c b/drivers/gpu/host1x/context.c
index 955c971c528d..a6f6779662a3 100644
--- a/drivers/gpu/host1x/context.c
+++ b/drivers/gpu/host1x/context.c
@@ -58,6 +58,7 @@ int host1x_memory_context_list_init(struct host1x *host1x)
 		ctx->dev.parent = host1x->dev;
 		ctx->dev.release = host1x_memory_context_release;
 
+		ctx->dev.dma_parms = &ctx->dma_parms;
 		dma_set_max_seg_size(&ctx->dev, UINT_MAX);
 
 		err = device_add(&ctx->dev);
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 9c8119ed13a4..c4dde3aafcac 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -466,6 +466,7 @@ struct host1x_memory_context {
 	refcount_t ref;
 	struct pid *owner;
 
+	struct device_dma_parameters dma_parms;
 	struct device dev;
 	u64 dma_mask;
 	u32 stream_id;
-- 
cgit v1.2.3


From 56440d7ec28d60f8da3bfa09062b3368ff9b16db Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 11 Oct 2024 17:12:17 +0000
Subject: genetlink: hold RCU in genlmsg_mcast()

While running net selftests with CONFIG_PROVE_RCU_LIST=y I saw
one lockdep splat [1].

genlmsg_mcast() uses for_each_net_rcu(), and must therefore hold RCU.

Instead of letting all callers guard genlmsg_multicast_allns()
with a rcu_read_lock()/rcu_read_unlock() pair, do it in genlmsg_mcast().

This also means the @flags parameter is useless, we need to always use
GFP_ATOMIC.

[1]
[10882.424136] =============================
[10882.424166] WARNING: suspicious RCU usage
[10882.424309] 6.12.0-rc2-virtme #1156 Not tainted
[10882.424400] -----------------------------
[10882.424423] net/netlink/genetlink.c:1940 RCU-list traversed in non-reader section!!
[10882.424469]
other info that might help us debug this:

[10882.424500]
rcu_scheduler_active = 2, debug_locks = 1
[10882.424744] 2 locks held by ip/15677:
[10882.424791] #0: ffffffffb6b491b0 (cb_lock){++++}-{3:3}, at: genl_rcv (net/netlink/genetlink.c:1219)
[10882.426334] #1: ffffffffb6b49248 (genl_mutex){+.+.}-{3:3}, at: genl_rcv_msg (net/netlink/genetlink.c:61 net/netlink/genetlink.c:57 net/netlink/genetlink.c:1209)
[10882.426465]
stack backtrace:
[10882.426805] CPU: 14 UID: 0 PID: 15677 Comm: ip Not tainted 6.12.0-rc2-virtme #1156
[10882.426919] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[10882.427046] Call Trace:
[10882.427131]  <TASK>
[10882.427244] dump_stack_lvl (lib/dump_stack.c:123)
[10882.427335] lockdep_rcu_suspicious (kernel/locking/lockdep.c:6822)
[10882.427387] genlmsg_multicast_allns (net/netlink/genetlink.c:1940 (discriminator 7) net/netlink/genetlink.c:1977 (discriminator 7))
[10882.427436] l2tp_tunnel_notify.constprop.0 (net/l2tp/l2tp_netlink.c:119) l2tp_netlink
[10882.427683] l2tp_nl_cmd_tunnel_create (net/l2tp/l2tp_netlink.c:253) l2tp_netlink
[10882.427748] genl_family_rcv_msg_doit (net/netlink/genetlink.c:1115)
[10882.427834] genl_rcv_msg (net/netlink/genetlink.c:1195 net/netlink/genetlink.c:1210)
[10882.427877] ? __pfx_l2tp_nl_cmd_tunnel_create (net/l2tp/l2tp_netlink.c:186) l2tp_netlink
[10882.427927] ? __pfx_genl_rcv_msg (net/netlink/genetlink.c:1201)
[10882.427959] netlink_rcv_skb (net/netlink/af_netlink.c:2551)
[10882.428069] genl_rcv (net/netlink/genetlink.c:1220)
[10882.428095] netlink_unicast (net/netlink/af_netlink.c:1332 net/netlink/af_netlink.c:1357)
[10882.428140] netlink_sendmsg (net/netlink/af_netlink.c:1901)
[10882.428210] ____sys_sendmsg (net/socket.c:729 (discriminator 1) net/socket.c:744 (discriminator 1) net/socket.c:2607 (discriminator 1))

Fixes: 33f72e6f0c67 ("l2tp : multicast notification to the registered listeners")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: James Chapman <jchapman@katalix.com>
Cc: Tom Parkin <tparkin@katalix.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Link: https://patch.msgid.link/20241011171217.3166614-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/target/target_core_user.c |  2 +-
 include/net/genetlink.h           |  3 +--
 net/l2tp/l2tp_netlink.c           |  4 ++--
 net/netlink/genetlink.c           | 28 ++++++++++++++--------------
 net/wireless/nl80211.c            |  8 ++------
 5 files changed, 20 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 7eb94894bd68..717931267bda 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -2130,7 +2130,7 @@ static int tcmu_netlink_event_send(struct tcmu_dev *udev,
 	}
 
 	ret = genlmsg_multicast_allns(&tcmu_genl_family, skb, 0,
-				      TCMU_MCGRP_CONFIG, GFP_KERNEL);
+				      TCMU_MCGRP_CONFIG);
 
 	/* Wait during an add as the listener may not be up yet */
 	if (ret == 0 ||
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 9ab49bfeae78..c1d91f1d20f6 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -531,13 +531,12 @@ static inline int genlmsg_multicast(const struct genl_family *family,
  * @skb: netlink message as socket buffer
  * @portid: own netlink portid to avoid sending to yourself
  * @group: offset of multicast group in groups array
- * @flags: allocation flags
  *
  * This function must hold the RTNL or rcu_read_lock().
  */
 int genlmsg_multicast_allns(const struct genl_family *family,
 			    struct sk_buff *skb, u32 portid,
-			    unsigned int group, gfp_t flags);
+			    unsigned int group);
 
 /**
  * genlmsg_unicast - unicast a netlink message
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 284f1dec1b56..59457c0c14aa 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -116,7 +116,7 @@ static int l2tp_tunnel_notify(struct genl_family *family,
 				  NLM_F_ACK, tunnel, cmd);
 
 	if (ret >= 0) {
-		ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+		ret = genlmsg_multicast_allns(family, msg, 0, 0);
 		/* We don't care if no one is listening */
 		if (ret == -ESRCH)
 			ret = 0;
@@ -144,7 +144,7 @@ static int l2tp_session_notify(struct genl_family *family,
 				   NLM_F_ACK, session, cmd);
 
 	if (ret >= 0) {
-		ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+		ret = genlmsg_multicast_allns(family, msg, 0, 0);
 		/* We don't care if no one is listening */
 		if (ret == -ESRCH)
 			ret = 0;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index feb54c63a116..07ad65774fe2 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1501,15 +1501,11 @@ static int genl_ctrl_event(int event, const struct genl_family *family,
 	if (IS_ERR(msg))
 		return PTR_ERR(msg);
 
-	if (!family->netnsok) {
+	if (!family->netnsok)
 		genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0,
 					0, GFP_KERNEL);
-	} else {
-		rcu_read_lock();
-		genlmsg_multicast_allns(&genl_ctrl, msg, 0,
-					0, GFP_ATOMIC);
-		rcu_read_unlock();
-	}
+	else
+		genlmsg_multicast_allns(&genl_ctrl, msg, 0, 0);
 
 	return 0;
 }
@@ -1929,23 +1925,23 @@ problem:
 
 core_initcall(genl_init);
 
-static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
-			 gfp_t flags)
+static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group)
 {
 	struct sk_buff *tmp;
 	struct net *net, *prev = NULL;
 	bool delivered = false;
 	int err;
 
+	rcu_read_lock();
 	for_each_net_rcu(net) {
 		if (prev) {
-			tmp = skb_clone(skb, flags);
+			tmp = skb_clone(skb, GFP_ATOMIC);
 			if (!tmp) {
 				err = -ENOMEM;
 				goto error;
 			}
 			err = nlmsg_multicast(prev->genl_sock, tmp,
-					      portid, group, flags);
+					      portid, group, GFP_ATOMIC);
 			if (!err)
 				delivered = true;
 			else if (err != -ESRCH)
@@ -1954,27 +1950,31 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
 
 		prev = net;
 	}
+	err = nlmsg_multicast(prev->genl_sock, skb, portid, group, GFP_ATOMIC);
+
+	rcu_read_unlock();
 
-	err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags);
 	if (!err)
 		delivered = true;
 	else if (err != -ESRCH)
 		return err;
 	return delivered ? 0 : -ESRCH;
  error:
+	rcu_read_unlock();
+
 	kfree_skb(skb);
 	return err;
 }
 
 int genlmsg_multicast_allns(const struct genl_family *family,
 			    struct sk_buff *skb, u32 portid,
-			    unsigned int group, gfp_t flags)
+			    unsigned int group)
 {
 	if (WARN_ON_ONCE(group >= family->n_mcgrps))
 		return -EINVAL;
 
 	group = family->mcgrp_offset + group;
-	return genlmsg_mcast(skb, portid, group, flags);
+	return genlmsg_mcast(skb, portid, group);
 }
 EXPORT_SYMBOL(genlmsg_multicast_allns);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9ab777e0bd4d..d7d099f7118a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -17986,10 +17986,8 @@ void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
 
 	genlmsg_end(msg, hdr);
 
-	rcu_read_lock();
 	genlmsg_multicast_allns(&nl80211_fam, msg, 0,
-				NL80211_MCGRP_REGULATORY, GFP_ATOMIC);
-	rcu_read_unlock();
+				NL80211_MCGRP_REGULATORY);
 
 	return;
 
@@ -18722,10 +18720,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 
 	genlmsg_end(msg, hdr);
 
-	rcu_read_lock();
 	genlmsg_multicast_allns(&nl80211_fam, msg, 0,
-				NL80211_MCGRP_REGULATORY, GFP_ATOMIC);
-	rcu_read_unlock();
+				NL80211_MCGRP_REGULATORY);
 
 	return;
 
-- 
cgit v1.2.3


From f03b296e8b516dbd63f57fc9056c1b0da1b9a0ff Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 14 Oct 2024 21:27:58 +0200
Subject: fs: pass offset and result to backing_file end_write() callback

This is needed for extending fuse inode size after fuse passthrough write.

Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Link: https://lore.kernel.org/linux-fsdevel/CAJfpegs=cvZ_NYy6Q_D42XhYS=Sjj5poM1b5TzXzOVvX=R36aA@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/backing-file.c            | 8 ++++----
 fs/fuse/passthrough.c        | 6 +++---
 fs/overlayfs/file.c          | 9 +++++++--
 include/linux/backing-file.h | 2 +-
 4 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/backing-file.c b/fs/backing-file.c
index 8860dac58c37..09a9be945d45 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -80,7 +80,7 @@ struct backing_aio {
 	refcount_t ref;
 	struct kiocb *orig_iocb;
 	/* used for aio completion */
-	void (*end_write)(struct file *);
+	void (*end_write)(struct file *, loff_t, ssize_t);
 	struct work_struct work;
 	long res;
 };
@@ -109,7 +109,7 @@ static void backing_aio_cleanup(struct backing_aio *aio, long res)
 	struct kiocb *orig_iocb = aio->orig_iocb;
 
 	if (aio->end_write)
-		aio->end_write(orig_iocb->ki_filp);
+		aio->end_write(orig_iocb->ki_filp, iocb->ki_pos, res);
 
 	orig_iocb->ki_pos = iocb->ki_pos;
 	backing_aio_put(aio);
@@ -239,7 +239,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 
 		ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
 		if (ctx->end_write)
-			ctx->end_write(ctx->user_file);
+			ctx->end_write(ctx->user_file, iocb->ki_pos, ret);
 	} else {
 		struct backing_aio *aio;
 
@@ -317,7 +317,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
 	revert_creds(old_cred);
 
 	if (ctx->end_write)
-		ctx->end_write(ctx->user_file);
+		ctx->end_write(ctx->user_file, ppos ? *ppos : 0, ret);
 
 	return ret;
 }
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 62aee8289d11..e4e92abe1601 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -18,7 +18,7 @@ static void fuse_file_accessed(struct file *file)
 	fuse_invalidate_atime(inode);
 }
 
-static void fuse_file_modified(struct file *file)
+static void fuse_passthrough_end_write(struct file *file, loff_t pos, ssize_t ret)
 {
 	struct inode *inode = file_inode(file);
 
@@ -63,7 +63,7 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb,
 	struct backing_file_ctx ctx = {
 		.cred = ff->cred,
 		.user_file = file,
-		.end_write = fuse_file_modified,
+		.end_write = fuse_passthrough_end_write,
 	};
 
 	pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
@@ -110,7 +110,7 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
 	struct backing_file_ctx ctx = {
 		.cred = ff->cred,
 		.user_file = out,
-		.end_write = fuse_file_modified,
+		.end_write = fuse_passthrough_end_write,
 	};
 
 	pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 4504493b20be..4444c78e2e0c 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -231,6 +231,11 @@ static void ovl_file_modified(struct file *file)
 	ovl_copyattr(file_inode(file));
 }
 
+static void ovl_file_end_write(struct file *file, loff_t pos, ssize_t ret)
+{
+	ovl_file_modified(file);
+}
+
 static void ovl_file_accessed(struct file *file)
 {
 	struct inode *inode, *upperinode;
@@ -294,7 +299,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	struct backing_file_ctx ctx = {
 		.cred = ovl_creds(inode->i_sb),
 		.user_file = file,
-		.end_write = ovl_file_modified,
+		.end_write = ovl_file_end_write,
 	};
 
 	if (!iov_iter_count(iter))
@@ -364,7 +369,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	struct backing_file_ctx ctx = {
 		.cred = ovl_creds(inode->i_sb),
 		.user_file = out,
-		.end_write = ovl_file_modified,
+		.end_write = ovl_file_end_write,
 	};
 
 	inode_lock(inode);
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 4b61b0e57720..2eed0ffb5e8f 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -16,7 +16,7 @@ struct backing_file_ctx {
 	const struct cred *cred;
 	struct file *user_file;
 	void (*accessed)(struct file *);
-	void (*end_write)(struct file *);
+	void (*end_write)(struct file *, loff_t, ssize_t);
 };
 
 struct file *backing_file_open(const struct path *user_path, int flags,
-- 
cgit v1.2.3


From 42aafd8b48adac1c3b20fe5892b1b91b80c1a1e6 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Oct 2024 21:48:47 +0800
Subject: ublk: don't allow user copy for unprivileged device

UBLK_F_USER_COPY requires userspace to call write() on ublk char
device for filling request buffer, and unprivileged device can't
be trusted.

So don't allow user copy for unprivileged device.

Cc: stable@vger.kernel.org
Fixes: 1172d5b8beca ("ublk: support user copy")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20241016134847.2911721-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 11 ++++++++++-
 include/uapi/linux/ublk_cmd.h |  8 +++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index a6c8e5cc6051..6ba2c1dd1d87 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2380,10 +2380,19 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	 * TODO: provide forward progress for RECOVERY handler, so that
 	 * unprivileged device can benefit from it
 	 */
-	if (info.flags & UBLK_F_UNPRIVILEGED_DEV)
+	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
 				UBLK_F_USER_RECOVERY);
 
+		/*
+		 * For USER_COPY, we depends on userspace to fill request
+		 * buffer by pwrite() to ublk char device, which can't be
+		 * used for unprivileged device
+		 */
+		if (info.flags & UBLK_F_USER_COPY)
+			return -EINVAL;
+	}
+
 	/* the created device is always owned by current user */
 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index c8dc5f8ea699..12873639ea96 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -175,7 +175,13 @@
 /* use ioctl encoding for uring command */
 #define UBLK_F_CMD_IOCTL_ENCODE	(1UL << 6)
 
-/* Copy between request and user buffer by pread()/pwrite() */
+/*
+ *  Copy between request and user buffer by pread()/pwrite()
+ *
+ *  Not available for UBLK_F_UNPRIVILEGED_DEV, otherwise userspace may
+ *  deceive us by not filling request buffer, then kernel uninitialized
+ *  data may be leaked.
+ */
 #define UBLK_F_USER_COPY	(1UL << 7)
 
 /*
-- 
cgit v1.2.3


From 8f3ce3d996bf1e2f8474ec3ddabdb8765c19e6ea Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 7 Oct 2024 16:30:49 +0200
Subject: mm: percpu: increase PERCPU_DYNAMIC_SIZE_SHIFT on certain builds.

Arnd reported a build failure due to the BUILD_BUG_ON() statement in
alloc_kmem_cache_cpus().  The test

  PERCPU_DYNAMIC_EARLY_SIZE < NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)

The factors that increase the right side of the equation:
- PAGE_SIZE > 4KiB increases KMALLOC_SHIFT_HIGH
- For the local_lock_t in kmem_cache_cpu:
  - PREEMPT_RT adds an actual lock.
  - LOCKDEP increases the size of the lock.
  - LOCK_STAT adds additional bytes plus padding to the lockdep
    structure.

The net difference with and without PREEMPT_RT is 88 bytes for the
lock_lock_t, 96 bytes for kmem_cache_cpu due to additional padding.  This
is enough to exceed the 80KiB limit with 16KiB page size - the 8KiB page
size is fine.

Increase PERCPU_DYNAMIC_SIZE_SHIFT to 13 on configs with PAGE_SIZE larger
than 4KiB and LOCKDEP enabled.

Link: https://lkml.kernel.org/r/20241007143049.gyMpEu89@linutronix.de
Fixes: d8fccd9ca5f9 ("arm64: Allow to enable PREEMPT_RT.")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202410020326.iaZIteIx-lkp@intel.com/
Reported-by: Arnd Bergmann <arnd@kernel.org>
Closes: https://lore.kernel.org/20241004095702.637528-1-arnd@kernel.org
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index b6321fc49159..52b5ea663b9f 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -41,7 +41,11 @@
 					 PCPU_MIN_ALLOC_SHIFT)
 
 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
-#define PERCPU_DYNAMIC_SIZE_SHIFT      12
+# if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PAGE_SIZE_4KB)
+# define PERCPU_DYNAMIC_SIZE_SHIFT      13
+# else
+# define PERCPU_DYNAMIC_SIZE_SHIFT      12
+#endif /* LOCKDEP and PAGE_SIZE > 4KiB */
 #else
 #define PERCPU_DYNAMIC_SIZE_SHIFT      10
 #endif
-- 
cgit v1.2.3


From 37f0b47c5143c2957909ced44fc09ffb118c99f7 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang@os.amperecomputing.com>
Date: Fri, 11 Oct 2024 18:17:02 -0700
Subject: mm: khugepaged: fix the arguments order in khugepaged_collapse_file
 trace point

The "addr" and "is_shmem" arguments have different order in TP_PROTO and
TP_ARGS.  This resulted in the incorrect trace result:

text-hugepage-644429 [276] 392092.878683: mm_khugepaged_collapse_file:
mm=0xffff20025d52c440, hpage_pfn=0x200678c00, index=512, addr=1, is_shmem=0,
filename=text-hugepage, nr=512, result=failed

The value of "addr" is wrong because it was treated as bool value, the
type of is_shmem.

Fix the order in TP_PROTO to keep "addr" is before "is_shmem" since the
original patch review suggested this order to achieve best packing.

And use "lx" for "addr" instead of "ld" in TP_printk because address is
typically shown in hex.

After the fix, the trace result looks correct:

text-hugepage-7291  [004]   128.627251: mm_khugepaged_collapse_file:
mm=0xffff0001328f9500, hpage_pfn=0x20016ea00, index=512, addr=0x400000,
is_shmem=0, filename=text-hugepage, nr=512, result=failed

Link: https://lkml.kernel.org/r/20241012011702.1084846-1-yang@os.amperecomputing.com
Fixes: 4c9473e87e75 ("mm/khugepaged: add tracepoint to collapse_file()")
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Cc: Gautam Menghani <gautammenghani201@gmail.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: <stable@vger.kernel.org>    [6.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/huge_memory.h | 4 ++--
 mm/khugepaged.c                    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index b5f5369b6300..9d5c00b0285c 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -208,7 +208,7 @@ TRACE_EVENT(mm_khugepaged_scan_file,
 
 TRACE_EVENT(mm_khugepaged_collapse_file,
 	TP_PROTO(struct mm_struct *mm, struct folio *new_folio, pgoff_t index,
-			bool is_shmem, unsigned long addr, struct file *file,
+			unsigned long addr, bool is_shmem, struct file *file,
 			int nr, int result),
 	TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
 	TP_STRUCT__entry(
@@ -233,7 +233,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,
 		__entry->result = result;
 	),
 
-	TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s",
+	TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%lx, is_shmem=%d, filename=%s, nr=%d, result=%s",
 		__entry->mm,
 		__entry->hpfn,
 		__entry->index,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f9c39898eaff..a420eff92011 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2227,7 +2227,7 @@ rollback:
 	folio_put(new_folio);
 out:
 	VM_BUG_ON(!list_empty(&pagelist));
-	trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result);
+	trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
 	return result;
 }
 
-- 
cgit v1.2.3


From 963756aac1f011d904ddd9548ae82286d3a91f96 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 11 Oct 2024 12:24:44 +0200
Subject: mm: huge_memory: add vma_thp_disabled() and thp_disabled_by_hw()

Patch series "mm: don't install PMD mappings when THPs are disabled by the
hw/process/vma".

During testing, it was found that we can get PMD mappings in processes
where THP (and more precisely, PMD mappings) are supposed to be disabled.
While it works as expected for anon+shmem, the pagecache is the
problematic bit.

For s390 KVM this currently means that a VM backed by a file located on
filesystem with large folio support can crash when KVM tries accessing the
problematic page, because the readahead logic might decide to use a
PMD-sized THP and faulting it into the page tables will install a PMD
mapping, something that s390 KVM cannot tolerate.

This might also be a problem with HW that does not support PMD mappings,
but I did not try reproducing it.

Fix it by respecting the ways to disable THPs when deciding whether we can
install a PMD mapping.  khugepaged should already be taking care of not
collapsing if THPs are effectively disabled for the hw/process/vma.


This patch (of 2):

Add vma_thp_disabled() and thp_disabled_by_hw() helpers to be shared by
shmem_allowable_huge_orders() and __thp_vma_allowable_orders().

[david@redhat.com: rename to vma_thp_disabled(), split out thp_disabled_by_hw() ]
Link: https://lkml.kernel.org/r/20241011102445.934409-2-david@redhat.com
Fixes: 793917d997df ("mm/readahead: Add large folio readahead")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: Leo Fu <bfu@redhat.com>
Tested-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Boqiao Fu <bfu@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 18 ++++++++++++++++++
 mm/huge_memory.c        | 13 +------------
 mm/shmem.c              |  7 +------
 3 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 67d0ab3c3bba..ef5b80e48599 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -322,6 +322,24 @@ struct thpsize {
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
 
+static inline bool vma_thp_disabled(struct vm_area_struct *vma,
+		unsigned long vm_flags)
+{
+	/*
+	 * Explicitly disabled through madvise or prctl, or some
+	 * architectures may disable THP for some mappings, for
+	 * example, s390 kvm.
+	 */
+	return (vm_flags & VM_NOHUGEPAGE) ||
+	       test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags);
+}
+
+static inline bool thp_disabled_by_hw(void)
+{
+	/* If the hardware/firmware marked hugepage support disabled. */
+	return transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED);
+}
+
 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 87b49ecc7b1e..2fb328880b50 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -109,18 +109,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 	if (!vma->vm_mm)		/* vdso */
 		return 0;
 
-	/*
-	 * Explicitly disabled through madvise or prctl, or some
-	 * architectures may disable THP for some mappings, for
-	 * example, s390 kvm.
-	 * */
-	if ((vm_flags & VM_NOHUGEPAGE) ||
-	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
-		return 0;
-	/*
-	 * If the hardware/firmware marked hugepage support disabled.
-	 */
-	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+	if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
 		return 0;
 
 	/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
diff --git a/mm/shmem.c b/mm/shmem.c
index 4f11b5506363..c5adb987b23c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1664,12 +1664,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
 	loff_t i_size;
 	int order;
 
-	if (vma && ((vm_flags & VM_NOHUGEPAGE) ||
-	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
-		return 0;
-
-	/* If the hardware/firmware marked hugepage support disabled. */
-	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+	if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
 		return 0;
 
 	global_huge = shmem_huge_global_enabled(inode, index, write_end,
-- 
cgit v1.2.3


From 9c5bd93edf7b8834aecaa7c340b852d5990d7c78 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Sun, 13 Oct 2024 18:26:39 +0200
Subject: bpf, sockmap: SK_DROP on attempted redirects of unsupported af_vsock

Don't mislead the callers of bpf_{sk,msg}_redirect_{map,hash}(): make sure
to immediately and visibly fail the forwarding of unsupported af_vsock
packets.

Fixes: 634f1a7110b4 ("vsock: support sockmap")
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20241013-vsock-fixes-for-redir-v2-1-d6577bbfe742@rbox.co
---
 include/net/sock.h  | 5 +++++
 net/core/sock_map.c | 8 ++++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index c58ca8dd561b..c87295f3476d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2715,6 +2715,11 @@ static inline bool sk_is_stream_unix(const struct sock *sk)
 	return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM;
 }
 
+static inline bool sk_is_vsock(const struct sock *sk)
+{
+	return sk->sk_family == AF_VSOCK;
+}
+
 /**
  * sk_eat_skb - Release a skb if it is no longer needed
  * @sk: socket to eat this skb from
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 242c91a6e3d3..07d6aa4e39ef 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -647,6 +647,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	sk = __sock_map_lookup_elem(map, key);
 	if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
 		return SK_DROP;
+	if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+		return SK_DROP;
 
 	skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
 	return SK_PASS;
@@ -675,6 +677,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
 		return SK_DROP;
 	if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
 		return SK_DROP;
+	if (sk_is_vsock(sk))
+		return SK_DROP;
 
 	msg->flags = flags;
 	msg->sk_redir = sk;
@@ -1249,6 +1253,8 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
 	sk = __sock_hash_lookup_elem(map, key);
 	if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
 		return SK_DROP;
+	if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+		return SK_DROP;
 
 	skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
 	return SK_PASS;
@@ -1277,6 +1283,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
 		return SK_DROP;
 	if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
 		return SK_DROP;
+	if (sk_is_vsock(sk))
+		return SK_DROP;
 
 	msg->flags = flags;
 	msg->sk_redir = sk;
-- 
cgit v1.2.3


From 78b2770c935fc1434a95cc17613fe31165b02dfe Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@linux.dev>
Date: Thu, 17 Oct 2024 11:10:55 -0400
Subject: dma-mapping: fix tracing dma_alloc/free with vmalloc'd memory

Not all virtual addresses have physical addresses, such as if they were
vmalloc'd. Just trace the virtual address instead of trying to trace a
physical address. This aligns with the API, and is good enough to
associate dma_alloc with dma_free.

Fixes: 038eb433dc14 ("dma-mapping: add tracing for dma-mapping API calls")
Reported-by: syzbot+b4bfacdec173efaa8567@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/670ebde5.050a0220.d9b66.0154.GAE@google.com/
Signed-off-by: Sean Anderson <sean.anderson@linux.dev>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/trace/events/dma.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h
index 569f86a44aaa..b0f41265191c 100644
--- a/include/trace/events/dma.h
+++ b/include/trace/events/dma.h
@@ -121,7 +121,7 @@ TRACE_EVENT(dma_alloc,
 
 	TP_STRUCT__entry(
 		__string(device, dev_name(dev))
-		__field(u64, phys_addr)
+		__field(void *, virt_addr)
 		__field(u64, dma_addr)
 		__field(size_t, size)
 		__field(gfp_t, flags)
@@ -130,18 +130,18 @@ TRACE_EVENT(dma_alloc,
 
 	TP_fast_assign(
 		__assign_str(device);
-		__entry->phys_addr = virt_to_phys(virt_addr);
+		__entry->virt_addr = virt_addr;
 		__entry->dma_addr = dma_addr;
 		__entry->size = size;
 		__entry->flags = flags;
 		__entry->attrs = attrs;
 	),
 
-	TP_printk("%s dma_addr=%llx size=%zu phys_addr=%llx flags=%s attrs=%s",
+	TP_printk("%s dma_addr=%llx size=%zu virt_addr=%p flags=%s attrs=%s",
 		__get_str(device),
 		__entry->dma_addr,
 		__entry->size,
-		__entry->phys_addr,
+		__entry->virt_addr,
 		show_gfp_flags(__entry->flags),
 		decode_dma_attrs(__entry->attrs))
 );
@@ -153,7 +153,7 @@ TRACE_EVENT(dma_free,
 
 	TP_STRUCT__entry(
 		__string(device, dev_name(dev))
-		__field(u64, phys_addr)
+		__field(void *, virt_addr)
 		__field(u64, dma_addr)
 		__field(size_t, size)
 		__field(unsigned long, attrs)
@@ -161,17 +161,17 @@ TRACE_EVENT(dma_free,
 
 	TP_fast_assign(
 		__assign_str(device);
-		__entry->phys_addr = virt_to_phys(virt_addr);
+		__entry->virt_addr = virt_addr;
 		__entry->dma_addr = dma_addr;
 		__entry->size = size;
 		__entry->attrs = attrs;
 	),
 
-	TP_printk("%s dma_addr=%llx size=%zu phys_addr=%llx attrs=%s",
+	TP_printk("%s dma_addr=%llx size=%zu virt_addr=%p attrs=%s",
 		__get_str(device),
 		__entry->dma_addr,
 		__entry->size,
-		__entry->phys_addr,
+		__entry->virt_addr,
 		decode_dma_attrs(__entry->attrs))
 );
 
-- 
cgit v1.2.3


From 0fd2a743301b6b5eec0f407080f89bed98384836 Mon Sep 17 00:00:00 2001
From: Jiqian Chen <Jiqian.Chen@amd.com>
Date: Sat, 12 Oct 2024 16:45:37 +0800
Subject: xen: Remove dependency between pciback and privcmd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 2fae6bb7be32 ("xen/privcmd: Add new syscall to get gsi from dev")
adds a weak reverse dependency to the config XEN_PRIVCMD definition, that
dependency causes xen-privcmd can't be loaded on domU, because dependent
xen-pciback isn't always be loaded successfully on domU.

To solve above problem, remove that dependency, and do not call
pcistub_get_gsi_from_sbdf() directly, instead add a hook in
drivers/xen/apci.c, xen-pciback register the real call function, then in
privcmd_ioctl_pcidev_get_gsi call that hook.

Fixes: 2fae6bb7be32 ("xen/privcmd: Add new syscall to get gsi from dev")
Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Message-ID: <20241012084537.1543059-1-Jiqian.Chen@amd.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/Kconfig                |  1 -
 drivers/xen/acpi.c                 | 24 ++++++++++++++++++++++++
 drivers/xen/privcmd.c              |  6 ++----
 drivers/xen/xen-pciback/pci_stub.c | 11 +++++++++--
 include/xen/acpi.h                 | 14 +++++++++-----
 5 files changed, 44 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 72ddee4c1544..f7d6f47971fd 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -261,7 +261,6 @@ config XEN_SCSI_BACKEND
 config XEN_PRIVCMD
 	tristate "Xen hypercall passthrough driver"
 	depends on XEN
-	imply XEN_PCIDEV_BACKEND
 	default m
 	help
 	  The hypercall passthrough driver allows privileged user programs to
diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
index 9e2096524fbc..d2ee605c5ca1 100644
--- a/drivers/xen/acpi.c
+++ b/drivers/xen/acpi.c
@@ -125,3 +125,27 @@ int xen_acpi_get_gsi_info(struct pci_dev *dev,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xen_acpi_get_gsi_info);
+
+static get_gsi_from_sbdf_t get_gsi_from_sbdf;
+static DEFINE_RWLOCK(get_gsi_from_sbdf_lock);
+
+void xen_acpi_register_get_gsi_func(get_gsi_from_sbdf_t func)
+{
+	write_lock(&get_gsi_from_sbdf_lock);
+	get_gsi_from_sbdf = func;
+	write_unlock(&get_gsi_from_sbdf_lock);
+}
+EXPORT_SYMBOL_GPL(xen_acpi_register_get_gsi_func);
+
+int xen_acpi_get_gsi_from_sbdf(u32 sbdf)
+{
+	int ret = -EOPNOTSUPP;
+
+	read_lock(&get_gsi_from_sbdf_lock);
+	if (get_gsi_from_sbdf)
+		ret = get_gsi_from_sbdf(sbdf);
+	read_unlock(&get_gsi_from_sbdf_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xen_acpi_get_gsi_from_sbdf);
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 3273cb8c2a66..4f75bc876454 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -850,15 +850,13 @@ out:
 static long privcmd_ioctl_pcidev_get_gsi(struct file *file, void __user *udata)
 {
 #if defined(CONFIG_XEN_ACPI)
-	int rc = -EINVAL;
+	int rc;
 	struct privcmd_pcidev_get_gsi kdata;
 
 	if (copy_from_user(&kdata, udata, sizeof(kdata)))
 		return -EFAULT;
 
-	if (IS_REACHABLE(CONFIG_XEN_PCIDEV_BACKEND))
-		rc = pcistub_get_gsi_from_sbdf(kdata.sbdf);
-
+	rc = xen_acpi_get_gsi_from_sbdf(kdata.sbdf);
 	if (rc < 0)
 		return rc;
 
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
index 2f3da5ac62cd..b616b7768c3b 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -227,7 +227,7 @@ static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev,
 }
 
 #ifdef CONFIG_XEN_ACPI
-int pcistub_get_gsi_from_sbdf(unsigned int sbdf)
+static int pcistub_get_gsi_from_sbdf(unsigned int sbdf)
 {
 	struct pcistub_device *psdev;
 	int domain = (sbdf >> 16) & 0xffff;
@@ -242,7 +242,6 @@ int pcistub_get_gsi_from_sbdf(unsigned int sbdf)
 
 	return psdev->gsi;
 }
-EXPORT_SYMBOL_GPL(pcistub_get_gsi_from_sbdf);
 #endif
 
 struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev,
@@ -1757,11 +1756,19 @@ static int __init xen_pcibk_init(void)
 		bus_register_notifier(&pci_bus_type, &pci_stub_nb);
 #endif
 
+#ifdef CONFIG_XEN_ACPI
+	xen_acpi_register_get_gsi_func(pcistub_get_gsi_from_sbdf);
+#endif
+
 	return err;
 }
 
 static void __exit xen_pcibk_cleanup(void)
 {
+#ifdef CONFIG_XEN_ACPI
+	xen_acpi_register_get_gsi_func(NULL);
+#endif
+
 #ifdef CONFIG_PCI_IOV
 	bus_unregister_notifier(&pci_bus_type, &pci_stub_nb);
 #endif
diff --git a/include/xen/acpi.h b/include/xen/acpi.h
index daa96a22d257..c66a8461612e 100644
--- a/include/xen/acpi.h
+++ b/include/xen/acpi.h
@@ -35,6 +35,8 @@
 
 #include <linux/types.h>
 
+typedef int (*get_gsi_from_sbdf_t)(u32 sbdf);
+
 #ifdef CONFIG_XEN_DOM0
 #include <asm/xen/hypervisor.h>
 #include <xen/xen.h>
@@ -72,6 +74,8 @@ int xen_acpi_get_gsi_info(struct pci_dev *dev,
 						  int *gsi_out,
 						  int *trigger_out,
 						  int *polarity_out);
+void xen_acpi_register_get_gsi_func(get_gsi_from_sbdf_t func);
+int xen_acpi_get_gsi_from_sbdf(u32 sbdf);
 #else
 static inline void xen_acpi_sleep_register(void)
 {
@@ -89,12 +93,12 @@ static inline int xen_acpi_get_gsi_info(struct pci_dev *dev,
 {
 	return -1;
 }
-#endif
 
-#ifdef CONFIG_XEN_PCI_STUB
-int pcistub_get_gsi_from_sbdf(unsigned int sbdf);
-#else
-static inline int pcistub_get_gsi_from_sbdf(unsigned int sbdf)
+static inline void xen_acpi_register_get_gsi_func(get_gsi_from_sbdf_t func)
+{
+}
+
+static inline int xen_acpi_get_gsi_from_sbdf(u32 sbdf)
 {
 	return -1;
 }
-- 
cgit v1.2.3


From 88a387cf9e5f7f7665e6dde8c6610f0ea65c5a6b Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Tue, 1 Oct 2024 15:13:53 +0100
Subject: KVM: Remove unused kvm_vcpu_gfn_to_pfn

The last use of kvm_vcpu_gfn_to_pfn was removed by commit
b1624f99aa8f ("KVM: Remove kvm_vcpu_gfn_to_page() and kvm_vcpu_gpa_to_page()")

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Message-ID: <20241001141354.18009-2-linux@treblig.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 1 -
 virt/kvm/kvm_main.c      | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index db567d26f7b9..b9b2e42e3fa7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1314,7 +1314,6 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
-kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 05cbb2548d99..f82131e7978b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3047,12 +3047,6 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
-kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
-
 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 			    struct page **pages, int nr_pages)
 {
-- 
cgit v1.2.3


From bc07eea2f3b330127242df2e0ec2d6cd16b4f2e8 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Tue, 1 Oct 2024 15:13:54 +0100
Subject: KVM: Remove unused kvm_vcpu_gfn_to_pfn_atomic

The last use of kvm_vcpu_gfn_to_pfn_atomic was removed by commit
1bbc60d0c7e5 ("KVM: x86/mmu: Remove MMU auditing")

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Message-ID: <20241001141354.18009-3-linux@treblig.org>
[Adjust Documentation/virt/kvm/locking.rst. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/locking.rst | 2 +-
 include/linux/kvm_host.h           | 1 -
 virt/kvm/kvm_main.c                | 6 ------
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 20a9a37d1cdd..1bedd56e2fe3 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -136,7 +136,7 @@ For direct sp, we can easily avoid it since the spte of direct sp is fixed
 to gfn.  For indirect sp, we disabled fast page fault for simplicity.
 
 A solution for indirect sp could be to pin the gfn, for example via
-kvm_vcpu_gfn_to_pfn_atomic, before the cmpxchg.  After the pinning:
+gfn_to_pfn_memslot_atomic, before the cmpxchg.  After the pinning:
 
 - We have held the refcount of pfn; that means the pfn can not be freed and
   be reused for another gfn.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b9b2e42e3fa7..45be36e5285f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1313,7 +1313,6 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
-kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f82131e7978b..141db5b79cd4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3035,12 +3035,6 @@ kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gf
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
-kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
-
 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
 	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
-- 
cgit v1.2.3


From 95ecba62e2fd201bcdcca636f5d774f1cd4f1458 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Oct 2024 19:41:18 +0000
Subject: net: fix races in netdev_tx_sent_queue()/dev_watchdog()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some workloads hit the infamous dev_watchdog() message:

"NETDEV WATCHDOG: eth0 (xxxx): transmit queue XX timed out"

It seems possible to hit this even for perfectly normal
BQL enabled drivers:

1) Assume a TX queue was idle for more than dev->watchdog_timeo
   (5 seconds unless changed by the driver)

2) Assume a big packet is sent, exceeding current BQL limit.

3) Driver ndo_start_xmit() puts the packet in TX ring,
   and netdev_tx_sent_queue() is called.

4) QUEUE_STATE_STACK_XOFF could be set from netdev_tx_sent_queue()
   before txq->trans_start has been written.

5) txq->trans_start is written later, from netdev_start_xmit()

    if (rc == NETDEV_TX_OK)
          txq_trans_update(txq)

dev_watchdog() running on another cpu could read the old
txq->trans_start, and then see QUEUE_STATE_STACK_XOFF, because 5)
did not happen yet.

To solve the issue, write txq->trans_start right before one XOFF bit
is set :

- _QUEUE_STATE_DRV_XOFF from netif_tx_stop_queue()
- __QUEUE_STATE_STACK_XOFF from netdev_tx_sent_queue()

From dev_watchdog(), we have to read txq->state before txq->trans_start.

Add memory barriers to enforce correct ordering.

In the future, we could avoid writing over txq->trans_start for normal
operations, and rename this field to txq->xoff_start_time.

Fixes: bec251bc8b6a ("net: no longer stop all TX queues in dev_watchdog()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20241015194118.3951657-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 12 ++++++++++++
 net/sched/sch_generic.c   |  8 +++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4d20c776a4ff..8896705ccd63 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3325,6 +3325,12 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
 
 static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
 {
+	/* Paired with READ_ONCE() from dev_watchdog() */
+	WRITE_ONCE(dev_queue->trans_start, jiffies);
+
+	/* This barrier is paired with smp_mb() from dev_watchdog() */
+	smp_mb__before_atomic();
+
 	/* Must be an atomic op see netif_txq_try_stop() */
 	set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
 }
@@ -3451,6 +3457,12 @@ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
 	if (likely(dql_avail(&dev_queue->dql) >= 0))
 		return;
 
+	/* Paired with READ_ONCE() from dev_watchdog() */
+	WRITE_ONCE(dev_queue->trans_start, jiffies);
+
+	/* This barrier is paired with smp_mb() from dev_watchdog() */
+	smp_mb__before_atomic();
+
 	set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);
 
 	/*
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2af24547a82c..38ec18f73de4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -512,9 +512,15 @@ static void dev_watchdog(struct timer_list *t)
 				struct netdev_queue *txq;
 
 				txq = netdev_get_tx_queue(dev, i);
-				trans_start = READ_ONCE(txq->trans_start);
 				if (!netif_xmit_stopped(txq))
 					continue;
+
+				/* Paired with WRITE_ONCE() + smp_mb...() in
+				 * netdev_tx_sent_queue() and netif_tx_stop_queue().
+				 */
+				smp_mb();
+				trans_start = READ_ONCE(txq->trans_start);
+
 				if (time_after(jiffies, trans_start + dev->watchdog_timeo)) {
 					timedout_ms = jiffies_to_msecs(jiffies - trans_start);
 					atomic_long_inc(&txq->trans_timeout);
-- 
cgit v1.2.3


From 4021e685139d567b3fc862f54101ae9dbb15d8b5 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 9 Oct 2024 11:31:50 +0800
Subject: fs/super.c: introduce get_tree_bdev_flags()

As Allison reported [1], currently get_tree_bdev() will store
"Can't lookup blockdev" error message.  Although it makes sense for
pure bdev-based fses, this message may mislead users who try to use
EROFS file-backed mounts since get_tree_nodev() is used as a fallback
then.

Add get_tree_bdev_flags() to specify extensible flags [2] and
GET_TREE_BDEV_QUIET_LOOKUP to silence "Can't lookup blockdev" message
since it's misleading to EROFS file-backed mounts now.

[1] https://lore.kernel.org/r/CAOYeF9VQ8jKVmpy5Zy9DNhO6xmWSKMB-DO8yvBB0XvBE7=3Ugg@mail.gmail.com
[2] https://lore.kernel.org/r/ZwUkJEtwIpUA4qMz@infradead.org

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20241009033151.2334888-1-hsiangkao@linux.alibaba.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c                 | 26 ++++++++++++++++++++------
 include/linux/fs_context.h |  6 ++++++
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index 1db230432960..c9c7223bc2a2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1596,13 +1596,14 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 EXPORT_SYMBOL_GPL(setup_bdev_super);
 
 /**
- * get_tree_bdev - Get a superblock based on a single block device
+ * get_tree_bdev_flags - Get a superblock based on a single block device
  * @fc: The filesystem context holding the parameters
  * @fill_super: Helper to initialise a new superblock
+ * @flags: GET_TREE_BDEV_* flags
  */
-int get_tree_bdev(struct fs_context *fc,
-		int (*fill_super)(struct super_block *,
-				  struct fs_context *))
+int get_tree_bdev_flags(struct fs_context *fc,
+		int (*fill_super)(struct super_block *sb,
+				  struct fs_context *fc), unsigned int flags)
 {
 	struct super_block *s;
 	int error = 0;
@@ -1613,10 +1614,10 @@ int get_tree_bdev(struct fs_context *fc,
 
 	error = lookup_bdev(fc->source, &dev);
 	if (error) {
-		errorf(fc, "%s: Can't lookup blockdev", fc->source);
+		if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP))
+			errorf(fc, "%s: Can't lookup blockdev", fc->source);
 		return error;
 	}
-
 	fc->sb_flags |= SB_NOSEC;
 	s = sget_dev(fc, dev);
 	if (IS_ERR(s))
@@ -1644,6 +1645,19 @@ int get_tree_bdev(struct fs_context *fc,
 	fc->root = dget(s->s_root);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(get_tree_bdev_flags);
+
+/**
+ * get_tree_bdev - Get a superblock based on a single block device
+ * @fc: The filesystem context holding the parameters
+ * @fill_super: Helper to initialise a new superblock
+ */
+int get_tree_bdev(struct fs_context *fc,
+		int (*fill_super)(struct super_block *,
+				  struct fs_context *))
+{
+	return get_tree_bdev_flags(fc, fill_super, 0);
+}
 EXPORT_SYMBOL(get_tree_bdev);
 
 static int test_bdev_super(struct super_block *s, void *data)
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index c13e99cbbf81..4b4bfef6f053 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -160,6 +160,12 @@ extern int get_tree_keyed(struct fs_context *fc,
 
 int setup_bdev_super(struct super_block *sb, int sb_flags,
 		struct fs_context *fc);
+
+#define GET_TREE_BDEV_QUIET_LOOKUP		0x0001
+int get_tree_bdev_flags(struct fs_context *fc,
+		int (*fill_super)(struct super_block *sb,
+				  struct fs_context *fc), unsigned int flags);
+
 extern int get_tree_bdev(struct fs_context *fc,
 			       int (*fill_super)(struct super_block *sb,
 						 struct fs_context *fc));
-- 
cgit v1.2.3


From d2f8671045b41871053dedaf3035a06ad53d2736 Mon Sep 17 00:00:00 2001
From: Bibo Mao <maobibo@loongson.cn>
Date: Mon, 21 Oct 2024 22:11:19 +0800
Subject: LoongArch: Set initial pte entry with PAGE_GLOBAL for kernel space

There are two pages in one TLB entry on LoongArch system. For kernel
space, it requires both two pte entries (buddies) with PAGE_GLOBAL bit
set, otherwise HW treats it as non-global tlb, there will be potential
problems if tlb entry for kernel space is not global. Such as fail to
flush kernel tlb with the function local_flush_tlb_kernel_range() which
supposed only flush tlb with global bit.

Kernel address space areas include percpu, vmalloc, vmemmap, fixmap and
kasan areas. For these areas both two consecutive page table entries
should be enabled with PAGE_GLOBAL bit. So with function set_pte() and
pte_clear(), pte buddy entry is checked and set besides its own pte
entry. However it is not atomic operation to set both two pte entries,
there is problem with test_vmalloc test case.

So function kernel_pte_init() is added to init a pte table when it is
created for kernel address space, and the default initial pte value is
PAGE_GLOBAL rather than zero at beginning. Then only its own pte entry
need update with function set_pte() and pte_clear(), nothing to do with
the pte buddy entry.

Signed-off-by: Bibo Mao <maobibo@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/include/asm/pgalloc.h | 11 +++++++++++
 arch/loongarch/include/asm/pgtable.h | 35 +++++++----------------------------
 arch/loongarch/mm/init.c             |  2 ++
 arch/loongarch/mm/pgtable.c          | 20 ++++++++++++++++++++
 include/linux/mm.h                   |  3 ++-
 mm/kasan/init.c                      |  8 +++++++-
 mm/sparse-vmemmap.c                  |  5 +++++
 7 files changed, 54 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h
index 4e2d6b7ca2ee..a7b9c9e73593 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -10,6 +10,7 @@
 
 #define __HAVE_ARCH_PMD_ALLOC_ONE
 #define __HAVE_ARCH_PUD_ALLOC_ONE
+#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
 #include <asm-generic/pgalloc.h>
 
 static inline void pmd_populate_kernel(struct mm_struct *mm,
@@ -44,6 +45,16 @@ extern void pagetable_init(void);
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+{
+	pte_t *pte = __pte_alloc_one_kernel(mm);
+
+	if (pte)
+		kernel_pte_init(pte);
+
+	return pte;
+}
+
 #define __pte_free_tlb(tlb, pte, address)			\
 do {								\
 	pagetable_pte_dtor(page_ptdesc(pte));			\
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index 9965f52ef65b..20714b73f14c 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -269,6 +269,7 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pm
 extern void pgd_init(void *addr);
 extern void pud_init(void *addr);
 extern void pmd_init(void *addr);
+extern void kernel_pte_init(void *addr);
 
 /*
  * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
@@ -325,39 +326,17 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
 {
 	WRITE_ONCE(*ptep, pteval);
 
-	if (pte_val(pteval) & _PAGE_GLOBAL) {
-		pte_t *buddy = ptep_buddy(ptep);
-		/*
-		 * Make sure the buddy is global too (if it's !none,
-		 * it better already be global)
-		 */
-		if (pte_none(ptep_get(buddy))) {
 #ifdef CONFIG_SMP
-			/*
-			 * For SMP, multiple CPUs can race, so we need
-			 * to do this atomically.
-			 */
-			__asm__ __volatile__(
-			__AMOR "$zero, %[global], %[buddy] \n"
-			: [buddy] "+ZB" (buddy->pte)
-			: [global] "r" (_PAGE_GLOBAL)
-			: "memory");
-
-			DBAR(0b11000); /* o_wrw = 0b11000 */
-#else /* !CONFIG_SMP */
-			WRITE_ONCE(*buddy, __pte(pte_val(ptep_get(buddy)) | _PAGE_GLOBAL));
-#endif /* CONFIG_SMP */
-		}
-	}
+	if (pte_val(pteval) & _PAGE_GLOBAL)
+		DBAR(0b11000); /* o_wrw = 0b11000 */
+#endif
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
-	/* Preserve global status for the pair */
-	if (pte_val(ptep_get(ptep_buddy(ptep))) & _PAGE_GLOBAL)
-		set_pte(ptep, __pte(_PAGE_GLOBAL));
-	else
-		set_pte(ptep, __pte(0));
+	pte_t pte = ptep_get(ptep);
+	pte_val(pte) &= _PAGE_GLOBAL;
+	set_pte(ptep, pte);
 }
 
 #define PGD_T_LOG2	(__builtin_ffs(sizeof(pgd_t)) - 1)
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index 8a87a482c8f4..188b52bbb254 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -201,7 +201,9 @@ pte_t * __init populate_kernel_pte(unsigned long addr)
 		pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 		if (!pte)
 			panic("%s: Failed to allocate memory\n", __func__);
+
 		pmd_populate_kernel(&init_mm, pmd, pte);
+		kernel_pte_init(pte);
 	}
 
 	return pte_offset_kernel(pmd, addr);
diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c
index eb6a29b491a7..3fa69b23ff84 100644
--- a/arch/loongarch/mm/pgtable.c
+++ b/arch/loongarch/mm/pgtable.c
@@ -116,6 +116,26 @@ void pud_init(void *addr)
 EXPORT_SYMBOL_GPL(pud_init);
 #endif
 
+void kernel_pte_init(void *addr)
+{
+	unsigned long *p, *end;
+
+	p = (unsigned long *)addr;
+	end = p + PTRS_PER_PTE;
+
+	do {
+		p[0] = _PAGE_GLOBAL;
+		p[1] = _PAGE_GLOBAL;
+		p[2] = _PAGE_GLOBAL;
+		p[3] = _PAGE_GLOBAL;
+		p[4] = _PAGE_GLOBAL;
+		p += 8;
+		p[-3] = _PAGE_GLOBAL;
+		p[-2] = _PAGE_GLOBAL;
+		p[-1] = _PAGE_GLOBAL;
+	} while (p != end);
+}
+
 pmd_t mk_pmd(struct page *page, pgprot_t prot)
 {
 	pmd_t pmd;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..61fff5d34ed5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3818,8 +3818,9 @@ void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
 		struct dev_pagemap *pgmap);
-void pmd_init(void *addr);
 void pud_init(void *addr);
+void pmd_init(void *addr);
+void kernel_pte_init(void *addr);
 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
 p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 89895f38f722..ac607c306292 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -106,6 +106,10 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
 	}
 }
 
+void __weak __meminit kernel_pte_init(void *addr)
+{
+}
+
 static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
 				unsigned long end)
 {
@@ -126,8 +130,10 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
 
 			if (slab_is_available())
 				p = pte_alloc_one_kernel(&init_mm);
-			else
+			else {
 				p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
+				kernel_pte_init(p);
+			}
 			if (!p)
 				return -ENOMEM;
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index edcc7a6b0f6f..c0388b2e959d 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -184,6 +184,10 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
 	return p;
 }
 
+void __weak __meminit kernel_pte_init(void *addr)
+{
+}
+
 pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
 {
 	pmd_t *pmd = pmd_offset(pud, addr);
@@ -191,6 +195,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
 		if (!p)
 			return NULL;
+		kernel_pte_init(p);
 		pmd_populate_kernel(&init_mm, pmd, p);
 	}
 	return pmd;
-- 
cgit v1.2.3


From 6db388585e486c0261aeef55f8bc63a9b45756c0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Oct 2024 06:13:50 +0200
Subject: iomap: turn iomap_want_unshare_iter into an inline function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

iomap_want_unshare_iter currently sits in fs/iomap/buffered-io.c, which
depends on CONFIG_BLOCK.  It is also in used in fs/dax.c whіch has no
such dependency.  Given that it is a trivial check turn it into an inline
in include/linux/iomap.h to fix the DAX && !BLOCK build.

Fixes: 6ef6a0e821d3 ("iomap: share iomap_unshare_iter predicate code with fsdax")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241015041350.118403-1-hch@lst.de
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 17 -----------------
 include/linux/iomap.h  | 20 +++++++++++++++++++-
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3899169b2cf7..6db2aaf808d5 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1309,23 +1309,6 @@ void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
 
-bool iomap_want_unshare_iter(const struct iomap_iter *iter)
-{
-	/*
-	 * Don't bother with blocks that are not shared to start with; or
-	 * mappings that cannot be shared, such as inline data, delalloc
-	 * reservations, holes or unwritten extents.
-	 *
-	 * Note that we use srcmap directly instead of iomap_iter_srcmap as
-	 * unsharing requires providing a separate source map, and the presence
-	 * of one is a good indicator that unsharing is needed, unlike
-	 * IOMAP_F_SHARED which can be set for any data that goes into the COW
-	 * fork for XFS.
-	 */
-	return (iter->iomap.flags & IOMAP_F_SHARED) &&
-		iter->srcmap.type == IOMAP_MAPPED;
-}
-
 static loff_t iomap_unshare_iter(struct iomap_iter *iter)
 {
 	struct iomap *iomap = &iter->iomap;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index d8a7fc84348c..0198f36e521e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -256,6 +256,25 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
 	return &i->iomap;
 }
 
+/*
+ * Check if the range needs to be unshared for a FALLOC_FL_UNSHARE_RANGE
+ * operation.
+ *
+ * Don't bother with blocks that are not shared to start with; or mappings that
+ * cannot be shared, such as inline data, delalloc reservations, holes or
+ * unwritten extents.
+ *
+ * Note that we use srcmap directly instead of iomap_iter_srcmap as unsharing
+ * requires providing a separate source map, and the presence of one is a good
+ * indicator that unsharing is needed, unlike IOMAP_F_SHARED which can be set
+ * for any data that goes into the COW fork for XFS.
+ */
+static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
+{
+	return (iter->iomap.flags & IOMAP_F_SHARED) &&
+		iter->srcmap.type == IOMAP_MAPPED;
+}
+
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops, void *private);
 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
@@ -267,7 +286,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
 bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
-bool iomap_want_unshare_iter(const struct iomap_iter *iter);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 		bool *did_zero, const struct iomap_ops *ops);
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-- 
cgit v1.2.3


From 6fad274f06f038c29660aa53fbad14241c9fd976 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 21 Oct 2024 17:28:05 +0200
Subject: bpf: Add MEM_WRITE attribute

Add a MEM_WRITE attribute for BPF helper functions which can be used in
bpf_func_proto to annotate an argument type in order to let the verifier
know that the helper writes into the memory passed as an argument. In
the past MEM_UNINIT has been (ab)used for this function, but the latter
merely tells the verifier that the passed memory can be uninitialized.

There have been bugs with overloading the latter but aside from that
there are also cases where the passed memory is read + written which
currently cannot be expressed, see also 4b3786a6c539 ("bpf: Zero former
ARG_PTR_TO_{LONG,INT} args in case of error").

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241021152809.33343-1-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      | 14 +++++++++++---
 kernel/bpf/helpers.c     | 10 +++++-----
 kernel/bpf/ringbuf.c     |  2 +-
 kernel/bpf/syscall.c     |  2 +-
 kernel/trace/bpf_trace.c |  4 ++--
 net/core/filter.c        |  4 ++--
 6 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 19d8ca8ac960..bdadb0bb6cec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -635,6 +635,7 @@ enum bpf_type_flag {
 	 */
 	PTR_UNTRUSTED		= BIT(6 + BPF_BASE_TYPE_BITS),
 
+	/* MEM can be uninitialized. */
 	MEM_UNINIT		= BIT(7 + BPF_BASE_TYPE_BITS),
 
 	/* DYNPTR points to memory local to the bpf program. */
@@ -700,6 +701,13 @@ enum bpf_type_flag {
 	 */
 	MEM_ALIGNED		= BIT(17 + BPF_BASE_TYPE_BITS),
 
+	/* MEM is being written to, often combined with MEM_UNINIT. Non-presence
+	 * of MEM_WRITE means that MEM is only being read. MEM_WRITE without the
+	 * MEM_UNINIT means that memory needs to be initialized since it is also
+	 * read.
+	 */
+	MEM_WRITE		= BIT(18 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
@@ -758,10 +766,10 @@ enum bpf_arg_type {
 	ARG_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
 	ARG_PTR_TO_STACK_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
 	ARG_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID,
-	/* pointer to memory does not need to be initialized, helper function must fill
-	 * all bytes or clear them in error case.
+	/* Pointer to memory does not need to be initialized, since helper function
+	 * fills all bytes or clears them in error case.
 	 */
-	ARG_PTR_TO_UNINIT_MEM		= MEM_UNINIT | ARG_PTR_TO_MEM,
+	ARG_PTR_TO_UNINIT_MEM		= MEM_UNINIT | MEM_WRITE | ARG_PTR_TO_MEM,
 	/* Pointer to valid memory of size known at compile time. */
 	ARG_PTR_TO_FIXED_SIZE_MEM	= MEM_FIXED_SIZE | ARG_PTR_TO_MEM,
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1a43d06eab28..ca3f0a2e5ed5 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -111,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
 };
 
 BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
@@ -124,7 +124,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
 };
 
 BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
@@ -538,7 +538,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg2_type	= ARG_CONST_SIZE,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
 	.arg4_size	= sizeof(s64),
 };
 
@@ -566,7 +566,7 @@ const struct bpf_func_proto bpf_strtoul_proto = {
 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg2_type	= ARG_CONST_SIZE,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
 	.arg4_size	= sizeof(u64),
 };
 
@@ -1742,7 +1742,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
+	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
 };
 
 BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index de3b681d1d13..e1cfe890e0be 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -632,7 +632,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
+	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE,
 };
 
 BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8cfa7183d2ef..67179529080b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5892,7 +5892,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
 	.arg1_type	= ARG_PTR_TO_MEM,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
 	.arg4_size	= sizeof(u64),
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3bd402fa62a4..95b6b3b16bac 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1202,7 +1202,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+	.arg3_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
 	.arg3_size	= sizeof(u64),
 };
 
@@ -1219,7 +1219,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
 	.func		= get_func_ret,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+	.arg2_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
 	.arg2_size	= sizeof(u64),
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index cb272b35d484..16b324d519da 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6368,7 +6368,7 @@ static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,
 	.arg2_type      = ARG_ANYTHING,
-	.arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+	.arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
 	.arg3_size	= sizeof(u32),
 	.arg4_type      = ARG_ANYTHING,
 	.arg5_type      = ARG_ANYTHING,
@@ -6380,7 +6380,7 @@ static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,
 	.arg2_type      = ARG_ANYTHING,
-	.arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+	.arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
 	.arg3_size	= sizeof(u32),
 	.arg4_type      = ARG_ANYTHING,
 	.arg5_type      = ARG_ANYTHING,
-- 
cgit v1.2.3


From 1bf4470a3939c678fb822073e9ea77a0560bc6bb Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 22 Oct 2024 12:31:08 -0400
Subject: Bluetooth: SCO: Fix UAF on sco_sock_timeout

conn->sk maybe have been unlinked/freed while waiting for sco_conn_lock
so this checks if the conn->sk is still valid by checking if it part of
sco_sk_list.

Reported-by: syzbot+4c0d0c4cde787116d465@syzkaller.appspotmail.com
Tested-by: syzbot+4c0d0c4cde787116d465@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=4c0d0c4cde787116d465
Fixes: ba316be1b6a0 ("Bluetooth: schedule SCO timeouts with delayed_work")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |  1 +
 net/bluetooth/af_bluetooth.c      | 22 ++++++++++++++++++++++
 net/bluetooth/sco.c               | 18 ++++++++++++------
 3 files changed, 35 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 5d655e109b2c..f66bc85c6411 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -403,6 +403,7 @@ int  bt_sock_register(int proto, const struct net_proto_family *ops);
 void bt_sock_unregister(int proto);
 void bt_sock_link(struct bt_sock_list *l, struct sock *s);
 void bt_sock_unlink(struct bt_sock_list *l, struct sock *s);
+bool bt_sock_linked(struct bt_sock_list *l, struct sock *s);
 struct sock *bt_sock_alloc(struct net *net, struct socket *sock,
 			   struct proto *prot, int proto, gfp_t prio, int kern);
 int  bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 67604ccec2f4..bdce9270487d 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -185,6 +185,28 @@ void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk)
 }
 EXPORT_SYMBOL(bt_sock_unlink);
 
+bool bt_sock_linked(struct bt_sock_list *l, struct sock *s)
+{
+	struct sock *sk;
+
+	if (!l || !s)
+		return false;
+
+	read_lock(&l->lock);
+
+	sk_for_each(sk, &l->head) {
+		if (s == sk) {
+			read_unlock(&l->lock);
+			return true;
+		}
+	}
+
+	read_unlock(&l->lock);
+
+	return false;
+}
+EXPORT_SYMBOL(bt_sock_linked);
+
 void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh)
 {
 	const struct cred *old_cred;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index a5ac160c592e..1c7252a36866 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -76,6 +76,16 @@ struct sco_pinfo {
 #define SCO_CONN_TIMEOUT	(HZ * 40)
 #define SCO_DISCONN_TIMEOUT	(HZ * 2)
 
+static struct sock *sco_sock_hold(struct sco_conn *conn)
+{
+	if (!conn || !bt_sock_linked(&sco_sk_list, conn->sk))
+		return NULL;
+
+	sock_hold(conn->sk);
+
+	return conn->sk;
+}
+
 static void sco_sock_timeout(struct work_struct *work)
 {
 	struct sco_conn *conn = container_of(work, struct sco_conn,
@@ -87,9 +97,7 @@ static void sco_sock_timeout(struct work_struct *work)
 		sco_conn_unlock(conn);
 		return;
 	}
-	sk = conn->sk;
-	if (sk)
-		sock_hold(sk);
+	sk = sco_sock_hold(conn);
 	sco_conn_unlock(conn);
 
 	if (!sk)
@@ -194,9 +202,7 @@ static void sco_conn_del(struct hci_conn *hcon, int err)
 
 	/* Kill socket */
 	sco_conn_lock(conn);
-	sk = conn->sk;
-	if (sk)
-		sock_hold(sk);
+	sk = sco_sock_hold(conn);
 	sco_conn_unlock(conn);
 
 	if (sk) {
-- 
cgit v1.2.3


From 894b00a3350c560990638bdf89bdf1f3d5491950 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 21 Oct 2024 14:00:10 +0200
Subject: kasan: Fix Software Tag-Based KASAN with GCC

Per [1], -fsanitize=kernel-hwaddress with GCC currently does not disable
instrumentation in functions with __attribute__((no_sanitize_address)).

However, __attribute__((no_sanitize("hwaddress"))) does correctly
disable instrumentation. Use it instead.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117196 [1]
Link: https://lore.kernel.org/r/000000000000f362e80620e27859@google.com
Link: https://lore.kernel.org/r/ZvFGwKfoC4yVjN_X@J2N7QTR9R3
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218854
Reported-by: syzbot+908886656a02769af987@syzkaller.appspotmail.com
Tested-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrew Pinski <pinskia@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Fixes: 7b861a53e46b ("kasan: Bump required compiler version")
Link: https://lore.kernel.org/r/20241021120013.3209481-1-elver@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/compiler-gcc.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index f805adaa316e..cd6f9aae311f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -80,7 +80,11 @@
 #define __noscs __attribute__((__no_sanitize__("shadow-call-stack")))
 #endif
 
+#ifdef __SANITIZE_HWADDRESS__
+#define __no_sanitize_address __attribute__((__no_sanitize__("hwaddress")))
+#else
 #define __no_sanitize_address __attribute__((__no_sanitize_address__))
+#endif
 
 #if defined(__SANITIZE_THREAD__)
 #define __no_sanitize_thread __attribute__((__no_sanitize_thread__))
-- 
cgit v1.2.3


From 247d65fb122ad560be1c8c4d87d7374fb28b0770 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 23 Oct 2024 11:40:10 +0100
Subject: afs: Fix missing subdir edit when renamed between parent dirs

When rename moves an AFS subdirectory between parent directories, the
subdir also needs a bit of editing: the ".." entry needs updating to point
to the new parent (though I don't make use of the info) and the DV needs
incrementing by 1 to reflect the change of content.  The server also sends
a callback break notification on the subdirectory if we have one, but we
can take care of recovering the promise next time we access the subdir.

This can be triggered by something like:

    mount -t afs %example.com:xfstest.test20 /xfstest.test/
    mkdir /xfstest.test/{aaa,bbb,aaa/ccc}
    touch /xfstest.test/bbb/ccc/d
    mv /xfstest.test/{aaa/ccc,bbb/ccc}
    touch /xfstest.test/bbb/ccc/e

When the pathwalk for the second touch hits "ccc", kafs spots that the DV
is incorrect and downloads it again (so the fix is not critical).

Fix this, if the rename target is a directory and the old and new
parents are different, by:

 (1) Incrementing the DV number of the target locally.

 (2) Editing the ".." entry in the target to refer to its new parent's
     vnode ID and uniquifier.

Link: https://lore.kernel.org/r/3340431.1729680010@warthog.procyon.org.uk
Fixes: 63a4681ff39c ("afs: Locally edit directory data for mkdir/create/unlink/...")
cc: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/dir.c               | 25 +++++++++++++
 fs/afs/dir_edit.c          | 91 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/afs/internal.h          |  2 +
 include/trace/events/afs.h |  7 +++-
 4 files changed, 122 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index f8622ed72e08..ada363af5aab 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,6 +12,7 @@
 #include <linux/swap.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
+#include <linux/iversion.h>
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 #include "afs_fs.h"
@@ -1823,6 +1824,8 @@ error:
 
 static void afs_rename_success(struct afs_operation *op)
 {
+	struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry));
+
 	_enter("op=%08x", op->debug_id);
 
 	op->ctime = op->file[0].scb.status.mtime_client;
@@ -1832,6 +1835,22 @@ static void afs_rename_success(struct afs_operation *op)
 		op->ctime = op->file[1].scb.status.mtime_client;
 		afs_vnode_commit_status(op, &op->file[1]);
 	}
+
+	/* If we're moving a subdir between dirs, we need to update
+	 * its DV counter too as the ".." will be altered.
+	 */
+	if (S_ISDIR(vnode->netfs.inode.i_mode) &&
+	    op->file[0].vnode != op->file[1].vnode) {
+		u64 new_dv;
+
+		write_seqlock(&vnode->cb_lock);
+
+		new_dv = vnode->status.data_version + 1;
+		vnode->status.data_version = new_dv;
+		inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
+
+		write_sequnlock(&vnode->cb_lock);
+	}
 }
 
 static void afs_rename_edit_dir(struct afs_operation *op)
@@ -1873,6 +1892,12 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 				 &vnode->fid, afs_edit_dir_for_rename_2);
 	}
 
+	if (S_ISDIR(vnode->netfs.inode.i_mode) &&
+	    new_dvnode != orig_dvnode &&
+	    test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+		afs_edit_dir_update_dotdot(vnode, new_dvnode,
+					   afs_edit_dir_for_rename_sub);
+
 	new_inode = d_inode(new_dentry);
 	if (new_inode) {
 		spin_lock(&new_inode->i_lock);
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index a71bff10496b..fe223fb78111 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -127,10 +127,10 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
 /*
  * Scan a directory block looking for a dirent of the right name.
  */
-static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name,
+static int afs_dir_scan_block(const union afs_xdr_dir_block *block, const struct qstr *name,
 			      unsigned int blocknum)
 {
-	union afs_xdr_dirent *de;
+	const union afs_xdr_dirent *de;
 	u64 bitmap;
 	int d, len, n;
 
@@ -492,3 +492,90 @@ error:
 	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	goto out_unmap;
 }
+
+/*
+ * Edit a subdirectory that has been moved between directories to update the
+ * ".." entry.
+ */
+void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
+				enum afs_edit_dir_reason why)
+{
+	union afs_xdr_dir_block *block;
+	union afs_xdr_dirent *de;
+	struct folio *folio;
+	unsigned int nr_blocks, b;
+	pgoff_t index;
+	loff_t i_size;
+	int slot;
+
+	_enter("");
+
+	i_size = i_size_read(&vnode->netfs.inode);
+	if (i_size < AFS_DIR_BLOCK_SIZE) {
+		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+		return;
+	}
+	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
+
+	/* Find a block that has sufficient slots available.  Each folio
+	 * contains two or more directory blocks.
+	 */
+	for (b = 0; b < nr_blocks; b++) {
+		index = b / AFS_DIR_BLOCKS_PER_PAGE;
+		folio = afs_dir_get_folio(vnode, index);
+		if (!folio)
+			goto error;
+
+		block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
+
+		/* Abandon the edit if we got a callback break. */
+		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+			goto invalidated;
+
+		slot = afs_dir_scan_block(block, &dotdot_name, b);
+		if (slot >= 0)
+			goto found_dirent;
+
+		kunmap_local(block);
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	/* Didn't find the dirent to clobber.  Download the directory again. */
+	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
+			   0, 0, 0, 0, "..");
+	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+	goto out;
+
+found_dirent:
+	de = &block->dirents[slot];
+	de->u.vnode  = htonl(new_dvnode->fid.vnode);
+	de->u.unique = htonl(new_dvnode->fid.unique);
+
+	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot,
+			   ntohl(de->u.vnode), ntohl(de->u.unique), "..");
+
+	kunmap_local(block);
+	folio_unlock(folio);
+	folio_put(folio);
+	inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
+
+out:
+	_leave("");
+	return;
+
+invalidated:
+	kunmap_local(block);
+	folio_unlock(folio);
+	folio_put(folio);
+	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
+			   0, 0, 0, 0, "..");
+	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+	goto out;
+
+error:
+	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
+			   0, 0, 0, 0, "..");
+	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+	goto out;
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6e1d3c4daf72..b306c0980870 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1072,6 +1072,8 @@ extern void afs_check_for_remote_deletion(struct afs_operation *);
 extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
 			     enum afs_edit_dir_reason);
 extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
+void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
+				enum afs_edit_dir_reason why);
 
 /*
  * dir_silly.c
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index 450c44c83a5d..a0aed1a428a1 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -331,7 +331,11 @@ enum yfs_cm_operation {
 	EM(afs_edit_dir_delete,			"delete") \
 	EM(afs_edit_dir_delete_error,		"d_err ") \
 	EM(afs_edit_dir_delete_inval,		"d_invl") \
-	E_(afs_edit_dir_delete_noent,		"d_nent")
+	EM(afs_edit_dir_delete_noent,		"d_nent") \
+	EM(afs_edit_dir_update_dd,		"u_ddot") \
+	EM(afs_edit_dir_update_error,		"u_fail") \
+	EM(afs_edit_dir_update_inval,		"u_invl") \
+	E_(afs_edit_dir_update_nodd,		"u_nodd")
 
 #define afs_edit_dir_reasons				  \
 	EM(afs_edit_dir_for_create,		"Create") \
@@ -340,6 +344,7 @@ enum yfs_cm_operation {
 	EM(afs_edit_dir_for_rename_0,		"Renam0") \
 	EM(afs_edit_dir_for_rename_1,		"Renam1") \
 	EM(afs_edit_dir_for_rename_2,		"Renam2") \
+	EM(afs_edit_dir_for_rename_sub,		"RnmSub") \
 	EM(afs_edit_dir_for_rmdir,		"RmDir ") \
 	EM(afs_edit_dir_for_silly_0,		"S_Ren0") \
 	EM(afs_edit_dir_for_silly_1,		"S_Ren1") \
-- 
cgit v1.2.3


From c2f803052bc7a7feb2e03befccc8e49b6ff1f5f5 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Thu, 24 Oct 2024 09:35:57 +0800
Subject: bpf: Add the missing BPF_LINK_TYPE invocation for sockmap

There is an out-of-bounds read in bpf_link_show_fdinfo() for the sockmap
link fd. Fix it by adding the missing BPF_LINK_TYPE invocation for
sockmap link

Also add comments for bpf_link_type to prevent missing updates in the
future.

Fixes: 699c23f02c65 ("bpf: Add bpf_link support for sk_msg and sk_skb progs")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241024013558.1135167-2-houtao@huaweicloud.com
---
 include/linux/bpf_types.h      | 1 +
 include/uapi/linux/bpf.h       | 3 +++
 tools/include/uapi/linux/bpf.h | 3 +++
 3 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 9f2a6b83b49e..fa78f49d4a9a 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -146,6 +146,7 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETFILTER, netfilter)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TCX, tcx)
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETKIT, netkit)
+BPF_LINK_TYPE(BPF_LINK_TYPE_SOCKMAP, sockmap)
 #endif
 #ifdef CONFIG_PERF_EVENTS
 BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e8241b320c6d..4a939c90dc2e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1121,6 +1121,9 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
+/* Add BPF_LINK_TYPE(type, name) in bpf_types.h to keep bpf_link_type_strs[]
+ * in sync with the definitions below.
+ */
 enum bpf_link_type {
 	BPF_LINK_TYPE_UNSPEC = 0,
 	BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e8241b320c6d..4a939c90dc2e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1121,6 +1121,9 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
+/* Add BPF_LINK_TYPE(type, name) in bpf_types.h to keep bpf_link_type_strs[]
+ * in sync with the definitions below.
+ */
 enum bpf_link_type {
 	BPF_LINK_TYPE_UNSPEC = 0,
 	BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
-- 
cgit v1.2.3


From 101c268bd2f37e965a5468353e62d154db38838e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 22 Oct 2024 18:43:49 -0700
Subject: cxl/port: Fix use-after-free, permit out-of-order decoder shutdown

In support of investigating an initialization failure report [1],
cxl_test was updated to register mock memory-devices after the mock
root-port/bus device had been registered. That led to cxl_test crashing
with a use-after-free bug with the following signature:

    cxl_port_attach_region: cxl region3: cxl_host_bridge.0:port3 decoder3.0 add: mem0:decoder7.0 @ 0 next: cxl_switch_uport.0 nr_eps: 1 nr_targets: 1
    cxl_port_attach_region: cxl region3: cxl_host_bridge.0:port3 decoder3.0 add: mem4:decoder14.0 @ 1 next: cxl_switch_uport.0 nr_eps: 2 nr_targets: 1
    cxl_port_setup_targets: cxl region3: cxl_switch_uport.0:port6 target[0] = cxl_switch_dport.0 for mem0:decoder7.0 @ 0
1)  cxl_port_setup_targets: cxl region3: cxl_switch_uport.0:port6 target[1] = cxl_switch_dport.4 for mem4:decoder14.0 @ 1
    [..]
    cxld_unregister: cxl decoder14.0:
    cxl_region_decode_reset: cxl_region region3:
    mock_decoder_reset: cxl_port port3: decoder3.0 reset
2)  mock_decoder_reset: cxl_port port3: decoder3.0: out of order reset, expected decoder3.1
    cxl_endpoint_decoder_release: cxl decoder14.0:
    [..]
    cxld_unregister: cxl decoder7.0:
3)  cxl_region_decode_reset: cxl_region region3:
    Oops: general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6bc3: 0000 [#1] PREEMPT SMP PTI
    [..]
    RIP: 0010:to_cxl_port+0x8/0x60 [cxl_core]
    [..]
    Call Trace:
     <TASK>
     cxl_region_decode_reset+0x69/0x190 [cxl_core]
     cxl_region_detach+0xe8/0x210 [cxl_core]
     cxl_decoder_kill_region+0x27/0x40 [cxl_core]
     cxld_unregister+0x5d/0x60 [cxl_core]

At 1) a region has been established with 2 endpoint decoders (7.0 and
14.0). Those endpoints share a common switch-decoder in the topology
(3.0). At teardown, 2), decoder14.0 is the first to be removed and hits
the "out of order reset case" in the switch decoder. The effect though
is that region3 cleanup is aborted leaving it in-tact and
referencing decoder14.0. At 3) the second attempt to teardown region3
trips over the stale decoder14.0 object which has long since been
deleted.

The fix here is to recognize that the CXL specification places no
mandate on in-order shutdown of switch-decoders, the driver enforces
in-order allocation, and hardware enforces in-order commit. So, rather
than fail and leave objects dangling, always remove them.

In support of making cxl_region_decode_reset() always succeed,
cxl_region_invalidate_memregion() failures are turned into warnings.
Crashing the kernel is ok there since system integrity is at risk if
caches cannot be managed around physical address mutation events like
CXL region destruction.

A new device_for_each_child_reverse_from() is added to cleanup
port->commit_end after all dependent decoders have been disabled. In
other words if decoders are allocated 0->1->2 and disabled 1->2->0 then
port->commit_end only decrements from 2 after 2 has been disabled, and
it decrements all the way to zero since 1 was disabled previously.

Link: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net [1]
Cc: stable@vger.kernel.org
Fixes: 176baefb2eb5 ("cxl/hdm: Commit decoder state to hardware")
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Link: https://patch.msgid.link/172964782781.81806.17902885593105284330.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 drivers/base/core.c          | 35 +++++++++++++++++++++++++++++++
 drivers/cxl/core/hdm.c       | 50 +++++++++++++++++++++++++++++++++++++-------
 drivers/cxl/core/region.c    | 48 ++++++++++++------------------------------
 drivers/cxl/cxl.h            |  3 ++-
 include/linux/device.h       |  3 +++
 tools/testing/cxl/test/cxl.c | 14 +++++--------
 6 files changed, 100 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index a4c853411a6b..e42f1ad73078 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4037,6 +4037,41 @@ int device_for_each_child_reverse(struct device *parent, void *data,
 }
 EXPORT_SYMBOL_GPL(device_for_each_child_reverse);
 
+/**
+ * device_for_each_child_reverse_from - device child iterator in reversed order.
+ * @parent: parent struct device.
+ * @from: optional starting point in child list
+ * @fn: function to be called for each device.
+ * @data: data for the callback.
+ *
+ * Iterate over @parent's child devices, starting at @from, and call @fn
+ * for each, passing it @data. This helper is identical to
+ * device_for_each_child_reverse() when @from is NULL.
+ *
+ * @fn is checked each iteration. If it returns anything other than 0,
+ * iteration stop and that value is returned to the caller of
+ * device_for_each_child_reverse_from();
+ */
+int device_for_each_child_reverse_from(struct device *parent,
+				       struct device *from, const void *data,
+				       int (*fn)(struct device *, const void *))
+{
+	struct klist_iter i;
+	struct device *child;
+	int error = 0;
+
+	if (!parent->p)
+		return 0;
+
+	klist_iter_init_node(&parent->p->klist_children, &i,
+			     (from ? &from->p->knode_parent : NULL));
+	while ((child = prev_device(&i)) && !error)
+		error = fn(child, data);
+	klist_iter_exit(&i);
+	return error;
+}
+EXPORT_SYMBOL_GPL(device_for_each_child_reverse_from);
+
 /**
  * device_find_child - device iterator for locating a particular device.
  * @parent: parent struct device
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 3df10517a327..223c273c0cd1 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -712,7 +712,44 @@ static int cxl_decoder_commit(struct cxl_decoder *cxld)
 	return 0;
 }
 
-static int cxl_decoder_reset(struct cxl_decoder *cxld)
+static int commit_reap(struct device *dev, const void *data)
+{
+	struct cxl_port *port = to_cxl_port(dev->parent);
+	struct cxl_decoder *cxld;
+
+	if (!is_switch_decoder(dev) && !is_endpoint_decoder(dev))
+		return 0;
+
+	cxld = to_cxl_decoder(dev);
+	if (port->commit_end == cxld->id &&
+	    ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
+		port->commit_end--;
+		dev_dbg(&port->dev, "reap: %s commit_end: %d\n",
+			dev_name(&cxld->dev), port->commit_end);
+	}
+
+	return 0;
+}
+
+void cxl_port_commit_reap(struct cxl_decoder *cxld)
+{
+	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	/*
+	 * Once the highest committed decoder is disabled, free any other
+	 * decoders that were pinned allocated by out-of-order release.
+	 */
+	port->commit_end--;
+	dev_dbg(&port->dev, "reap: %s commit_end: %d\n", dev_name(&cxld->dev),
+		port->commit_end);
+	device_for_each_child_reverse_from(&port->dev, &cxld->dev, NULL,
+					   commit_reap);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_commit_reap, CXL);
+
+static void cxl_decoder_reset(struct cxl_decoder *cxld)
 {
 	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
 	struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
@@ -721,14 +758,14 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
 	u32 ctrl;
 
 	if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
-		return 0;
+		return;
 
-	if (port->commit_end != id) {
+	if (port->commit_end == id)
+		cxl_port_commit_reap(cxld);
+	else
 		dev_dbg(&port->dev,
 			"%s: out of order reset, expected decoder%d.%d\n",
 			dev_name(&cxld->dev), port->id, port->commit_end);
-		return -EBUSY;
-	}
 
 	down_read(&cxl_dpa_rwsem);
 	ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
@@ -741,7 +778,6 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
 	writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id));
 	up_read(&cxl_dpa_rwsem);
 
-	port->commit_end--;
 	cxld->flags &= ~CXL_DECODER_F_ENABLE;
 
 	/* Userspace is now responsible for reconfiguring this decoder */
@@ -751,8 +787,6 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
 		cxled = to_cxl_endpoint_decoder(&cxld->dev);
 		cxled->state = CXL_DECODER_STATE_MANUAL;
 	}
-
-	return 0;
 }
 
 static int cxl_setup_hdm_decoder_from_dvsec(
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index e701e4b04032..3478d2058303 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -232,8 +232,8 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
 				"Bypassing cpu_cache_invalidate_memregion() for testing!\n");
 			return 0;
 		} else {
-			dev_err(&cxlr->dev,
-				"Failed to synchronize CPU cache state\n");
+			dev_WARN(&cxlr->dev,
+				 "Failed to synchronize CPU cache state\n");
 			return -ENXIO;
 		}
 	}
@@ -242,19 +242,17 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
 	return 0;
 }
 
-static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
+static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
 {
 	struct cxl_region_params *p = &cxlr->params;
-	int i, rc = 0;
+	int i;
 
 	/*
-	 * Before region teardown attempt to flush, and if the flush
-	 * fails cancel the region teardown for data consistency
-	 * concerns
+	 * Before region teardown attempt to flush, evict any data cached for
+	 * this region, or scream loudly about missing arch / platform support
+	 * for CXL teardown.
 	 */
-	rc = cxl_region_invalidate_memregion(cxlr);
-	if (rc)
-		return rc;
+	cxl_region_invalidate_memregion(cxlr);
 
 	for (i = count - 1; i >= 0; i--) {
 		struct cxl_endpoint_decoder *cxled = p->targets[i];
@@ -277,23 +275,17 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
 			cxl_rr = cxl_rr_load(iter, cxlr);
 			cxld = cxl_rr->decoder;
 			if (cxld->reset)
-				rc = cxld->reset(cxld);
-			if (rc)
-				return rc;
+				cxld->reset(cxld);
 			set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
 		}
 
 endpoint_reset:
-		rc = cxled->cxld.reset(&cxled->cxld);
-		if (rc)
-			return rc;
+		cxled->cxld.reset(&cxled->cxld);
 		set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
 	}
 
 	/* all decoders associated with this region have been torn down */
 	clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
-
-	return 0;
 }
 
 static int commit_decoder(struct cxl_decoder *cxld)
@@ -409,16 +401,8 @@ static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
 		 * still pending.
 		 */
 		if (p->state == CXL_CONFIG_RESET_PENDING) {
-			rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
-			/*
-			 * Revert to committed since there may still be active
-			 * decoders associated with this region, or move forward
-			 * to active to mark the reset successful
-			 */
-			if (rc)
-				p->state = CXL_CONFIG_COMMIT;
-			else
-				p->state = CXL_CONFIG_ACTIVE;
+			cxl_region_decode_reset(cxlr, p->interleave_ways);
+			p->state = CXL_CONFIG_ACTIVE;
 		}
 	}
 
@@ -2054,13 +2038,7 @@ static int cxl_region_detach(struct cxl_endpoint_decoder *cxled)
 	get_device(&cxlr->dev);
 
 	if (p->state > CXL_CONFIG_ACTIVE) {
-		/*
-		 * TODO: tear down all impacted regions if a device is
-		 * removed out of order
-		 */
-		rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
-		if (rc)
-			goto out;
+		cxl_region_decode_reset(cxlr, p->interleave_ways);
 		p->state = CXL_CONFIG_ACTIVE;
 	}
 
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 0d8b810a51f0..5406e3ab3d4a 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -359,7 +359,7 @@ struct cxl_decoder {
 	struct cxl_region *region;
 	unsigned long flags;
 	int (*commit)(struct cxl_decoder *cxld);
-	int (*reset)(struct cxl_decoder *cxld);
+	void (*reset)(struct cxl_decoder *cxld);
 };
 
 /*
@@ -730,6 +730,7 @@ static inline bool is_cxl_root(struct cxl_port *port)
 int cxl_num_decoders_committed(struct cxl_port *port);
 bool is_cxl_port(const struct device *dev);
 struct cxl_port *to_cxl_port(const struct device *dev);
+void cxl_port_commit_reap(struct cxl_decoder *cxld);
 struct pci_bus;
 int devm_cxl_register_pci_bus(struct device *host, struct device *uport_dev,
 			      struct pci_bus *bus);
diff --git a/include/linux/device.h b/include/linux/device.h
index b4bde8d22697..667cb6db9019 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1078,6 +1078,9 @@ int device_for_each_child(struct device *dev, void *data,
 			  int (*fn)(struct device *dev, void *data));
 int device_for_each_child_reverse(struct device *dev, void *data,
 				  int (*fn)(struct device *dev, void *data));
+int device_for_each_child_reverse_from(struct device *parent,
+				       struct device *from, const void *data,
+				       int (*fn)(struct device *, const void *));
 struct device *device_find_child(struct device *dev, void *data,
 				 int (*match)(struct device *dev, void *data));
 struct device *device_find_child_by_name(struct device *parent,
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 90d5afd52dd0..c5bbd89b3192 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -693,26 +693,22 @@ static int mock_decoder_commit(struct cxl_decoder *cxld)
 	return 0;
 }
 
-static int mock_decoder_reset(struct cxl_decoder *cxld)
+static void mock_decoder_reset(struct cxl_decoder *cxld)
 {
 	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
 	int id = cxld->id;
 
 	if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
-		return 0;
+		return;
 
 	dev_dbg(&port->dev, "%s reset\n", dev_name(&cxld->dev));
-	if (port->commit_end != id) {
+	if (port->commit_end == id)
+		cxl_port_commit_reap(cxld);
+	else
 		dev_dbg(&port->dev,
 			"%s: out of order reset, expected decoder%d.%d\n",
 			dev_name(&cxld->dev), port->id, port->commit_end);
-		return -EBUSY;
-	}
-
-	port->commit_end--;
 	cxld->flags &= ~CXL_DECODER_F_ENABLE;
-
-	return 0;
 }
 
 static void default_mock_decoder(struct cxl_decoder *cxld)
-- 
cgit v1.2.3


From b5413156bad91dc2995a5c4eab1b05e56914638a Mon Sep 17 00:00:00 2001
From: Benjamin Segall <bsegall@google.com>
Date: Fri, 25 Oct 2024 18:35:35 -0700
Subject: posix-cpu-timers: Clear TICK_DEP_BIT_POSIX_TIMER on clone

When cloning a new thread, its posix_cputimers are not inherited, and
are cleared by posix_cputimers_init(). However, this does not clear the
tick dependency it creates in tsk->tick_dep_mask, and the handler does
not reach the code to clear the dependency if there were no timers to
begin with.

Thus if a thread has a cputimer running before clone/fork, all
descendants will prevent nohz_full unless they create a cputimer of
their own.

Fix this by entirely clearing the tick_dep_mask in copy_process().
(There is currently no inherited state that needs a tick dependency)

Process-wide timers do not have this problem because fork does not copy
signal_struct as a baseline, it creates one from scratch.

Fixes: b78783000d5c ("posix-cpu-timers: Migrate to use new tick dependency mask model")
Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/all/xm26o737bq8o.fsf@google.com
---
 include/linux/tick.h | 8 ++++++++
 kernel/fork.c        | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 72744638c5b0..99c9c5a7252a 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -251,12 +251,19 @@ static inline void tick_dep_set_task(struct task_struct *tsk,
 	if (tick_nohz_full_enabled())
 		tick_nohz_dep_set_task(tsk, bit);
 }
+
 static inline void tick_dep_clear_task(struct task_struct *tsk,
 				       enum tick_dep_bits bit)
 {
 	if (tick_nohz_full_enabled())
 		tick_nohz_dep_clear_task(tsk, bit);
 }
+
+static inline void tick_dep_init_task(struct task_struct *tsk)
+{
+	atomic_set(&tsk->tick_dep_mask, 0);
+}
+
 static inline void tick_dep_set_signal(struct task_struct *tsk,
 				       enum tick_dep_bits bit)
 {
@@ -290,6 +297,7 @@ static inline void tick_dep_set_task(struct task_struct *tsk,
 				     enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_task(struct task_struct *tsk,
 				       enum tick_dep_bits bit) { }
+static inline void tick_dep_init_task(struct task_struct *tsk) { }
 static inline void tick_dep_set_signal(struct task_struct *tsk,
 				       enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_signal(struct signal_struct *signal,
diff --git a/kernel/fork.c b/kernel/fork.c
index 89ceb4a68af2..6fa9fe62e01e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
 #include <linux/rseq.h>
 #include <uapi/linux/pidfd.h>
 #include <linux/pidfs.h>
+#include <linux/tick.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -2292,6 +2293,7 @@ __latent_entropy struct task_struct *copy_process(
 	acct_clear_integrals(p);
 
 	posix_cputimers_init(&p->posix_cputimers);
+	tick_dep_init_task(p);
 
 	p->io_context = NULL;
 	audit_set_context(p, NULL);
-- 
cgit v1.2.3


From f64e67e5d3a45a4a04286c47afade4b518acd47b Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 15 Oct 2024 18:56:05 +0100
Subject: fork: do not invoke uffd on fork if error occurs

Patch series "fork: do not expose incomplete mm on fork".

During fork we may place the virtual memory address space into an
inconsistent state before the fork operation is complete.

In addition, we may encounter an error during the fork operation that
indicates that the virtual memory address space is invalidated.

As a result, we should not be exposing it in any way to external machinery
that might interact with the mm or VMAs, machinery that is not designed to
deal with incomplete state.

We specifically update the fork logic to defer khugepaged and ksm to the
end of the operation and only to be invoked if no error arose, and
disallow uffd from observing fork events should an error have occurred.


This patch (of 2):

Currently on fork we expose the virtual address space of a process to
userland unconditionally if uffd is registered in VMAs, regardless of
whether an error arose in the fork.

This is performed in dup_userfaultfd_complete() which is invoked
unconditionally, and performs two duties - invoking registered handlers
for the UFFD_EVENT_FORK event via dup_fctx(), and clearing down
userfaultfd_fork_ctx objects established in dup_userfaultfd().

This is problematic, because the virtual address space may not yet be
correctly initialised if an error arose.

The change in commit d24062914837 ("fork: use __mt_dup() to duplicate
maple tree in dup_mmap()") makes this more pertinent as we may be in a
state where entries in the maple tree are not yet consistent.

We address this by, on fork error, ensuring that we roll back state that
we would otherwise expect to clean up through the event being handled by
userland and perform the memory freeing duty otherwise performed by
dup_userfaultfd_complete().

We do this by implementing a new function, dup_userfaultfd_fail(), which
performs the same loop, only decrementing reference counts.

Note that we perform mmgrab() on the parent and child mm's, however
userfaultfd_ctx_put() will mmdrop() this once the reference count drops to
zero, so we will avoid memory leaks correctly here.

Link: https://lkml.kernel.org/r/cover.1729014377.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/d3691d58bb58712b6fb3df2be441d175bd3cdf07.1729014377.git.lorenzo.stoakes@oracle.com
Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: Jann Horn <jannh@google.com>
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              | 28 ++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h |  5 +++++
 kernel/fork.c                 |  5 ++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 68cdd89c97a3..7c0bd0b55f88 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
 	}
 }
 
+void dup_userfaultfd_fail(struct list_head *fcs)
+{
+	struct userfaultfd_fork_ctx *fctx, *n;
+
+	/*
+	 * An error has occurred on fork, we will tear memory down, but have
+	 * allocated memory for fctx's and raised reference counts for both the
+	 * original and child contexts (and on the mm for each as a result).
+	 *
+	 * These would ordinarily be taken care of by a user handling the event,
+	 * but we are no longer doing so, so manually clean up here.
+	 *
+	 * mm tear down will take care of cleaning up VMA contexts.
+	 */
+	list_for_each_entry_safe(fctx, n, fcs, list) {
+		struct userfaultfd_ctx *octx = fctx->orig;
+		struct userfaultfd_ctx *ctx = fctx->new;
+
+		atomic_dec(&octx->mmap_changing);
+		VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
+		userfaultfd_ctx_put(octx);
+		userfaultfd_ctx_put(ctx);
+
+		list_del(&fctx->list);
+		kfree(fctx);
+	}
+}
+
 void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 			     struct vm_userfaultfd_ctx *vm_ctx)
 {
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 9fc6ce15c499..cb40f1a1d081 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -249,6 +249,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
 extern void dup_userfaultfd_complete(struct list_head *);
+void dup_userfaultfd_fail(struct list_head *);
 
 extern void mremap_userfaultfd_prep(struct vm_area_struct *,
 				    struct vm_userfaultfd_ctx *);
@@ -351,6 +352,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
 {
 }
 
+static inline void dup_userfaultfd_fail(struct list_head *l)
+{
+}
+
 static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 					   struct vm_userfaultfd_ctx *ctx)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 89ceb4a68af2..597b477dd491 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -775,7 +775,10 @@ out:
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
 	mmap_write_unlock(oldmm);
-	dup_userfaultfd_complete(&uf);
+	if (!retval)
+		dup_userfaultfd_complete(&uf);
+	else
+		dup_userfaultfd_fail(&uf);
 fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
-- 
cgit v1.2.3


From 985da552a98e27096444508ce5d853244019111f Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 15 Oct 2024 18:56:06 +0100
Subject: fork: only invoke khugepaged, ksm hooks if no error

There is no reason to invoke these hooks early against an mm that is in an
incomplete state.

The change in commit d24062914837 ("fork: use __mt_dup() to duplicate
maple tree in dup_mmap()") makes this more pertinent as we may be in a
state where entries in the maple tree are not yet consistent.

Their placement early in dup_mmap() only appears to have been meaningful
for early error checking, and since functionally it'd require a very small
allocation to fail (in practice 'too small to fail') that'd only occur in
the most dire circumstances, meaning the fork would fail or be OOM'd in
any case.

Since both khugepaged and KSM tracking are there to provide optimisations
to memory performance rather than critical functionality, it doesn't
really matter all that much if, under such dire memory pressure, we fail
to register an mm with these.

As a result, we follow the example of commit d2081b2bf819 ("mm:
khugepaged: make khugepaged_enter() void function") and make ksm_fork() a
void function also.

We only expose the mm to these functions once we are done with them and
only if no error occurred in the fork operation.

Link: https://lkml.kernel.org/r/e0cb8b840c9d1d5a6e84d4f8eff5f3f2022aa10c.1729014377.git.lorenzo.stoakes@oracle.com
Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: Jann Horn <jannh@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jann Horn <jannh@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h | 10 ++++------
 kernel/fork.c       |  7 ++-----
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 11690dacd986..ec9c05044d4f 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -54,12 +54,11 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
 	return atomic_long_read(&mm->ksm_zero_pages);
 }
 
-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
+	/* Adding mm to ksm is best effort on fork. */
 	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-		return __ksm_enter(mm);
-
-	return 0;
+		__ksm_enter(mm);
 }
 
 static inline int ksm_execve(struct mm_struct *mm)
@@ -107,9 +106,8 @@ static inline int ksm_disable(struct mm_struct *mm)
 	return 0;
 }
 
-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	return 0;
 }
 
 static inline int ksm_execve(struct mm_struct *mm)
diff --git a/kernel/fork.c b/kernel/fork.c
index 597b477dd491..3bf38d260cb3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -653,11 +653,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	mm->exec_vm = oldmm->exec_vm;
 	mm->stack_vm = oldmm->stack_vm;
 
-	retval = ksm_fork(mm, oldmm);
-	if (retval)
-		goto out;
-	khugepaged_fork(mm, oldmm);
-
 	/* Use __mt_dup() to efficiently build an identical maple tree. */
 	retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
 	if (unlikely(retval))
@@ -760,6 +755,8 @@ loop_out:
 	vma_iter_free(&vmi);
 	if (!retval) {
 		mt_set_in_rcu(vmi.mas.tree);
+		ksm_fork(mm, oldmm);
+		khugepaged_fork(mm, oldmm);
 	} else if (mpnt) {
 		/*
 		 * The entire maple tree has already been duplicated. If the
-- 
cgit v1.2.3


From 69e5a17511f654db0fe058d22719cef008c9faf1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 18 Oct 2024 14:01:19 -0300
Subject: iommu: Remove useless flush from
 iommu_create_device_direct_mappings()

These days iommu_map() does not require external flushing, it always
internally handles any required flushes. Since
iommu_create_device_direct_mappings() only calls iommu_map(), remove
the extra call.

Since this is the last call site for iommu_flush_iotlb_all() remove it
too.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/0-v1-bb6c694e1b07+a29e1-iommu_no_flush_all_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c |  4 ----
 include/linux/iommu.h | 10 ----------
 2 files changed, 14 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 4c29a4bc368b..e5d5b4fd322a 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1141,10 +1141,6 @@ map_end:
 		}
 
 	}
-
-	if (!list_empty(&mappings) && iommu_is_dma_domain(domain))
-		iommu_flush_iotlb_all(domain);
-
 out:
 	iommu_put_resv_regions(dev, &mappings);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 62d1b85c80d3..03f9fa833db9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -852,12 +852,6 @@ void iommu_set_dma_strict(void);
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
-static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
-{
-	if (domain->ops->flush_iotlb_all)
-		domain->ops->flush_iotlb_all(domain);
-}
-
 static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
@@ -1141,10 +1135,6 @@ static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
-static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
-{
-}
-
 static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
-- 
cgit v1.2.3


From f6440fcc9c7b7aaad2e52830ab83fa71990f76ff Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 9 Oct 2024 12:11:47 +0800
Subject: iommu: Remove iommu_domain_alloc()

The iommu_domain_alloc() interface is no longer used in the tree anymore.
Remove it to avoid dead code.

There is increasing demand for supporting multiple IOMMU drivers, and this
is the last bus-based thing standing in the way of that.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20241009041147.28391-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 36 ------------------------------------
 include/linux/iommu.h |  6 ------
 2 files changed, 42 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e5d5b4fd322a..fe967b5fd094 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1965,42 +1965,6 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type)
 	return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type);
 }
 
-static int __iommu_domain_alloc_dev(struct device *dev, void *data)
-{
-	const struct iommu_ops **ops = data;
-
-	if (!dev_has_iommu(dev))
-		return 0;
-
-	if (WARN_ONCE(*ops && *ops != dev_iommu_ops(dev),
-		      "Multiple IOMMU drivers present for bus %s, which the public IOMMU API can't fully support yet. You will still need to disable one or more for this to work, sorry!\n",
-		      dev_bus_name(dev)))
-		return -EBUSY;
-
-	*ops = dev_iommu_ops(dev);
-	return 0;
-}
-
-/*
- * The iommu ops in bus has been retired. Do not use this interface in
- * new drivers.
- */
-struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
-{
-	const struct iommu_ops *ops = NULL;
-	int err = bus_for_each_dev(bus, NULL, &ops, __iommu_domain_alloc_dev);
-	struct iommu_domain *domain;
-
-	if (err || !ops)
-		return NULL;
-
-	domain = __iommu_domain_alloc(ops, NULL, IOMMU_DOMAIN_UNMANAGED);
-	if (IS_ERR(domain))
-		return NULL;
-	return domain;
-}
-EXPORT_SYMBOL_GPL(iommu_domain_alloc);
-
 /**
  * iommu_paging_domain_alloc() - Allocate a paging domain
  * @dev: device for which the domain is allocated
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 03f9fa833db9..d93c87f0c003 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -787,7 +787,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 extern int bus_iommu_probe(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
-extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus);
 struct iommu_domain *iommu_paging_domain_alloc(struct device *dev);
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
@@ -1079,11 +1078,6 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
-static inline struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
-{
-	return NULL;
-}
-
 static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From 20858d4ebb423826b7a978d54cde195f51d87e20 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@ziepe.ca>
Date: Mon, 28 Oct 2024 09:38:00 +0000
Subject: iommu: Introduce iommu_paging_domain_alloc_flags()

Currently drivers calls iommu_paging_domain_alloc(dev) to get an
UNMANAGED domain. This is not sufficient to support PASID with
UNMANAGED domain as some HW like AMD requires certain page table type
to support PASIDs.

Also the domain_alloc_paging op only passes device as param for domain
allocation. This is not sufficient for AMD driver to decide the right
page table.

Instead of extending ops->domain_alloc_paging() it was decided to
enhance ops->domain_alloc_user() so that caller can pass various
additional flags.

Hence add iommu_paging_domain_alloc_flags() API which takes flags as
parameter. Caller can pass additional parameter to indicate type of
domain required, etc. iommu_paging_domain_alloc_flags() internally calls
appropriate callback function to allocate a domain.

Signed-off-by: Jason Gunthorpe <jgg@ziepe.ca>
[Added description - Vasant]
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20241028093810.5901-3-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 32 +++++++++++++++++++++++++++-----
 include/linux/iommu.h | 14 +++++++++++---
 2 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cd91c0eabb34..d82aa6563d84 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1962,20 +1962,42 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type)
 }
 
 /**
- * iommu_paging_domain_alloc() - Allocate a paging domain
+ * iommu_paging_domain_alloc_flags() - Allocate a paging domain
  * @dev: device for which the domain is allocated
+ * @flags: Enum of iommufd_hwpt_alloc_flags
  *
  * Allocate a paging domain which will be managed by a kernel driver. Return
- * allocated domain if successful, or a ERR pointer for failure.
+ * allocated domain if successful, or an ERR pointer for failure.
  */
-struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+						     unsigned int flags)
 {
+	const struct iommu_ops *ops;
+	struct iommu_domain *domain;
+
 	if (!dev_has_iommu(dev))
 		return ERR_PTR(-ENODEV);
 
-	return __iommu_domain_alloc(dev_iommu_ops(dev), dev, IOMMU_DOMAIN_UNMANAGED);
+	ops = dev_iommu_ops(dev);
+
+	if (ops->domain_alloc_paging && !flags)
+		domain = ops->domain_alloc_paging(dev);
+	else if (ops->domain_alloc_user)
+		domain = ops->domain_alloc_user(dev, flags, NULL, NULL);
+	else if (ops->domain_alloc && !flags)
+		domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED);
+	else
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (IS_ERR(domain))
+		return domain;
+	if (!domain)
+		return ERR_PTR(-ENOMEM);
+
+	iommu_domain_init(domain, IOMMU_DOMAIN_UNMANAGED, ops);
+	return domain;
 }
-EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc);
+EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags);
 
 void iommu_domain_free(struct iommu_domain *domain)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d93c87f0c003..aa78d911fdda 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -511,8 +511,6 @@ static inline int __iommu_copy_struct_from_user_array(
  *                the caller iommu_domain_alloc() returns.
  * @domain_alloc_user: Allocate an iommu domain corresponding to the input
  *                     parameters as defined in include/uapi/linux/iommufd.h.
- *                     Unlike @domain_alloc, it is called only by IOMMUFD and
- *                     must fully initialize the new domain before return.
  *                     Upon success, if the @user_data is valid and the @parent
  *                     points to a kernel-managed domain, the new domain must be
  *                     IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be
@@ -787,7 +785,11 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 extern int bus_iommu_probe(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
-struct iommu_domain *iommu_paging_domain_alloc(struct device *dev);
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags);
+static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
+{
+	return iommu_paging_domain_alloc_flags(dev, 0);
+}
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
@@ -1078,6 +1080,12 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+						     unsigned int flags)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From b7a0855eb95f6db8ac8e17596e76f7b94a790fe6 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@ziepe.ca>
Date: Mon, 28 Oct 2024 09:38:01 +0000
Subject: iommu: Add new flag to explictly request PASID capable domain

Introduce new flag (IOMMU_HWPT_ALLOC_PASID) to domain_alloc_users() ops.
If IOMMU supports PASID it will allocate domain. Otherwise return error.
In error path check for -EOPNOTSUPP and try to allocate non-PASID
domain so that DMA-API mode work fine for drivers which does not support
PASID as well.

Also modify __iommu_group_alloc_default_domain() to call
iommu_paging_domain_alloc_flags() with appropriate flag when allocating
paging domain.

Signed-off-by: Jason Gunthorpe <jgg@ziepe.ca>
Co-developed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20241028093810.5901-4-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c        | 56 +++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/iommufd.h |  8 +++++++
 2 files changed, 53 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d82aa6563d84..97ad43144736 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -32,6 +32,7 @@
 #include <trace/events/iommu.h>
 #include <linux/sched/mm.h>
 #include <linux/msi.h>
+#include <uapi/linux/iommufd.h>
 
 #include "dma-iommu.h"
 #include "iommu-priv.h"
@@ -99,6 +100,9 @@ static int __iommu_attach_device(struct iommu_domain *domain,
 				 struct device *dev);
 static int __iommu_attach_group(struct iommu_domain *domain,
 				struct iommu_group *group);
+static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev,
+						       unsigned int type,
+						       unsigned int flags);
 
 enum {
 	IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0,
@@ -1585,8 +1589,30 @@ EXPORT_SYMBOL_GPL(fsl_mc_device_group);
 static struct iommu_domain *
 __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type)
 {
+	struct device *dev = iommu_group_first_dev(group);
+	struct iommu_domain *dom;
+
 	if (group->default_domain && group->default_domain->type == req_type)
 		return group->default_domain;
+
+	/*
+	 * When allocating the DMA API domain assume that the driver is going to
+	 * use PASID and make sure the RID's domain is PASID compatible.
+	 */
+	if (req_type & __IOMMU_DOMAIN_PAGING) {
+		dom = __iommu_paging_domain_alloc_flags(dev, req_type,
+			   dev->iommu->max_pasids ? IOMMU_HWPT_ALLOC_PASID : 0);
+
+		/*
+		 * If driver does not support PASID feature then
+		 * try to allocate non-PASID domain
+		 */
+		if (PTR_ERR(dom) == -EOPNOTSUPP)
+			dom = __iommu_paging_domain_alloc_flags(dev, req_type, 0);
+
+		return dom;
+	}
+
 	return __iommu_group_domain_alloc(group, req_type);
 }
 
@@ -1961,16 +1987,9 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type)
 	return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type);
 }
 
-/**
- * iommu_paging_domain_alloc_flags() - Allocate a paging domain
- * @dev: device for which the domain is allocated
- * @flags: Enum of iommufd_hwpt_alloc_flags
- *
- * Allocate a paging domain which will be managed by a kernel driver. Return
- * allocated domain if successful, or an ERR pointer for failure.
- */
-struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
-						     unsigned int flags)
+static struct iommu_domain *
+__iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type,
+				  unsigned int flags)
 {
 	const struct iommu_ops *ops;
 	struct iommu_domain *domain;
@@ -1994,9 +2013,24 @@ struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
 	if (!domain)
 		return ERR_PTR(-ENOMEM);
 
-	iommu_domain_init(domain, IOMMU_DOMAIN_UNMANAGED, ops);
+	iommu_domain_init(domain, type, ops);
 	return domain;
 }
+
+/**
+ * iommu_paging_domain_alloc_flags() - Allocate a paging domain
+ * @dev: device for which the domain is allocated
+ * @flags: Bitmap of iommufd_hwpt_alloc_flags
+ *
+ * Allocate a paging domain which will be managed by a kernel driver. Return
+ * allocated domain if successful, or an ERR pointer for failure.
+ */
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+						     unsigned int flags)
+{
+	return __iommu_paging_domain_alloc_flags(dev,
+					 IOMMU_DOMAIN_UNMANAGED, flags);
+}
 EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags);
 
 void iommu_domain_free(struct iommu_domain *domain)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 72010f71c5e4..0c0ed28ee113 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -359,11 +359,19 @@ struct iommu_vfio_ioas {
  *                                   enforced on device attachment
  * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is
  *                             valid.
+ * @IOMMU_HWPT_ALLOC_PASID: Requests a domain that can be used with PASID. The
+ *                          domain can be attached to any PASID on the device.
+ *                          Any domain attached to the non-PASID part of the
+ *                          device must also be flaged, otherwise attaching a
+ *                          PASID will blocked.
+ *                          If IOMMU does not support PASID it will return
+ *                          error (-EOPNOTSUPP).
  */
 enum iommufd_hwpt_alloc_flags {
 	IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0,
 	IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1,
 	IOMMU_HWPT_FAULT_ID_VALID = 1 << 2,
+	IOMMU_HWPT_ALLOC_PASID = 1 << 3,
 };
 
 /**
-- 
cgit v1.2.3


From 1c10941e34c5fdc0357e46a25bd130d9cf40b925 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Mon, 28 Oct 2024 13:56:56 +0100
Subject: ACPI: CPPC: Make rmw_lock a raw_spin_lock

The following BUG was triggered:

=============================
[ BUG: Invalid wait context ]
6.12.0-rc2-XXX #406 Not tainted
-----------------------------
kworker/1:1/62 is trying to lock:
ffffff8801593030 (&cpc_ptr->rmw_lock){+.+.}-{3:3}, at: cpc_write+0xcc/0x370
other info that might help us debug this:
context-{5:5}
2 locks held by kworker/1:1/62:
  #0: ffffff897ef5ec98 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x2c/0x50
  #1: ffffff880154e238 (&sg_policy->update_lock){....}-{2:2}, at: sugov_update_shared+0x3c/0x280
stack backtrace:
CPU: 1 UID: 0 PID: 62 Comm: kworker/1:1 Not tainted 6.12.0-rc2-g9654bd3e8806 #406
Workqueue:  0x0 (events)
Call trace:
  dump_backtrace+0xa4/0x130
  show_stack+0x20/0x38
  dump_stack_lvl+0x90/0xd0
  dump_stack+0x18/0x28
  __lock_acquire+0x480/0x1ad8
  lock_acquire+0x114/0x310
  _raw_spin_lock+0x50/0x70
  cpc_write+0xcc/0x370
  cppc_set_perf+0xa0/0x3a8
  cppc_cpufreq_fast_switch+0x40/0xc0
  cpufreq_driver_fast_switch+0x4c/0x218
  sugov_update_shared+0x234/0x280
  update_load_avg+0x6ec/0x7b8
  dequeue_entities+0x108/0x830
  dequeue_task_fair+0x58/0x408
  __schedule+0x4f0/0x1070
  schedule+0x54/0x130
  worker_thread+0xc0/0x2e8
  kthread+0x130/0x148
  ret_from_fork+0x10/0x20

sugov_update_shared() locks a raw_spinlock while cpc_write() locks a
spinlock.

To have a correct wait-type order, update rmw_lock to a raw spinlock and
ensure that interrupts will be disabled on the CPU holding it.

Fixes: 60949b7b8054 ("ACPI: CPPC: Fix MASK_VAL() usage")
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Link: https://patch.msgid.link/20241028125657.1271512-1-pierre.gondois@arm.com
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/cppc_acpi.c | 9 +++++----
 include/acpi/cppc_acpi.h | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index c3fc2c05d868..1a40f0514eaa 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -867,7 +867,7 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
 
 	/* Store CPU Logical ID */
 	cpc_ptr->cpu_id = pr->id;
-	spin_lock_init(&cpc_ptr->rmw_lock);
+	raw_spin_lock_init(&cpc_ptr->rmw_lock);
 
 	/* Parse PSD data for this CPU */
 	ret = acpi_get_psd(cpc_ptr, handle);
@@ -1087,6 +1087,7 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val)
 	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
 	struct cpc_reg *reg = &reg_res->cpc_entry.reg;
 	struct cpc_desc *cpc_desc;
+	unsigned long flags;
 
 	size = GET_BIT_WIDTH(reg);
 
@@ -1126,7 +1127,7 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val)
 			return -ENODEV;
 		}
 
-		spin_lock(&cpc_desc->rmw_lock);
+		raw_spin_lock_irqsave(&cpc_desc->rmw_lock, flags);
 		switch (size) {
 		case 8:
 			prev_val = readb_relaxed(vaddr);
@@ -1141,7 +1142,7 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val)
 			prev_val = readq_relaxed(vaddr);
 			break;
 		default:
-			spin_unlock(&cpc_desc->rmw_lock);
+			raw_spin_unlock_irqrestore(&cpc_desc->rmw_lock, flags);
 			return -EFAULT;
 		}
 		val = MASK_VAL_WRITE(reg, prev_val, val);
@@ -1174,7 +1175,7 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val)
 	}
 
 	if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
-		spin_unlock(&cpc_desc->rmw_lock);
+		raw_spin_unlock_irqrestore(&cpc_desc->rmw_lock, flags);
 
 	return ret_val;
 }
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 76e44e102780..62d368bcd9ec 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -65,7 +65,7 @@ struct cpc_desc {
 	int write_cmd_status;
 	int write_cmd_id;
 	/* Lock used for RMW operations in cpc_write() */
-	spinlock_t rmw_lock;
+	raw_spinlock_t rmw_lock;
 	struct cpc_register_resource cpc_regs[MAX_CPC_REG_ENT];
 	struct acpi_psd_package domain_info;
 	struct kobject kobj;
-- 
cgit v1.2.3


From d14772c0d88c387f881a577aa136e1e9b1291d07 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 29 Oct 2024 11:54:25 +0100
Subject: iommu: Fix prototype of iommu_paging_domain_alloc_flags()

The iommu_paging_domain_alloc_flags() prototype for
non-iommu kernel configurations lacks the 'static inline'
prefixes.

Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Fixes: 20858d4ebb42 ("iommu: Introduce iommu_paging_domain_alloc_flags()")
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index aa78d911fdda..522efdc7d815 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1080,7 +1080,7 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
-struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+static inline struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
 						     unsigned int flags)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From a33bf8d8ce7e06bf0f033865b0cea5887cd2ac8c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 29 Oct 2024 12:11:49 +0100
Subject: iommu: Restore iommu_flush_iotlb_all()

This patch restores the iommu_flush_iotlb_all() function.
Commit

	69e5a17511f6 ("iommu: Remove useless flush from iommu_create_device_direct_mappings()")

claims it removed the last call-site, except it did not. There is still
at least one caller in

	drivers/gpu/drm/msm/msm_iommu.c

so keep the function around until all call-sites are updated.

Cc: Jason Gunthorpe <jgg@ziepe.ca>
Fixes: 69e5a17511f6 ("iommu: Remove useless flush from iommu_create_device_direct_mappings()")
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 522efdc7d815..8cce372a33f1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -853,6 +853,12 @@ void iommu_set_dma_strict(void);
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
+static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+	if (domain->ops->flush_iotlb_all)
+		domain->ops->flush_iotlb_all(domain);
+}
+
 static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
@@ -1137,6 +1143,10 @@ static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
+static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+}
+
 static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
-- 
cgit v1.2.3


From ad4a3ca6a8e886f6491910a3ae5d53595e40597d Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 22 Oct 2024 09:38:22 +0300
Subject: ipv4: ip_tunnel: Fix suspicious RCU usage warning in
 ip_tunnel_init_flow()

There are code paths from which the function is called without holding
the RCU read lock, resulting in a suspicious RCU usage warning [1].

Fix by using l3mdev_master_upper_ifindex_by_index() which will acquire
the RCU read lock before calling
l3mdev_master_upper_ifindex_by_index_rcu().

[1]
WARNING: suspicious RCU usage
6.12.0-rc3-custom-gac8f72681cf2 #141 Not tainted
-----------------------------
net/core/dev.c:876 RCU-list traversed in non-reader section!!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
1 lock held by ip/361:
 #0: ffffffff86fc7cb0 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x377/0xf60

stack backtrace:
CPU: 3 UID: 0 PID: 361 Comm: ip Not tainted 6.12.0-rc3-custom-gac8f72681cf2 #141
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
Call Trace:
 <TASK>
 dump_stack_lvl+0xba/0x110
 lockdep_rcu_suspicious.cold+0x4f/0xd6
 dev_get_by_index_rcu+0x1d3/0x210
 l3mdev_master_upper_ifindex_by_index_rcu+0x2b/0xf0
 ip_tunnel_bind_dev+0x72f/0xa00
 ip_tunnel_newlink+0x368/0x7a0
 ipgre_newlink+0x14c/0x170
 __rtnl_newlink+0x1173/0x19c0
 rtnl_newlink+0x6c/0xa0
 rtnetlink_rcv_msg+0x3cc/0xf60
 netlink_rcv_skb+0x171/0x450
 netlink_unicast+0x539/0x7f0
 netlink_sendmsg+0x8c1/0xd80
 ____sys_sendmsg+0x8f9/0xc20
 ___sys_sendmsg+0x197/0x1e0
 __sys_sendmsg+0x122/0x1f0
 do_syscall_64+0xbb/0x1d0
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: db53cd3d88dc ("net: Handle l3mdev in ip_tunnel_init_flow")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20241022063822.462057-1-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_tunnels.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 6194fbb564c6..6a070478254d 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -354,7 +354,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4,
 	memset(fl4, 0, sizeof(*fl4));
 
 	if (oif) {
-		fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index_rcu(net, oif);
+		fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index(net, oif);
 		/* Legacy VRF/l3mdev use case */
 		fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif;
 	}
-- 
cgit v1.2.3


From 95b6235e36953e9d6c822270961e79751eee2276 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 28 Oct 2024 17:58:38 +0000
Subject: iommu: Make bus_iommu_probe() static

With the last external caller of bus_iommu_probe() now gone, make it
internal as it really should be.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: H. Nikolaus Schaller <hns@goldelico.com>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Tested-by: Kevin Hilman <khilman@baylibre.com>
Tested-by: Beleswar Padhi <b-padhi@ti.com>
Link: https://lore.kernel.org/r/a7511a034a27259aff4e14d80a861d3c40fbff1e.1730136799.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 3 ++-
 include/linux/iommu.h | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 83c8e617a2c5..35291b02aab1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -90,6 +90,7 @@ static const char * const iommu_group_resv_type_string[] = {
 #define IOMMU_CMD_LINE_DMA_API		BIT(0)
 #define IOMMU_CMD_LINE_STRICT		BIT(1)
 
+static int bus_iommu_probe(const struct bus_type *bus);
 static int iommu_bus_notifier(struct notifier_block *nb,
 			      unsigned long action, void *data);
 static void iommu_release_device(struct device *dev);
@@ -1795,7 +1796,7 @@ static void iommu_group_do_probe_finalize(struct device *dev)
 		ops->probe_finalize(dev);
 }
 
-int bus_iommu_probe(const struct bus_type *bus)
+static int bus_iommu_probe(const struct bus_type *bus)
 {
 	struct iommu_group *group, *next;
 	LIST_HEAD(group_list);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..84a6ed5e803a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -784,7 +784,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 	};
 }
 
-extern int bus_iommu_probe(const struct bus_type *bus);
 extern bool iommu_present(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
-- 
cgit v1.2.3


From 62a898b07b83f6f407003d8a70f0827a5af08a59 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 30 Oct 2024 18:05:13 +0800
Subject: bpf: Add bpf_mem_alloc_check_size() helper

Introduce bpf_mem_alloc_check_size() to check whether the allocation
size exceeds the limitation for the kmalloc-equivalent allocator. The
upper limit for percpu allocation is LLIST_NODE_SZ bytes larger than
non-percpu allocation, so a percpu argument is added to the helper.

The helper will be used in the following patch to check whether the size
parameter passed to bpf_mem_alloc() is too big.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20241030100516.3633640-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  3 +++
 kernel/bpf/memalloc.c         | 14 +++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index aaf004d94322..e45162ef59bb 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -33,6 +33,9 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg
 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
+/* Check the allocation size for kmalloc equivalent allocator */
+int bpf_mem_alloc_check_size(bool percpu, size_t size);
+
 /* kmalloc/kfree equivalent: */
 void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
 void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index b3858a76e0b3..146f5b57cfb1 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -35,6 +35,8 @@
  */
 #define LLIST_NODE_SZ sizeof(struct llist_node)
 
+#define BPF_MEM_ALLOC_SIZE_MAX 4096
+
 /* similar to kmalloc, but sizeof == 8 bucket is gone */
 static u8 size_index[24] __ro_after_init = {
 	3,	/* 8 */
@@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = {
 
 static int bpf_mem_cache_idx(size_t size)
 {
-	if (!size || size > 4096)
+	if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
 		return -1;
 
 	if (size <= 192)
@@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
 
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
+
+int bpf_mem_alloc_check_size(bool percpu, size_t size)
+{
+	/* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
+	if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
+	    (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
+		return -E2BIG;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 0fc810ae3ae110f9e2fcccce80fc8c8d62f97907 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 29 Oct 2024 16:03:31 -1000
Subject: x86/uaccess: Avoid barrier_nospec() in 64-bit copy_from_user()

The barrier_nospec() in 64-bit copy_from_user() is slow. Instead use
pointer masking to force the user pointer to all 1's for an invalid
address.

The kernel test robot reports a 2.6% improvement in the per_thread_ops
benchmark [1].

This is a variation on a patch originally by Josh Poimboeuf [2].

Link: https://lore.kernel.org/202410281344.d02c72a2-oliver.sang@intel.com [1]
Link: https://lore.kernel.org/5b887fe4c580214900e21f6c61095adf9a142735.1730166635.git.jpoimboe@kernel.org [2]
Tested-and-reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uaccess.h | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 39c7cf82b0c2..43844510d5d0 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -38,6 +38,7 @@
 #else
  #define can_do_masked_user_access() 0
  #define masked_user_access_begin(src) NULL
+ #define mask_user_address(src) (src)
 #endif
 
 /*
@@ -159,19 +160,27 @@ _inline_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long res = n;
 	might_fault();
-	if (!should_fail_usercopy() && likely(access_ok(from, n))) {
+	if (should_fail_usercopy())
+		goto fail;
+	if (can_do_masked_user_access())
+		from = mask_user_address(from);
+	else {
+		if (!access_ok(from, n))
+			goto fail;
 		/*
 		 * Ensure that bad access_ok() speculation will not
 		 * lead to nasty side effects *after* the copy is
 		 * finished:
 		 */
 		barrier_nospec();
-		instrument_copy_from_user_before(to, from, n);
-		res = raw_copy_from_user(to, from, n);
-		instrument_copy_from_user_after(to, from, n, res);
 	}
-	if (unlikely(res))
-		memset(to + (n - res), 0, res);
+	instrument_copy_from_user_before(to, from, n);
+	res = raw_copy_from_user(to, from, n);
+	instrument_copy_from_user_after(to, from, n, res);
+	if (likely(!res))
+		return 0;
+fail:
+	memset(to + (n - res), 0, res);
 	return res;
 }
 extern __must_check unsigned long
-- 
cgit v1.2.3


From f4657e16e767105194f97586fe3c03d3f64c4d37 Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Sun, 20 Oct 2024 15:08:19 +0800
Subject: mm/codetag: fix null pointer check logic for ref and tag

When we compile and load lib/slub_kunit.c,it will cause a panic.

The root cause is that __kmalloc_cache_noprof was directly called instead
of kmem_cache_alloc,which resulted in no alloc_tag being allocated.This
caused current->alloc_tag to be null,leading to a null pointer dereference
in alloc_tag_ref_set.

Despite the fact that my colleague Pei Xiao will later fix the code in
slub_kunit.c,we still need fix null pointer check logic for ref and tag to
avoid panic caused by a null pointer dereference.

Here is the log for the panic:

[   74.779373][ T2158] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
[   74.780130][ T2158] Mem abort info:
[   74.780406][ T2158]   ESR = 0x0000000096000004
[   74.780756][ T2158]   EC = 0x25: DABT (current EL), IL = 32 bits
[   74.781225][ T2158]   SET = 0, FnV = 0
[   74.781529][ T2158]   EA = 0, S1PTW = 0
[   74.781836][ T2158]   FSC = 0x04: level 0 translation fault
[   74.782288][ T2158] Data abort info:
[   74.782577][ T2158]   ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
[   74.783068][ T2158]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[   74.783533][ T2158]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[   74.784010][ T2158] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000105f34000
[   74.784586][ T2158] [0000000000000020] pgd=0000000000000000, p4d=0000000000000000
[   74.785293][ T2158] Internal error: Oops: 0000000096000004 [#1] SMP
[   74.785805][ T2158] Modules linked in: slub_kunit kunit ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute ip6table_nat ip6table_mangle 4
[   74.790661][ T2158] CPU: 0 UID: 0 PID: 2158 Comm: kunit_try_catch Kdump: loaded Tainted: G        W        N 6.12.0-rc3+ #2
[   74.791535][ T2158] Tainted: [W]=WARN, [N]=TEST
[   74.791889][ T2158] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[   74.792479][ T2158] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   74.793101][ T2158] pc : alloc_tagging_slab_alloc_hook+0x120/0x270
[   74.793607][ T2158] lr : alloc_tagging_slab_alloc_hook+0x120/0x270
[   74.794095][ T2158] sp : ffff800084d33cd0
[   74.794418][ T2158] x29: ffff800084d33cd0 x28: 0000000000000000 x27: 0000000000000000
[   74.795095][ T2158] x26: 0000000000000000 x25: 0000000000000012 x24: ffff80007b30e314
[   74.795822][ T2158] x23: ffff000390ff6f10 x22: 0000000000000000 x21: 0000000000000088
[   74.796555][ T2158] x20: ffff000390285840 x19: fffffd7fc3ef7830 x18: ffffffffffffffff
[   74.797283][ T2158] x17: ffff8000800e63b4 x16: ffff80007b33afc4 x15: ffff800081654c00
[   74.798011][ T2158] x14: 0000000000000000 x13: 205d383531325420 x12: 5b5d383734363537
[   74.798744][ T2158] x11: ffff800084d337e0 x10: 000000000000005d x9 : 00000000ffffffd0
[   74.799476][ T2158] x8 : 7f7f7f7f7f7f7f7f x7 : ffff80008219d188 x6 : c0000000ffff7fff
[   74.800206][ T2158] x5 : ffff0003fdbc9208 x4 : ffff800081edd188 x3 : 0000000000000001
[   74.800932][ T2158] x2 : 0beaa6dee1ac5a00 x1 : 0beaa6dee1ac5a00 x0 : ffff80037c2cb000
[   74.801656][ T2158] Call trace:
[   74.801954][ T2158]  alloc_tagging_slab_alloc_hook+0x120/0x270
[   74.802494][ T2158]  __kmalloc_cache_noprof+0x148/0x33c
[   74.802976][ T2158]  test_kmalloc_redzone_access+0x4c/0x104 [slub_kunit]
[   74.803607][ T2158]  kunit_try_run_case+0x70/0x17c [kunit]
[   74.804124][ T2158]  kunit_generic_run_threadfn_adapter+0x2c/0x4c [kunit]
[   74.804768][ T2158]  kthread+0x10c/0x118
[   74.805141][ T2158]  ret_from_fork+0x10/0x20
[   74.805540][ T2158] Code: b9400a80 11000400 b9000a80 97ffd858 (f94012d3)
[   74.806176][ T2158] SMP: stopping secondary CPUs
[   74.808130][ T2158] Starting crashdump kernel...

Link: https://lkml.kernel.org/r/20241020070819.307944-1-hao.ge@linux.dev
Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()")
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/alloc_tag.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 1f0a9ff23a2c..941deffc590d 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -135,18 +135,21 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
 #endif
 
 /* Caller should verify both ref and tag to be valid */
-static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+static inline bool __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
 {
 	alloc_tag_add_check(ref, tag);
 	if (!ref || !tag)
-		return;
+		return false;
 
 	ref->ct = &tag->ct;
+	return true;
 }
 
-static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
 {
-	__alloc_tag_ref_set(ref, tag);
+	if (unlikely(!__alloc_tag_ref_set(ref, tag)))
+		return false;
+
 	/*
 	 * We need in increment the call counter every time we have a new
 	 * allocation or when we split a large allocation into smaller ones.
@@ -154,12 +157,13 @@ static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *t
 	 * counter because when we free each part the counter will be decremented.
 	 */
 	this_cpu_inc(tag->counters->calls);
+	return true;
 }
 
 static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
 {
-	alloc_tag_ref_set(ref, tag);
-	this_cpu_add(tag->counters->bytes, bytes);
+	if (likely(alloc_tag_ref_set(ref, tag)))
+		this_cpu_add(tag->counters->bytes, bytes);
 }
 
 static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
-- 
cgit v1.2.3


From 5168a68eb78fa1c67a8b2d31d0642c7fd866cc12 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Wed, 23 Oct 2024 01:55:12 +0800
Subject: mm, swap: avoid over reclaim of full clusters

When running low on usable slots, cluster allocator will try to reclaim
the full clusters aggressively to reclaim HAS_CACHE slots.  This
guarantees that as long as there are any usable slots, HAS_CACHE or not,
the swap device will be usable and workload won't go OOM early.

Before the cluster allocator, swap allocator fails easily if device is
filled up with reclaimable HAS_CACHE slots.  Which can be easily
reproduced with following simple program:

    #include <stdio.h>
    #include <string.h>
    #include <linux/mman.h>
    #include <sys/mman.h>
    #define SIZE 8192UL * 1024UL * 1024UL
    int main(int argc, char **argv) {
        long tmp;
        char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        memset(p, 0, SIZE);
        madvise(p, SIZE, MADV_PAGEOUT);
        for (unsigned long i = 0; i < SIZE; ++i)
            tmp += p[i];
        getchar(); /* Pause */
        return 0;
    }

Setup an 8G non ramdisk swap, the first run of the program will swapout 8G
ram successfully.  But run same program again after the first run paused,
the second run can't swapout all 8G memory as now half of the swap device
is pinned by HAS_CACHE.  There was a random scan in the old allocator that
may reclaim part of the HAS_CACHE by luck, but it's unreliable.

The new allocator's added reclaim of full clusters when device is low on
usable slots.  But when multiple CPUs are seeing the device is low on
usable slots at the same time, they ran into a thundering herd problem.

This is an observable problem on large machine with mass parallel
workload, as full cluster reclaim is slower on large swap device and
higher number of CPUs will also make things worse.

Testing using a 128G ZRAM on a 48c96t system.  When the swap device is
very close to full (eg.  124G / 128G), running build linux kernel with
make -j96 in a 1G memory cgroup will hung (not a softlockup though)
spinning in full cluster reclaim for about ~5min before go OOM.

To solve this, split the full reclaim into two parts:

- Instead of do a synchronous aggressively reclaim when device is low,
  do only one aggressively reclaim when device is strictly full with a
  kworker. This still ensures in worst case the device won't be unusable
  because of HAS_CACHE slots.

- To avoid allocation (especially higher order) suffer from HAS_CACHE
  filling up clusters and kworker not responsive enough, do one synchronous
  scan every time the free list is drained, and only scan one cluster. This
  is kind of similar to the random reclaim before, keeps the full clusters
  rotated and has a minimal latency. This should provide a fair reclaim
  strategy suitable for most workloads.

Link: https://lkml.kernel.org/r/20241022175512.10398-1-ryncsn@gmail.com
Fixes: 2cacbdfdee65 ("mm: swap: add a adaptive full cluster cache reclaim")
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  1 +
 mm/swapfile.c        | 49 ++++++++++++++++++++++++++++++-------------------
 2 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ca533b478c21..f3e0ac20c2e8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -335,6 +335,7 @@ struct swap_info_struct {
 					 * list.
 					 */
 	struct work_struct discard_work; /* discard worker */
+	struct work_struct reclaim_work; /* reclaim worker */
 	struct list_head discard_clusters; /* discard clusters list */
 	struct plist_node avail_lists[]; /*
 					   * entries in swap_avail_heads, one
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b0915f3fab31..46bd4b1a3c07 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -731,15 +731,16 @@ done:
 	return offset;
 }
 
-static void swap_reclaim_full_clusters(struct swap_info_struct *si)
+/* Return true if reclaimed a whole cluster */
+static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
 {
 	long to_scan = 1;
 	unsigned long offset, end;
 	struct swap_cluster_info *ci;
 	unsigned char *map = si->swap_map;
-	int nr_reclaim, total_reclaimed = 0;
+	int nr_reclaim;
 
-	if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER)
+	if (force)
 		to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
 
 	while (!list_empty(&si->full_clusters)) {
@@ -749,28 +750,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si)
 		end = min(si->max, offset + SWAPFILE_CLUSTER);
 		to_scan--;
 
+		spin_unlock(&si->lock);
 		while (offset < end) {
 			if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
-				spin_unlock(&si->lock);
 				nr_reclaim = __try_to_reclaim_swap(si, offset,
 								   TTRS_ANYWAY | TTRS_DIRECT);
-				spin_lock(&si->lock);
-				if (nr_reclaim > 0) {
-					offset += nr_reclaim;
-					total_reclaimed += nr_reclaim;
-					continue;
-				} else if (nr_reclaim < 0) {
-					offset += -nr_reclaim;
+				if (nr_reclaim) {
+					offset += abs(nr_reclaim);
 					continue;
 				}
 			}
 			offset++;
 		}
-		if (to_scan <= 0 || total_reclaimed)
+		spin_lock(&si->lock);
+
+		if (to_scan <= 0)
 			break;
 	}
 }
 
+static void swap_reclaim_work(struct work_struct *work)
+{
+	struct swap_info_struct *si;
+
+	si = container_of(work, struct swap_info_struct, reclaim_work);
+
+	spin_lock(&si->lock);
+	swap_reclaim_full_clusters(si, true);
+	spin_unlock(&si->lock);
+}
+
 /*
  * Try to get swap entries with specified order from current cpu's swap entry
  * pool (a cluster). This might involve allocating a new cluster for current CPU
@@ -800,6 +809,10 @@ new_cluster:
 		goto done;
 	}
 
+	/* Try reclaim from full clusters if free clusters list is drained */
+	if (vm_swap_full())
+		swap_reclaim_full_clusters(si, false);
+
 	if (order < PMD_ORDER) {
 		unsigned int frags = 0;
 
@@ -881,13 +894,6 @@ new_cluster:
 	}
 
 done:
-	/* Try reclaim from full clusters if device is nearfull */
-	if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) {
-		swap_reclaim_full_clusters(si);
-		if (!found && !order && si->pages != si->inuse_pages)
-			goto new_cluster;
-	}
-
 	cluster->next[order] = offset;
 	return found;
 }
@@ -922,6 +928,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 		si->lowest_bit = si->max;
 		si->highest_bit = 0;
 		del_from_avail_list(si);
+
+		if (vm_swap_full())
+			schedule_work(&si->reclaim_work);
 	}
 }
 
@@ -2816,6 +2825,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	wait_for_completion(&p->comp);
 
 	flush_work(&p->discard_work);
+	flush_work(&p->reclaim_work);
 
 	destroy_swap_extents(p);
 	if (p->flags & SWP_CONTINUED)
@@ -3376,6 +3386,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		return PTR_ERR(si);
 
 	INIT_WORK(&si->discard_work, swap_discard_work);
+	INIT_WORK(&si->reclaim_work, swap_reclaim_work);
 
 	name = getname(specialfile);
 	if (IS_ERR(name)) {
-- 
cgit v1.2.3


From 9d08ec41a0645283d79a2e642205d488feaceacf Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 19 Oct 2024 22:22:12 -0600
Subject: mm: allow set/clear page_type again

Some page flags (page->flags) were converted to page types
(page->page_types).  A recent example is PG_hugetlb.

From the exclusive writer's perspective, e.g., a thread doing
__folio_set_hugetlb(), there is a difference between the page flag and
type APIs: the former allows the same non-atomic operation to be repeated
whereas the latter does not.  For example, calling __folio_set_hugetlb()
twice triggers VM_BUG_ON_FOLIO(), since the second call expects the type
(PG_hugetlb) not to be set previously.

Using add_hugetlb_folio() as an example, it calls __folio_set_hugetlb() in
the following error-handling path.  And when that happens, it triggers the
aforementioned VM_BUG_ON_FOLIO().

  if (folio_test_hugetlb(folio)) {
    rc = hugetlb_vmemmap_restore_folio(h, folio);
    if (rc) {
      spin_lock_irq(&hugetlb_lock);
      add_hugetlb_folio(h, folio, false);
      ...

It is possible to make hugeTLB comply with the new requirements from the
page type API.  However, a straightforward fix would be to just allow the
same page type to be set or cleared again inside the API, to avoid any
changes to its callers.

Link: https://lkml.kernel.org/r/20241020042212.296781-1-yuzhao@google.com
Fixes: d99e3140a4d3 ("mm: turn folio_test_hugetlb into a PageType")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1b3a76710487..cc839e4365c1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -975,12 +975,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \
 }									\
 static __always_inline void __folio_set_##fname(struct folio *folio)	\
 {									\
+	if (folio_test_##fname(folio))					\
+		return;							\
 	VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX,	\
 			folio);						\
 	folio->page.page_type = (unsigned int)PGTY_##lname << 24;	\
 }									\
 static __always_inline void __folio_clear_##fname(struct folio *folio)	\
 {									\
+	if (folio->page.page_type == UINT_MAX)				\
+		return;							\
 	VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio);		\
 	folio->page.page_type = UINT_MAX;				\
 }
@@ -993,11 +997,15 @@ static __always_inline int Page##uname(const struct page *page)		\
 }									\
 static __always_inline void __SetPage##uname(struct page *page)		\
 {									\
+	if (Page##uname(page))						\
+		return;							\
 	VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page);	\
 	page->page_type = (unsigned int)PGTY_##lname << 24;		\
 }									\
 static __always_inline void __ClearPage##uname(struct page *page)	\
 {									\
+	if (page->page_type == UINT_MAX)				\
+		return;							\
 	VM_BUG_ON_PAGE(!Page##uname(page), page);			\
 	page->page_type = UINT_MAX;					\
 }
-- 
cgit v1.2.3


From caa714f86699bcfb01aa2d698db12d91af7d0d81 Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie@huawei.com>
Date: Wed, 30 Oct 2024 10:35:02 +0800
Subject: drm/tests: helpers: Add helper for drm_display_mode_from_cea_vic()

As Maxime suggested, add a new helper
drm_kunit_display_mode_from_cea_vic(), it can replace the direct call
of drm_display_mode_from_cea_vic(), and it will help solving
the `mode` memory leaks.

Acked-by: Maxime Ripard <mripard@kernel.org>
Suggested-by: Maxime Ripard <mripard@kernel.org>
Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241030023504.530425-2-ruanjinjie@huawei.com
Signed-off-by: Maxime Ripard <mripard@kernel.org>
---
 drivers/gpu/drm/tests/drm_kunit_helpers.c | 42 +++++++++++++++++++++++++++++++
 include/drm/drm_kunit_helpers.h           |  4 +++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/tests/drm_kunit_helpers.c b/drivers/gpu/drm/tests/drm_kunit_helpers.c
index aa62719dab0e..04a6b8cc62ac 100644
--- a/drivers/gpu/drm/tests/drm_kunit_helpers.c
+++ b/drivers/gpu/drm/tests/drm_kunit_helpers.c
@@ -3,6 +3,7 @@
 #include <drm/drm_atomic.h>
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_drv.h>
+#include <drm/drm_edid.h>
 #include <drm/drm_fourcc.h>
 #include <drm/drm_kunit_helpers.h>
 #include <drm/drm_managed.h>
@@ -311,6 +312,47 @@ drm_kunit_helper_create_crtc(struct kunit *test,
 }
 EXPORT_SYMBOL_GPL(drm_kunit_helper_create_crtc);
 
+static void kunit_action_drm_mode_destroy(void *ptr)
+{
+	struct drm_display_mode *mode = ptr;
+
+	drm_mode_destroy(NULL, mode);
+}
+
+/**
+ * drm_kunit_display_mode_from_cea_vic() - return a mode for CEA VIC
+					   for a KUnit test
+ * @test: The test context object
+ * @dev: DRM device
+ * @video_code: CEA VIC of the mode
+ *
+ * Creates a new mode matching the specified CEA VIC for a KUnit test.
+ *
+ * Resources will be cleaned up automatically.
+ *
+ * Returns: A new drm_display_mode on success or NULL on failure
+ */
+struct drm_display_mode *
+drm_kunit_display_mode_from_cea_vic(struct kunit *test, struct drm_device *dev,
+				    u8 video_code)
+{
+	struct drm_display_mode *mode;
+	int ret;
+
+	mode = drm_display_mode_from_cea_vic(dev, video_code);
+	if (!mode)
+		return NULL;
+
+	ret = kunit_add_action_or_reset(test,
+					kunit_action_drm_mode_destroy,
+					mode);
+	if (ret)
+		return NULL;
+
+	return mode;
+}
+EXPORT_SYMBOL_GPL(drm_kunit_display_mode_from_cea_vic);
+
 MODULE_AUTHOR("Maxime Ripard <maxime@cerno.tech>");
 MODULE_DESCRIPTION("KUnit test suite helper functions");
 MODULE_LICENSE("GPL");
diff --git a/include/drm/drm_kunit_helpers.h b/include/drm/drm_kunit_helpers.h
index e7cc17ee4934..afdd46ef04f7 100644
--- a/include/drm/drm_kunit_helpers.h
+++ b/include/drm/drm_kunit_helpers.h
@@ -120,4 +120,8 @@ drm_kunit_helper_create_crtc(struct kunit *test,
 			     const struct drm_crtc_funcs *funcs,
 			     const struct drm_crtc_helper_funcs *helper_funcs);
 
+struct drm_display_mode *
+drm_kunit_display_mode_from_cea_vic(struct kunit *test, struct drm_device *dev,
+				    u8 video_code);
+
 #endif // DRM_KUNIT_HELPERS_H_
-- 
cgit v1.2.3


From 071b24b54d2d05fbf39ddbb27dee08abd1d713f3 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sun, 27 Oct 2024 22:31:15 -0700
Subject: Input: fix regression when re-registering input handlers

Commit d469647bafd9 ("Input: simplify event handling logic") introduced
code that would set handler->events() method to either
input_handler_events_filter() or input_handler_events_default() or
input_handler_events_null(), depending on the kind of input handler
(a filter or a regular one) we are dealing with. Unfortunately this
breaks cases when we try to re-register the same filter (as is the case
with sysrq handler): after initial registration the handler will have 2
event handling methods defined, and will run afoul of the check in
input_handler_check_methods():

	input: input_handler_check_methods: only one event processing method can be defined (sysrq)
	sysrq: Failed to register input handler, error -22

Fix this by adding handle_events() method to input_handle structure and
setting it up when registering a new input handle according to event
handling methods defined in associated input_handler structure, thus
avoiding modifying the input_handler structure.

Reported-by: "Ned T. Crigler" <crigler@gmail.com>
Reported-by: Christian Heusel <christian@heusel.eu>
Tested-by: "Ned T. Crigler" <crigler@gmail.com>
Tested-by: Peter Seiderer <ps.report@gmx.net>
Fixes: d469647bafd9 ("Input: simplify event handling logic")
Link: https://lore.kernel.org/r/Zx2iQp6csn42PJA7@xavtug
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/input.c | 134 +++++++++++++++++++++++++++-----------------------
 include/linux/input.h |  10 +++-
 2 files changed, 82 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 3c321671793f..3d2cc13e1f32 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -119,12 +119,12 @@ static void input_pass_values(struct input_dev *dev,
 
 	handle = rcu_dereference(dev->grab);
 	if (handle) {
-		count = handle->handler->events(handle, vals, count);
+		count = handle->handle_events(handle, vals, count);
 	} else {
 		list_for_each_entry_rcu(handle, &dev->h_list, d_node)
 			if (handle->open) {
-				count = handle->handler->events(handle, vals,
-								count);
+				count = handle->handle_events(handle, vals,
+							      count);
 				if (!count)
 					break;
 			}
@@ -2537,57 +2537,6 @@ static int input_handler_check_methods(const struct input_handler *handler)
 	return 0;
 }
 
-/*
- * An implementation of input_handler's events() method that simply
- * invokes handler->event() method for each event one by one.
- */
-static unsigned int input_handler_events_default(struct input_handle *handle,
-						 struct input_value *vals,
-						 unsigned int count)
-{
-	struct input_handler *handler = handle->handler;
-	struct input_value *v;
-
-	for (v = vals; v != vals + count; v++)
-		handler->event(handle, v->type, v->code, v->value);
-
-	return count;
-}
-
-/*
- * An implementation of input_handler's events() method that invokes
- * handler->filter() method for each event one by one and removes events
- * that were filtered out from the "vals" array.
- */
-static unsigned int input_handler_events_filter(struct input_handle *handle,
-						struct input_value *vals,
-						unsigned int count)
-{
-	struct input_handler *handler = handle->handler;
-	struct input_value *end = vals;
-	struct input_value *v;
-
-	for (v = vals; v != vals + count; v++) {
-		if (handler->filter(handle, v->type, v->code, v->value))
-			continue;
-		if (end != v)
-			*end = *v;
-		end++;
-	}
-
-	return end - vals;
-}
-
-/*
- * An implementation of input_handler's events() method that does nothing.
- */
-static unsigned int input_handler_events_null(struct input_handle *handle,
-					      struct input_value *vals,
-					      unsigned int count)
-{
-	return count;
-}
-
 /**
  * input_register_handler - register a new input handler
  * @handler: handler to be registered
@@ -2607,13 +2556,6 @@ int input_register_handler(struct input_handler *handler)
 
 	INIT_LIST_HEAD(&handler->h_list);
 
-	if (handler->filter)
-		handler->events = input_handler_events_filter;
-	else if (handler->event)
-		handler->events = input_handler_events_default;
-	else if (!handler->events)
-		handler->events = input_handler_events_null;
-
 	error = mutex_lock_interruptible(&input_mutex);
 	if (error)
 		return error;
@@ -2687,6 +2629,75 @@ int input_handler_for_each_handle(struct input_handler *handler, void *data,
 }
 EXPORT_SYMBOL(input_handler_for_each_handle);
 
+/*
+ * An implementation of input_handle's handle_events() method that simply
+ * invokes handler->event() method for each event one by one.
+ */
+static unsigned int input_handle_events_default(struct input_handle *handle,
+						struct input_value *vals,
+						unsigned int count)
+{
+	struct input_handler *handler = handle->handler;
+	struct input_value *v;
+
+	for (v = vals; v != vals + count; v++)
+		handler->event(handle, v->type, v->code, v->value);
+
+	return count;
+}
+
+/*
+ * An implementation of input_handle's handle_events() method that invokes
+ * handler->filter() method for each event one by one and removes events
+ * that were filtered out from the "vals" array.
+ */
+static unsigned int input_handle_events_filter(struct input_handle *handle,
+					       struct input_value *vals,
+					       unsigned int count)
+{
+	struct input_handler *handler = handle->handler;
+	struct input_value *end = vals;
+	struct input_value *v;
+
+	for (v = vals; v != vals + count; v++) {
+		if (handler->filter(handle, v->type, v->code, v->value))
+			continue;
+		if (end != v)
+			*end = *v;
+		end++;
+	}
+
+	return end - vals;
+}
+
+/*
+ * An implementation of input_handle's handle_events() method that does nothing.
+ */
+static unsigned int input_handle_events_null(struct input_handle *handle,
+					     struct input_value *vals,
+					     unsigned int count)
+{
+	return count;
+}
+
+/*
+ * Sets up appropriate handle->event_handler based on the input_handler
+ * associated with the handle.
+ */
+static void input_handle_setup_event_handler(struct input_handle *handle)
+{
+	struct input_handler *handler = handle->handler;
+
+	if (handler->filter)
+		handle->handle_events = input_handle_events_filter;
+	else if (handler->event)
+		handle->handle_events = input_handle_events_default;
+	else if (handler->events)
+		handle->handle_events = handler->events;
+	else
+		handle->handle_events = input_handle_events_null;
+}
+
 /**
  * input_register_handle - register a new input handle
  * @handle: handle to register
@@ -2704,6 +2715,7 @@ int input_register_handle(struct input_handle *handle)
 	struct input_dev *dev = handle->dev;
 	int error;
 
+	input_handle_setup_event_handler(handle);
 	/*
 	 * We take dev->mutex here to prevent race with
 	 * input_release_device().
diff --git a/include/linux/input.h b/include/linux/input.h
index 89a0be6ee0e2..cd866b020a01 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -339,12 +339,16 @@ struct input_handler {
  * @name: name given to the handle by handler that created it
  * @dev: input device the handle is attached to
  * @handler: handler that works with the device through this handle
+ * @handle_events: event sequence handler. It is set up by the input core
+ *	according to event handling method specified in the @handler. See
+ *	input_handle_setup_event_handler().
+ *	This method is being called by the input core with interrupts disabled
+ *	and dev->event_lock spinlock held and so it may not sleep.
  * @d_node: used to put the handle on device's list of attached handles
  * @h_node: used to put the handle on handler's list of handles from which
  *	it gets events
  */
 struct input_handle {
-
 	void *private;
 
 	int open;
@@ -353,6 +357,10 @@ struct input_handle {
 	struct input_dev *dev;
 	struct input_handler *handler;
 
+	unsigned int (*handle_events)(struct input_handle *handle,
+				      struct input_value *vals,
+				      unsigned int count);
+
 	struct list_head	d_node;
 	struct list_head	h_node;
 };
-- 
cgit v1.2.3


From ddd6d8e975b171ea3f63a011a75820883ff0d479 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 19 Oct 2024 01:29:38 +0000
Subject: mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL stats

Patch series "mm: multi-gen LRU: Have secondary MMUs participate in
MM_WALK".

Today, the MM_WALK capability causes MGLRU to clear the young bit from
PMDs and PTEs during the page table walk before eviction, but MGLRU does
not call the clear_young() MMU notifier in this case.  By not calling this
notifier, the MM walk takes less time/CPU, but it causes pages that are
accessed mostly through KVM / secondary MMUs to appear younger than they
should be.

We do call the clear_young() notifier today, but only when attempting to
evict the page, so we end up clearing young/accessed information less
frequently for secondary MMUs than for mm PTEs, and therefore they appear
younger and are less likely to be evicted.  Therefore, memory that is
*not* being accessed mostly by KVM will be evicted *more* frequently,
worsening performance.

ChromeOS observed a tab-open latency regression when enabling MGLRU with a
setup that involved running a VM:

		Tab-open latency histogram (ms)
Version		p50	mean	p95	p99	max
base		1315	1198	2347	3454	10319
mglru		2559	1311	7399	12060	43758
fix		1119	926	2470	4211	6947

This series replaces the final non-selftest patchs from this series[1],
which introduced a similar change (and a new MMU notifier) with KVM
optimizations.  I'll send a separate series (to Sean and Paolo) for the
KVM optimizations.

This series also makes proactive reclaim with MGLRU possible for KVM
memory.  I have verified that this functions correctly with the selftest
from [1], but given that that test is a KVM selftest, I'll send it with
the rest of the KVM optimizations later.  Andrew, let me know if you'd
like to take the test now anyway.

[1]: https://lore.kernel.org/linux-mm/20240926013506.860253-18-jthoughton@google.com/


This patch (of 2):

The removed stats, MM_LEAF_OLD and MM_NONLEAF_TOTAL, are not very helpful
and become more complicated to properly compute when adding
test/clear_young() notifiers in MGLRU's mm walk.

Link: https://lkml.kernel.org/r/20241019012940.3656292-1-jthoughton@google.com
Link: https://lkml.kernel.org/r/20241019012940.3656292-2-jthoughton@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Stevens <stevensd@google.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Wei Xu <weixugc@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  2 --
 mm/vmscan.c            | 14 +++++---------
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 17506e4a2835..9342e5692dab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,9 +458,7 @@ struct lru_gen_folio {
 
 enum {
 	MM_LEAF_TOTAL,		/* total leaf entries */
-	MM_LEAF_OLD,		/* old leaf entries */
 	MM_LEAF_YOUNG,		/* young leaf entries */
-	MM_NONLEAF_TOTAL,	/* total non-leaf entries */
 	MM_NONLEAF_FOUND,	/* non-leaf entries found in Bloom filters */
 	MM_NONLEAF_ADDED,	/* non-leaf entries added to Bloom filters */
 	NR_MM_STATS
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb4e8440c507..4f1d33e4b360 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3399,7 +3399,6 @@ restart:
 			continue;
 
 		if (!pte_young(ptent)) {
-			walk->mm_stats[MM_LEAF_OLD]++;
 			continue;
 		}
 
@@ -3552,7 +3551,6 @@ restart:
 			walk->mm_stats[MM_LEAF_TOTAL]++;
 
 			if (!pmd_young(val)) {
-				walk->mm_stats[MM_LEAF_OLD]++;
 				continue;
 			}
 
@@ -3564,8 +3562,6 @@ restart:
 			continue;
 		}
 
-		walk->mm_stats[MM_NONLEAF_TOTAL]++;
-
 		if (!walk->force_scan && should_clear_pmd_young()) {
 			if (!pmd_young(val))
 				continue;
@@ -5254,11 +5250,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
 		seq_printf(m, "            %10d", tier);
 		for (type = 0; type < ANON_AND_FILE; type++) {
-			const char *s = "   ";
+			const char *s = "xxx";
 			unsigned long n[3] = {};
 
 			if (seq == max_seq) {
-				s = "RT ";
+				s = "RTx";
 				n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
 				n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
 			} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
@@ -5280,14 +5276,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 
 	seq_puts(m, "                      ");
 	for (i = 0; i < NR_MM_STATS; i++) {
-		const char *s = "      ";
+		const char *s = "xxxx";
 		unsigned long n = 0;
 
 		if (seq == max_seq && NR_HIST_GENS == 1) {
-			s = "LOYNFA";
+			s = "TYFA";
 			n = READ_ONCE(mm_state->stats[hist][i]);
 		} else if (seq != max_seq && NR_HIST_GENS > 1) {
-			s = "loynfa";
+			s = "tyfa";
 			n = READ_ONCE(mm_state->stats[hist][i]);
 		}
 
-- 
cgit v1.2.3


From 1d4832becdc2cdb2cffe2a6050c9d9fd8ff1c58c Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 19 Oct 2024 01:29:39 +0000
Subject: mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify()

When the MM_WALK capability is enabled, memory that is mostly accessed by
a VM appears younger than it really is, therefore this memory will be less
likely to be evicted.  Therefore, the presence of a running VM can
significantly increase swap-outs for non-VM memory, regressing the
performance for the rest of the system.

Fix this regression by always calling {ptep,pmdp}_clear_young_notify()
whenever we clear the young bits on PMDs/PTEs.

[jthoughton@google.com: fix link-time error]
Link: https://lkml.kernel.org/r/20241019012940.3656292-3-jthoughton@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
Reported-by: David Stevens <stevensd@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Wei Xu <weixugc@google.com>
Cc: <stable@vger.kernel.org>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  5 +--
 mm/rmap.c              |  9 ++----
 mm/vmscan.c            | 88 ++++++++++++++++++++++++++++----------------------
 3 files changed, 55 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9342e5692dab..5b1c984daf45 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -555,7 +555,7 @@ struct lru_gen_memcg {
 
 void lru_gen_init_pgdat(struct pglist_data *pgdat);
 void lru_gen_init_lruvec(struct lruvec *lruvec);
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 
 void lru_gen_init_memcg(struct mem_cgroup *memcg);
 void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -574,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
 }
 
-static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
+	return false;
 }
 
 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
diff --git a/mm/rmap.c b/mm/rmap.c
index a8797d1b3d49..73d5998677d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -885,13 +885,10 @@ static bool folio_referenced_one(struct folio *folio,
 			return false;
 		}
 
-		if (pvmw.pte) {
-			if (lru_gen_enabled() &&
-			    pte_young(ptep_get(pvmw.pte))) {
-				lru_gen_look_around(&pvmw);
+		if (lru_gen_enabled() && pvmw.pte) {
+			if (lru_gen_look_around(&pvmw))
 				referenced++;
-			}
-
+		} else if (pvmw.pte) {
 			if (ptep_clear_flush_young_notify(vma, address,
 						pvmw.pte))
 				referenced++;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f1d33e4b360..ddaaff67642e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,7 @@
 #include <linux/khugepaged.h>
 #include <linux/rculist_nulls.h>
 #include <linux/random.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -3294,7 +3295,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
 	return false;
 }
 
-static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr,
+				 struct pglist_data *pgdat)
 {
 	unsigned long pfn = pte_pfn(pte);
 
@@ -3306,13 +3308,20 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
 	if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
 		return -1;
 
+	if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
+		return -1;
+
 	if (WARN_ON_ONCE(!pfn_valid(pfn)))
 		return -1;
 
+	if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+		return -1;
+
 	return pfn;
 }
 
-static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr,
+				 struct pglist_data *pgdat)
 {
 	unsigned long pfn = pmd_pfn(pmd);
 
@@ -3324,9 +3333,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
 	if (WARN_ON_ONCE(pmd_devmap(pmd)))
 		return -1;
 
+	if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
+		return -1;
+
 	if (WARN_ON_ONCE(!pfn_valid(pfn)))
 		return -1;
 
+	if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+		return -1;
+
 	return pfn;
 }
 
@@ -3335,10 +3350,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
 {
 	struct folio *folio;
 
-	/* try to avoid unnecessary memory loads */
-	if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
-		return NULL;
-
 	folio = pfn_folio(pfn);
 	if (folio_nid(folio) != pgdat->node_id)
 		return NULL;
@@ -3394,20 +3405,16 @@ restart:
 		total++;
 		walk->mm_stats[MM_LEAF_TOTAL]++;
 
-		pfn = get_pte_pfn(ptent, args->vma, addr);
+		pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
 		if (pfn == -1)
 			continue;
 
-		if (!pte_young(ptent)) {
-			continue;
-		}
-
 		folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
 		if (!folio)
 			continue;
 
-		if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
-			VM_WARN_ON_ONCE(true);
+		if (!ptep_clear_young_notify(args->vma, addr, pte + i))
+			continue;
 
 		young++;
 		walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3473,21 +3480,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
 		/* don't round down the first address */
 		addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
 
-		pfn = get_pmd_pfn(pmd[i], vma, addr);
-		if (pfn == -1)
+		if (!pmd_present(pmd[i]))
 			goto next;
 
 		if (!pmd_trans_huge(pmd[i])) {
-			if (!walk->force_scan && should_clear_pmd_young())
+			if (!walk->force_scan && should_clear_pmd_young() &&
+			    !mm_has_notifiers(args->mm))
 				pmdp_test_and_clear_young(vma, addr, pmd + i);
 			goto next;
 		}
 
+		pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat);
+		if (pfn == -1)
+			goto next;
+
 		folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
 		if (!folio)
 			goto next;
 
-		if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
+		if (!pmdp_clear_young_notify(vma, addr, pmd + i))
 			goto next;
 
 		walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3545,24 +3556,18 @@ restart:
 		}
 
 		if (pmd_trans_huge(val)) {
-			unsigned long pfn = pmd_pfn(val);
 			struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+			unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat);
 
 			walk->mm_stats[MM_LEAF_TOTAL]++;
 
-			if (!pmd_young(val)) {
-				continue;
-			}
-
-			/* try to avoid unnecessary memory loads */
-			if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
-				continue;
-
-			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+			if (pfn != -1)
+				walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
 			continue;
 		}
 
-		if (!walk->force_scan && should_clear_pmd_young()) {
+		if (!walk->force_scan && should_clear_pmd_young() &&
+		    !mm_has_notifiers(args->mm)) {
 			if (!pmd_young(val))
 				continue;
 
@@ -4036,13 +4041,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  * the PTE table to the Bloom filter. This forms a feedback loop between the
  * eviction and the aging.
  */
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
 	int i;
 	unsigned long start;
 	unsigned long end;
 	struct lru_gen_mm_walk *walk;
-	int young = 0;
+	int young = 1;
 	pte_t *pte = pvmw->pte;
 	unsigned long addr = pvmw->address;
 	struct vm_area_struct *vma = pvmw->vma;
@@ -4058,12 +4063,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	lockdep_assert_held(pvmw->ptl);
 	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
 
+	if (!ptep_clear_young_notify(vma, addr, pte))
+		return false;
+
 	if (spin_is_contended(pvmw->ptl))
-		return;
+		return true;
 
 	/* exclude special VMAs containing anon pages from COW */
 	if (vma->vm_flags & VM_SPECIAL)
-		return;
+		return true;
 
 	/* avoid taking the LRU lock under the PTL when possible */
 	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
@@ -4071,6 +4079,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	start = max(addr & PMD_MASK, vma->vm_start);
 	end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
 
+	if (end - start == PAGE_SIZE)
+		return true;
+
 	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
 		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
 			end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -4084,7 +4095,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 
 	/* folio_update_gen() requires stable folio_memcg() */
 	if (!mem_cgroup_trylock_pages(memcg))
-		return;
+		return true;
 
 	arch_enter_lazy_mmu_mode();
 
@@ -4094,19 +4105,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 		unsigned long pfn;
 		pte_t ptent = ptep_get(pte + i);
 
-		pfn = get_pte_pfn(ptent, vma, addr);
+		pfn = get_pte_pfn(ptent, vma, addr, pgdat);
 		if (pfn == -1)
 			continue;
 
-		if (!pte_young(ptent))
-			continue;
-
 		folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
 		if (!folio)
 			continue;
 
-		if (!ptep_test_and_clear_young(vma, addr, pte + i))
-			VM_WARN_ON_ONCE(true);
+		if (!ptep_clear_young_notify(vma, addr, pte + i))
+			continue;
 
 		young++;
 
@@ -4136,6 +4144,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	/* feedback from rmap walkers to page table walkers */
 	if (mm_state && suitable_to_scan(i, young))
 		update_bloom_filter(mm_state, max_seq, pvmw->pmd);
+
+	return true;
 }
 
 /******************************************************************************
-- 
cgit v1.2.3


From bc2940869508b7b956a757a26d3b1ebf9546790e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 23 Oct 2024 16:34:42 -0400
Subject: nfs_common: fix localio to cope with racing nfs_local_probe()

Fix the possibility of racing nfs_local_probe() resulting in:
  list_add double add: new=ffff8b99707f9f58, prev=ffff8b99707f9f58, next=ffffffffc0f30000.
  ------------[ cut here ]------------
  kernel BUG at lib/list_debug.c:35!

Add nfs_uuid_init() to properly initialize all nfs_uuid_t members
(particularly its list_head).

Switch to returning bool from nfs_uuid_begin(), returns false if
nfs_uuid_t is already in-use (its list_head is on a list). Update
nfs_local_probe() to return early if the nfs_client's cl_uuid
(nfs_uuid_t) is in-use.

Also, switch nfs_uuid_begin() from using list_add_tail_rcu() to
list_add_tail() -- rculist was used in an earlier version of the
localio code that had a lockless nfs_uuid_lookup interface.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/client.c            |  3 +--
 fs/nfs/localio.c           |  3 ++-
 fs/nfs_common/nfslocalio.c | 23 ++++++++++++++++++-----
 include/linux/nfslocalio.h |  3 ++-
 4 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 114282398716..03ecc7765615 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -181,8 +181,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 	seqlock_init(&clp->cl_boot_lock);
 	ktime_get_real_ts64(&clp->cl_nfssvc_boot);
-	clp->cl_uuid.net = NULL;
-	clp->cl_uuid.dom = NULL;
+	nfs_uuid_init(&clp->cl_uuid);
 	spin_lock_init(&clp->cl_localio_lock);
 #endif /* CONFIG_NFS_LOCALIO */
 
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index d0aa680ec816..8f0ce82a677e 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -205,7 +205,8 @@ void nfs_local_probe(struct nfs_client *clp)
 		nfs_local_disable(clp);
 	}
 
-	nfs_uuid_begin(&clp->cl_uuid);
+	if (!nfs_uuid_begin(&clp->cl_uuid))
+		return;
 	if (nfs_server_uuid_is_local(clp))
 		nfs_local_enable(clp);
 	nfs_uuid_end(&clp->cl_uuid);
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 5c8ce5066c16..09404d142d1a 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -5,7 +5,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/rculist.h>
+#include <linux/list.h>
 #include <linux/nfslocalio.h>
 #include <net/netns/generic.h>
 
@@ -20,15 +20,27 @@ static DEFINE_SPINLOCK(nfs_uuid_lock);
  */
 static LIST_HEAD(nfs_uuids);
 
-void nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
+void nfs_uuid_init(nfs_uuid_t *nfs_uuid)
 {
 	nfs_uuid->net = NULL;
 	nfs_uuid->dom = NULL;
-	uuid_gen(&nfs_uuid->uuid);
+	INIT_LIST_HEAD(&nfs_uuid->list);
+}
+EXPORT_SYMBOL_GPL(nfs_uuid_init);
 
+bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
+{
 	spin_lock(&nfs_uuid_lock);
-	list_add_tail_rcu(&nfs_uuid->list, &nfs_uuids);
+	/* Is this nfs_uuid already in use? */
+	if (!list_empty(&nfs_uuid->list)) {
+		spin_unlock(&nfs_uuid_lock);
+		return false;
+	}
+	uuid_gen(&nfs_uuid->uuid);
+	list_add_tail(&nfs_uuid->list, &nfs_uuids);
 	spin_unlock(&nfs_uuid_lock);
+
+	return true;
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_begin);
 
@@ -36,7 +48,8 @@ void nfs_uuid_end(nfs_uuid_t *nfs_uuid)
 {
 	if (nfs_uuid->net == NULL) {
 		spin_lock(&nfs_uuid_lock);
-		list_del_init(&nfs_uuid->list);
+		if (nfs_uuid->net == NULL)
+			list_del_init(&nfs_uuid->list);
 		spin_unlock(&nfs_uuid_lock);
 	}
 }
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index b0dd9b1eef4f..3982fea79919 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -32,7 +32,8 @@ typedef struct {
 	struct auth_domain *dom; /* auth_domain for localio */
 } nfs_uuid_t;
 
-void nfs_uuid_begin(nfs_uuid_t *);
+void nfs_uuid_init(nfs_uuid_t *);
+bool nfs_uuid_begin(nfs_uuid_t *);
 void nfs_uuid_end(nfs_uuid_t *);
 void nfs_uuid_is_local(const uuid_t *, struct list_head *,
 		       struct net *, struct auth_domain *, struct module *);
-- 
cgit v1.2.3


From 95e2eaf5b91aae6c3a433cd7882733bd806fa3c8 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 4 Nov 2024 09:40:30 +0800
Subject: iommu/vt-d: Remove unused dmar_msi_read

dmar_msi_read() has been unused since 2022 in
commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture")

Remove it.

(dmar_msi_write still exists and is used once).

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://lore.kernel.org/r/20241022002702.302728-1-linux@treblig.org
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/dmar.c | 13 -------------
 include/linux/dmar.h       |  1 -
 2 files changed, 14 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index e16c2b1d7633..9f424acf474e 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1895,19 +1895,6 @@ void dmar_msi_write(int irq, struct msi_msg *msg)
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
-void dmar_msi_read(int irq, struct msi_msg *msg)
-{
-	struct intel_iommu *iommu = irq_get_handler_data(irq);
-	int reg = dmar_msi_reg(iommu, irq);
-	unsigned long flag;
-
-	raw_spin_lock_irqsave(&iommu->register_lock, flag);
-	msg->data = readl(iommu->reg + reg + 4);
-	msg->address_lo = readl(iommu->reg + reg + 8);
-	msg->address_hi = readl(iommu->reg + reg + 12);
-	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
 static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 		u8 fault_reason, u32 pasid, u16 source_id,
 		unsigned long long addr)
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 499bb2c63483..692b2b445761 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -292,7 +292,6 @@ static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src)
 struct irq_data;
 extern void dmar_msi_unmask(struct irq_data *data);
 extern void dmar_msi_mask(struct irq_data *data);
-extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
 extern irqreturn_t dmar_fault(int irq, void *dev_id);
-- 
cgit v1.2.3


From 0fb4a7ad270b3b209e510eb9dc5b07bf02b7edaf Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 29 Oct 2024 18:11:46 +0000
Subject: mm: refactor map_deny_write_exec()

Refactor the map_deny_write_exec() to not unnecessarily require a VMA
parameter but rather to accept VMA flags parameters, which allows us to
use this function early in mmap_region() in a subsequent commit.

While we're here, we refactor the function to be more readable and add
some additional documentation.

Link: https://lkml.kernel.org/r/6be8bb59cd7c68006ebb006eb9d8dc27104b1f70.1730224667.git.lorenzo.stoakes@oracle.com
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: Jann Horn <jannh@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jann Horn <jannh@google.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mman.h | 21 ++++++++++++++++++---
 mm/mmap.c            |  2 +-
 mm/mprotect.c        |  2 +-
 mm/vma.h             |  2 +-
 4 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mman.h b/include/linux/mman.h
index bcb201ab7a41..8ddca62d6460 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -188,16 +188,31 @@ static inline bool arch_memory_deny_write_exec_supported(void)
  *
  *	d)	mmap(PROT_READ | PROT_EXEC)
  *		mmap(PROT_READ | PROT_EXEC | PROT_BTI)
+ *
+ * This is only applicable if the user has set the Memory-Deny-Write-Execute
+ * (MDWE) protection mask for the current process.
+ *
+ * @old specifies the VMA flags the VMA originally possessed, and @new the ones
+ * we propose to set.
+ *
+ * Return: false if proposed change is OK, true if not ok and should be denied.
  */
-static inline bool map_deny_write_exec(struct vm_area_struct *vma,  unsigned long vm_flags)
+static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
 {
+	/* If MDWE is disabled, we have nothing to deny. */
 	if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
 		return false;
 
-	if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+	/* If the new VMA is not executable, we have nothing to deny. */
+	if (!(new & VM_EXEC))
+		return false;
+
+	/* Under MDWE we do not accept newly writably executable VMAs... */
+	if (new & VM_WRITE)
 		return true;
 
-	if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+	/* ...nor previously non-executable VMAs becoming executable. */
+	if (!(old & VM_EXEC))
 		return true;
 
 	return false;
diff --git a/mm/mmap.c b/mm/mmap.c
index ac0604f146f6..ab71d4c3464c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1505,7 +1505,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		vma_set_anonymous(vma);
 	}
 
-	if (map_deny_write_exec(vma, vma->vm_flags)) {
+	if (map_deny_write_exec(vma->vm_flags, vma->vm_flags)) {
 		error = -EACCES;
 		goto close_and_free_vma;
 	}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5d6d06107d..6f450af3252e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -810,7 +810,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
 			break;
 		}
 
-		if (map_deny_write_exec(vma, newflags)) {
+		if (map_deny_write_exec(vma->vm_flags, newflags)) {
 			error = -EACCES;
 			break;
 		}
diff --git a/mm/vma.h b/mm/vma.h
index 75558b5e9c8c..d58068c0ff2e 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -42,7 +42,7 @@ struct vma_munmap_struct {
 	int vma_count;                  /* Number of vmas that will be removed */
 	bool unlock;                    /* Unlock after the munmap */
 	bool clear_ptes;                /* If there are outstanding PTE to be cleared */
-	/* 1 byte hole */
+	/* 2 byte hole */
 	unsigned long nr_pages;         /* Number of pages being removed */
 	unsigned long locked_vm;        /* Number of locked pages */
 	unsigned long nr_accounted;     /* Number of VM_ACCOUNT pages */
-- 
cgit v1.2.3


From 5baf8b037debf4ec60108ccfeccb8636d1dbad81 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 29 Oct 2024 18:11:47 +0000
Subject: mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling

Currently MTE is permitted in two circumstances (desiring to use MTE
having been specified by the VM_MTE flag) - where MAP_ANONYMOUS is
specified, as checked by arch_calc_vm_flag_bits() and actualised by
setting the VM_MTE_ALLOWED flag, or if the file backing the mapping is
shmem, in which case we set VM_MTE_ALLOWED in shmem_mmap() when the mmap
hook is activated in mmap_region().

The function that checks that, if VM_MTE is set, VM_MTE_ALLOWED is also
set is the arm64 implementation of arch_validate_flags().

Unfortunately, we intend to refactor mmap_region() to perform this check
earlier, meaning that in the case of a shmem backing we will not have
invoked shmem_mmap() yet, causing the mapping to fail spuriously.

It is inappropriate to set this architecture-specific flag in general mm
code anyway, so a sensible resolution of this issue is to instead move the
check somewhere else.

We resolve this by setting VM_MTE_ALLOWED much earlier in do_mmap(), via
the arch_calc_vm_flag_bits() call.

This is an appropriate place to do this as we already check for the
MAP_ANONYMOUS case here, and the shmem file case is simply a variant of
the same idea - we permit RAM-backed memory.

This requires a modification to the arch_calc_vm_flag_bits() signature to
pass in a pointer to the struct file associated with the mapping, however
this is not too egregious as this is only used by two architectures anyway
- arm64 and parisc.

So this patch performs this adjustment and removes the unnecessary
assignment of VM_MTE_ALLOWED in shmem_mmap().

[akpm@linux-foundation.org: fix whitespace, per Catalin]
Link: https://lkml.kernel.org/r/ec251b20ba1964fb64cf1607d2ad80c47f3873df.1730224667.git.lorenzo.stoakes@oracle.com
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Jann Horn <jannh@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/mman.h  | 10 +++++++---
 arch/parisc/include/asm/mman.h |  5 +++--
 include/linux/mman.h           |  7 ++++---
 mm/mmap.c                      |  2 +-
 mm/nommu.c                     |  2 +-
 mm/shmem.c                     |  3 ---
 6 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 9e39217b4afb..798d965760d4 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -6,6 +6,8 @@
 
 #ifndef BUILD_VDSO
 #include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/shmem_fs.h>
 #include <linux/types.h>
 
 static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
@@ -31,19 +33,21 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
 }
 #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
 
-static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
+static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
+						   unsigned long flags)
 {
 	/*
 	 * Only allow MTE on anonymous mappings as these are guaranteed to be
 	 * backed by tags-capable memory. The vm_flags may be overridden by a
 	 * filesystem supporting MTE (RAM-based).
 	 */
-	if (system_supports_mte() && (flags & MAP_ANONYMOUS))
+	if (system_supports_mte() &&
+	    ((flags & MAP_ANONYMOUS) || shmem_file(file)))
 		return VM_MTE_ALLOWED;
 
 	return 0;
 }
-#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
+#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
 
 static inline bool arch_validate_prot(unsigned long prot,
 	unsigned long addr __always_unused)
diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h
index 89b6beeda0b8..663f587dc789 100644
--- a/arch/parisc/include/asm/mman.h
+++ b/arch/parisc/include/asm/mman.h
@@ -2,6 +2,7 @@
 #ifndef __ASM_MMAN_H__
 #define __ASM_MMAN_H__
 
+#include <linux/fs.h>
 #include <uapi/asm/mman.h>
 
 /* PARISC cannot allow mdwe as it needs writable stacks */
@@ -11,7 +12,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
 }
 #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
 
-static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
+static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags)
 {
 	/*
 	 * The stack on parisc grows upwards, so if userspace requests memory
@@ -23,6 +24,6 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
 
 	return 0;
 }
-#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
+#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
 
 #endif /* __ASM_MMAN_H__ */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 8ddca62d6460..a842783ffa62 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_MMAN_H
 #define _LINUX_MMAN_H
 
+#include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/percpu_counter.h>
 
@@ -94,7 +95,7 @@ static inline void vm_unacct_memory(long pages)
 #endif
 
 #ifndef arch_calc_vm_flag_bits
-#define arch_calc_vm_flag_bits(flags) 0
+#define arch_calc_vm_flag_bits(file, flags) 0
 #endif
 
 #ifndef arch_validate_prot
@@ -151,13 +152,13 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
  * Combine the mmap "flags" argument into "vm_flags" used internally.
  */
 static inline unsigned long
-calc_vm_flag_bits(unsigned long flags)
+calc_vm_flag_bits(struct file *file, unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
 	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
 	       _calc_vm_trans(flags, MAP_STACK,	     VM_NOHUGEPAGE) |
-	       arch_calc_vm_flag_bits(flags);
+	       arch_calc_vm_flag_bits(file, flags);
 }
 
 unsigned long vm_commit_limit(void);
diff --git a/mm/mmap.c b/mm/mmap.c
index ab71d4c3464c..aee5fa08ae5d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -344,7 +344,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	 * to. we assume access permissions have been handled by the open
 	 * of the memory object, so we don't do any here.
 	 */
-	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
 	/* Obtain the address to map to. we verify (or select) it and ensure
diff --git a/mm/nommu.c b/mm/nommu.c
index 635d028d647b..e9b5f527ab5b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -842,7 +842,7 @@ static unsigned long determine_vm_flags(struct file *file,
 {
 	unsigned long vm_flags;
 
-	vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
+	vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
 
 	if (!file) {
 		/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 4ba1d00fabda..e87f5d6799a7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2733,9 +2733,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	if (ret)
 		return ret;
 
-	/* arm64 - allow memory tagging on RAM-based files */
-	vm_flags_set(vma, VM_MTE_ALLOWED);
-
 	file_accessed(file);
 	/* This is anonymous shared memory if it is unlinked at the time of mmap */
 	if (inode->i_nlink)
-- 
cgit v1.2.3


From b79276dcac9124a79c8cf7cc8fbdd3d4c3c9a7c7 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 4 Nov 2024 16:28:55 -0600
Subject: ACPI: processor: Move arch_init_invariance_cppc() call later

arch_init_invariance_cppc() is called at the end of
acpi_cppc_processor_probe() in order to configure frequency invariance
based upon the values from _CPC.

This however doesn't work on AMD CPPC shared memory designs that have
AMD preferred cores enabled because _CPC needs to be analyzed from all
cores to judge if preferred cores are enabled.

This issue manifests to users as a warning since commit 21fb59ab4b97
("ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn"):
```
Could not retrieve highest performance (-19)
```

However the warning isn't the cause of this, it was actually
commit 279f838a61f9 ("x86/amd: Detect preferred cores in
amd_get_boost_ratio_numerator()") which exposed the issue.

To fix this problem, change arch_init_invariance_cppc() into a new weak
symbol that is called at the end of acpi_processor_driver_init().
Each architecture that supports it can declare the symbol to override
the weak one.

Define it for x86, in arch/x86/kernel/acpi/cppc.c, and for all of the
architectures using the generic arch_topology.c code.

Fixes: 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()")
Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219431
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Link: https://patch.msgid.link/20241104222855.3959267-1-superm1@kernel.org
[ rjw: Changelog edit ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm64/include/asm/topology.h | 4 ----
 arch/x86/include/asm/topology.h   | 5 -----
 arch/x86/kernel/acpi/cppc.c       | 7 ++++++-
 drivers/acpi/cppc_acpi.c          | 6 ------
 drivers/acpi/processor_driver.c   | 9 +++++++++
 drivers/base/arch_topology.c      | 6 +++++-
 include/acpi/processor.h          | 2 ++
 include/linux/arch_topology.h     | 4 ----
 8 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 5fc3af9f8f29..341174bf9106 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -26,10 +26,6 @@ void update_freq_counters_refs(void);
 #define arch_scale_freq_invariant topology_scale_freq_invariant
 #define arch_scale_freq_ref topology_get_freq_ref
 
-#ifdef CONFIG_ACPI_CPPC_LIB
-#define arch_init_invariance_cppc topology_init_cpu_capacity_cppc
-#endif
-
 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity topology_get_cpu_scale
 
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index aef70336d624..92f3664dd933 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -305,9 +305,4 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled
 extern void arch_scale_freq_tick(void);
 #define arch_scale_freq_tick arch_scale_freq_tick
 
-#ifdef CONFIG_ACPI_CPPC_LIB
-void init_freq_invariance_cppc(void);
-#define arch_init_invariance_cppc init_freq_invariance_cppc
-#endif
-
 #endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 956984054bf3..aab9d0570841 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -110,7 +110,7 @@ static void amd_set_max_freq_ratio(void)
 
 static DEFINE_MUTEX(freq_invariance_lock);
 
-void init_freq_invariance_cppc(void)
+static inline void init_freq_invariance_cppc(void)
 {
 	static bool init_done;
 
@@ -127,6 +127,11 @@ void init_freq_invariance_cppc(void)
 	mutex_unlock(&freq_invariance_lock);
 }
 
+void acpi_processor_init_invariance_cppc(void)
+{
+	init_freq_invariance_cppc();
+}
+
 /*
  * Get the highest performance register value.
  * @cpu: CPU from which to get highest performance.
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 1a40f0514eaa..5c0cc7aae872 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -671,10 +671,6 @@ static int pcc_data_alloc(int pcc_ss_id)
  *  )
  */
 
-#ifndef arch_init_invariance_cppc
-static inline void arch_init_invariance_cppc(void) { }
-#endif
-
 /**
  * acpi_cppc_processor_probe - Search for per CPU _CPC objects.
  * @pr: Ptr to acpi_processor containing this CPU's logical ID.
@@ -905,8 +901,6 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
 		goto out_free;
 	}
 
-	arch_init_invariance_cppc();
-
 	kfree(output.pointer);
 	return 0;
 
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index cb52dd000b95..3b281bc1e73c 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -237,6 +237,9 @@ static struct notifier_block acpi_processor_notifier_block = {
 	.notifier_call = acpi_processor_notifier,
 };
 
+void __weak acpi_processor_init_invariance_cppc(void)
+{ }
+
 /*
  * We keep the driver loaded even when ACPI is not running.
  * This is needed for the powernow-k8 driver, that works even without
@@ -270,6 +273,12 @@ static int __init acpi_processor_driver_init(void)
 				  NULL, acpi_soft_cpu_dead);
 
 	acpi_processor_throttling_init();
+
+	/*
+	 * Frequency invariance calculations on AMD platforms can't be run until
+	 * after acpi_cppc_processor_probe() has been called for all online CPUs
+	 */
+	acpi_processor_init_invariance_cppc();
 	return 0;
 err:
 	driver_unregister(&acpi_processor_driver);
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 75fcb75d5515..3ebe77566788 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -366,7 +366,7 @@ void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate)
 #ifdef CONFIG_ACPI_CPPC_LIB
 #include <acpi/cppc_acpi.h>
 
-void topology_init_cpu_capacity_cppc(void)
+static inline void topology_init_cpu_capacity_cppc(void)
 {
 	u64 capacity, capacity_scale = 0;
 	struct cppc_perf_caps perf_caps;
@@ -417,6 +417,10 @@ void topology_init_cpu_capacity_cppc(void)
 exit:
 	free_raw_capacity();
 }
+void acpi_processor_init_invariance_cppc(void)
+{
+	topology_init_cpu_capacity_cppc();
+}
 #endif
 
 #ifdef CONFIG_CPU_FREQ
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index e6f6074eadbf..a17e97e634a6 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -465,4 +465,6 @@ extern int acpi_processor_ffh_lpi_probe(unsigned int cpu);
 extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
 #endif
 
+void acpi_processor_init_invariance_cppc(void);
+
 #endif
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index b721f360d759..4a952c4885ed 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -11,10 +11,6 @@
 void topology_normalize_cpu_scale(void);
 int topology_update_cpu_topology(void);
 
-#ifdef CONFIG_ACPI_CPPC_LIB
-void topology_init_cpu_capacity_cppc(void);
-#endif
-
 struct device_node;
 bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
-- 
cgit v1.2.3


From 8c462d56487e3abdbf8a61cedfe7c795a54f4a78 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 6 Nov 2024 16:04:48 +0000
Subject: arm64: smccc: Remove broken support for SMCCCv1.3 SVE discard hint

SMCCCv1.3 added a hint bit which callers can set in an SMCCC function ID
(AKA "FID") to indicate that it is acceptable for the SMCCC
implementation to discard SVE and/or SME state over a specific SMCCC
call. The kernel support for using this hint is broken and SMCCC calls
may clobber the SVE and/or SME state of arbitrary tasks, though FPSIMD
state is unaffected.

The kernel support is intended to use the hint when there is no SVE or
SME state to save, and to do this it checks whether TIF_FOREIGN_FPSTATE
is set or TIF_SVE is clear in assembly code:

|        ldr     <flags>, [<current_task>, #TSK_TI_FLAGS]
|        tbnz    <flags>, #TIF_FOREIGN_FPSTATE, 1f   // Any live FP state?
|        tbnz    <flags>, #TIF_SVE, 2f               // Does that state include SVE?
|
| 1:     orr     <fid>, <fid>, ARM_SMCCC_1_3_SVE_HINT
| 2:
|        << SMCCC call using FID >>

This is not safe as-is:

(1) SMCCC calls can be made in a preemptible context and preemption can
    result in TIF_FOREIGN_FPSTATE being set or cleared at arbitrary
    points in time. Thus checking for TIF_FOREIGN_FPSTATE provides no
    guarantee.

(2) TIF_FOREIGN_FPSTATE only indicates that the live FP/SVE/SME state in
    the CPU does not belong to the current task, and does not indicate
    that clobbering this state is acceptable.

    When the live CPU state is clobbered it is necessary to update
    fpsimd_last_state.st to ensure that a subsequent context switch will
    reload FP/SVE/SME state from memory rather than consuming the
    clobbered state. This and the SMCCC call itself must happen in a
    critical section with preemption disabled to avoid races.

(3) Live SVE/SME state can exist with TIF_SVE clear (e.g. with only
    TIF_SME set), and checking TIF_SVE alone is insufficient.

Remove the broken support for the SMCCCv1.3 SVE saving hint. This is
effectively a revert of commits:

* cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint")
* a7c3acca5380 ("arm64: smccc: Save lr before calling __arm_smccc_sve_check()")

... leaving behind the ARM_SMCCC_VERSION_1_3 and ARM_SMCCC_1_3_SVE_HINT
definitions, since these are simply definitions from the SMCCC
specification, and the latter is used in KVM via ARM_SMCCC_CALL_HINTS.

If we want to bring this back in future, we'll probably want to handle
this logic in C where we can use all the usual FPSIMD/SVE/SME helper
functions, and that'll likely require some rework of the SMCCC code
and/or its callers.

Fixes: cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: stable@vger.kernel.org
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241106160448.2712997-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/smccc-call.S | 35 +++--------------------------------
 drivers/firmware/smccc/smccc.c |  4 ----
 include/linux/arm-smccc.h      | 32 +++-----------------------------
 3 files changed, 6 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index 487381164ff6..2def9d0dd3dd 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -7,48 +7,19 @@
 
 #include <asm/asm-offsets.h>
 #include <asm/assembler.h>
-#include <asm/thread_info.h>
-
-/*
- * If we have SMCCC v1.3 and (as is likely) no SVE state in
- * the registers then set the SMCCC hint bit to say there's no
- * need to preserve it.  Do this by directly adjusting the SMCCC
- * function value which is already stored in x0 ready to be called.
- */
-SYM_FUNC_START(__arm_smccc_sve_check)
-
-	ldr_l	x16, smccc_has_sve_hint
-	cbz	x16, 2f
-
-	get_current_task x16
-	ldr	x16, [x16, #TSK_TI_FLAGS]
-	tbnz	x16, #TIF_FOREIGN_FPSTATE, 1f	// Any live FP state?
-	tbnz	x16, #TIF_SVE, 2f		// Does that state include SVE?
-
-1:	orr	x0, x0, ARM_SMCCC_1_3_SVE_HINT
-
-2:	ret
-SYM_FUNC_END(__arm_smccc_sve_check)
-EXPORT_SYMBOL(__arm_smccc_sve_check)
 
 	.macro SMCCC instr
-	stp     x29, x30, [sp, #-16]!
-	mov	x29, sp
-alternative_if ARM64_SVE
-	bl	__arm_smccc_sve_check
-alternative_else_nop_endif
 	\instr	#0
-	ldr	x4, [sp, #16]
+	ldr	x4, [sp]
 	stp	x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
 	stp	x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS]
-	ldr	x4, [sp, #24]
+	ldr	x4, [sp, #8]
 	cbz	x4, 1f /* no quirk structure */
 	ldr	x9, [x4, #ARM_SMCCC_QUIRK_ID_OFFS]
 	cmp	x9, #ARM_SMCCC_QUIRK_QCOM_A6
 	b.ne	1f
 	str	x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS]
-1:	ldp     x29, x30, [sp], #16
-	ret
+1:	ret
 	.endm
 
 /*
diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c
index d670635914ec..a74600d9f2d7 100644
--- a/drivers/firmware/smccc/smccc.c
+++ b/drivers/firmware/smccc/smccc.c
@@ -16,7 +16,6 @@ static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
 static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE;
 
 bool __ro_after_init smccc_trng_available = false;
-u64 __ro_after_init smccc_has_sve_hint = false;
 s32 __ro_after_init smccc_soc_id_version = SMCCC_RET_NOT_SUPPORTED;
 s32 __ro_after_init smccc_soc_id_revision = SMCCC_RET_NOT_SUPPORTED;
 
@@ -28,9 +27,6 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
 	smccc_conduit = conduit;
 
 	smccc_trng_available = smccc_probe_trng();
-	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
-	    smccc_version >= ARM_SMCCC_VERSION_1_3)
-		smccc_has_sve_hint = true;
 
 	if ((smccc_version >= ARM_SMCCC_VERSION_1_2) &&
 	    (smccc_conduit != SMCCC_CONDUIT_NONE)) {
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index f59099a213d0..67f6fdf2e7cd 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -315,8 +315,6 @@ u32 arm_smccc_get_version(void);
 
 void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit);
 
-extern u64 smccc_has_sve_hint;
-
 /**
  * arm_smccc_get_soc_id_version()
  *
@@ -414,15 +412,6 @@ struct arm_smccc_quirk {
 	} state;
 };
 
-/**
- * __arm_smccc_sve_check() - Set the SVE hint bit when doing SMC calls
- *
- * Sets the SMCCC hint bit to indicate if there is live state in the SVE
- * registers, this modifies x0 in place and should never be called from C
- * code.
- */
-asmlinkage unsigned long __arm_smccc_sve_check(unsigned long x0);
-
 /**
  * __arm_smccc_smc() - make SMC calls
  * @a0-a7: arguments passed in registers 0 to 7
@@ -490,20 +479,6 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 
 #endif
 
-/* nVHE hypervisor doesn't have a current thread so needs separate checks */
-#if defined(CONFIG_ARM64_SVE) && !defined(__KVM_NVHE_HYPERVISOR__)
-
-#define SMCCC_SVE_CHECK ALTERNATIVE("nop \n",  "bl __arm_smccc_sve_check \n", \
-				    ARM64_SVE)
-#define smccc_sve_clobbers "x16", "x30", "cc",
-
-#else
-
-#define SMCCC_SVE_CHECK
-#define smccc_sve_clobbers
-
-#endif
-
 #define __constraint_read_2	"r" (arg0)
 #define __constraint_read_3	__constraint_read_2, "r" (arg1)
 #define __constraint_read_4	__constraint_read_3, "r" (arg2)
@@ -574,12 +549,11 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 		register unsigned long r3 asm("r3"); 			\
 		CONCATENATE(__declare_arg_,				\
 			    COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__);	\
-		asm volatile(SMCCC_SVE_CHECK				\
-			     inst "\n" :				\
+		asm volatile(inst "\n" :				\
 			     "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3)	\
 			     : CONCATENATE(__constraint_read_,		\
 					   COUNT_ARGS(__VA_ARGS__))	\
-			     : smccc_sve_clobbers "memory");		\
+			     : "memory");				\
 		if (___res)						\
 			*___res = (typeof(*___res)){r0, r1, r2, r3};	\
 	} while (0)
@@ -628,7 +602,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 		asm ("" :						\
 		     : CONCATENATE(__constraint_read_,			\
 				   COUNT_ARGS(__VA_ARGS__))		\
-		     : smccc_sve_clobbers "memory");			\
+		     : "memory");					\
 		if (___res)						\
 			___res->a0 = SMCCC_RET_NOT_SUPPORTED;		\
 	} while (0)
-- 
cgit v1.2.3


From c03d278fdf35e73dd0ec543b9b556876b9d9a8dc Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 5 Nov 2024 12:07:22 +0100
Subject: netfilter: nf_tables: wait for rcu grace period on net_device removal

8c873e219970 ("netfilter: core: free hooks with call_rcu") removed
synchronize_net() call when unregistering basechain hook, however,
net_device removal event handler for the NFPROTO_NETDEV was not updated
to wait for RCU grace period.

Note that 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks
on net_device removal") does not remove basechain rules on device
removal, I was hinted to remove rules on net_device removal later, see
5ebe0b0eec9d ("netfilter: nf_tables: destroy basechain and rules on
netdevice removal").

Although NETDEV_UNREGISTER event is guaranteed to be handled after
synchronize_net() call, this path needs to wait for rcu grace period via
rcu callback to release basechain hooks if netns is alive because an
ongoing netlink dump could be in progress (sockets hold a reference on
the netns).

Note that nf_tables_pre_exit_net() unregisters and releases basechain
hooks but it is possible to see NETDEV_UNREGISTER at a later stage in
the netns exit path, eg. veth peer device in another netns:

 cleanup_net()
  default_device_exit_batch()
   unregister_netdevice_many_notify()
    notifier_call_chain()
     nf_tables_netdev_event()
      __nft_release_basechain()

In this particular case, same rule of thumb applies: if netns is alive,
then wait for rcu grace period because netlink dump in the other netns
could be in progress. Otherwise, if the other netns is going away then
no netlink dump can be in progress and basechain hooks can be released
inmediately.

While at it, turn WARN_ON() into WARN_ON_ONCE() for the basechain
validation, which should not ever happen.

Fixes: 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks on net_device removal")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++++
 net/netfilter/nf_tables_api.c     | 41 ++++++++++++++++++++++++++++++++-------
 2 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 91ae20cb7648..066a3ea33b12 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1103,6 +1103,7 @@ struct nft_rule_blob {
  *	@name: name of the chain
  *	@udlen: user data length
  *	@udata: user data in the chain
+ *	@rcu_head: rcu head for deferred release
  *	@blob_next: rule blob pointer to the next in the chain
  */
 struct nft_chain {
@@ -1120,6 +1121,7 @@ struct nft_chain {
 	char				*name;
 	u16				udlen;
 	u8				*udata;
+	struct rcu_head			rcu_head;
 
 	/* Only used during control plane commit phase: */
 	struct nft_rule_blob		*blob_next;
@@ -1263,6 +1265,7 @@ static inline void nft_use_inc_restore(u32 *use)
  *	@sets: sets in the table
  *	@objects: stateful objects in the table
  *	@flowtables: flow tables in the table
+ *	@net: netnamespace this table belongs to
  *	@hgenerator: handle generator state
  *	@handle: table handle
  *	@use: number of chain references to this table
@@ -1282,6 +1285,7 @@ struct nft_table {
 	struct list_head		sets;
 	struct list_head		objects;
 	struct list_head		flowtables;
+	possible_net_t			net;
 	u64				hgenerator;
 	u64				handle;
 	u32				use;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a24fe62650a7..588a2757986c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1495,6 +1495,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
 	INIT_LIST_HEAD(&table->sets);
 	INIT_LIST_HEAD(&table->objects);
 	INIT_LIST_HEAD(&table->flowtables);
+	write_pnet(&table->net, net);
 	table->family = family;
 	table->flags = flags;
 	table->handle = ++nft_net->table_handle;
@@ -11430,22 +11431,48 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
 }
 EXPORT_SYMBOL_GPL(nft_data_dump);
 
-int __nft_release_basechain(struct nft_ctx *ctx)
+static void __nft_release_basechain_now(struct nft_ctx *ctx)
 {
 	struct nft_rule *rule, *nr;
 
-	if (WARN_ON(!nft_is_base_chain(ctx->chain)))
-		return 0;
-
-	nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
 	list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
 		list_del(&rule->list);
-		nft_use_dec(&ctx->chain->use);
 		nf_tables_rule_release(ctx, rule);
 	}
+	nf_tables_chain_destroy(ctx->chain);
+}
+
+static void nft_release_basechain_rcu(struct rcu_head *head)
+{
+	struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head);
+	struct nft_ctx ctx = {
+		.family	= chain->table->family,
+		.chain	= chain,
+		.net	= read_pnet(&chain->table->net),
+	};
+
+	__nft_release_basechain_now(&ctx);
+	put_net(ctx.net);
+}
+
+int __nft_release_basechain(struct nft_ctx *ctx)
+{
+	struct nft_rule *rule;
+
+	if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain)))
+		return 0;
+
+	nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
+	list_for_each_entry(rule, &ctx->chain->rules, list)
+		nft_use_dec(&ctx->chain->use);
+
 	nft_chain_del(ctx->chain);
 	nft_use_dec(&ctx->table->use);
-	nf_tables_chain_destroy(ctx->chain);
+
+	if (maybe_get_net(ctx->net))
+		call_rcu(&ctx->chain->rcu_head, nft_release_basechain_rcu);
+	else
+		__nft_release_basechain_now(ctx);
 
 	return 0;
 }
-- 
cgit v1.2.3


From fc9de52de38f656399d2ce40f7349a6b5f86e787 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Nov 2024 13:03:22 +0000
Subject: rxrpc: Fix missing locking causing hanging calls

If a call gets aborted (e.g. because kafs saw a signal) between it being
queued for connection and the I/O thread picking up the call, the abort
will be prioritised over the connection and it will be removed from
local->new_client_calls by rxrpc_disconnect_client_call() without a lock
being held.  This may cause other calls on the list to disappear if a race
occurs.

Fix this by taking the client_call_lock when removing a call from whatever
list its ->wait_link happens to be on.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-afs@lists.infradead.org
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Fixes: 9d35d880e0e4 ("rxrpc: Move client call connection to the I/O thread")
Link: https://patch.msgid.link/726660.1730898202@warthog.procyon.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/rxrpc.h | 1 +
 net/rxrpc/conn_client.c      | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index a1b126a6b0d7..cc22596c7250 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -287,6 +287,7 @@
 	EM(rxrpc_call_see_input,		"SEE input   ") \
 	EM(rxrpc_call_see_release,		"SEE release ") \
 	EM(rxrpc_call_see_userid_exists,	"SEE u-exists") \
+	EM(rxrpc_call_see_waiting_call,		"SEE q-conn  ") \
 	E_(rxrpc_call_see_zap,			"SEE zap     ")
 
 #define rxrpc_txqueue_traces \
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index d25bf1cf3670..bb11e8289d6d 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -516,6 +516,7 @@ void rxrpc_connect_client_calls(struct rxrpc_local *local)
 
 		spin_lock(&local->client_call_lock);
 		list_move_tail(&call->wait_link, &bundle->waiting_calls);
+		rxrpc_see_call(call, rxrpc_call_see_waiting_call);
 		spin_unlock(&local->client_call_lock);
 
 		if (rxrpc_bundle_has_space(bundle))
@@ -586,7 +587,10 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call
 		_debug("call is waiting");
 		ASSERTCMP(call->call_id, ==, 0);
 		ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
+		/* May still be on ->new_client_calls. */
+		spin_lock(&local->client_call_lock);
 		list_del_init(&call->wait_link);
+		spin_unlock(&local->client_call_lock);
 		return;
 	}
 
-- 
cgit v1.2.3


From c928807f6f6b6d595a7e199591ae297c81de3aeb Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 28 Oct 2024 12:26:53 -0600
Subject: mm/page_alloc: keep track of free highatomic

OOM kills due to vastly overestimated free highatomic reserves were
observed:

  ... invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0 ...
  Node 0 Normal free:1482936kB boost:0kB min:410416kB low:739404kB high:1068392kB reserved_highatomic:1073152KB ...
  Node 0 Normal: 1292*4kB (ME) 1920*8kB (E) 383*16kB (UE) 220*32kB (ME) 340*64kB (E) 2155*128kB (UE) 3243*256kB (UE) 615*512kB (U) 1*1024kB (M) 0*2048kB 0*4096kB = 1477408kB

The second line above shows that the OOM kill was due to the following
condition:

  free (1482936kB) - reserved_highatomic (1073152kB) = 409784KB < min (410416kB)

And the third line shows there were no free pages in any
MIGRATE_HIGHATOMIC pageblocks, which otherwise would show up as type 'H'.
Therefore __zone_watermark_unusable_free() underestimated the usable free
memory by over 1GB, which resulted in the unnecessary OOM kill above.

The comments in __zone_watermark_unusable_free() warns about the potential
risk, i.e.,

  If the caller does not have rights to reserves below the min
  watermark then subtract the high-atomic reserves. This will
  over-estimate the size of the atomic reserve but it avoids a search.

However, it is possible to keep track of free pages in reserved highatomic
pageblocks with a new per-zone counter nr_free_highatomic protected by the
zone lock, to avoid a search when calculating the usable free memory.  And
the cost would be minimal, i.e., simple arithmetics in the highatomic
alloc/free/move paths.

Note that since nr_free_highatomic can be relatively small, using a
per-cpu counter might cause too much drift and defeat its purpose, in
addition to the extra memory overhead.

Dependson e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting") - see [1]

[akpm@linux-foundation.org: s/if/else if/, per Johannes, stealth whitespace tweak]
Link: https://lkml.kernel.org/r/20241028182653.3420139-1-yuzhao@google.com
Link: https://lkml.kernel.org/r/0d0ddb33-fcdc-43e2-801f-0c1df2031afb@suse.cz [1]
Fixes: 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: Link Lin <linkl@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5b1c984daf45..80bc5640bb60 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -823,6 +823,7 @@ struct zone {
 	unsigned long watermark_boost;
 
 	unsigned long nr_reserved_highatomic;
+	unsigned long nr_free_highatomic;
 
 	/*
 	 * We don't know if the memory that we're going to allocate will be
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8ad38cd5e574..c6c7bb3ea71b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -635,6 +635,8 @@ compaction_capture(struct capture_control *capc, struct page *page,
 static inline void account_freepages(struct zone *zone, int nr_pages,
 				     int migratetype)
 {
+	lockdep_assert_held(&zone->lock);
+
 	if (is_migrate_isolate(migratetype))
 		return;
 
@@ -642,6 +644,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
 
 	if (is_migrate_cma(migratetype))
 		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
+	else if (is_migrate_highatomic(migratetype))
+		WRITE_ONCE(zone->nr_free_highatomic,
+			   zone->nr_free_highatomic + nr_pages);
 }
 
 /* Used for pages not on another list */
@@ -3079,11 +3084,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
 
 	/*
 	 * If the caller does not have rights to reserves below the min
-	 * watermark then subtract the high-atomic reserves. This will
-	 * over-estimate the size of the atomic reserve but it avoids a search.
+	 * watermark then subtract the free pages reserved for highatomic.
 	 */
 	if (likely(!(alloc_flags & ALLOC_RESERVES)))
-		unusable_free += z->nr_reserved_highatomic;
+		unusable_free += READ_ONCE(z->nr_free_highatomic);
 
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
-- 
cgit v1.2.3


From 9e05e5c7ee8758141d2db7e8fea2cab34500c6ed Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Mon, 4 Nov 2024 19:54:19 +0000
Subject: signal: restore the override_rlimit logic

Prior to commit d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of
ucounts") UCOUNT_RLIMIT_SIGPENDING rlimit was not enforced for a class of
signals.  However now it's enforced unconditionally, even if
override_rlimit is set.  This behavior change caused production issues.

For example, if the limit is reached and a process receives a SIGSEGV
signal, sigqueue_alloc fails to allocate the necessary resources for the
signal delivery, preventing the signal from being delivered with siginfo.
This prevents the process from correctly identifying the fault address and
handling the error.  From the user-space perspective, applications are
unaware that the limit has been reached and that the siginfo is
effectively 'corrupted'.  This can lead to unpredictable behavior and
crashes, as we observed with java applications.

Fix this by passing override_rlimit into inc_rlimit_get_ucounts() and skip
the comparison to max there if override_rlimit is set.  This effectively
restores the old behavior.

Link: https://lkml.kernel.org/r/20241104195419.3962584-1-roman.gushchin@linux.dev
Fixes: d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts")
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Co-developed-by: Andrei Vagin <avagin@google.com>
Signed-off-by: Andrei Vagin <avagin@google.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Alexey Gladkov <legion@kernel.org>
Cc: Kees Cook <kees@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/user_namespace.h | 3 ++-
 kernel/signal.c                | 3 ++-
 kernel/ucount.c                | 6 ++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 3625096d5f85..7183e5aca282 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -141,7 +141,8 @@ static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type ty
 
 long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
 bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
-long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type);
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
+			    bool override_rlimit);
 void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
 bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 4344860ffcac..cbabb2d05e0a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -419,7 +419,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 	 */
 	rcu_read_lock();
 	ucounts = task_ucounts(t);
-	sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
+	sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING,
+					    override_rlimit);
 	rcu_read_unlock();
 	if (!sigpending)
 		return NULL;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 9469102c5ac0..696406939be5 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
 	do_dec_rlimit_put_ucounts(ucounts, NULL, type);
 }
 
-long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
+			    bool override_rlimit)
 {
 	/* Caller must hold a reference to ucounts */
 	struct ucounts *iter;
@@ -320,7 +321,8 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
 			goto dec_unwind;
 		if (iter == ucounts)
 			ret = new;
-		max = get_userns_rlimit_max(iter->ns, type);
+		if (!override_rlimit)
+			max = get_userns_rlimit_max(iter->ns, type);
 		/*
 		 * Grab an extra ucount reference for the caller when
 		 * the rlimit count was previously 0.
-- 
cgit v1.2.3


From b45a3777ceabbe08ab7a6e97f258191c07cbab8d Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 8 Nov 2024 10:13:52 +0800
Subject: iommu: Pass old domain to set_dev_pasid op

To support domain replacement for pasid, the underlying iommu driver needs
to know the old domain hence be able to clean up the existing attachment.
It would be much convenient for iommu layer to pass down the old domain.
Otherwise, iommu drivers would need to track domain for pasids by
themselves, this would duplicate code among the iommu drivers. Or iommu
drivers would rely group->pasid_array to get domain, which may not always
the correct one.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Link: https://lore.kernel.org/r/20241107122234.7424-2-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/amd_iommu.h                   | 3 ++-
 drivers/iommu/amd/pasid.c                       | 3 ++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 3 ++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c     | 3 ++-
 drivers/iommu/intel/iommu.c                     | 6 ++++--
 drivers/iommu/intel/svm.c                       | 3 ++-
 drivers/iommu/iommu.c                           | 3 ++-
 include/linux/iommu.h                           | 2 +-
 8 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 6386fa4556d9..b11b014fa82d 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -52,7 +52,8 @@ struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev,
 						struct mm_struct *mm);
 void amd_iommu_domain_free(struct iommu_domain *dom);
 int iommu_sva_set_dev_pasid(struct iommu_domain *domain,
-			    struct device *dev, ioasid_t pasid);
+			    struct device *dev, ioasid_t pasid,
+			    struct iommu_domain *old);
 void amd_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
 				struct iommu_domain *domain);
 
diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c
index 0657b9373be5..d1dfc745f55e 100644
--- a/drivers/iommu/amd/pasid.c
+++ b/drivers/iommu/amd/pasid.c
@@ -100,7 +100,8 @@ static const struct mmu_notifier_ops sva_mn = {
 };
 
 int iommu_sva_set_dev_pasid(struct iommu_domain *domain,
-			    struct device *dev, ioasid_t pasid)
+			    struct device *dev, ioasid_t pasid,
+			    struct iommu_domain *old)
 {
 	struct pdom_dev_data *pdom_dev_data;
 	struct protection_domain *sva_pdom = to_pdomain(domain);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index a7c36654dee5..645da7b69bed 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -332,7 +332,8 @@ void arm_smmu_sva_notifier_synchronize(void)
 }
 
 static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
-				      struct device *dev, ioasid_t id)
+				      struct device *dev, ioasid_t id,
+				      struct iommu_domain *old)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 737c5b882355..1517fe2c356e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2856,7 +2856,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 }
 
 static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain,
-				      struct device *dev, ioasid_t id)
+				     struct device *dev, ioasid_t id,
+				     struct iommu_domain *old)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e2b3aa601191..2d1d208d5125 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4070,7 +4070,8 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
 }
 
 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
-				     struct device *dev, ioasid_t pasid)
+				     struct device *dev, ioasid_t pasid,
+				     struct iommu_domain *old)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
@@ -4356,7 +4357,8 @@ static int identity_domain_attach_dev(struct iommu_domain *domain, struct device
 }
 
 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
-					 struct device *dev, ioasid_t pasid)
+					 struct device *dev, ioasid_t pasid,
+					 struct iommu_domain *old)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct intel_iommu *iommu = info->iommu;
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 3cc43a958b4d..4a2bd65614ad 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -111,7 +111,8 @@ static const struct mmu_notifier_ops intel_mmuops = {
 };
 
 static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
-				   struct device *dev, ioasid_t pasid)
+				   struct device *dev, ioasid_t pasid,
+				   struct iommu_domain *old)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 83c8e617a2c5..f3f81c04b8fb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3331,7 +3331,8 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain,
 	int ret;
 
 	for_each_group_device(group, device) {
-		ret = domain->ops->set_dev_pasid(domain, device->dev, pasid);
+		ret = domain->ops->set_dev_pasid(domain, device->dev,
+						 pasid, NULL);
 		if (ret)
 			goto err_revert;
 	}
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..32dce80aa7fd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -642,7 +642,7 @@ struct iommu_ops {
 struct iommu_domain_ops {
 	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
 	int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
-			     ioasid_t pasid);
+			     ioasid_t pasid, struct iommu_domain *old);
 
 	int (*map_pages)(struct iommu_domain *domain, unsigned long iova,
 			 phys_addr_t paddr, size_t pgsize, size_t pgcount,
-- 
cgit v1.2.3


From 980e3016ebcc00021bf8190c64fd65ba23e90498 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 8 Nov 2024 10:14:04 +0800
Subject: iommu: Make set_dev_pasid op support domain replacement

The iommu core is going to support domain replacement for pasid, it needs
to make the set_dev_pasid op support replacing domain and keep the old
domain config in the failure case.

AMD iommu driver does not support domain replacement for pasid yet, so it
would fail the set_dev_pasid op to keep the old config if the input @old
is non-NULL. Till now, all the set_dev_pasid callbacks can handle the old
parameter and can keep the old config when failed, so update the kdoc of
set_dev_pasid op.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Link: https://lore.kernel.org/r/20241107122234.7424-14-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/pasid.c | 3 +++
 include/linux/iommu.h     | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c
index d1dfc745f55e..8c73a30c2800 100644
--- a/drivers/iommu/amd/pasid.c
+++ b/drivers/iommu/amd/pasid.c
@@ -109,6 +109,9 @@ int iommu_sva_set_dev_pasid(struct iommu_domain *domain,
 	unsigned long flags;
 	int ret = -EINVAL;
 
+	if (old)
+		return -EOPNOTSUPP;
+
 	/* PASID zero is used for requests from the I/O device without PASID */
 	if (!is_pasid_valid(dev_data, pasid))
 		return ret;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 32dce80aa7fd..27f923450a7c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -616,7 +616,8 @@ struct iommu_ops {
  * * EBUSY	- device is attached to a domain and cannot be changed
  * * ENODEV	- device specific errors, not able to be attached
  * * <others>	- treated as ENODEV by the caller. Use is discouraged
- * @set_dev_pasid: set an iommu domain to a pasid of device
+ * @set_dev_pasid: set or replace an iommu domain to a pasid of device. The pasid of
+ *                 the device should be left in the old config in error case.
  * @map_pages: map a physically contiguous set of pages of the same size to
  *             an iommu domain.
  * @unmap_pages: unmap a number of pages of the same size from an iommu domain
-- 
cgit v1.2.3