From 7445cf31d2e25e3f8ad7b1c5342e624c09ab23a2 Mon Sep 17 00:00:00 2001
From: Zvi Effron <zeffron@riotgames.com>
Date: Wed, 7 Jul 2021 22:16:54 +0000
Subject: bpf: Add function for XDP meta data length check

This commit prepares to use the XDP meta data length check in multiple
places by making it into a static inline function instead of a literal.

Co-developed-by: Cody Haas <chaas@riotgames.com>
Co-developed-by: Lisa Watanabe <lwatanabe@riotgames.com>
Signed-off-by: Cody Haas <chaas@riotgames.com>
Signed-off-by: Lisa Watanabe <lwatanabe@riotgames.com>
Signed-off-by: Zvi Effron <zeffron@riotgames.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210707221657.3985075-2-zeffron@riotgames.com
---
 include/net/xdp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 5533f0ab2afc..ad5b02dcb6f4 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -276,6 +276,11 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp)
 	return unlikely(xdp->data_meta > xdp->data);
 }
 
+static inline bool xdp_metalen_invalid(unsigned long metalen)
+{
+	return (metalen & (sizeof(__u32) - 1)) || (metalen > 32);
+}
+
 struct xdp_attachment_info {
 	struct bpf_prog *prog;
 	u32 flags;
-- 
cgit v1.2.3


From 47316f4a305367794fc04f23e5c778678d8f1d8e Mon Sep 17 00:00:00 2001
From: Zvi Effron <zeffron@riotgames.com>
Date: Wed, 7 Jul 2021 22:16:55 +0000
Subject: bpf: Support input xdp_md context in BPF_PROG_TEST_RUN

Support passing a xdp_md via ctx_in/ctx_out in bpf_attr for
BPF_PROG_TEST_RUN.

The intended use case is to pass some XDP meta data to the test runs of
XDP programs that are used as tail calls.

For programs that use bpf_prog_test_run_xdp, support xdp_md input and
output. Unlike with an actual xdp_md during a non-test run, data_meta must
be 0 because it must point to the start of the provided user data. From
the initial xdp_md, use data and data_end to adjust the pointers in the
generated xdp_buff. All other non-zero fields are prohibited (with
EINVAL). If the user has set ctx_out/ctx_size_out, copy the (potentially
different) xdp_md back to the userspace.

We require all fields of input xdp_md except the ones we explicitly
support to be set to zero. The expectation is that in the future we might
add support for more fields and we want to fail explicitly if the user
runs the program on the kernel where we don't yet support them.

Co-developed-by: Cody Haas <chaas@riotgames.com>
Co-developed-by: Lisa Watanabe <lwatanabe@riotgames.com>
Signed-off-by: Cody Haas <chaas@riotgames.com>
Signed-off-by: Lisa Watanabe <lwatanabe@riotgames.com>
Signed-off-by: Zvi Effron <zeffron@riotgames.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210707221657.3985075-3-zeffron@riotgames.com
---
 include/uapi/linux/bpf.h |  3 ---
 net/bpf/test_run.c       | 67 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 59 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bf9252c7381e..b46a383e8db7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -324,9 +324,6 @@ union bpf_iter_link_info {
  *		**BPF_PROG_TYPE_SK_LOOKUP**
  *			*data_in* and *data_out* must be NULL.
  *
- *		**BPF_PROG_TYPE_XDP**
- *			*ctx_in* and *ctx_out* must be NULL.
- *
  *		**BPF_PROG_TYPE_RAW_TRACEPOINT**,
  *		**BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE**
  *
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index aa47af349ba8..229c5deb813c 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -15,6 +15,7 @@
 #include <linux/error-injection.h>
 #include <linux/smp.h>
 #include <linux/sock_diag.h>
+#include <net/xdp.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/bpf_test_run.h>
@@ -687,6 +688,22 @@ out:
 	return ret;
 }
 
+static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp)
+{
+	if (!xdp_md)
+		return 0;
+
+	if (xdp_md->egress_ifindex != 0)
+		return -EINVAL;
+
+	if (xdp_md->ingress_ifindex != 0 || xdp_md->rx_queue_index != 0)
+		return -EINVAL;
+
+	xdp->data = xdp->data_meta + xdp_md->data;
+
+	return 0;
+}
+
 int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr)
 {
@@ -697,35 +714,69 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	struct netdev_rx_queue *rxqueue;
 	struct xdp_buff xdp = {};
 	u32 retval, duration;
+	struct xdp_md *ctx;
 	u32 max_data_sz;
 	void *data;
-	int ret;
+	int ret = -EINVAL;
 
-	if (kattr->test.ctx_in || kattr->test.ctx_out)
-		return -EINVAL;
+	ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md));
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (ctx) {
+		/* There can't be user provided data before the meta data */
+		if (ctx->data_meta || ctx->data_end != size ||
+		    ctx->data > ctx->data_end ||
+		    unlikely(xdp_metalen_invalid(ctx->data)))
+			goto free_ctx;
+		/* Meta data is allocated from the headroom */
+		headroom -= ctx->data;
+	}
 
 	/* XDP have extra tailroom as (most) drivers use full page */
 	max_data_sz = 4096 - headroom - tailroom;
 
 	data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
-	if (IS_ERR(data))
-		return PTR_ERR(data);
+	if (IS_ERR(data)) {
+		ret = PTR_ERR(data);
+		goto free_ctx;
+	}
 
 	rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
 	xdp_init_buff(&xdp, headroom + max_data_sz + tailroom,
 		      &rxqueue->xdp_rxq);
 	xdp_prepare_buff(&xdp, data, headroom, size, true);
 
+	ret = xdp_convert_md_to_buff(ctx, &xdp);
+	if (ret)
+		goto free_data;
+
 	bpf_prog_change_xdp(NULL, prog);
 	ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
 	if (ret)
 		goto out;
-	if (xdp.data != data + headroom || xdp.data_end != xdp.data + size)
-		size = xdp.data_end - xdp.data;
-	ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
+
+	if (xdp.data_meta != data + headroom ||
+	    xdp.data_end != xdp.data_meta + size)
+		size = xdp.data_end - xdp.data_meta;
+
+	if (ctx) {
+		ctx->data = xdp.data - xdp.data_meta;
+		ctx->data_end = xdp.data_end - xdp.data_meta;
+	}
+
+	ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, retval,
+			      duration);
+	if (!ret)
+		ret = bpf_ctx_finish(kattr, uattr, ctx,
+				     sizeof(struct xdp_md));
+
 out:
 	bpf_prog_change_xdp(prog, NULL);
+free_data:
 	kfree(data);
+free_ctx:
+	kfree(ctx);
 	return ret;
 }
 
-- 
cgit v1.2.3


From fe21cb91ae7bca1ae7805454be80b6d03bec85f7 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 2 Jul 2021 16:48:21 +0530
Subject: net: core: Split out code to run generic XDP prog
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helper can later be utilized in code that runs cpumap and devmap
programs in generic redirect mode and adjust skb based on changes made
to xdp_buff.

When returning XDP_REDIRECT/XDP_TX, it invokes __skb_push, so whenever a
generic redirect path invokes devmap/cpumap prog if set, it must
__skb_pull again as we expect mac header to be pulled.

It also drops the skb_reset_mac_len call after do_xdp_generic, as the
mac_header and network_header are advanced by the same offset, so the
difference (mac_len) remains constant.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210702111825.491065-2-memxor@gmail.com
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 84 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 55 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eaf5bb008aa9..42f6f866d5f3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3984,6 +3984,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
 	__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
 }
 
+u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
+			     struct bpf_prog *xdp_prog);
 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index c253c2aafe97..93e80c36cc97 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4744,45 +4744,18 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 	return rxqueue;
 }
 
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
-				     struct xdp_buff *xdp,
-				     struct bpf_prog *xdp_prog)
+u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
+			     struct bpf_prog *xdp_prog)
 {
 	void *orig_data, *orig_data_end, *hard_start;
 	struct netdev_rx_queue *rxqueue;
-	u32 metalen, act = XDP_DROP;
 	bool orig_bcast, orig_host;
 	u32 mac_len, frame_sz;
 	__be16 orig_eth_type;
 	struct ethhdr *eth;
+	u32 metalen, act;
 	int off;
 
-	/* Reinjected packets coming from act_mirred or similar should
-	 * not get XDP generic processing.
-	 */
-	if (skb_is_redirected(skb))
-		return XDP_PASS;
-
-	/* XDP packets must be linear and must have sufficient headroom
-	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
-	 * native XDP provides, thus we need to do it here as well.
-	 */
-	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
-	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
-		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
-		int troom = skb->tail + skb->data_len - skb->end;
-
-		/* In case we have to go down the path and also linearize,
-		 * then lets do the pskb_expand_head() work just once here.
-		 */
-		if (pskb_expand_head(skb,
-				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
-				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
-			goto do_drop;
-		if (skb_linearize(skb))
-			goto do_drop;
-	}
-
 	/* The XDP program wants to see the packet starting at the MAC
 	 * header.
 	 */
@@ -4837,6 +4810,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 		skb->protocol = eth_type_trans(skb, skb->dev);
 	}
 
+	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
+	 * before calling us again on redirect path. We do not call do_redirect
+	 * as we leave that up to the caller.
+	 *
+	 * Caller is responsible for managing lifetime of skb (i.e. calling
+	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
+	 */
 	switch (act) {
 	case XDP_REDIRECT:
 	case XDP_TX:
@@ -4847,6 +4827,49 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 		if (metalen)
 			skb_metadata_set(skb, metalen);
 		break;
+	}
+
+	return act;
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+				     struct xdp_buff *xdp,
+				     struct bpf_prog *xdp_prog)
+{
+	u32 act = XDP_DROP;
+
+	/* Reinjected packets coming from act_mirred or similar should
+	 * not get XDP generic processing.
+	 */
+	if (skb_is_redirected(skb))
+		return XDP_PASS;
+
+	/* XDP packets must be linear and must have sufficient headroom
+	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+	 * native XDP provides, thus we need to do it here as well.
+	 */
+	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
+	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+		int troom = skb->tail + skb->data_len - skb->end;
+
+		/* In case we have to go down the path and also linearize,
+		 * then lets do the pskb_expand_head() work just once here.
+		 */
+		if (pskb_expand_head(skb,
+				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+			goto do_drop;
+		if (skb_linearize(skb))
+			goto do_drop;
+	}
+
+	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+	switch (act) {
+	case XDP_REDIRECT:
+	case XDP_TX:
+	case XDP_PASS:
+		break;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 		fallthrough;
@@ -5312,7 +5335,6 @@ another_round:
 			ret = NET_RX_DROP;
 			goto out;
 		}
-		skb_reset_mac_len(skb);
 	}
 
 	if (eth_type_vlan(skb->protocol)) {
-- 
cgit v1.2.3


From cb0f80039fb7ec9981a74d22019daaa85ff51a3d Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 2 Jul 2021 16:48:22 +0530
Subject: bitops: Add non-atomic bitops for pointers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cpumap needs to set, clear, and test the lowest bit in skb pointer in
various places. To make these checks less noisy, add pointer friendly
bitop macros that also do some typechecking to sanitize the argument.

These wrap the non-atomic bitops __set_bit, __clear_bit, and test_bit
but for pointer arguments. Pointer's address has to be passed in and it
is treated as an unsigned long *, since width and representation of
pointer and unsigned long match on targets Linux supports. They are
prefixed with double underscore to indicate lack of atomicity.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210702111825.491065-3-memxor@gmail.com
---
 include/linux/bitops.h    | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/typecheck.h |  9 +++++++++
 2 files changed, 59 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 26bf15e6cd35..5e62e2383b7f 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -4,6 +4,7 @@
 
 #include <asm/types.h>
 #include <linux/bits.h>
+#include <linux/typecheck.h>
 
 #include <uapi/linux/kernel.h>
 
@@ -253,6 +254,55 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
 		__clear_bit(nr, addr);
 }
 
+/**
+ * __ptr_set_bit - Set bit in a pointer's value
+ * @nr: the bit to set
+ * @addr: the address of the pointer variable
+ *
+ * Example:
+ *	void *p = foo();
+ *	__ptr_set_bit(bit, &p);
+ */
+#define __ptr_set_bit(nr, addr)                         \
+	({                                              \
+		typecheck_pointer(*(addr));             \
+		__set_bit(nr, (unsigned long *)(addr)); \
+	})
+
+/**
+ * __ptr_clear_bit - Clear bit in a pointer's value
+ * @nr: the bit to clear
+ * @addr: the address of the pointer variable
+ *
+ * Example:
+ *	void *p = foo();
+ *	__ptr_clear_bit(bit, &p);
+ */
+#define __ptr_clear_bit(nr, addr)                         \
+	({                                                \
+		typecheck_pointer(*(addr));               \
+		__clear_bit(nr, (unsigned long *)(addr)); \
+	})
+
+/**
+ * __ptr_test_bit - Test bit in a pointer's value
+ * @nr: the bit to test
+ * @addr: the address of the pointer variable
+ *
+ * Example:
+ *	void *p = foo();
+ *	if (__ptr_test_bit(bit, &p)) {
+ *	        ...
+ *	} else {
+ *		...
+ *	}
+ */
+#define __ptr_test_bit(nr, addr)                       \
+	({                                             \
+		typecheck_pointer(*(addr));            \
+		test_bit(nr, (unsigned long *)(addr)); \
+	})
+
 #ifdef __KERNEL__
 
 #ifndef set_mask_bits
diff --git a/include/linux/typecheck.h b/include/linux/typecheck.h
index 20d310331eb5..46b15e2aaefb 100644
--- a/include/linux/typecheck.h
+++ b/include/linux/typecheck.h
@@ -22,4 +22,13 @@
 	(void)__tmp; \
 })
 
+/*
+ * Check at compile time that something is a pointer type.
+ */
+#define typecheck_pointer(x) \
+({	typeof(x) __dummy; \
+	(void)sizeof(*__dummy); \
+	1; \
+})
+
 #endif		/* TYPECHECK_H_INCLUDED */
-- 
cgit v1.2.3


From 11941f8a85362f612df61f4aaab0e41b64d2111d Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 2 Jul 2021 16:48:23 +0530
Subject: bpf: cpumap: Implement generic cpumap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change implements CPUMAP redirect support for generic XDP programs.
The idea is to reuse the cpu map entry's queue that is used to push
native xdp frames for redirecting skb to a different CPU. This will
match native XDP behavior (in that RPS is invoked again for packet
reinjected into networking stack).

To be able to determine whether the incoming skb is from the driver or
cpumap, we reuse skb->redirected bit that skips generic XDP processing
when it is set. To always make use of this, CONFIG_NET_REDIRECT guard on
it has been lifted and it is always available.

>From the redirect side, we add the skb to ptr_ring with its lowest bit
set to 1.  This should be safe as skb is not 1-byte aligned. This allows
kthread to discern between xdp_frames and sk_buff. On consumption of the
ptr_ring item, the lowest bit is unset.

In the end, the skb is simply added to the list that kthread is anyway
going to maintain for xdp_frames converted to skb, and then received
again by using netif_receive_skb_list.

Bulking optimization for generic cpumap is left as an exercise for a
future patch for now.

Since cpumap entry progs are now supported, also remove check in
generic_xdp_install for the cpumap.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20210702111825.491065-4-memxor@gmail.com
---
 include/linux/bpf.h    |   9 +++-
 include/linux/skbuff.h |  10 +----
 kernel/bpf/cpumap.c    | 116 +++++++++++++++++++++++++++++++++++++++++--------
 net/core/dev.c         |   3 +-
 net/core/filter.c      |   6 ++-
 5 files changed, 114 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f309fc1509f2..095aaa104c56 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1513,7 +1513,8 @@ bool dev_map_can_have_prog(struct bpf_map *map);
 void __cpu_map_flush(void);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
-bool cpu_map_prog_allowed(struct bpf_map *map);
+int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
+			     struct sk_buff *skb);
 
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
@@ -1710,6 +1711,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 	return 0;
 }
 
+static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
+					   struct sk_buff *skb)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline bool cpu_map_prog_allowed(struct bpf_map *map)
 {
 	return false;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b2db9cd9a73f..f19190820e63 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -863,8 +863,8 @@ struct sk_buff {
 	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;
 #endif
-#ifdef CONFIG_NET_REDIRECT
 	__u8			redirected:1;
+#ifdef CONFIG_NET_REDIRECT
 	__u8			from_ingress:1;
 #endif
 #ifdef CONFIG_TLS_DEVICE
@@ -4664,17 +4664,13 @@ static inline __wsum lco_csum(struct sk_buff *skb)
 
 static inline bool skb_is_redirected(const struct sk_buff *skb)
 {
-#ifdef CONFIG_NET_REDIRECT
 	return skb->redirected;
-#else
-	return false;
-#endif
 }
 
 static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
 {
-#ifdef CONFIG_NET_REDIRECT
 	skb->redirected = 1;
+#ifdef CONFIG_NET_REDIRECT
 	skb->from_ingress = from_ingress;
 	if (skb->from_ingress)
 		skb->tstamp = 0;
@@ -4683,9 +4679,7 @@ static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
 
 static inline void skb_reset_redirect(struct sk_buff *skb)
 {
-#ifdef CONFIG_NET_REDIRECT
 	skb->redirected = 0;
-#endif
 }
 
 static inline bool skb_csum_is_sctp(struct sk_buff *skb)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 480e936c54d0..585b2b77ccc4 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -16,6 +16,7 @@
  * netstack, and assigning dedicated CPUs for this stage.  This
  * basically allows for 10G wirespeed pre-filtering via bpf.
  */
+#include <linux/bitops.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/ptr_ring.h>
@@ -168,6 +169,46 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 	}
 }
 
+static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+				     struct list_head *listp,
+				     struct xdp_cpumap_stats *stats)
+{
+	struct sk_buff *skb, *tmp;
+	struct xdp_buff xdp;
+	u32 act;
+	int err;
+
+	list_for_each_entry_safe(skb, tmp, listp, list) {
+		act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
+		switch (act) {
+		case XDP_PASS:
+			break;
+		case XDP_REDIRECT:
+			skb_list_del_init(skb);
+			err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
+						      rcpu->prog);
+			if (unlikely(err)) {
+				kfree_skb(skb);
+				stats->drop++;
+			} else {
+				stats->redirect++;
+			}
+			return;
+		default:
+			bpf_warn_invalid_xdp_action(act);
+			fallthrough;
+		case XDP_ABORTED:
+			trace_xdp_exception(skb->dev, rcpu->prog, act);
+			fallthrough;
+		case XDP_DROP:
+			skb_list_del_init(skb);
+			kfree_skb(skb);
+			stats->drop++;
+			return;
+		}
+	}
+}
+
 static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 				    void **frames, int n,
 				    struct xdp_cpumap_stats *stats)
@@ -176,11 +217,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 	struct xdp_buff xdp;
 	int i, nframes = 0;
 
-	if (!rcpu->prog)
-		return n;
-
-	rcu_read_lock_bh();
-
 	xdp_set_return_frame_no_direct();
 	xdp.rxq = &rxq;
 
@@ -227,17 +263,37 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 		}
 	}
 
+	xdp_clear_return_frame_no_direct();
+
+	return nframes;
+}
+
+#define CPUMAP_BATCH 8
+
+static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+				int xdp_n, struct xdp_cpumap_stats *stats,
+				struct list_head *list)
+{
+	int nframes;
+
+	if (!rcpu->prog)
+		return xdp_n;
+
+	rcu_read_lock_bh();
+
+	nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
+
 	if (stats->redirect)
-		xdp_do_flush_map();
+		xdp_do_flush();
 
-	xdp_clear_return_frame_no_direct();
+	if (unlikely(!list_empty(list)))
+		cpu_map_bpf_prog_run_skb(rcpu, list, stats);
 
 	rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
 
 	return nframes;
 }
 
-#define CPUMAP_BATCH 8
 
 static int cpu_map_kthread_run(void *data)
 {
@@ -254,9 +310,9 @@ static int cpu_map_kthread_run(void *data)
 		struct xdp_cpumap_stats stats = {}; /* zero stats */
 		unsigned int kmem_alloc_drops = 0, sched = 0;
 		gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+		int i, n, m, nframes, xdp_n;
 		void *frames[CPUMAP_BATCH];
 		void *skbs[CPUMAP_BATCH];
-		int i, n, m, nframes;
 		LIST_HEAD(list);
 
 		/* Release CPU reschedule checks */
@@ -280,9 +336,20 @@ static int cpu_map_kthread_run(void *data)
 		 */
 		n = __ptr_ring_consume_batched(rcpu->queue, frames,
 					       CPUMAP_BATCH);
-		for (i = 0; i < n; i++) {
+		for (i = 0, xdp_n = 0; i < n; i++) {
 			void *f = frames[i];
-			struct page *page = virt_to_page(f);
+			struct page *page;
+
+			if (unlikely(__ptr_test_bit(0, &f))) {
+				struct sk_buff *skb = f;
+
+				__ptr_clear_bit(0, &skb);
+				list_add_tail(&skb->list, &list);
+				continue;
+			}
+
+			frames[xdp_n++] = f;
+			page = virt_to_page(f);
 
 			/* Bring struct page memory area to curr CPU. Read by
 			 * build_skb_around via page_is_pfmemalloc(), and when
@@ -292,7 +359,7 @@ static int cpu_map_kthread_run(void *data)
 		}
 
 		/* Support running another XDP prog on this CPU */
-		nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
+		nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
 		if (nframes) {
 			m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
 			if (unlikely(m == 0)) {
@@ -330,12 +397,6 @@ static int cpu_map_kthread_run(void *data)
 	return 0;
 }
 
-bool cpu_map_prog_allowed(struct bpf_map *map)
-{
-	return map->map_type == BPF_MAP_TYPE_CPUMAP &&
-	       map->value_size != offsetofend(struct bpf_cpumap_val, qsize);
-}
-
 static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
 {
 	struct bpf_prog *prog;
@@ -701,6 +762,25 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 	return 0;
 }
 
+int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
+			     struct sk_buff *skb)
+{
+	int ret;
+
+	__skb_pull(skb, skb->mac_len);
+	skb_set_redirected(skb, false);
+	__ptr_set_bit(0, &skb);
+
+	ret = ptr_ring_produce(rcpu->queue, skb);
+	if (ret < 0)
+		goto trace;
+
+	wake_up_process(rcpu->kthread);
+trace:
+	trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu);
+	return ret;
+}
+
 void __cpu_map_flush(void)
 {
 	struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
diff --git a/net/core/dev.c b/net/core/dev.c
index 93e80c36cc97..4c51d1f81633 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5669,8 +5669,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 		 * have a bpf_prog installed on an entry
 		 */
 		for (i = 0; i < new->aux->used_map_cnt; i++) {
-			if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
-			    cpu_map_prog_allowed(new->aux->used_maps[i])) {
+			if (dev_map_can_have_prog(new->aux->used_maps[i])) {
 				mutex_unlock(&new->aux->used_maps_mutex);
 				return -EINVAL;
 			}
diff --git a/net/core/filter.c b/net/core/filter.c
index f2c15b2a057a..3b4986e96e9c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4040,8 +4040,12 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 			goto err;
 		consume_skb(skb);
 		break;
+	case BPF_MAP_TYPE_CPUMAP:
+		err = cpu_map_generic_redirect(fwd, skb);
+		if (unlikely(err))
+			goto err;
+		break;
 	default:
-		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
 		err = -EBADRQC;
 		goto err;
 	}
-- 
cgit v1.2.3


From 2ea5eabaf04a1829383aefe98ac38a2e5ae2d698 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 2 Jul 2021 16:48:24 +0530
Subject: bpf: devmap: Implement devmap prog execution for generic XDP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This lifts the restriction on running devmap BPF progs in generic
redirect mode. To match native XDP behavior, it is invoked right before
generic_xdp_tx is called, and only supports XDP_PASS/XDP_ABORTED/
XDP_DROP actions.

We also return 0 even if devmap program drops the packet, as
semantically redirect has already succeeded and the devmap prog is the
last point before TX of the packet to device where it can deliver a
verdict on the packet.

This also means it must take care of freeing the skb, as
xdp_do_generic_redirect callers only do that in case an error is
returned.

Since devmap entry prog is supported, remove the check in
generic_xdp_install entirely.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210702111825.491065-5-memxor@gmail.com
---
 include/linux/bpf.h |  1 -
 kernel/bpf/devmap.c | 49 +++++++++++++++++++++++++++++++++++++++----------
 net/core/dev.c      | 18 ------------------
 3 files changed, 39 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 095aaa104c56..4afbff308ca3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1508,7 +1508,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
 			   struct bpf_prog *xdp_prog, struct bpf_map *map,
 			   bool exclude_ingress);
-bool dev_map_can_have_prog(struct bpf_map *map);
 
 void __cpu_map_flush(void);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2546dafd6672..fa26eac5e4b6 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -322,16 +322,6 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
 	return -ENOENT;
 }
 
-bool dev_map_can_have_prog(struct bpf_map *map)
-{
-	if ((map->map_type == BPF_MAP_TYPE_DEVMAP ||
-	     map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) &&
-	    map->value_size != offsetofend(struct bpf_devmap_val, ifindex))
-		return true;
-
-	return false;
-}
-
 static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
 				struct xdp_frame **frames, int n,
 				struct net_device *dev)
@@ -499,6 +489,37 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 	return 0;
 }
 
+static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
+{
+	struct xdp_txq_info txq = { .dev = dst->dev };
+	struct xdp_buff xdp;
+	u32 act;
+
+	if (!dst->xdp_prog)
+		return XDP_PASS;
+
+	__skb_pull(skb, skb->mac_len);
+	xdp.txq = &txq;
+
+	act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
+	switch (act) {
+	case XDP_PASS:
+		__skb_push(skb, skb->mac_len);
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		fallthrough;
+	case XDP_ABORTED:
+		trace_xdp_exception(dst->dev, dst->xdp_prog, act);
+		fallthrough;
+	case XDP_DROP:
+		kfree_skb(skb);
+		break;
+	}
+
+	return act;
+}
+
 int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
@@ -614,6 +635,14 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 	err = xdp_ok_fwd_dev(dst->dev, skb->len);
 	if (unlikely(err))
 		return err;
+
+	/* Redirect has already succeeded semantically at this point, so we just
+	 * return 0 even if packet is dropped. Helper below takes care of
+	 * freeing skb.
+	 */
+	if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
+		return 0;
+
 	skb->dev = dst->dev;
 	generic_xdp_tx(skb, xdp_prog);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 4c51d1f81633..71f7175cad9a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5660,24 +5660,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 	struct bpf_prog *new = xdp->prog;
 	int ret = 0;
 
-	if (new) {
-		u32 i;
-
-		mutex_lock(&new->aux->used_maps_mutex);
-
-		/* generic XDP does not work with DEVMAPs that can
-		 * have a bpf_prog installed on an entry
-		 */
-		for (i = 0; i < new->aux->used_map_cnt; i++) {
-			if (dev_map_can_have_prog(new->aux->used_maps[i])) {
-				mutex_unlock(&new->aux->used_maps_mutex);
-				return -EINVAL;
-			}
-		}
-
-		mutex_unlock(&new->aux->used_maps_mutex);
-	}
-
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		rcu_assign_pointer(dev->xdp_prog, new);
-- 
cgit v1.2.3


From f170acda7ffaf0473d06e1e17c12cd9fd63904f5 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Wed, 14 Jul 2021 21:43:17 +0900
Subject: bpf: Fix a typo of reuseport map in bpf.h.

Fix s/BPF_MAP_TYPE_REUSEPORT_ARRAY/BPF_MAP_TYPE_REUSEPORT_SOCKARRAY/ typo
in bpf.h.

Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210714124317.67526-1-kuniyu@amazon.co.jp
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b46a383e8db7..bafb6282032b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3246,7 +3246,7 @@ union bpf_attr {
  * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		Select a **SO_REUSEPORT** socket from a
- *		**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ *		**BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*.
  *		It checks the selected socket is matching the incoming
  *		request in the socket buffer.
  *	Return
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b46a383e8db7..bafb6282032b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3246,7 +3246,7 @@ union bpf_attr {
  * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		Select a **SO_REUSEPORT** socket from a
- *		**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ *		**BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*.
  *		It checks the selected socket is matching the incoming
  *		request in the socket buffer.
  *	Return
-- 
cgit v1.2.3


From 5c2c85315948c42c6c0258cf9bad596acaa79043 Mon Sep 17 00:00:00 2001
From: Richard Laing <richard.laing@alliedtelesis.co.nz>
Date: Thu, 15 Jul 2021 09:18:05 +1200
Subject: bus: mhi: pci-generic: configurable network interface MRU

The MRU value used by the MHI MBIM network interface affects
the throughput performance of the interface. Different modem
models use different default MRU sizes based on their bandwidth
capabilities. Large values generally result in higher throughput
for larger packet sizes.

In addition if the MRU used by the MHI device is larger than that
specified in the MHI net device the data is fragmented and needs
to be re-assembled which generates a (single) warning message about
the fragmented packets. Setting the MRU on both ends avoids the
extra processing to re-assemble the packets.

This patch allows the documented MRU for a modem to be automatically
set as the MHI net device MRU avoiding fragmentation and improving
throughput performance.

Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bus/mhi/pci_generic.c | 6 +++++-
 drivers/net/mhi/net.c         | 1 +
 drivers/net/mhi/proto_mbim.c  | 4 +++-
 include/linux/mhi.h           | 2 ++
 4 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c
index ca3bc40427f8..19413daa0917 100644
--- a/drivers/bus/mhi/pci_generic.c
+++ b/drivers/bus/mhi/pci_generic.c
@@ -32,6 +32,7 @@
  * @edl: emergency download mode firmware path (if any)
  * @bar_num: PCI base address register to use for MHI MMIO register space
  * @dma_data_width: DMA transfer word size (32 or 64 bits)
+ * @mru_default: default MRU size for MBIM network packets
  */
 struct mhi_pci_dev_info {
 	const struct mhi_controller_config *config;
@@ -40,6 +41,7 @@ struct mhi_pci_dev_info {
 	const char *edl;
 	unsigned int bar_num;
 	unsigned int dma_data_width;
+	unsigned int mru_default;
 };
 
 #define MHI_CHANNEL_CONFIG_UL(ch_num, ch_name, el_count, ev_ring) \
@@ -251,7 +253,8 @@ static const struct mhi_pci_dev_info mhi_qcom_sdx55_info = {
 	.edl = "qcom/sdx55m/edl.mbn",
 	.config = &modem_qcom_v1_mhiv_config,
 	.bar_num = MHI_PCI_DEFAULT_BAR_NUM,
-	.dma_data_width = 32
+	.dma_data_width = 32,
+	.mru_default = 32768
 };
 
 static const struct mhi_pci_dev_info mhi_qcom_sdx24_info = {
@@ -643,6 +646,7 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mhi_cntrl->wake_get = mhi_pci_wake_get_nop;
 	mhi_cntrl->wake_put = mhi_pci_wake_put_nop;
 	mhi_cntrl->wake_toggle = mhi_pci_wake_toggle_nop;
+	mhi_cntrl->mru = info->mru_default;
 
 	err = mhi_pci_claim(mhi_cntrl, info->bar_num, DMA_BIT_MASK(info->dma_data_width));
 	if (err)
diff --git a/drivers/net/mhi/net.c b/drivers/net/mhi/net.c
index e60e38c1f09d..a5a2aa19bb91 100644
--- a/drivers/net/mhi/net.c
+++ b/drivers/net/mhi/net.c
@@ -329,6 +329,7 @@ static int mhi_net_newlink(void *ctxt, struct net_device *ndev, u32 if_id,
 	mhi_netdev->mdev = mhi_dev;
 	mhi_netdev->skbagg_head = NULL;
 	mhi_netdev->proto = info->proto;
+	mhi_netdev->mru = mhi_dev->mhi_cntrl->mru;
 
 	INIT_DELAYED_WORK(&mhi_netdev->rx_refill, mhi_net_rx_refill_work);
 	u64_stats_init(&mhi_netdev->stats.rx_syncp);
diff --git a/drivers/net/mhi/proto_mbim.c b/drivers/net/mhi/proto_mbim.c
index bf1ad863237d..f1cc7f35bb85 100644
--- a/drivers/net/mhi/proto_mbim.c
+++ b/drivers/net/mhi/proto_mbim.c
@@ -292,7 +292,9 @@ static int mbim_init(struct mhi_net_dev *mhi_netdev)
 
 	ndev->needed_headroom = sizeof(struct mbim_tx_hdr);
 	ndev->mtu = MHI_MBIM_DEFAULT_MTU;
-	mhi_netdev->mru = MHI_MBIM_DEFAULT_MRU;
+
+	if (!mhi_netdev->mru)
+		mhi_netdev->mru = MHI_MBIM_DEFAULT_MRU;
 
 	return 0;
 }
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 944aa3aa3035..beb918328eef 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -356,6 +356,7 @@ struct mhi_controller_config {
  * @fbc_download: MHI host needs to do complete image transfer (optional)
  * @wake_set: Device wakeup set flag
  * @irq_flags: irq flags passed to request_irq (optional)
+ * @mru: the default MRU for the MHI device
  *
  * Fields marked as (required) need to be populated by the controller driver
  * before calling mhi_register_controller(). For the fields marked as (optional)
@@ -448,6 +449,7 @@ struct mhi_controller {
 	bool fbc_download;
 	bool wake_set;
 	unsigned long irq_flags;
+	u32 mru;
 };
 
 /**
-- 
cgit v1.2.3


From b00628b1c7d595ae5b544e059c27b1f5828314b4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 14 Jul 2021 17:54:09 -0700
Subject: bpf: Introduce bpf timers.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded
in hash/array/lru maps as a regular field and helpers to operate on it:

// Initialize the timer.
// First 4 bits of 'flags' specify clockid.
// Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, int flags);

// Configure the timer to call 'callback_fn' static function.
long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);

// Arm the timer to expire 'nsec' nanoseconds from the current time.
long bpf_timer_start(struct bpf_timer *timer, u64 nsec, u64 flags);

// Cancel the timer and wait for callback_fn to finish if it was running.
long bpf_timer_cancel(struct bpf_timer *timer);

Here is how BPF program might look like:
struct map_elem {
    int counter;
    struct bpf_timer timer;
};

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __uint(max_entries, 1000);
    __type(key, int);
    __type(value, struct map_elem);
} hmap SEC(".maps");

static int timer_cb(void *map, int *key, struct map_elem *val);
/* val points to particular map element that contains bpf_timer. */

SEC("fentry/bpf_fentry_test1")
int BPF_PROG(test1, int a)
{
    struct map_elem *val;
    int key = 0;

    val = bpf_map_lookup_elem(&hmap, &key);
    if (val) {
        bpf_timer_init(&val->timer, &hmap, CLOCK_REALTIME);
        bpf_timer_set_callback(&val->timer, timer_cb);
        bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */, 0);
    }
}

This patch adds helper implementations that rely on hrtimers
to call bpf functions as timers expire.
The following patches add necessary safety checks.

Only programs with CAP_BPF are allowed to use bpf_timer.

The amount of timers used by the program is constrained by
the memcg recorded at map creation time.

The bpf_timer_init() helper needs explicit 'map' argument because inner maps
are dynamic and not known at load time. While the bpf_timer_set_callback() is
receiving hidden 'aux->prog' argument supplied by the verifier.

The prog pointer is needed to do refcnting of bpf program to make sure that
program doesn't get freed while the timer is armed. This approach relies on
"user refcnt" scheme used in prog_array that stores bpf programs for
bpf_tail_call. The bpf_timer_set_callback() will increment the prog refcnt which is
paired with bpf_timer_cancel() that will drop the prog refcnt. The
ops->map_release_uref is responsible for cancelling the timers and dropping
prog refcnt when user space reference to a map reaches zero.
This uref approach is done to make sure that Ctrl-C of user space process will
not leave timers running forever unless the user space explicitly pinned a map
that contained timers in bpffs.

bpf_timer_init() and bpf_timer_set_callback() will return -EPERM if map doesn't
have user references (is not held by open file descriptor from user space and
not pinned in bpffs).

The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel
and free the timer if given map element had it allocated.
"bpftool map update" command can be used to cancel timers.

The 'struct bpf_timer' is explicitly __attribute__((aligned(8))) because
'__u64 :64' has 1 byte alignment of 8 byte padding.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210715005417.78572-4-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h            |   3 +
 include/uapi/linux/bpf.h       |  73 ++++++++++
 kernel/bpf/helpers.c           | 324 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 109 ++++++++++++++
 kernel/trace/bpf_trace.c       |   2 +-
 scripts/bpf_doc.py             |   2 +
 tools/include/uapi/linux/bpf.h |  73 ++++++++++
 7 files changed, 585 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4afbff308ca3..125240b7cefb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -168,6 +168,7 @@ struct bpf_map {
 	u32 max_entries;
 	u32 map_flags;
 	int spin_lock_off; /* >=0 valid offset, <0 error */
+	int timer_off; /* >=0 valid offset, <0 error */
 	u32 id;
 	int numa_node;
 	u32 btf_key_type_id;
@@ -221,6 +222,7 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 }
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 			   bool lock_src);
+void bpf_timer_cancel_and_free(void *timer);
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
 struct bpf_offload_dev;
@@ -314,6 +316,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */
 	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
+	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
 	__BPF_ARG_TYPE_MAX,
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bafb6282032b..3544ec5234f0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4777,6 +4777,70 @@ union bpf_attr {
  * 		Execute close syscall for given FD.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
+ *	Description
+ *		Initialize the timer.
+ *		First 4 bits of *flags* specify clockid.
+ *		Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
+ *		All other bits of *flags* are reserved.
+ *		The verifier will reject the program if *timer* is not from
+ *		the same *map*.
+ *	Return
+ *		0 on success.
+ *		**-EBUSY** if *timer* is already initialized.
+ *		**-EINVAL** if invalid *flags* are passed.
+ *		**-EPERM** if *timer* is in a map that doesn't have any user references.
+ *		The user space should either hold a file descriptor to a map with timers
+ *		or pin such map in bpffs. When map is unpinned or file descriptor is
+ *		closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
+ *	Description
+ *		Configure the timer to call *callback_fn* static function.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *		**-EPERM** if *timer* is in a map that doesn't have any user references.
+ *		The user space should either hold a file descriptor to a map with timers
+ *		or pin such map in bpffs. When map is unpinned or file descriptor is
+ *		closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
+ *	Description
+ *		Set timer expiration N nanoseconds from the current time. The
+ *		configured callback will be invoked in soft irq context on some cpu
+ *		and will not repeat unless another bpf_timer_start() is made.
+ *		In such case the next invocation can migrate to a different cpu.
+ *		Since struct bpf_timer is a field inside map element the map
+ *		owns the timer. The bpf_timer_set_callback() will increment refcnt
+ *		of BPF program to make sure that callback_fn code stays valid.
+ *		When user space reference to a map reaches zero all timers
+ *		in a map are cancelled and corresponding program's refcnts are
+ *		decremented. This is done to make sure that Ctrl-C of a user
+ *		process doesn't leave any timers running. If map is pinned in
+ *		bpffs the callback_fn can re-arm itself indefinitely.
+ *		bpf_map_update/delete_elem() helpers and user space sys_bpf commands
+ *		cancel and free the timer in the given map element.
+ *		The map can contain timers that invoke callback_fn-s from different
+ *		programs. The same callback_fn can serve different timers from
+ *		different maps if key/value layout matches across maps.
+ *		Every bpf_timer_set_callback() can have different callback_fn.
+ *
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
+ *		or invalid *flags* are passed.
+ *
+ * long bpf_timer_cancel(struct bpf_timer *timer)
+ *	Description
+ *		Cancel the timer and wait for callback_fn to finish if it was running.
+ *	Return
+ *		0 if the timer was not active.
+ *		1 if the timer was active.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
+ *		own timer which would have led to a deadlock otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4948,6 +5012,10 @@ union bpf_attr {
 	FN(sys_bpf),			\
 	FN(btf_find_by_name_kind),	\
 	FN(sys_close),			\
+	FN(timer_init),			\
+	FN(timer_set_callback),		\
+	FN(timer_start),		\
+	FN(timer_cancel),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6074,6 +6142,11 @@ struct bpf_spin_lock {
 	__u32	val;
 };
 
+struct bpf_timer {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 38be3cfc2f58..74b16593983d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -999,6 +999,322 @@ const struct bpf_func_proto bpf_snprintf_proto = {
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+/* BPF map elements can contain 'struct bpf_timer'.
+ * Such map owns all of its BPF timers.
+ * 'struct bpf_timer' is allocated as part of map element allocation
+ * and it's zero initialized.
+ * That space is used to keep 'struct bpf_timer_kern'.
+ * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
+ * remembers 'struct bpf_map *' pointer it's part of.
+ * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
+ * bpf_timer_start() arms the timer.
+ * If user space reference to a map goes to zero at this point
+ * ops->map_release_uref callback is responsible for cancelling the timers,
+ * freeing their memory, and decrementing prog's refcnts.
+ * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
+ * Inner maps can contain bpf timers as well. ops->map_release_uref is
+ * freeing the timers when inner map is replaced or deleted by user space.
+ */
+struct bpf_hrtimer {
+	struct hrtimer timer;
+	struct bpf_map *map;
+	struct bpf_prog *prog;
+	void __rcu *callback_fn;
+	void *value;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer */
+struct bpf_timer_kern {
+	struct bpf_hrtimer *timer;
+	/* bpf_spin_lock is used here instead of spinlock_t to make
+	 * sure that it always fits into space resereved by struct bpf_timer
+	 * regardless of LOCKDEP and spinlock debug flags.
+	 */
+	struct bpf_spin_lock lock;
+} __attribute__((aligned(8)));
+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+{
+	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+	struct bpf_map *map = t->map;
+	void *value = t->value;
+	void *callback_fn;
+	void *key;
+	u32 idx;
+	int ret;
+
+	callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+	if (!callback_fn)
+		goto out;
+
+	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
+	 * Remember the timer this callback is servicing to prevent
+	 * deadlock if callback_fn() calls bpf_timer_cancel() or
+	 * bpf_map_delete_elem() on the same timer.
+	 */
+	this_cpu_write(hrtimer_running, t);
+	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+		/* compute the key */
+		idx = ((char *)value - array->value) / array->elem_size;
+		key = &idx;
+	} else { /* hash or lru */
+		key = value - round_up(map->key_size, 8);
+	}
+
+	ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+					 (u64)(long)key,
+					 (u64)(long)value, 0, 0);
+	WARN_ON(ret != 0); /* Next patch moves this check into the verifier */
+
+	this_cpu_write(hrtimer_running, NULL);
+out:
+	return HRTIMER_NORESTART;
+}
+
+BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
+	   u64, flags)
+{
+	clockid_t clockid = flags & (MAX_CLOCKS - 1);
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	BUILD_BUG_ON(MAX_CLOCKS != 16);
+	BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
+	BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
+
+	if (in_nmi())
+		return -EOPNOTSUPP;
+
+	if (flags >= MAX_CLOCKS ||
+	    /* similar to timerfd except _ALARM variants are not supported */
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME &&
+	     clockid != CLOCK_BOOTTIME))
+		return -EINVAL;
+	__bpf_spin_lock_irqsave(&timer->lock);
+	t = timer->timer;
+	if (t) {
+		ret = -EBUSY;
+		goto out;
+	}
+	if (!atomic64_read(&map->usercnt)) {
+		/* maps with timers must be either held by user space
+		 * or pinned in bpffs.
+		 */
+		ret = -EPERM;
+		goto out;
+	}
+	/* allocate hrtimer via map_kmalloc to use memcg accounting */
+	t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
+	if (!t) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	t->value = (void *)timer - map->timer_off;
+	t->map = map;
+	t->prog = NULL;
+	rcu_assign_pointer(t->callback_fn, NULL);
+	hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+	t->timer.function = bpf_timer_cb;
+	timer->timer = t;
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+	.func		= bpf_timer_init,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
+	   struct bpf_prog_aux *, aux)
+{
+	struct bpf_prog *prev, *prog = aux->prog;
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	if (in_nmi())
+		return -EOPNOTSUPP;
+	__bpf_spin_lock_irqsave(&timer->lock);
+	t = timer->timer;
+	if (!t) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!atomic64_read(&t->map->usercnt)) {
+		/* maps with timers must be either held by user space
+		 * or pinned in bpffs. Otherwise timer might still be
+		 * running even when bpf prog is detached and user space
+		 * is gone, since map_release_uref won't ever be called.
+		 */
+		ret = -EPERM;
+		goto out;
+	}
+	prev = t->prog;
+	if (prev != prog) {
+		/* Bump prog refcnt once. Every bpf_timer_set_callback()
+		 * can pick different callback_fn-s within the same prog.
+		 */
+		prog = bpf_prog_inc_not_zero(prog);
+		if (IS_ERR(prog)) {
+			ret = PTR_ERR(prog);
+			goto out;
+		}
+		if (prev)
+			/* Drop prev prog refcnt when swapping with new prog */
+			bpf_prog_put(prev);
+		t->prog = prog;
+	}
+	rcu_assign_pointer(t->callback_fn, callback_fn);
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_set_callback_proto = {
+	.func		= bpf_timer_set_callback,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+};
+
+BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+{
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	if (in_nmi())
+		return -EOPNOTSUPP;
+	if (flags)
+		return -EINVAL;
+	__bpf_spin_lock_irqsave(&timer->lock);
+	t = timer->timer;
+	if (!t || !t->prog) {
+		ret = -EINVAL;
+		goto out;
+	}
+	hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_start_proto = {
+	.func		= bpf_timer_start,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static void drop_prog_refcnt(struct bpf_hrtimer *t)
+{
+	struct bpf_prog *prog = t->prog;
+
+	if (prog) {
+		bpf_prog_put(prog);
+		t->prog = NULL;
+		rcu_assign_pointer(t->callback_fn, NULL);
+	}
+}
+
+BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+{
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	if (in_nmi())
+		return -EOPNOTSUPP;
+	__bpf_spin_lock_irqsave(&timer->lock);
+	t = timer->timer;
+	if (!t) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (this_cpu_read(hrtimer_running) == t) {
+		/* If bpf callback_fn is trying to bpf_timer_cancel()
+		 * its own timer the hrtimer_cancel() will deadlock
+		 * since it waits for callback_fn to finish
+		 */
+		ret = -EDEADLK;
+		goto out;
+	}
+	drop_prog_refcnt(t);
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	/* Cancel the timer and wait for associated callback to finish
+	 * if it was running.
+	 */
+	ret = ret ?: hrtimer_cancel(&t->timer);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+	.func		= bpf_timer_cancel,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+};
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+	struct bpf_timer_kern *timer = val;
+	struct bpf_hrtimer *t;
+
+	/* Performance optimization: read timer->timer without lock first. */
+	if (!READ_ONCE(timer->timer))
+		return;
+
+	__bpf_spin_lock_irqsave(&timer->lock);
+	/* re-read it under lock */
+	t = timer->timer;
+	if (!t)
+		goto out;
+	drop_prog_refcnt(t);
+	/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+	 * this timer, since it won't be initialized.
+	 */
+	timer->timer = NULL;
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	if (!t)
+		return;
+	/* Cancel the timer and wait for callback to complete if it was running.
+	 * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
+	 * right after for both preallocated and non-preallocated maps.
+	 * The timer->timer = NULL was already done and no code path can
+	 * see address 't' anymore.
+	 *
+	 * Check that bpf_map_delete/update_elem() wasn't called from timer
+	 * callback_fn. In such case don't call hrtimer_cancel() (since it will
+	 * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
+	 * return -1). Though callback_fn is still running on this cpu it's
+	 * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+	 * from 't'. The bpf subprog callback_fn won't be able to access 't',
+	 * since timer->timer = NULL was already done. The timer will be
+	 * effectively cancelled because bpf_timer_cb() will return
+	 * HRTIMER_NORESTART.
+	 */
+	if (this_cpu_read(hrtimer_running) != t)
+		hrtimer_cancel(&t->timer);
+	kfree(t);
+}
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1065,6 +1381,14 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_per_cpu_ptr_proto;
 	case BPF_FUNC_this_cpu_ptr:
 		return &bpf_this_cpu_ptr_proto;
+	case BPF_FUNC_timer_init:
+		return &bpf_timer_init_proto;
+	case BPF_FUNC_timer_set_callback:
+		return &bpf_timer_set_callback_proto;
+	case BPF_FUNC_timer_start:
+		return &bpf_timer_start_proto;
+	case BPF_FUNC_timer_cancel:
+		return &bpf_timer_cancel_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3dbb3b40b754..e8645c819803 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4656,6 +4656,38 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+			      struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	bool is_const = tnum_is_const(reg->var_off);
+	struct bpf_map *map = reg->map_ptr;
+	u64 val = reg->var_off.value;
+
+	if (!is_const) {
+		verbose(env,
+			"R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map->btf) {
+		verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+			map->name);
+		return -EINVAL;
+	}
+	if (val) {
+		/* This restriction will be removed in the next patch */
+		verbose(env, "bpf_timer field can only be first in the map value element\n");
+		return -EINVAL;
+	}
+	if (meta->map_ptr) {
+		verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+		return -EFAULT;
+	}
+	meta->map_ptr = map;
+	return 0;
+}
+
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
 	return type == ARG_PTR_TO_MEM ||
@@ -4788,6 +4820,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
 static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -4819,6 +4852,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_FUNC]		= &func_ptr_types,
 	[ARG_PTR_TO_STACK_OR_NULL]	= &stack_ptr_types,
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
+	[ARG_PTR_TO_TIMER]		= &timer_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4948,6 +4982,10 @@ skip_type_check:
 
 	if (arg_type == ARG_CONST_MAP_PTR) {
 		/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+		if (meta->map_ptr && meta->map_ptr != reg->map_ptr) {
+			verbose(env, "Map pointer doesn't match bpf_timer.\n");
+			return -EINVAL;
+		}
 		meta->map_ptr = reg->map_ptr;
 	} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
 		/* bpf_map_xxx(..., map_ptr, ..., key) call:
@@ -5000,6 +5038,9 @@ skip_type_check:
 			verbose(env, "verifier internal error\n");
 			return -EFAULT;
 		}
+	} else if (arg_type == ARG_PTR_TO_TIMER) {
+		if (process_timer_func(env, regno, meta))
+			return -EACCES;
 	} else if (arg_type == ARG_PTR_TO_FUNC) {
 		meta->subprogno = reg->subprogno;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
@@ -5742,6 +5783,34 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int set_timer_callback_state(struct bpf_verifier_env *env,
+				    struct bpf_func_state *caller,
+				    struct bpf_func_state *callee,
+				    int insn_idx)
+{
+	struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+	/* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
+	 * callback_fn(struct bpf_map *map, void *key, void *value);
+	 */
+	callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+	callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+	callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+	callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+	callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+	callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -6069,6 +6138,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_timer_set_callback) {
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_timer_callback_state);
+		if (err < 0)
+			return -EINVAL;
+	}
+
 	if (func_id == BPF_FUNC_snprintf) {
 		err = check_bpf_snprintf_call(env, regs);
 		if (err < 0)
@@ -12591,6 +12667,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (insn->imm == BPF_FUNC_timer_set_callback) {
+			/* The verifier will process callback_fn as many times as necessary
+			 * with different maps and the register states prepared by
+			 * set_timer_callback_state will be accurate.
+			 *
+			 * The following use case is valid:
+			 *   map1 is shared by prog1, prog2, prog3.
+			 *   prog1 calls bpf_timer_init for some map1 elements
+			 *   prog2 calls bpf_timer_set_callback for some map1 elements.
+			 *     Those that were not bpf_timer_init-ed will return -EINVAL.
+			 *   prog3 calls bpf_timer_start for some map1 elements.
+			 *     Those that were not both bpf_timer_init-ed and
+			 *     bpf_timer_set_callback-ed will return -EINVAL.
+			 */
+			struct bpf_insn ld_addrs[2] = {
+				BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+			};
+
+			insn_buf[0] = ld_addrs[0];
+			insn_buf[1] = ld_addrs[1];
+			insn_buf[2] = *insn;
+			cnt = 3;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
+
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
 		 * only.
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 64bd2d84367f..6c77d25137e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1059,7 +1059,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_snprintf:
 		return &bpf_snprintf_proto;
 	default:
-		return NULL;
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index 2d94025b38e9..00ac7b79cddb 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -547,6 +547,7 @@ class PrinterHelpers(Printer):
             'struct inode',
             'struct socket',
             'struct file',
+            'struct bpf_timer',
     ]
     known_types = {
             '...',
@@ -594,6 +595,7 @@ class PrinterHelpers(Printer):
             'struct inode',
             'struct socket',
             'struct file',
+            'struct bpf_timer',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bafb6282032b..3544ec5234f0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4777,6 +4777,70 @@ union bpf_attr {
  * 		Execute close syscall for given FD.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
+ *	Description
+ *		Initialize the timer.
+ *		First 4 bits of *flags* specify clockid.
+ *		Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
+ *		All other bits of *flags* are reserved.
+ *		The verifier will reject the program if *timer* is not from
+ *		the same *map*.
+ *	Return
+ *		0 on success.
+ *		**-EBUSY** if *timer* is already initialized.
+ *		**-EINVAL** if invalid *flags* are passed.
+ *		**-EPERM** if *timer* is in a map that doesn't have any user references.
+ *		The user space should either hold a file descriptor to a map with timers
+ *		or pin such map in bpffs. When map is unpinned or file descriptor is
+ *		closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
+ *	Description
+ *		Configure the timer to call *callback_fn* static function.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *		**-EPERM** if *timer* is in a map that doesn't have any user references.
+ *		The user space should either hold a file descriptor to a map with timers
+ *		or pin such map in bpffs. When map is unpinned or file descriptor is
+ *		closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
+ *	Description
+ *		Set timer expiration N nanoseconds from the current time. The
+ *		configured callback will be invoked in soft irq context on some cpu
+ *		and will not repeat unless another bpf_timer_start() is made.
+ *		In such case the next invocation can migrate to a different cpu.
+ *		Since struct bpf_timer is a field inside map element the map
+ *		owns the timer. The bpf_timer_set_callback() will increment refcnt
+ *		of BPF program to make sure that callback_fn code stays valid.
+ *		When user space reference to a map reaches zero all timers
+ *		in a map are cancelled and corresponding program's refcnts are
+ *		decremented. This is done to make sure that Ctrl-C of a user
+ *		process doesn't leave any timers running. If map is pinned in
+ *		bpffs the callback_fn can re-arm itself indefinitely.
+ *		bpf_map_update/delete_elem() helpers and user space sys_bpf commands
+ *		cancel and free the timer in the given map element.
+ *		The map can contain timers that invoke callback_fn-s from different
+ *		programs. The same callback_fn can serve different timers from
+ *		different maps if key/value layout matches across maps.
+ *		Every bpf_timer_set_callback() can have different callback_fn.
+ *
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
+ *		or invalid *flags* are passed.
+ *
+ * long bpf_timer_cancel(struct bpf_timer *timer)
+ *	Description
+ *		Cancel the timer and wait for callback_fn to finish if it was running.
+ *	Return
+ *		0 if the timer was not active.
+ *		1 if the timer was active.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
+ *		own timer which would have led to a deadlock otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4948,6 +5012,10 @@ union bpf_attr {
 	FN(sys_bpf),			\
 	FN(btf_find_by_name_kind),	\
 	FN(sys_close),			\
+	FN(timer_init),			\
+	FN(timer_set_callback),		\
+	FN(timer_start),		\
+	FN(timer_cancel),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6074,6 +6142,11 @@ struct bpf_spin_lock {
 	__u32	val;
 };
 
+struct bpf_timer {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
-- 
cgit v1.2.3


From 68134668c17f31f51930478f75495b552a411550 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 14 Jul 2021 17:54:10 -0700
Subject: bpf: Add map side support for bpf timers.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restrict bpf timers to array, hash (both preallocated and kmalloced), and
lru map types. The per-cpu maps with timers don't make sense, since 'struct
bpf_timer' is a part of map value. bpf timers in per-cpu maps would mean that
the number of timers depends on number of possible cpus and timers would not be
accessible from all cpus. lpm map support can be added in the future.
The timers in inner maps are supported.

The bpf_map_update/delete_elem() helpers and sys_bpf commands cancel and free
bpf_timer in a given map element.

Similar to 'struct bpf_spin_lock' BTF is required and it is used to validate
that map element indeed contains 'struct bpf_timer'.

Make check_and_init_map_value() init both bpf_spin_lock and bpf_timer when
map element data is reused in preallocated htab and lru maps.

Teach copy_map_value() to support both bpf_spin_lock and bpf_timer in a single
map element. There could be one of each, but not more than one. Due to 'one
bpf_timer in one element' restriction do not support timers in global data,
since global data is a map of single element, but from bpf program side it's
seen as many global variables and restriction of single global timer would be
odd. The sys_bpf map_freeze and sys_mmap syscalls are not allowed on maps with
timers, since user space could have corrupted mmap element and crashed the
kernel. The maps with timers cannot be readonly. Due to these restrictions
search for bpf_timer in datasec BTF in case it was placed in the global data to
report clear error.

The previous patch allowed 'struct bpf_timer' as a first field in a map
element only. Relax this restriction.

Refactor lru map to s/bpf_lru_push_free/htab_lru_push_free/ to cancel and free
the timer when lru map deletes an element as a part of it eviction algorithm.

Make sure that bpf program cannot access 'struct bpf_timer' via direct load/store.
The timer operation are done through helpers only.
This is similar to 'struct bpf_spin_lock'.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210715005417.78572-5-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h        |  44 ++++++++++++++-----
 include/linux/btf.h        |   1 +
 kernel/bpf/arraymap.c      |  21 +++++++++
 kernel/bpf/btf.c           |  77 +++++++++++++++++++++++++++------
 kernel/bpf/hashtab.c       | 105 +++++++++++++++++++++++++++++++++++++++------
 kernel/bpf/local_storage.c |   4 +-
 kernel/bpf/map_in_map.c    |   2 +
 kernel/bpf/syscall.c       |  21 +++++++--
 kernel/bpf/verifier.c      |  30 +++++++++++--
 9 files changed, 259 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 125240b7cefb..a9a4a480a6d0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -198,24 +198,46 @@ static inline bool map_value_has_spin_lock(const struct bpf_map *map)
 	return map->spin_lock_off >= 0;
 }
 
-static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
+static inline bool map_value_has_timer(const struct bpf_map *map)
 {
-	if (likely(!map_value_has_spin_lock(map)))
-		return;
-	*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
-		(struct bpf_spin_lock){};
+	return map->timer_off >= 0;
 }
 
-/* copy everything but bpf_spin_lock */
+static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
+{
+	if (unlikely(map_value_has_spin_lock(map)))
+		*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
+			(struct bpf_spin_lock){};
+	if (unlikely(map_value_has_timer(map)))
+		*(struct bpf_timer *)(dst + map->timer_off) =
+			(struct bpf_timer){};
+}
+
+/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
 static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 {
+	u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0;
+
 	if (unlikely(map_value_has_spin_lock(map))) {
-		u32 off = map->spin_lock_off;
+		s_off = map->spin_lock_off;
+		s_sz = sizeof(struct bpf_spin_lock);
+	} else if (unlikely(map_value_has_timer(map))) {
+		t_off = map->timer_off;
+		t_sz = sizeof(struct bpf_timer);
+	}
 
-		memcpy(dst, src, off);
-		memcpy(dst + off + sizeof(struct bpf_spin_lock),
-		       src + off + sizeof(struct bpf_spin_lock),
-		       map->value_size - off - sizeof(struct bpf_spin_lock));
+	if (unlikely(s_sz || t_sz)) {
+		if (s_off < t_off || !s_sz) {
+			swap(s_off, t_off);
+			swap(s_sz, t_sz);
+		}
+		memcpy(dst, src, t_off);
+		memcpy(dst + t_off + t_sz,
+		       src + t_off + t_sz,
+		       s_off - t_off - t_sz);
+		memcpy(dst + s_off + s_sz,
+		       src + s_off + s_sz,
+		       map->value_size - s_off - s_sz);
 	} else {
 		memcpy(dst, src, map->value_size);
 	}
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 94a0c976c90f..214fde93214b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -99,6 +99,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
 int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
+int btf_find_timer(const struct btf *btf, const struct btf_type *t);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
 const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3c4105603f9d..cebd4fb06d19 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -287,6 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
 	return 0;
 }
 
+static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
+{
+	if (unlikely(map_value_has_timer(&arr->map)))
+		bpf_timer_cancel_and_free(val + arr->map.timer_off);
+}
+
 /* Called from syscall or from eBPF program */
 static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 				 u64 map_flags)
@@ -321,6 +327,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 			copy_map_value_locked(map, val, value, false);
 		else
 			copy_map_value(map, val, value);
+		check_and_free_timer_in_array(array, val);
 	}
 	return 0;
 }
@@ -374,6 +381,19 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
 	return (void *)round_down((unsigned long)array, PAGE_SIZE);
 }
 
+static void array_map_free_timers(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	if (likely(!map_value_has_timer(map)))
+		return;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		bpf_timer_cancel_and_free(array->value + array->elem_size * i +
+					  map->timer_off);
+}
+
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 static void array_map_free(struct bpf_map *map)
 {
@@ -668,6 +688,7 @@ const struct bpf_map_ops array_map_ops = {
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
+	.map_release_uref = array_map_free_timers,
 	.map_lookup_elem = array_map_lookup_elem,
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cb4b72997d9b..7780131f710e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3046,43 +3046,92 @@ static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
+				 const char *name, int sz, int align)
 {
 	const struct btf_member *member;
 	u32 i, off = -ENOENT;
 
-	if (!__btf_type_is_struct(t))
-		return -EINVAL;
-
 	for_each_member(i, t, member) {
 		const struct btf_type *member_type = btf_type_by_id(btf,
 								    member->type);
 		if (!__btf_type_is_struct(member_type))
 			continue;
-		if (member_type->size != sizeof(struct bpf_spin_lock))
+		if (member_type->size != sz)
 			continue;
-		if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
-			   "bpf_spin_lock"))
+		if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
 			continue;
 		if (off != -ENOENT)
-			/* only one 'struct bpf_spin_lock' is allowed */
+			/* only one such field is allowed */
 			return -E2BIG;
 		off = btf_member_bit_offset(t, member);
 		if (off % 8)
 			/* valid C code cannot generate such BTF */
 			return -EINVAL;
 		off /= 8;
-		if (off % __alignof__(struct bpf_spin_lock))
-			/* valid struct bpf_spin_lock will be 4 byte aligned */
+		if (off % align)
+			return -EINVAL;
+	}
+	return off;
+}
+
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+				const char *name, int sz, int align)
+{
+	const struct btf_var_secinfo *vsi;
+	u32 i, off = -ENOENT;
+
+	for_each_vsi(i, t, vsi) {
+		const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+		const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+		if (!__btf_type_is_struct(var_type))
+			continue;
+		if (var_type->size != sz)
+			continue;
+		if (vsi->size != sz)
+			continue;
+		if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+			continue;
+		if (off != -ENOENT)
+			/* only one such field is allowed */
+			return -E2BIG;
+		off = vsi->offset;
+		if (off % align)
 			return -EINVAL;
 	}
 	return off;
 }
 
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+			  const char *name, int sz, int align)
+{
+
+	if (__btf_type_is_struct(t))
+		return btf_find_struct_field(btf, t, name, sz, align);
+	else if (btf_type_is_datasec(t))
+		return btf_find_datasec_var(btf, t, name, sz, align);
+	return -EINVAL;
+}
+
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+	return btf_find_field(btf, t, "bpf_spin_lock",
+			      sizeof(struct bpf_spin_lock),
+			      __alignof__(struct bpf_spin_lock));
+}
+
+int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+{
+	return btf_find_field(btf, t, "bpf_timer",
+			      sizeof(struct bpf_timer),
+			      __alignof__(struct bpf_timer));
+}
+
 static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
 			      u32 type_id, void *data, u8 bits_offset,
 			      struct btf_show *show)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 72c58cc516a3..6dc3fae46a56 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -228,6 +228,32 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
 	return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
 }
 
+static bool htab_has_extra_elems(struct bpf_htab *htab)
+{
+	return !htab_is_percpu(htab) && !htab_is_lru(htab);
+}
+
+static void htab_free_prealloced_timers(struct bpf_htab *htab)
+{
+	u32 num_entries = htab->map.max_entries;
+	int i;
+
+	if (likely(!map_value_has_timer(&htab->map)))
+		return;
+	if (htab_has_extra_elems(htab))
+		num_entries += num_possible_cpus();
+
+	for (i = 0; i < num_entries; i++) {
+		struct htab_elem *elem;
+
+		elem = get_htab_elem(htab, i);
+		bpf_timer_cancel_and_free(elem->key +
+					  round_up(htab->map.key_size, 8) +
+					  htab->map.timer_off);
+		cond_resched();
+	}
+}
+
 static void htab_free_elems(struct bpf_htab *htab)
 {
 	int i;
@@ -265,8 +291,12 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 	struct htab_elem *l;
 
 	if (node) {
+		u32 key_size = htab->map.key_size;
+
 		l = container_of(node, struct htab_elem, lru_node);
-		memcpy(l->key, key, htab->map.key_size);
+		memcpy(l->key, key, key_size);
+		check_and_init_map_value(&htab->map,
+					 l->key + round_up(key_size, 8));
 		return l;
 	}
 
@@ -278,7 +308,7 @@ static int prealloc_init(struct bpf_htab *htab)
 	u32 num_entries = htab->map.max_entries;
 	int err = -ENOMEM, i;
 
-	if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+	if (htab_has_extra_elems(htab))
 		num_entries += num_possible_cpus();
 
 	htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
@@ -695,6 +725,14 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
 	return insn - insn_buf;
 }
 
+static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+{
+	if (unlikely(map_value_has_timer(&htab->map)))
+		bpf_timer_cancel_and_free(elem->key +
+					  round_up(htab->map.key_size, 8) +
+					  htab->map.timer_off);
+}
+
 /* It is called from the bpf_lru_list when the LRU needs to delete
  * older elements from the htab.
  */
@@ -719,6 +757,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 		if (l == tgt_l) {
 			hlist_nulls_del_rcu(&l->hash_node);
+			check_and_free_timer(htab, l);
 			break;
 		}
 
@@ -790,6 +829,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+	check_and_free_timer(htab, l);
 	kfree(l);
 }
 
@@ -817,6 +857,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	htab_put_fd_value(htab, l);
 
 	if (htab_is_prealloc(htab)) {
+		check_and_free_timer(htab, l);
 		__pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		atomic_dec(&htab->count);
@@ -920,8 +961,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			l_new = ERR_PTR(-ENOMEM);
 			goto dec_count;
 		}
-		check_and_init_map_lock(&htab->map,
-					l_new->key + round_up(key_size, 8));
+		check_and_init_map_value(&htab->map,
+					 l_new->key + round_up(key_size, 8));
 	}
 
 	memcpy(l_new->key, key, key_size);
@@ -1062,6 +1103,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		hlist_nulls_del_rcu(&l_old->hash_node);
 		if (!htab_is_prealloc(htab))
 			free_htab_elem(htab, l_old);
+		else
+			check_and_free_timer(htab, l_old);
 	}
 	ret = 0;
 err:
@@ -1069,6 +1112,12 @@ err:
 	return ret;
 }
 
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+	check_and_free_timer(htab, elem);
+	bpf_lru_push_free(&htab->lru, &elem->lru_node);
+}
+
 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 				    u64 map_flags)
 {
@@ -1102,7 +1151,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	l_new = prealloc_lru_pop(htab, key, hash);
 	if (!l_new)
 		return -ENOMEM;
-	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+	copy_map_value(&htab->map,
+		       l_new->key + round_up(map->key_size, 8), value);
 
 	ret = htab_lock_bucket(htab, b, hash, &flags);
 	if (ret)
@@ -1128,9 +1178,9 @@ err:
 	htab_unlock_bucket(htab, b, hash, flags);
 
 	if (ret)
-		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+		htab_lru_push_free(htab, l_new);
 	else if (l_old)
-		bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+		htab_lru_push_free(htab, l_old);
 
 	return ret;
 }
@@ -1339,7 +1389,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 
 	htab_unlock_bucket(htab, b, hash, flags);
 	if (l)
-		bpf_lru_push_free(&htab->lru, &l->lru_node);
+		htab_lru_push_free(htab, l);
 	return ret;
 }
 
@@ -1359,6 +1409,35 @@ static void delete_all_elements(struct bpf_htab *htab)
 	}
 }
 
+static void htab_free_malloced_timers(struct bpf_htab *htab)
+{
+	int i;
+
+	rcu_read_lock();
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_nulls_head *head = select_bucket(htab, i);
+		struct hlist_nulls_node *n;
+		struct htab_elem *l;
+
+		hlist_nulls_for_each_entry(l, n, head, hash_node)
+			check_and_free_timer(htab, l);
+		cond_resched_rcu();
+	}
+	rcu_read_unlock();
+}
+
+static void htab_map_free_timers(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+	if (likely(!map_value_has_timer(&htab->map)))
+		return;
+	if (!htab_is_prealloc(htab))
+		htab_free_malloced_timers(htab);
+	else
+		htab_free_prealloced_timers(htab);
+}
+
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 static void htab_map_free(struct bpf_map *map)
 {
@@ -1456,7 +1535,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
 			else
 				copy_map_value(map, value, l->key +
 					       roundup_key_size);
-			check_and_init_map_lock(map, value);
+			check_and_init_map_value(map, value);
 		}
 
 		hlist_nulls_del_rcu(&l->hash_node);
@@ -1467,7 +1546,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
 	htab_unlock_bucket(htab, b, hash, bflags);
 
 	if (is_lru_map && l)
-		bpf_lru_push_free(&htab->lru, &l->lru_node);
+		htab_lru_push_free(htab, l);
 
 	return ret;
 }
@@ -1645,7 +1724,7 @@ again_nocopy:
 						      true);
 			else
 				copy_map_value(map, dst_val, value);
-			check_and_init_map_lock(map, dst_val);
+			check_and_init_map_value(map, dst_val);
 		}
 		if (do_delete) {
 			hlist_nulls_del_rcu(&l->hash_node);
@@ -1672,7 +1751,7 @@ again_nocopy:
 	while (node_to_free) {
 		l = node_to_free;
 		node_to_free = node_to_free->batch_flink;
-		bpf_lru_push_free(&htab->lru, &l->lru_node);
+		htab_lru_push_free(htab, l);
 	}
 
 next_batch:
@@ -2034,6 +2113,7 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
+	.map_release_uref = htab_map_free_timers,
 	.map_lookup_elem = htab_map_lookup_elem,
 	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
 	.map_update_elem = htab_map_update_elem,
@@ -2055,6 +2135,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
+	.map_release_uref = htab_map_free_timers,
 	.map_lookup_elem = htab_lru_map_lookup_elem,
 	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
 	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index bd11db9774c3..95d70a08325d 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -173,7 +173,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
 		return -ENOMEM;
 
 	memcpy(&new->data[0], value, map->value_size);
-	check_and_init_map_lock(map, new->data);
+	check_and_init_map_value(map, new->data);
 
 	new = xchg(&storage->buf, new);
 	kfree_rcu(new, rcu);
@@ -509,7 +509,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
 						    map->numa_node);
 		if (!storage->buf)
 			goto enomem;
-		check_and_init_map_lock(map, storage->buf->data);
+		check_and_init_map_value(map, storage->buf->data);
 	} else {
 		storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
 		if (!storage->percpu_buf)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 39ab0b68cade..890dfe14e731 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -50,6 +50,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	inner_map_meta->map_flags = inner_map->map_flags;
 	inner_map_meta->max_entries = inner_map->max_entries;
 	inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
+	inner_map_meta->timer_off = inner_map->timer_off;
 
 	/* Misc members not needed in bpf_map_meta_equal() check. */
 	inner_map_meta->ops = inner_map->ops;
@@ -75,6 +76,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
 	return meta0->map_type == meta1->map_type &&
 		meta0->key_size == meta1->key_size &&
 		meta0->value_size == meta1->value_size &&
+		meta0->timer_off == meta1->timer_off &&
 		meta0->map_flags == meta1->map_flags;
 }
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5d1fee634be8..9a2068e39d23 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -260,8 +260,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
 				copy_map_value_locked(map, value, ptr, true);
 			else
 				copy_map_value(map, value, ptr);
-			/* mask lock, since value wasn't zero inited */
-			check_and_init_map_lock(map, value);
+			/* mask lock and timer, since value wasn't zero inited */
+			check_and_init_map_value(map, value);
 		}
 		rcu_read_unlock();
 	}
@@ -623,7 +623,8 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
 	struct bpf_map *map = filp->private_data;
 	int err;
 
-	if (!map->ops->map_mmap || map_value_has_spin_lock(map))
+	if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
+	    map_value_has_timer(map))
 		return -ENOTSUPP;
 
 	if (!(vma->vm_flags & VM_SHARED))
@@ -793,6 +794,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		}
 	}
 
+	map->timer_off = btf_find_timer(btf, value_type);
+	if (map_value_has_timer(map)) {
+		if (map->map_flags & BPF_F_RDONLY_PROG)
+			return -EACCES;
+		if (map->map_type != BPF_MAP_TYPE_HASH &&
+		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+		    map->map_type != BPF_MAP_TYPE_ARRAY)
+			return -EOPNOTSUPP;
+	}
+
 	if (map->ops->map_check_btf)
 		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
 
@@ -844,6 +855,7 @@ static int map_create(union bpf_attr *attr)
 	mutex_init(&map->freeze_mutex);
 
 	map->spin_lock_off = -EINVAL;
+	map->timer_off = -EINVAL;
 	if (attr->btf_key_type_id || attr->btf_value_type_id ||
 	    /* Even the map's value is a kernel's struct,
 	     * the bpf_prog.o must have BTF to begin with
@@ -1591,7 +1603,8 @@ static int map_freeze(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
+	    map_value_has_timer(map)) {
 		fdput(f);
 		return -ENOTSUPP;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e8645c819803..12b50f46a7c1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 	}
+	if (map_value_has_timer(map)) {
+		u32 t = map->timer_off;
+
+		if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
+		     t < reg->umax_value + off + size) {
+			verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
+			return -EACCES;
+		}
+	}
 	return err;
 }
 
@@ -4675,9 +4684,24 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
 			map->name);
 		return -EINVAL;
 	}
-	if (val) {
-		/* This restriction will be removed in the next patch */
-		verbose(env, "bpf_timer field can only be first in the map value element\n");
+	if (!map_value_has_timer(map)) {
+		if (map->timer_off == -E2BIG)
+			verbose(env,
+				"map '%s' has more than one 'struct bpf_timer'\n",
+				map->name);
+		else if (map->timer_off == -ENOENT)
+			verbose(env,
+				"map '%s' doesn't have 'struct bpf_timer'\n",
+				map->name);
+		else
+			verbose(env,
+				"map '%s' is not a struct type or bpf_timer is mangled\n",
+				map->name);
+		return -EINVAL;
+	}
+	if (map->timer_off != val + reg->off) {
+		verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
+			val + reg->off, map->timer_off);
 		return -EINVAL;
 	}
 	if (meta->map_ptr) {
-- 
cgit v1.2.3


From 3e8ce29850f1839d0603f925b30be9d8a4329917 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 14 Jul 2021 17:54:11 -0700
Subject: bpf: Prevent pointer mismatch in bpf_timer_init.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

bpf_timer_init() arguments are:
1. pointer to a timer (which is embedded in map element).
2. pointer to a map.
Make sure that pointer to a timer actually belongs to that map.

Use map_uid (which is unique id of inner map) to reject:
inner_map1 = bpf_map_lookup_elem(outer_map, key1)
inner_map2 = bpf_map_lookup_elem(outer_map, key2)
if (inner_map1 && inner_map2) {
    timer = bpf_map_lookup_elem(inner_map1);
    if (timer)
        // mismatch would have been allowed
        bpf_timer_init(timer, inner_map2);
}

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210715005417.78572-6-alexei.starovoitov@gmail.com
---
 include/linux/bpf_verifier.h |  9 ++++++++-
 kernel/bpf/verifier.c        | 31 ++++++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e774ecc1cd1f..5d3169b57e6e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -53,7 +53,14 @@ struct bpf_reg_state {
 		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
 		 *   PTR_TO_MAP_VALUE_OR_NULL
 		 */
-		struct bpf_map *map_ptr;
+		struct {
+			struct bpf_map *map_ptr;
+			/* To distinguish map lookups from outer map
+			 * the map_uid is non-zero for registers
+			 * pointing to inner maps.
+			 */
+			u32 map_uid;
+		};
 
 		/* for PTR_TO_BTF_ID */
 		struct {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 12b50f46a7c1..8df2671c3d33 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -255,6 +255,7 @@ struct bpf_call_arg_meta {
 	int mem_size;
 	u64 msize_max_value;
 	int ref_obj_id;
+	int map_uid;
 	int func_id;
 	struct btf *btf;
 	u32 btf_id;
@@ -1135,6 +1136,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 		if (map->inner_map_meta) {
 			reg->type = CONST_PTR_TO_MAP;
 			reg->map_ptr = map->inner_map_meta;
+			/* transfer reg's id which is unique for every map_lookup_elem
+			 * as UID of the inner map.
+			 */
+			reg->map_uid = reg->id;
 		} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
 			reg->type = PTR_TO_XDP_SOCK;
 		} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -4708,6 +4713,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
 		verbose(env, "verifier bug. Two map pointers in a timer helper\n");
 		return -EFAULT;
 	}
+	meta->map_uid = reg->map_uid;
 	meta->map_ptr = map;
 	return 0;
 }
@@ -5006,11 +5012,29 @@ skip_type_check:
 
 	if (arg_type == ARG_CONST_MAP_PTR) {
 		/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
-		if (meta->map_ptr && meta->map_ptr != reg->map_ptr) {
-			verbose(env, "Map pointer doesn't match bpf_timer.\n");
-			return -EINVAL;
+		if (meta->map_ptr) {
+			/* Use map_uid (which is unique id of inner map) to reject:
+			 * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+			 * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+			 * if (inner_map1 && inner_map2) {
+			 *     timer = bpf_map_lookup_elem(inner_map1);
+			 *     if (timer)
+			 *         // mismatch would have been allowed
+			 *         bpf_timer_init(timer, inner_map2);
+			 * }
+			 *
+			 * Comparing map_ptr is enough to distinguish normal and outer maps.
+			 */
+			if (meta->map_ptr != reg->map_ptr ||
+			    meta->map_uid != reg->map_uid) {
+				verbose(env,
+					"timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+					meta->map_uid, reg->map_uid);
+				return -EINVAL;
+			}
 		}
 		meta->map_ptr = reg->map_ptr;
+		meta->map_uid = reg->map_uid;
 	} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
 		/* bpf_map_xxx(..., map_ptr, ..., key) call:
 		 * check that [key, key + map->key_size) are within
@@ -6204,6 +6228,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
+		regs[BPF_REG_0].map_uid = meta.map_uid;
 		if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
 			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
 			if (map_value_has_spin_lock(meta.map_ptr))
-- 
cgit v1.2.3


From bfc6bb74e4f16ab264fa73398a7a79d7d2afac2e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 14 Jul 2021 17:54:14 -0700
Subject: bpf: Implement verifier support for validation of async callbacks.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

bpf_for_each_map_elem() and bpf_timer_set_callback() helpers are relying on
PTR_TO_FUNC infra in the verifier to validate addresses to subprograms
and pass them into the helpers as function callbacks.
In case of bpf_for_each_map_elem() the callback is invoked synchronously
and the verifier treats it as a normal subprogram call by adding another
bpf_func_state and new frame in __check_func_call().
bpf_timer_set_callback() doesn't invoke the callback directly.
The subprogram will be called asynchronously from bpf_timer_cb().
Teach the verifier to validate such async callbacks as special kind
of jump by pushing verifier state into stack and let pop_stack() process it.

Special care needs to be taken during state pruning.
The call insn doing bpf_timer_set_callback has to be a prune_point.
Otherwise short timer callbacks might not have prune points in front of
bpf_timer_set_callback() which means is_state_visited() will be called
after this call insn is processed in __check_func_call(). Which means that
another async_cb state will be pushed to be walked later and the verifier
will eventually hit BPF_COMPLEXITY_LIMIT_JMP_SEQ limit.
Since push_async_cb() looks like another push_stack() branch the
infinite loop detection will trigger false positive. To recognize
this case mark such states as in_async_callback_fn.
To distinguish infinite loop in async callback vs the same callback called
with different arguments for different map and timer add async_entry_cnt
to bpf_func_state.

Enforce return zero from async callbacks.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210715005417.78572-9-alexei.starovoitov@gmail.com
---
 include/linux/bpf_verifier.h |   9 +++-
 kernel/bpf/helpers.c         |   8 ++-
 kernel/bpf/verifier.c        | 123 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 131 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5d3169b57e6e..242d0b1a0772 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -208,12 +208,19 @@ struct bpf_func_state {
 	 * zero == main subprog
 	 */
 	u32 subprogno;
+	/* Every bpf_timer_start will increment async_entry_cnt.
+	 * It's used to distinguish:
+	 * void foo(void) { for(;;); }
+	 * void foo(void) { bpf_timer_set_callback(,foo); }
+	 */
+	u32 async_entry_cnt;
+	bool in_callback_fn;
+	bool in_async_callback_fn;
 
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
 	struct bpf_reference_state *refs;
 	int allocated_stack;
-	bool in_callback_fn;
 	struct bpf_stack_state *stack;
 };
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 74b16593983d..9fe846ec6bd1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1043,7 +1043,6 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
 	void *callback_fn;
 	void *key;
 	u32 idx;
-	int ret;
 
 	callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
 	if (!callback_fn)
@@ -1066,10 +1065,9 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
 		key = value - round_up(map->key_size, 8);
 	}
 
-	ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
-					 (u64)(long)key,
-					 (u64)(long)value, 0, 0);
-	WARN_ON(ret != 0); /* Next patch moves this check into the verifier */
+	BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key,
+				   (u64)(long)value, 0, 0);
+	/* The verifier checked that return value is zero. */
 
 	this_cpu_write(hrtimer_running, NULL);
 out:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1cb1b35e69b7..ab06256bf6c8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -735,6 +735,10 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			if (state->refs[i].id)
 				verbose(env, ",%d", state->refs[i].id);
 	}
+	if (state->in_callback_fn)
+		verbose(env, " cb");
+	if (state->in_async_callback_fn)
+		verbose(env, " async_cb");
 	verbose(env, "\n");
 }
 
@@ -1527,6 +1531,54 @@ static void init_func_state(struct bpf_verifier_env *env,
 	init_reg_state(env, state);
 }
 
+/* Similar to push_stack(), but for async callbacks */
+static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
+						int insn_idx, int prev_insn_idx,
+						int subprog)
+{
+	struct bpf_verifier_stack_elem *elem;
+	struct bpf_func_state *frame;
+
+	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+	if (!elem)
+		goto err;
+
+	elem->insn_idx = insn_idx;
+	elem->prev_insn_idx = prev_insn_idx;
+	elem->next = env->head;
+	elem->log_pos = env->log.len_used;
+	env->head = elem;
+	env->stack_size++;
+	if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+		verbose(env,
+			"The sequence of %d jumps is too complex for async cb.\n",
+			env->stack_size);
+		goto err;
+	}
+	/* Unlike push_stack() do not copy_verifier_state().
+	 * The caller state doesn't matter.
+	 * This is async callback. It starts in a fresh stack.
+	 * Initialize it similar to do_check_common().
+	 */
+	elem->st.branches = 1;
+	frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+	if (!frame)
+		goto err;
+	init_func_state(env, frame,
+			BPF_MAIN_FUNC /* callsite */,
+			0 /* frameno within this callchain */,
+			subprog /* subprog number within this prog */);
+	elem->st.frame[0] = frame;
+	return &elem->st;
+err:
+	free_verifier_state(env->cur_state, true);
+	env->cur_state = NULL;
+	/* pop all elements and return */
+	while (!pop_stack(env, NULL, NULL, false));
+	return NULL;
+}
+
+
 enum reg_arg_type {
 	SRC_OP,		/* register is used as source operand */
 	DST_OP,		/* register is used as destination operand */
@@ -5704,6 +5756,30 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		}
 	}
 
+	if (insn->code == (BPF_JMP | BPF_CALL) &&
+	    insn->imm == BPF_FUNC_timer_set_callback) {
+		struct bpf_verifier_state *async_cb;
+
+		/* there is no real recursion here. timer callbacks are async */
+		async_cb = push_async_cb(env, env->subprog_info[subprog].start,
+					 *insn_idx, subprog);
+		if (!async_cb)
+			return -EFAULT;
+		callee = async_cb->frame[0];
+		callee->async_entry_cnt = caller->async_entry_cnt + 1;
+
+		/* Convert bpf_timer_set_callback() args into timer callback args */
+		err = set_callee_state_cb(env, caller, callee, *insn_idx);
+		if (err)
+			return err;
+
+		clear_caller_saved_regs(env, caller->regs);
+		mark_reg_unknown(env, caller->regs, BPF_REG_0);
+		caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+		/* continue with next insn after call */
+		return 0;
+	}
+
 	callee = kzalloc(sizeof(*callee), GFP_KERNEL);
 	if (!callee)
 		return -ENOMEM;
@@ -5856,6 +5932,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
 	/* unused */
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+	callee->in_async_callback_fn = true;
 	return 0;
 }
 
@@ -9224,7 +9301,8 @@ static int check_return_code(struct bpf_verifier_env *env)
 	struct tnum range = tnum_range(0, 1);
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	int err;
-	const bool is_subprog = env->cur_state->frame[0]->subprogno;
+	struct bpf_func_state *frame = env->cur_state->frame[0];
+	const bool is_subprog = frame->subprogno;
 
 	/* LSM and struct_ops func-ptr's return type could be "void" */
 	if (!is_subprog &&
@@ -9249,6 +9327,22 @@ static int check_return_code(struct bpf_verifier_env *env)
 	}
 
 	reg = cur_regs(env) + BPF_REG_0;
+
+	if (frame->in_async_callback_fn) {
+		/* enforce return zero from async callbacks like timer */
+		if (reg->type != SCALAR_VALUE) {
+			verbose(env, "In async callback the register R0 is not a known value (%s)\n",
+				reg_type_str[reg->type]);
+			return -EINVAL;
+		}
+
+		if (!tnum_in(tnum_const(0), reg->var_off)) {
+			verbose_invalid_scalar(env, reg, &range, "async callback", "R0");
+			return -EINVAL;
+		}
+		return 0;
+	}
+
 	if (is_subprog) {
 		if (reg->type != SCALAR_VALUE) {
 			verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
@@ -9496,6 +9590,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
 		return DONE_EXPLORING;
 
 	case BPF_CALL:
+		if (insns[t].imm == BPF_FUNC_timer_set_callback)
+			/* Mark this call insn to trigger is_state_visited() check
+			 * before call itself is processed by __check_func_call().
+			 * Otherwise new async state will be pushed for further
+			 * exploration.
+			 */
+			init_explored_state(env, t);
 		return visit_func_call_insn(t, insn_cnt, insns, env,
 					    insns[t].src_reg == BPF_PSEUDO_CALL);
 
@@ -10503,9 +10604,25 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		states_cnt++;
 		if (sl->state.insn_idx != insn_idx)
 			goto next;
+
 		if (sl->state.branches) {
-			if (states_maybe_looping(&sl->state, cur) &&
-			    states_equal(env, &sl->state, cur)) {
+			struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
+
+			if (frame->in_async_callback_fn &&
+			    frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
+				/* Different async_entry_cnt means that the verifier is
+				 * processing another entry into async callback.
+				 * Seeing the same state is not an indication of infinite
+				 * loop or infinite recursion.
+				 * But finding the same state doesn't mean that it's safe
+				 * to stop processing the current state. The previous state
+				 * hasn't yet reached bpf_exit, since state.branches > 0.
+				 * Checking in_async_callback_fn alone is not enough either.
+				 * Since the verifier still needs to catch infinite loops
+				 * inside async callbacks.
+				 */
+			} else if (states_maybe_looping(&sl->state, cur) &&
+				   states_equal(env, &sl->state, cur)) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
 				return -EINVAL;
-- 
cgit v1.2.3


From 7ddc80a476c2d599246028af5808d15f9e24c109 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 14 Jul 2021 17:54:15 -0700
Subject: bpf: Teach stack depth check about async callbacks.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Teach max stack depth checking algorithm about async callbacks
that don't increase bpf program stack size.
Also add sanity check that bpf_tail_call didn't sneak into async cb.
It's impossible, since PTR_TO_CTX is not available in async cb,
hence the program cannot contain bpf_tail_call(ctx,...);

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210715005417.78572-10-alexei.starovoitov@gmail.com
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 18 +++++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 242d0b1a0772..b847e1ccd10f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -406,6 +406,7 @@ struct bpf_subprog_info {
 	bool has_tail_call;
 	bool tail_call_reachable;
 	bool has_ld_abs;
+	bool is_async_cb;
 };
 
 /* single container for all structs
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ab06256bf6c8..344ee67265cc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3709,6 +3709,8 @@ process_func:
 continue_func:
 	subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
+		int next_insn;
+
 		if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
 			continue;
 		/* remember insn and function to return to */
@@ -3716,13 +3718,22 @@ continue_func:
 		ret_prog[frame] = idx;
 
 		/* find the callee */
-		i = i + insn[i].imm + 1;
-		idx = find_subprog(env, i);
+		next_insn = i + insn[i].imm + 1;
+		idx = find_subprog(env, next_insn);
 		if (idx < 0) {
 			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
-				  i);
+				  next_insn);
 			return -EFAULT;
 		}
+		if (subprog[idx].is_async_cb) {
+			if (subprog[idx].has_tail_call) {
+				verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+				return -EFAULT;
+			}
+			 /* async callbacks don't increase bpf prog stack size */
+			continue;
+		}
+		i = next_insn;
 
 		if (subprog[idx].has_tail_call)
 			tail_call_reachable = true;
@@ -5761,6 +5772,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		struct bpf_verifier_state *async_cb;
 
 		/* there is no real recursion here. timer callbacks are async */
+		env->subprog_info[subprog].is_async_cb = true;
 		async_cb = push_async_cb(env, env->subprog_info[subprog].start,
 					 *insn_idx, subprog);
 		if (!async_cb)
-- 
cgit v1.2.3


From 7e6f3cd89f04a0a577002d5696288b482109d25c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 14 Jul 2021 11:43:53 +0200
Subject: bpf, x86: Store caller's ip in trampoline stack

Storing caller's ip in trampoline's stack. Trampoline programs
can reach the IP in (ctx - 8) address, so there's no change in
program's arguments interface.

The IP address is takes from [fp + 8], which is return address
from the initial 'call fentry' call to trampoline.

This IP address will be returned via bpf_get_func_ip helper
helper, which is added in following patches.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210714094400.396467-2-jolsa@kernel.org
---
 arch/x86/net/bpf_jit_comp.c | 19 +++++++++++++++++++
 include/linux/bpf.h         |  5 +++++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index e835164189f1..c320b3ce7b58 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1951,6 +1951,9 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	if (flags & BPF_TRAMP_F_CALL_ORIG)
 		stack_size += 8; /* room for return value of orig_call */
 
+	if (flags & BPF_TRAMP_F_IP_ARG)
+		stack_size += 8; /* room for IP address argument */
+
 	if (flags & BPF_TRAMP_F_SKIP_FRAME)
 		/* skip patched call instruction and point orig_call to actual
 		 * body of the kernel function.
@@ -1964,6 +1967,22 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
 	EMIT1(0x53);		 /* push rbx */
 
+	if (flags & BPF_TRAMP_F_IP_ARG) {
+		/* Store IP address of the traced function:
+		 * mov rax, QWORD PTR [rbp + 8]
+		 * sub rax, X86_PATCH_SIZE
+		 * mov QWORD PTR [rbp - stack_size], rax
+		 */
+		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
+		EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE);
+		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -stack_size);
+
+		/* Continue with stack_size for regs storage, stack will
+		 * be correctly restored with 'leave' instruction.
+		 */
+		stack_size -= 8;
+	}
+
 	save_regs(m, &prog, nr_args, stack_size);
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a9a4a480a6d0..94d77dc7ce35 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -579,6 +579,11 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_SKIP_FRAME		BIT(2)
 
+/* Store IP address of the caller on the trampoline stack,
+ * so it's available for trampoline's programs.
+ */
+#define BPF_TRAMP_F_IP_ARG		BIT(3)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.  Pick a number to fit into BPF_IMAGE_SIZE / 2
  */
-- 
cgit v1.2.3


From 1e37392cccdea94da635e3c6d16b21865806f619 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 14 Jul 2021 11:43:54 +0200
Subject: bpf: Enable BPF_TRAMP_F_IP_ARG for trampolines with call_get_func_ip

Enabling BPF_TRAMP_F_IP_ARG for trampolines that actually need it.

The BPF_TRAMP_F_IP_ARG adds extra 3 instructions to trampoline code
and is used only by programs with bpf_get_func_ip helper, which is
added in following patch and sets call_get_func_ip bit.

This patch ensures that BPF_TRAMP_F_IP_ARG flag is used only for
trampolines that have programs with call_get_func_ip set.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210714094400.396467-3-jolsa@kernel.org
---
 include/linux/filter.h  |  3 ++-
 kernel/bpf/trampoline.c | 12 +++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 472f97074da0..ba36989f711a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -559,7 +559,8 @@ struct bpf_prog {
 				kprobe_override:1, /* Do we override a kprobe? */
 				has_callchain_buf:1, /* callchain buffer allocated? */
 				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
-				call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
+				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
+				call_get_func_ip:1; /* Do we call get_func_ip() */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 28a3630c48ee..b2535acfe9db 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -172,7 +172,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 }
 
 static struct bpf_tramp_progs *
-bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
+bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
 {
 	const struct bpf_prog_aux *aux;
 	struct bpf_tramp_progs *tprogs;
@@ -189,8 +189,10 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
 		*total += tr->progs_cnt[kind];
 		progs = tprogs[kind].progs;
 
-		hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
+		hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) {
+			*ip_arg |= aux->prog->call_get_func_ip;
 			*progs++ = aux->prog;
+		}
 	}
 	return tprogs;
 }
@@ -333,9 +335,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 	struct bpf_tramp_image *im;
 	struct bpf_tramp_progs *tprogs;
 	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+	bool ip_arg = false;
 	int err, total;
 
-	tprogs = bpf_trampoline_get_progs(tr, &total);
+	tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg);
 	if (IS_ERR(tprogs))
 		return PTR_ERR(tprogs);
 
@@ -357,6 +360,9 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 	    tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
 		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
 
+	if (ip_arg)
+		flags |= BPF_TRAMP_F_IP_ARG;
+
 	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
 					  &tr->func.model, flags, tprogs,
 					  tr->func.addr);
-- 
cgit v1.2.3


From 9b99edcae5c80c8fb9f8e7149bae528c9e610a72 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 14 Jul 2021 11:43:55 +0200
Subject: bpf: Add bpf_get_func_ip helper for tracing programs

Adding bpf_get_func_ip helper for BPF_PROG_TYPE_TRACING programs,
specifically for all trampoline attach types.

The trampoline's caller IP address is stored in (ctx - 8) address.
so there's no reason to actually call the helper, but rather fixup
the call instruction and return [ctx - 8] value directly.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210714094400.396467-4-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       |  7 +++++++
 kernel/bpf/verifier.c          | 43 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c       | 15 +++++++++++++++
 tools/include/uapi/linux/bpf.h |  7 +++++++
 4 files changed, 72 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3544ec5234f0..89688f16ad60 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4841,6 +4841,12 @@ union bpf_attr {
  *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
  *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
  *		own timer which would have led to a deadlock otherwise.
+ *
+ * u64 bpf_get_func_ip(void *ctx)
+ * 	Description
+ * 		Get address of the traced function (for tracing programs).
+ * 	Return
+ * 		Address of the traced function.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5016,6 +5022,7 @@ union bpf_attr {
 	FN(timer_set_callback),		\
 	FN(timer_start),		\
 	FN(timer_cancel),		\
+	FN(get_func_ip),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 344ee67265cc..ceef190514e4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6161,6 +6161,27 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
 	return err;
 }
 
+static int check_get_func_ip(struct bpf_verifier_env *env)
+{
+	enum bpf_attach_type eatype = env->prog->expected_attach_type;
+	enum bpf_prog_type type = resolve_prog_type(env->prog);
+	int func_id = BPF_FUNC_get_func_ip;
+
+	if (type == BPF_PROG_TYPE_TRACING) {
+		if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT &&
+		    eatype != BPF_MODIFY_RETURN) {
+			verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
+				func_id_name(func_id), func_id);
+			return -ENOTSUPP;
+		}
+		return 0;
+	}
+
+	verbose(env, "func %s#%d not supported for program type %d\n",
+		func_id_name(func_id), func_id, type);
+	return -ENOTSUPP;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -6439,6 +6460,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
 		env->prog->call_get_stack = true;
 
+	if (func_id == BPF_FUNC_get_func_ip) {
+		if (check_get_func_ip(env))
+			return -ENOTSUPP;
+		env->prog->call_get_func_ip = true;
+	}
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
@@ -12632,6 +12659,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog;
 	bool expect_blinding = bpf_jit_blinding_enabled(prog);
+	enum bpf_prog_type prog_type = resolve_prog_type(prog);
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
 	const int insn_cnt = prog->len;
@@ -12998,6 +13026,21 @@ patch_map_ops_generic:
 			continue;
 		}
 
+		/* Implement bpf_get_func_ip inline. */
+		if (prog_type == BPF_PROG_TYPE_TRACING &&
+		    insn->imm == BPF_FUNC_get_func_ip) {
+			/* Load IP address from ctx - 8 */
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+			if (!new_prog)
+				return -ENOMEM;
+
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 patch_call_imm:
 		fn = env->ops->get_func_proto(insn->imm, env->prog);
 		/* all functions that have prototype and verifier allowed
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6c77d25137e0..3e71503eeb23 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -948,6 +948,19 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
+{
+	/* This helper call is inlined by verifier. */
+	return ((u64 *)ctx)[-1];
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
+	.func		= bpf_get_func_ip_tracing,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1058,6 +1071,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_snprintf:
 		return &bpf_snprintf_proto;
+	case BPF_FUNC_get_func_ip:
+		return &bpf_get_func_ip_proto_tracing;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3544ec5234f0..89688f16ad60 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4841,6 +4841,12 @@ union bpf_attr {
  *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
  *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
  *		own timer which would have led to a deadlock otherwise.
+ *
+ * u64 bpf_get_func_ip(void *ctx)
+ * 	Description
+ * 		Get address of the traced function (for tracing programs).
+ * 	Return
+ * 		Address of the traced function.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5016,6 +5022,7 @@ union bpf_attr {
 	FN(timer_set_callback),		\
 	FN(timer_start),		\
 	FN(timer_cancel),		\
+	FN(get_func_ip),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 9ffd9f3ff7193933dae171740ab70a103d460065 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 14 Jul 2021 11:43:56 +0200
Subject: bpf: Add bpf_get_func_ip helper for kprobe programs

Adding bpf_get_func_ip helper for BPF_PROG_TYPE_KPROBE programs,
so it's now possible to call bpf_get_func_ip from both kprobe and
kretprobe programs.

Taking the caller's address from 'struct kprobe::addr', which is
defined for both kprobe and kretprobe.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lore.kernel.org/bpf/20210714094400.396467-5-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       |  2 +-
 kernel/bpf/verifier.c          |  2 ++
 kernel/trace/bpf_trace.c       | 16 ++++++++++++++++
 tools/include/uapi/linux/bpf.h |  2 +-
 4 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 89688f16ad60..2db6925e04f4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4844,7 +4844,7 @@ union bpf_attr {
  *
  * u64 bpf_get_func_ip(void *ctx)
  * 	Description
- * 		Get address of the traced function (for tracing programs).
+ * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
  */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ceef190514e4..97216f799ba8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6175,6 +6175,8 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
 			return -ENOTSUPP;
 		}
 		return 0;
+	} else if (type == BPF_PROG_TYPE_KPROBE) {
+		return 0;
 	}
 
 	verbose(env, "func %s#%d not supported for program type %d\n",
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e71503eeb23..0b113716bc7a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -961,6 +961,20 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
+{
+	struct kprobe *kp = kprobe_running();
+
+	return kp ? (u64) kp->addr : 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
+	.func		= bpf_get_func_ip_kprobe,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1092,6 +1106,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_override_return:
 		return &bpf_override_return_proto;
 #endif
+	case BPF_FUNC_get_func_ip:
+		return &bpf_get_func_ip_proto_kprobe;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 89688f16ad60..2db6925e04f4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4844,7 +4844,7 @@ union bpf_attr {
  *
  * u64 bpf_get_func_ip(void *ctx)
  * 	Description
- * 		Get address of the traced function (for tracing programs).
+ * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
  */
-- 
cgit v1.2.3


From 17edea21b38d047a10c189296c58aea9875d0d0a Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Sun, 4 Jul 2021 12:02:42 -0700
Subject: sock_map: Relax config dependency to CONFIG_NET

Currently sock_map still has Kconfig dependency on CONFIG_INET,
but there is no actual functional dependency on it after we
introduce ->psock_update_sk_prot().

We have to extend it to CONFIG_NET now as we are going to
support AF_UNIX.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210704190252.11866-2-xiyou.wangcong@gmail.com
---
 include/linux/bpf.h | 38 ++++++++++++++++++++------------------
 kernel/bpf/Kconfig  |  2 +-
 net/core/Makefile   |  2 --
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 94d77dc7ce35..d25c16c365e5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1887,6 +1887,12 @@ void bpf_map_offload_map_free(struct bpf_map *map);
 int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 			      const union bpf_attr *kattr,
 			      union bpf_attr __user *uattr);
+
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
+int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
+void sock_map_unhash(struct sock *sk);
+void sock_map_close(struct sock *sk, long timeout);
 #else
 static inline int bpf_prog_offload_init(struct bpf_prog *prog,
 					union bpf_attr *attr)
@@ -1919,24 +1925,6 @@ static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 {
 	return -ENOTSUPP;
 }
-#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
-
-#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
-int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
-int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
-int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
-void sock_map_unhash(struct sock *sk);
-void sock_map_close(struct sock *sk, long timeout);
-
-void bpf_sk_reuseport_detach(struct sock *sk);
-int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
-				       void *value);
-int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
-				       void *value, u64 map_flags);
-#else
-static inline void bpf_sk_reuseport_detach(struct sock *sk)
-{
-}
 
 #ifdef CONFIG_BPF_SYSCALL
 static inline int sock_map_get_from_fd(const union bpf_attr *attr,
@@ -1956,7 +1944,21 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void
 {
 	return -EOPNOTSUPP;
 }
+#endif /* CONFIG_BPF_SYSCALL */
+#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
+#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
+void bpf_sk_reuseport_detach(struct sock *sk);
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+				       void *value);
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags);
+#else
+static inline void bpf_sk_reuseport_detach(struct sock *sk)
+{
+}
+
+#ifdef CONFIG_BPF_SYSCALL
 static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
 						     void *key, void *value)
 {
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bd04f4a44c01..a82d6de86522 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -29,7 +29,7 @@ config BPF_SYSCALL
 	select IRQ_WORK
 	select TASKS_TRACE_RCU
 	select BINARY_PRINTF
-	select NET_SOCK_MSG if INET
+	select NET_SOCK_MSG if NET
 	default n
 	help
 	  Enable the bpf() system call that allows to manipulate BPF programs
diff --git a/net/core/Makefile b/net/core/Makefile
index f7f16650fe9e..35ced6201814 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -33,8 +33,6 @@ obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
 obj-$(CONFIG_GRO_CELLS) += gro_cells.o
 obj-$(CONFIG_FAILOVER) += failover.o
-ifeq ($(CONFIG_INET),y)
 obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
 obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
-endif
 obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
-- 
cgit v1.2.3


From c63829182c37c2d6d0608976d15fa61ebebe9e6b Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Sun, 4 Jul 2021 12:02:47 -0700
Subject: af_unix: Implement ->psock_update_sk_prot()

Now we can implement unix_bpf_update_proto() to update
sk_prot, especially prot->close().

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210704190252.11866-7-xiyou.wangcong@gmail.com
---
 MAINTAINERS           |  1 +
 include/net/af_unix.h | 10 ++++++++++
 net/core/sock_map.c   |  1 +
 net/unix/Makefile     |  1 +
 net/unix/af_unix.c    |  6 +++++-
 net/unix/unix_bpf.c   | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 net/unix/unix_bpf.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 88449b7a4c95..2c793df1d873 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10277,6 +10277,7 @@ F:	net/core/skmsg.c
 F:	net/core/sock_map.c
 F:	net/ipv4/tcp_bpf.c
 F:	net/ipv4/udp_bpf.c
+F:	net/unix/unix_bpf.c
 
 LANDLOCK SECURITY MODULE
 M:	Mickaël Salaün <mic@digikod.net>
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index f42fdddecd41..cca645846af1 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -89,4 +89,14 @@ void unix_sysctl_unregister(struct net *net);
 static inline int unix_sysctl_register(struct net *net) { return 0; }
 static inline void unix_sysctl_unregister(struct net *net) {}
 #endif
+
+#ifdef CONFIG_BPF_SYSCALL
+extern struct proto unix_proto;
+
+int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
+void __init unix_bpf_build_proto(void);
+#else
+static inline void __init unix_bpf_build_proto(void)
+{}
+#endif
 #endif
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 3c427e7e6df9..ae5fa4338d9c 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1517,6 +1517,7 @@ void sock_map_close(struct sock *sk, long timeout)
 	release_sock(sk);
 	saved_close(sk, timeout);
 }
+EXPORT_SYMBOL_GPL(sock_map_close);
 
 static int sock_map_iter_attach_target(struct bpf_prog *prog,
 				       union bpf_iter_link_info *linfo,
diff --git a/net/unix/Makefile b/net/unix/Makefile
index 54e58cc4f945..20491825b4d0 100644
--- a/net/unix/Makefile
+++ b/net/unix/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_UNIX)	+= unix.o
 
 unix-y			:= af_unix.o garbage.o
 unix-$(CONFIG_SYSCTL)	+= sysctl_net_unix.o
+unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o
 
 obj-$(CONFIG_UNIX_DIAG)	+= unix_diag.o
 unix_diag-y		:= diag.o
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 875eeaaddc07..573253c5b5c2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -788,11 +788,14 @@ static void unix_close(struct sock *sk, long timeout)
 	 */
 }
 
-static struct proto unix_proto = {
+struct proto unix_proto = {
 	.name			= "UNIX",
 	.owner			= THIS_MODULE,
 	.obj_size		= sizeof(struct unix_sock),
 	.close			= unix_close,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= unix_bpf_update_proto,
+#endif
 };
 
 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
@@ -2973,6 +2976,7 @@ static int __init af_unix_init(void)
 
 	sock_register(&unix_family_ops);
 	register_pernet_subsys(&unix_net_ops);
+	unix_bpf_build_proto();
 out:
 	return rc;
 }
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
new file mode 100644
index 000000000000..b1582a659427
--- /dev/null
+++ b/net/unix/unix_bpf.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */
+
+#include <linux/skmsg.h>
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <net/af_unix.h>
+
+static struct proto *unix_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(unix_prot_lock);
+static struct proto unix_bpf_prot;
+
+static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+	*prot        = *base;
+	prot->close  = sock_map_close;
+}
+
+static void unix_bpf_check_needs_rebuild(struct proto *ops)
+{
+	if (unlikely(ops != smp_load_acquire(&unix_prot_saved))) {
+		spin_lock_bh(&unix_prot_lock);
+		if (likely(ops != unix_prot_saved)) {
+			unix_bpf_rebuild_protos(&unix_bpf_prot, ops);
+			smp_store_release(&unix_prot_saved, ops);
+		}
+		spin_unlock_bh(&unix_prot_lock);
+	}
+}
+
+int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+	if (restore) {
+		sk->sk_write_space = psock->saved_write_space;
+		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+		return 0;
+	}
+
+	unix_bpf_check_needs_rebuild(psock->sk_proto);
+	WRITE_ONCE(sk->sk_prot, &unix_bpf_prot);
+	return 0;
+}
+
+void __init unix_bpf_build_proto(void)
+{
+	unix_bpf_rebuild_protos(&unix_bpf_prot, &unix_proto);
+}
-- 
cgit v1.2.3


From 9825d866ce0d11009513e06824885340062c166b Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Sun, 4 Jul 2021 12:02:48 -0700
Subject: af_unix: Implement unix_dgram_bpf_recvmsg()

We have to implement unix_dgram_bpf_recvmsg() to replace the
original ->recvmsg() to retrieve skmsg from ingress_msg.

AF_UNIX is again special here because the lack of
sk_prot->recvmsg(). I simply add a special case inside
unix_dgram_recvmsg() to call sk->sk_prot->recvmsg() directly.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210704190252.11866-8-xiyou.wangcong@gmail.com
---
 include/net/af_unix.h |  2 ++
 net/unix/af_unix.c    | 19 ++++++++++---
 net/unix/unix_bpf.c   | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index cca645846af1..435a2c3d5a6f 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -82,6 +82,8 @@ static inline struct unix_sock *unix_sk(const struct sock *sk)
 long unix_inq_len(struct sock *sk);
 long unix_outq_len(struct sock *sk);
 
+int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
+			 int flags);
 #ifdef CONFIG_SYSCTL
 int unix_sysctl_register(struct net *net);
 void unix_sysctl_unregister(struct net *net);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 573253c5b5c2..89927678c0dc 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2098,11 +2098,11 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
 	}
 }
 
-static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
-			      size_t size, int flags)
+int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
+			 int flags)
 {
 	struct scm_cookie scm;
-	struct sock *sk = sock->sk;
+	struct socket *sock = sk->sk_socket;
 	struct unix_sock *u = unix_sk(sk);
 	struct sk_buff *skb, *last;
 	long timeo;
@@ -2205,6 +2205,19 @@ out:
 	return err;
 }
 
+static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+			      int flags)
+{
+	struct sock *sk = sock->sk;
+
+#ifdef CONFIG_BPF_SYSCALL
+	if (sk->sk_prot != &unix_proto)
+		return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+					    flags & ~MSG_DONTWAIT, NULL);
+#endif
+	return __unix_dgram_recvmsg(sk, msg, size, flags);
+}
+
 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
 			  sk_read_actor_t recv_actor)
 {
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index b1582a659427..db0cda29fb2f 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -6,6 +6,80 @@
 #include <net/sock.h>
 #include <net/af_unix.h>
 
+#define unix_sk_has_data(__sk, __psock)					\
+		({	!skb_queue_empty(&__sk->sk_receive_queue) ||	\
+			!skb_queue_empty(&__psock->ingress_skb) ||	\
+			!list_empty(&__psock->ingress_msg);		\
+		})
+
+static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock,
+			      long timeo)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct unix_sock *u = unix_sk(sk);
+	int ret = 0;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return 1;
+
+	if (!timeo)
+		return ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	if (!unix_sk_has_data(sk, psock)) {
+		mutex_unlock(&u->iolock);
+		wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+		mutex_lock(&u->iolock);
+		ret = unix_sk_has_data(sk, psock);
+	}
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
+static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
+				  size_t len, int nonblock, int flags,
+				  int *addr_len)
+{
+	struct unix_sock *u = unix_sk(sk);
+	struct sk_psock *psock;
+	int copied, ret;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return __unix_dgram_recvmsg(sk, msg, len, flags);
+
+	mutex_lock(&u->iolock);
+	if (!skb_queue_empty(&sk->sk_receive_queue) &&
+	    sk_psock_queue_empty(psock)) {
+		ret = __unix_dgram_recvmsg(sk, msg, len, flags);
+		goto out;
+	}
+
+msg_bytes_ready:
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	if (!copied) {
+		long timeo;
+		int data;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = unix_msg_wait_data(sk, psock, timeo);
+		if (data) {
+			if (!sk_psock_queue_empty(psock))
+				goto msg_bytes_ready;
+			ret = __unix_dgram_recvmsg(sk, msg, len, flags);
+			goto out;
+		}
+		copied = -EAGAIN;
+	}
+	ret = copied;
+out:
+	mutex_unlock(&u->iolock);
+	sk_psock_put(sk, psock);
+	return ret;
+}
+
 static struct proto *unix_prot_saved __read_mostly;
 static DEFINE_SPINLOCK(unix_prot_lock);
 static struct proto unix_bpf_prot;
@@ -14,6 +88,7 @@ static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base
 {
 	*prot        = *base;
 	prot->close  = sock_map_close;
+	prot->recvmsg = unix_dgram_bpf_recvmsg;
 }
 
 static void unix_bpf_check_needs_rebuild(struct proto *ops)
-- 
cgit v1.2.3


From b83d23a2a38b1770da0491257ae81d52307f7816 Mon Sep 17 00:00:00 2001
From: Mark Gray <mark.d.gray@redhat.com>
Date: Thu, 15 Jul 2021 08:27:54 -0400
Subject: openvswitch: Introduce per-cpu upcall dispatch

The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.

This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:

* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)

This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.

In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:

a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.

The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html

Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  8 +++++
 net/openvswitch/actions.c        |  6 +++-
 net/openvswitch/datapath.c       | 72 ++++++++++++++++++++++++++++++++++++++--
 net/openvswitch/datapath.h       | 20 +++++++++++
 4 files changed, 103 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 8d16744edc31..6571b57b2268 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
  * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
  * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
  * not be sent.
+ * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
+ * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
  * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
  * datapath.  Always present in notifications.
  * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the
@@ -87,6 +89,9 @@ enum ovs_datapath_attr {
 	OVS_DP_ATTR_USER_FEATURES,	/* OVS_DP_F_*  */
 	OVS_DP_ATTR_PAD,
 	OVS_DP_ATTR_MASKS_CACHE_SIZE,
+	OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in per-cpu
+				     * dispatch mode
+				     */
 	__OVS_DP_ATTR_MAX
 };
 
@@ -127,6 +132,9 @@ struct ovs_vport_stats {
 /* Allow tc offload recirc sharing */
 #define OVS_DP_F_TC_RECIRC_SHARING	(1 << 2)
 
+/* Allow per-cpu dispatch of upcalls */
+#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU	(1 << 3)
+
 /* Fixed logical ports. */
 #define OVSP_LOCAL      ((__u32)0)
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ef15d9eb4774..f79679746c62 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -924,7 +924,11 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			break;
 
 		case OVS_USERSPACE_ATTR_PID:
-			upcall.portid = nla_get_u32(a);
+			if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+				upcall.portid =
+				   ovs_dp_get_upcall_portid(dp, smp_processor_id());
+			else
+				upcall.portid = nla_get_u32(a);
 			break;
 
 		case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index bc164b35e67d..7a4edafdc685 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -133,6 +133,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
 
 static void ovs_dp_masks_rebalance(struct work_struct *work);
 
+static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *);
+
 /* Must be called with rcu_read_lock or ovs_mutex. */
 const char *ovs_dp_name(const struct datapath *dp)
 {
@@ -166,6 +168,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
 	free_percpu(dp->stats_percpu);
 	kfree(dp->ports);
 	ovs_meters_exit(dp);
+	kfree(dp->upcall_portids);
 	kfree(dp);
 }
 
@@ -239,7 +242,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 
 		memset(&upcall, 0, sizeof(upcall));
 		upcall.cmd = OVS_PACKET_CMD_MISS;
-		upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
+		if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+			upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id());
+		else
+			upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
 		upcall.mru = OVS_CB(skb)->mru;
 		error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
 		if (unlikely(error))
@@ -1594,16 +1602,67 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb,
 
 DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
 
+static int ovs_dp_set_upcall_portids(struct datapath *dp,
+			      const struct nlattr *ids)
+{
+	struct dp_nlsk_pids *old, *dp_nlsk_pids;
+
+	if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
+		return -EINVAL;
+
+	old = ovsl_dereference(dp->upcall_portids);
+
+	dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids),
+			       GFP_KERNEL);
+	if (!dp_nlsk_pids)
+		return -ENOMEM;
+
+	dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32);
+	nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids));
+
+	rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids);
+
+	kfree_rcu(old, rcu);
+
+	return 0;
+}
+
+u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
+{
+	struct dp_nlsk_pids *dp_nlsk_pids;
+
+	dp_nlsk_pids = rcu_dereference(dp->upcall_portids);
+
+	if (dp_nlsk_pids) {
+		if (cpu_id < dp_nlsk_pids->n_pids) {
+			return dp_nlsk_pids->pids[cpu_id];
+		} else if (dp_nlsk_pids->n_pids > 0 && cpu_id >= dp_nlsk_pids->n_pids) {
+			/* If the number of netlink PIDs is mismatched with the number of
+			 * CPUs as seen by the kernel, log this and send the upcall to an
+			 * arbitrary socket (0) in order to not drop packets
+			 */
+			pr_info_ratelimited("cpu_id mismatch with handler threads");
+			return dp_nlsk_pids->pids[cpu_id % dp_nlsk_pids->n_pids];
+		} else {
+			return 0;
+		}
+	} else {
+		return 0;
+	}
+}
+
 static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
 {
 	u32 user_features = 0;
+	int err;
 
 	if (a[OVS_DP_ATTR_USER_FEATURES]) {
 		user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
 
 		if (user_features & ~(OVS_DP_F_VPORT_PIDS |
 				      OVS_DP_F_UNALIGNED |
-				      OVS_DP_F_TC_RECIRC_SHARING))
+				      OVS_DP_F_TC_RECIRC_SHARING |
+				      OVS_DP_F_DISPATCH_UPCALL_PER_CPU))
 			return -EOPNOTSUPP;
 
 #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
@@ -1624,6 +1683,15 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
 
 	dp->user_features = user_features;
 
+	if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU &&
+	    a[OVS_DP_ATTR_PER_CPU_PIDS]) {
+		/* Upcall Netlink Port IDs have been updated */
+		err = ovs_dp_set_upcall_portids(dp,
+						a[OVS_DP_ATTR_PER_CPU_PIDS]);
+		if (err)
+			return err;
+	}
+
 	if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
 		static_branch_enable(&tc_recirc_sharing_support);
 	else
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 38f7d3e66ca6..fcfe6cb46441 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -50,6 +50,21 @@ struct dp_stats_percpu {
 	struct u64_stats_sync syncp;
 };
 
+/**
+ * struct dp_nlsk_pids - array of netlink portids of for a datapath.
+ *                       This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU
+ *                       is enabled and must be protected by rcu.
+ * @rcu: RCU callback head for deferred destruction.
+ * @n_pids: Size of @pids array.
+ * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets
+ *       that miss the flow table.
+ */
+struct dp_nlsk_pids {
+	struct rcu_head rcu;
+	u32 n_pids;
+	u32 pids[];
+};
+
 /**
  * struct datapath - datapath for flow-based packet switching
  * @rcu: RCU callback head for deferred destruction.
@@ -61,6 +76,7 @@ struct dp_stats_percpu {
  * @net: Reference to net namespace.
  * @max_headroom: the maximum headroom of all vports in this datapath; it will
  * be used by all the internal vports in this dp.
+ * @upcall_portids: RCU protected 'struct dp_nlsk_pids'.
  *
  * Context: See the comment on locking at the top of datapath.c for additional
  * locking information.
@@ -87,6 +103,8 @@ struct datapath {
 
 	/* Switch meters. */
 	struct dp_meter_table meter_tbl;
+
+	struct dp_nlsk_pids __rcu *upcall_portids;
 };
 
 /**
@@ -243,6 +261,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *,
 		  const struct sw_flow_key *, const struct dp_upcall_info *,
 		  uint32_t cutlen);
 
+u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id);
+
 const char *ovs_dp_name(const struct datapath *dp);
 struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
 					 u32 portid, u32 seq, u8 cmd);
-- 
cgit v1.2.3


From c7603cfa04e7c3a435b31d065f7cbdc829428f6e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 12 Jul 2021 16:06:15 -0700
Subject: bpf: Add ambient BPF runtime context stored in current

b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage()
helper") fixed the problem with cgroup-local storage use in BPF by
pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate
possible BPF program preemptions and nested executions.

While this seems to work good in practice, it introduces new and unnecessary
failure mode in which not all BPF programs might be executed if we fail to
find an unused slot for cgroup storage, however unlikely it is. It might also
not be so unlikely when/if we allow sleepable cgroup BPF programs in the
future.

Further, the way that cgroup storage is implemented as ambiently-available
property during entire BPF program execution is a convenient way to pass extra
information to BPF program and helpers without requiring user code to pass
around extra arguments explicitly. So it would be good to have a generic
solution that can allow implementing this without arbitrary restrictions.
Ideally, such solution would work for both preemptable and sleepable BPF
programs in exactly the same way.

This patch introduces such solution, bpf_run_ctx. It adds one pointer field
(bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of
macros in such a way that it always stays valid throughout BPF program
execution. BPF program preemption is handled by remembering previous
current->bpf_ctx value locally while executing nested BPF program and
restoring old value after nested BPF program finishes. This is handled by two
helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are
supposed to be used before and after BPF program runs, respectively.

Restoring old value of the pointer handles preemption, while bpf_run_ctx
pointer being a property of current task_struct naturally solves this problem
for sleepable BPF programs by "following" BPF program execution as it is
scheduled in and out of CPU. It would even allow CPU migration of BPF
programs, even though it's not currently allowed by BPF infra.

This patch cleans up cgroup local storage handling as a first application. The
design itself is generic, though, with bpf_run_ctx being an empty struct that
is supposed to be embedded into a specific struct for a given BPF program type
(bpf_cg_run_ctx in this case). Follow up patches are planned that will expand
this mechanism for other uses within tracing BPF programs.

To verify that this change doesn't revert the fix to the original cgroup
storage issue, I ran the same repro as in the original report ([0]) and didn't
get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with
bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work).

  [0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/

Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org
---
 include/linux/bpf-cgroup.h | 54 ----------------------------------------------
 include/linux/bpf.h        | 54 +++++++++++++++++++++++++++++-----------------
 include/linux/sched.h      |  3 +++
 kernel/bpf/helpers.c       | 16 +++++---------
 kernel/bpf/local_storage.c |  3 ---
 kernel/fork.c              |  1 +
 net/bpf/test_run.c         | 23 ++++++++++----------
 7 files changed, 54 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8b77d08d4b47..a74cd1c3bd87 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -27,19 +27,6 @@ struct task_struct;
 extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
 #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
 
-#define BPF_CGROUP_STORAGE_NEST_MAX	8
-
-struct bpf_cgroup_storage_info {
-	struct task_struct *task;
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
-};
-
-/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks
- * to use bpf cgroup storage simultaneously.
- */
-DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
-		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
 
@@ -172,44 +159,6 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	return BPF_CGROUP_STORAGE_SHARED;
 }
 
-static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage
-					 *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
-{
-	enum bpf_cgroup_storage_type stype;
-	int i, err = 0;
-
-	preempt_disable();
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL))
-			continue;
-
-		this_cpu_write(bpf_cgroup_storage_info[i].task, current);
-		for_each_cgroup_storage_type(stype)
-			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
-				       storage[stype]);
-		goto out;
-	}
-	err = -EBUSY;
-	WARN_ON_ONCE(1);
-
-out:
-	preempt_enable();
-	return err;
-}
-
-static inline void bpf_cgroup_storage_unset(void)
-{
-	int i;
-
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
-			continue;
-
-		this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
-		return;
-	}
-}
-
 struct bpf_cgroup_storage *
 cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
 		      void *key, bool locked);
@@ -487,9 +436,6 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
-static inline int bpf_cgroup_storage_set(
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; }
-static inline void bpf_cgroup_storage_unset(void) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0edff8f5177e..978ebd16ae60 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1142,38 +1142,40 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *include_prog,
 			struct bpf_prog_array **new_array);
 
+struct bpf_run_ctx {};
+
+struct bpf_cg_run_ctx {
+	struct bpf_run_ctx run_ctx;
+	struct bpf_prog_array_item *prog_item;
+};
+
 /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */
 #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE			(1 << 0)
 /* BPF program asks to set CN on the packet. */
 #define BPF_RET_SET_CN						(1 << 0)
 
-/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY,
- * if bpf_cgroup_storage_set() failed, the rest of programs
- * will not execute. This should be a really rare scenario
- * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of
- * preemptions all between bpf_cgroup_storage_set() and
- * bpf_cgroup_storage_unset() on the same cpu.
- */
 #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags)		\
 	({								\
 		struct bpf_prog_array_item *_item;			\
 		struct bpf_prog *_prog;					\
 		struct bpf_prog_array *_array;				\
+		struct bpf_run_ctx *old_run_ctx;			\
+		struct bpf_cg_run_ctx run_ctx;				\
 		u32 _ret = 1;						\
 		u32 func_ret;						\
 		migrate_disable();					\
 		rcu_read_lock();					\
 		_array = rcu_dereference(array);			\
 		_item = &_array->items[0];				\
+		old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);	\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
-				break;					\
+			run_ctx.prog_item = _item;			\
 			func_ret = func(_prog, ctx);			\
 			_ret &= (func_ret & 1);				\
-			*(ret_flags) |= (func_ret >> 1);			\
-			bpf_cgroup_storage_unset();			\
+			*(ret_flags) |= (func_ret >> 1);		\
 			_item++;					\
 		}							\
+		bpf_reset_run_ctx(old_run_ctx);				\
 		rcu_read_unlock();					\
 		migrate_enable();					\
 		_ret;							\
@@ -1184,6 +1186,8 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		struct bpf_prog_array_item *_item;	\
 		struct bpf_prog *_prog;			\
 		struct bpf_prog_array *_array;		\
+		struct bpf_run_ctx *old_run_ctx;	\
+		struct bpf_cg_run_ctx run_ctx;		\
 		u32 _ret = 1;				\
 		migrate_disable();			\
 		rcu_read_lock();			\
@@ -1191,17 +1195,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		if (unlikely(check_non_null && !_array))\
 			goto _out;			\
 		_item = &_array->items[0];		\
-		while ((_prog = READ_ONCE(_item->prog))) {		\
-			if (!set_cg_storage) {			\
-				_ret &= func(_prog, ctx);	\
-			} else {				\
-				if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
-					break;			\
-				_ret &= func(_prog, ctx);	\
-				bpf_cgroup_storage_unset();	\
-			}				\
+		old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\
+		while ((_prog = READ_ONCE(_item->prog))) {	\
+			run_ctx.prog_item = _item;	\
+			_ret &= func(_prog, ctx);	\
 			_item++;			\
 		}					\
+		bpf_reset_run_ctx(old_run_ctx);		\
 _out:							\
 		rcu_read_unlock();			\
 		migrate_enable();			\
@@ -1284,6 +1284,20 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
+static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
+{
+	struct bpf_run_ctx *old_ctx;
+
+	old_ctx = current->bpf_ctx;
+	current->bpf_ctx = new_ctx;
+	return old_ctx;
+}
+
+static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
+{
+	current->bpf_ctx = old_ctx;
+}
+
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ec8d07d88641..c64119aa2e60 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -42,6 +42,7 @@ struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
 struct bpf_local_storage;
+struct bpf_run_ctx;
 struct capture_control;
 struct cfs_rq;
 struct fs_struct;
@@ -1379,6 +1380,8 @@ struct task_struct {
 #ifdef CONFIG_BPF_SYSCALL
 	/* Used by BPF task local storage */
 	struct bpf_local_storage __rcu	*bpf_storage;
+	/* Used for BPF run context */
+	struct bpf_run_ctx		*bpf_ctx;
 #endif
 
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9fe846ec6bd1..15746f779fe1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -393,8 +393,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
 };
 
 #ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
-		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
@@ -403,17 +401,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 * verifier checks that its value is correct.
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
-	struct bpf_cgroup_storage *storage = NULL;
+	struct bpf_cgroup_storage *storage;
+	struct bpf_cg_run_ctx *ctx;
 	void *ptr;
-	int i;
 
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
-			continue;
-
-		storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
-		break;
-	}
+	/* get current cgroup storage from BPF run context */
+	ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+	storage = ctx->prog_item->cgroup_storage[stype];
 
 	if (stype == BPF_CGROUP_STORAGE_SHARED)
 		ptr = &READ_ONCE(storage->buf)->data[0];
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 95d70a08325d..362e81481594 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -11,9 +11,6 @@
 
 #ifdef CONFIG_CGROUP_BPF
 
-DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
-	       bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
 #include "../cgroup/cgroup-internal.h"
 
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\
diff --git a/kernel/fork.c b/kernel/fork.c
index bc94b2cc5995..e8b41e212110 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2083,6 +2083,7 @@ static __latent_entropy struct task_struct *copy_process(
 #endif
 #ifdef CONFIG_BPF_SYSCALL
 	RCU_INIT_POINTER(p->bpf_storage, NULL);
+	p->bpf_ctx = NULL;
 #endif
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index cda8375bbbaf..8d46e2962786 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -88,17 +88,19 @@ reset:
 static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 			u32 *retval, u32 *time, bool xdp)
 {
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL };
+	struct bpf_prog_array_item item = {.prog = prog};
+	struct bpf_run_ctx *old_ctx;
+	struct bpf_cg_run_ctx run_ctx;
 	struct bpf_test_timer t = { NO_MIGRATE };
 	enum bpf_cgroup_storage_type stype;
 	int ret;
 
 	for_each_cgroup_storage_type(stype) {
-		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
-		if (IS_ERR(storage[stype])) {
-			storage[stype] = NULL;
+		item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+		if (IS_ERR(item.cgroup_storage[stype])) {
+			item.cgroup_storage[stype] = NULL;
 			for_each_cgroup_storage_type(stype)
-				bpf_cgroup_storage_free(storage[stype]);
+				bpf_cgroup_storage_free(item.cgroup_storage[stype]);
 			return -ENOMEM;
 		}
 	}
@@ -107,22 +109,19 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 		repeat = 1;
 
 	bpf_test_timer_enter(&t);
+	old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	do {
-		ret = bpf_cgroup_storage_set(storage);
-		if (ret)
-			break;
-
+		run_ctx.prog_item = &item;
 		if (xdp)
 			*retval = bpf_prog_run_xdp(prog, ctx);
 		else
 			*retval = BPF_PROG_RUN(prog, ctx);
-
-		bpf_cgroup_storage_unset();
 	} while (bpf_test_timer_continue(&t, repeat, &ret, time));
+	bpf_reset_run_ctx(old_ctx);
 	bpf_test_timer_leave(&t);
 
 	for_each_cgroup_storage_type(stype)
-		bpf_cgroup_storage_free(storage[stype]);
+		bpf_cgroup_storage_free(item.cgroup_storage[stype]);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 96cd2dd65bb0b94c908f2df32bba7350fc1b954e Mon Sep 17 00:00:00 2001
From: Lior Nahmanson <liorna@nvidia.com>
Date: Mon, 28 Dec 2020 10:38:12 +0200
Subject: net/mlx5: Add DCS caps & fields support

This fields will be needed when adding a support for DCS offload

max_dci_stream_channels - maximum DCI stream channels supported per DCI.
max_dci_errored_streams - maximum DCI error stream channels
supported per DCI before a DCI move to error state.

Signed-off-by: Lior Nahmanson <liorna@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b0009aa3647f..3dd6641e942c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1651,7 +1651,13 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         max_geneve_tlv_option_data_len[0x5];
 	u8         reserved_at_570[0x10];
 
-	u8         reserved_at_580[0x33];
+	u8	   reserved_at_580[0xb];
+	u8	   log_max_dci_stream_channels[0x5];
+	u8	   reserved_at_590[0x3];
+	u8	   log_max_dci_errored_streams[0x5];
+	u8	   reserved_at_598[0x8];
+
+	u8         reserved_at_5a0[0x13];
 	u8         log_max_dek[0x5];
 	u8         reserved_at_5b8[0x4];
 	u8         mini_cqe_resp_stride_index[0x1];
@@ -3020,10 +3026,12 @@ struct mlx5_ifc_qpc_bits {
 	u8         reserved_at_3c0[0x8];
 	u8         next_send_psn[0x18];
 
-	u8         reserved_at_3e0[0x8];
+	u8         reserved_at_3e0[0x3];
+	u8	   log_num_dci_stream_channels[0x5];
 	u8         cqn_snd[0x18];
 
-	u8         reserved_at_400[0x8];
+	u8         reserved_at_400[0x3];
+	u8	   log_num_dci_errored_streams[0x5];
 	u8         deth_sqpn[0x18];
 
 	u8         reserved_at_420[0x20];
-- 
cgit v1.2.3


From e93abb840a2c356ed2809c31fcedb058601ac2e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Jul 2021 03:11:07 -0700
Subject: net/tcp_fastopen: remove tcp_fastopen_ctx_lock

Remove the (per netns) spinlock in favor of xchg() atomic operations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Link: https://lore.kernel.org/r/20210719101107.3203943-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netns/ipv4.h |  1 -
 net/ipv4/tcp_fastopen.c  | 17 +++--------------
 net/ipv4/tcp_ipv4.c      |  1 -
 3 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index b8620519eace..2f65701a43c9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -174,7 +174,6 @@ struct netns_ipv4 {
 	int sysctl_tcp_fastopen;
 	const struct tcp_congestion_ops __rcu  *tcp_congestion_control;
 	struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
-	spinlock_t tcp_fastopen_ctx_lock;
 	unsigned int sysctl_tcp_fastopen_blackhole_timeout;
 	atomic_t tfo_active_disable_times;
 	unsigned long tfo_active_disable_stamp;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 47c32604d38f..1a9fbd5448a7 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -55,12 +55,7 @@ void tcp_fastopen_ctx_destroy(struct net *net)
 {
 	struct tcp_fastopen_context *ctxt;
 
-	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
-
-	ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
-				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
-	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL);
-	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+	ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL);
 
 	if (ctxt)
 		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
@@ -89,18 +84,12 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 		ctx->num = 1;
 	}
 
-	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
 	if (sk) {
 		q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
-		octx = rcu_dereference_protected(q->ctx,
-			lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
-		rcu_assign_pointer(q->ctx, ctx);
+		octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx);
 	} else {
-		octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
-			lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
-		rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
+		octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx);
 	}
-	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
 
 	if (octx)
 		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b9dc2d6197be..e9321dd39cdb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2964,7 +2964,6 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
-	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
 
-- 
cgit v1.2.3


From f4b7002a7076f025dce59647a77c8251175d2b34 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:28 +0300
Subject: net: bridge: add vlan mcast snooping knob

Add a global knob that controls if vlan multicast snooping is enabled.
The proper contexts (vlan or bridge-wide) will be chosen based on the knob
when processing packets and changing bridge device state. Note that
vlans have their individual mcast snooping enabled by default, but this
knob is needed to turn on bridge vlan snooping. It is disabled by
default. To enable the knob vlan filtering must also be enabled, it
doesn't make sense to have vlan mcast snooping without vlan filtering
since that would lead to inconsistencies. Disabling vlan filtering will
also automatically disable vlan mcast snooping.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |   2 +
 net/bridge/br.c                |   9 ++-
 net/bridge/br_device.c         |   7 +-
 net/bridge/br_input.c          |   5 +-
 net/bridge/br_multicast.c      | 143 ++++++++++++++++++++++++++++++++---------
 net/bridge/br_private.h        |  37 +++++++++--
 net/bridge/br_vlan.c           |  20 ++++--
 7 files changed, 175 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 6b56a7549531..7927ad80ee86 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -720,12 +720,14 @@ struct br_mcast_stats {
 
 /* bridge boolean options
  * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets
+ * BR_BOOLOPT_MCAST_VLAN_SNOOPING - control vlan multicast snooping
  *
  * IMPORTANT: if adding a new option do not forget to handle
  *            it in br_boolopt_toggle/get and bridge sysfs
  */
 enum br_boolopt_id {
 	BR_BOOLOPT_NO_LL_LEARN,
+	BR_BOOLOPT_MCAST_VLAN_SNOOPING,
 	BR_BOOLOPT_MAX
 };
 
diff --git a/net/bridge/br.c b/net/bridge/br.c
index ef743f94254d..51f2e25c4cd6 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -214,17 +214,22 @@ static struct notifier_block br_switchdev_notifier = {
 int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
 		      struct netlink_ext_ack *extack)
 {
+	int err = 0;
+
 	switch (opt) {
 	case BR_BOOLOPT_NO_LL_LEARN:
 		br_opt_toggle(br, BROPT_NO_LL_LEARN, on);
 		break;
+	case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
+		err = br_multicast_toggle_vlan_snooping(br, on, extack);
+		break;
 	default:
 		/* shouldn't be called with unsupported options */
 		WARN_ON(1);
 		break;
 	}
 
-	return 0;
+	return err;
 }
 
 int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
@@ -232,6 +237,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
 	switch (opt) {
 	case BR_BOOLOPT_NO_LL_LEARN:
 		return br_opt_get(br, BROPT_NO_LL_LEARN);
+	case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
+		return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED);
 	default:
 		/* shouldn't be called with unsupported options */
 		WARN_ON(1);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index e815bf4f9f24..00daf35f54d5 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -27,12 +27,14 @@ EXPORT_SYMBOL_GPL(nf_br_ops);
 /* net device transmit always called with BH disabled */
 netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	struct net_bridge_mcast_port *pmctx_null = NULL;
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_mcast *brmctx = &br->multicast_ctx;
 	struct net_bridge_fdb_entry *dst;
 	struct net_bridge_mdb_entry *mdst;
 	const struct nf_br_ops *nf_ops;
 	u8 state = BR_STATE_FORWARDING;
+	struct net_bridge_vlan *vlan;
 	const unsigned char *dest;
 	u16 vid = 0;
 
@@ -54,7 +56,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_reset_mac_header(skb);
 	skb_pull(skb, ETH_HLEN);
 
-	if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state))
+	if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid,
+				&state, &vlan))
 		goto out;
 
 	if (IS_ENABLED(CONFIG_INET) &&
@@ -83,7 +86,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
 			goto out;
 		}
-		if (br_multicast_rcv(brmctx, NULL, skb, vid)) {
+		if (br_multicast_rcv(&brmctx, &pmctx_null, vlan, skb, vid)) {
 			kfree_skb(skb);
 			goto out;
 		}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index bb2036dd4934..8a0c0cc55cb4 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -73,6 +73,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	struct net_bridge_mdb_entry *mdst;
 	bool local_rcv, mcast_hit = false;
 	struct net_bridge_mcast *brmctx;
+	struct net_bridge_vlan *vlan;
 	struct net_bridge *br;
 	u16 vid = 0;
 	u8 state;
@@ -84,7 +85,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	pmctx = &p->multicast_ctx;
 	state = p->state;
 	if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid,
-				&state))
+				&state, &vlan))
 		goto out;
 
 	nbp_switchdev_frame_mark(p, skb);
@@ -102,7 +103,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 			local_rcv = true;
 		} else {
 			pkt_type = BR_PKT_MULTICAST;
-			if (br_multicast_rcv(brmctx, pmctx, skb, vid))
+			if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid))
 				goto drop;
 		}
 	}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index ef4e7de3f18d..b71772828b23 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1797,9 +1797,9 @@ void br_multicast_enable_port(struct net_bridge_port *port)
 {
 	struct net_bridge *br = port->br;
 
-	spin_lock(&br->multicast_lock);
+	spin_lock_bh(&br->multicast_lock);
 	__br_multicast_enable_port_ctx(&port->multicast_ctx);
-	spin_unlock(&br->multicast_lock);
+	spin_unlock_bh(&br->multicast_lock);
 }
 
 static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
@@ -1827,9 +1827,9 @@ static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
 
 void br_multicast_disable_port(struct net_bridge_port *port)
 {
-	spin_lock(&port->br->multicast_lock);
+	spin_lock_bh(&port->br->multicast_lock);
 	__br_multicast_disable_port_ctx(&port->multicast_ctx);
-	spin_unlock(&port->br->multicast_lock);
+	spin_unlock_bh(&port->br->multicast_lock);
 }
 
 static int __grp_src_delete_marked(struct net_bridge_port_group *pg)
@@ -3510,8 +3510,9 @@ static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx,
 }
 #endif
 
-int br_multicast_rcv(struct net_bridge_mcast *brmctx,
-		     struct net_bridge_mcast_port *pmctx,
+int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+		     struct net_bridge_mcast_port **pmctx,
+		     struct net_bridge_vlan *vlan,
 		     struct sk_buff *skb, u16 vid)
 {
 	int ret = 0;
@@ -3519,16 +3520,36 @@ int br_multicast_rcv(struct net_bridge_mcast *brmctx,
 	BR_INPUT_SKB_CB(skb)->igmp = 0;
 	BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
 
-	if (!br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED))
+	if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED))
 		return 0;
 
+	if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) {
+		const struct net_bridge_vlan *masterv;
+
+		/* the vlan has the master flag set only when transmitting
+		 * through the bridge device
+		 */
+		if (br_vlan_is_master(vlan)) {
+			masterv = vlan;
+			*brmctx = &vlan->br_mcast_ctx;
+			*pmctx = NULL;
+		} else {
+			masterv = vlan->brvlan;
+			*brmctx = &vlan->brvlan->br_mcast_ctx;
+			*pmctx = &vlan->port_mcast_ctx;
+		}
+
+		if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED))
+			return 0;
+	}
+
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-		ret = br_multicast_ipv4_rcv(brmctx, pmctx, skb, vid);
+		ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid);
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case htons(ETH_P_IPV6):
-		ret = br_multicast_ipv6_rcv(brmctx, pmctx, skb, vid);
+		ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid);
 		break;
 #endif
 	}
@@ -3727,20 +3748,22 @@ static void __br_multicast_open(struct net_bridge_mcast *brmctx)
 
 void br_multicast_open(struct net_bridge *br)
 {
-	struct net_bridge_vlan_group *vg;
-	struct net_bridge_vlan *vlan;
-
 	ASSERT_RTNL();
 
-	vg = br_vlan_group(br);
-	if (vg) {
-		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
-			struct net_bridge_mcast *brmctx;
-
-			brmctx = &vlan->br_mcast_ctx;
-			if (br_vlan_is_brentry(vlan) &&
-			    !br_multicast_ctx_vlan_disabled(brmctx))
-				__br_multicast_open(&vlan->br_mcast_ctx);
+	if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+		struct net_bridge_vlan_group *vg;
+		struct net_bridge_vlan *vlan;
+
+		vg = br_vlan_group(br);
+		if (vg) {
+			list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+				struct net_bridge_mcast *brmctx;
+
+				brmctx = &vlan->br_mcast_ctx;
+				if (br_vlan_is_brentry(vlan) &&
+				    !br_multicast_ctx_vlan_disabled(brmctx))
+					__br_multicast_open(&vlan->br_mcast_ctx);
+			}
 		}
 	}
 
@@ -3804,22 +3827,80 @@ void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on)
 	}
 }
 
-void br_multicast_stop(struct net_bridge *br)
+void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+	struct net_bridge_port *p;
+
+	if (WARN_ON_ONCE(!br_vlan_is_master(vlan)))
+		return;
+
+	list_for_each_entry(p, &vlan->br->port_list, list) {
+		struct net_bridge_vlan *vport;
+
+		vport = br_vlan_find(nbp_vlan_group(p), vlan->vid);
+		if (!vport)
+			continue;
+		br_multicast_toggle_one_vlan(vport, on);
+	}
+}
+
+int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
+				      struct netlink_ext_ack *extack)
 {
 	struct net_bridge_vlan_group *vg;
 	struct net_bridge_vlan *vlan;
+	struct net_bridge_port *p;
 
-	ASSERT_RTNL();
+	if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on)
+		return 0;
+
+	if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled");
+		return -EINVAL;
+	}
 
 	vg = br_vlan_group(br);
-	if (vg) {
-		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
-			struct net_bridge_mcast *brmctx;
-
-			brmctx = &vlan->br_mcast_ctx;
-			if (br_vlan_is_brentry(vlan) &&
-			    !br_multicast_ctx_vlan_disabled(brmctx))
-				__br_multicast_stop(&vlan->br_mcast_ctx);
+	if (!vg)
+		return 0;
+
+	br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on);
+
+	/* disable/enable non-vlan mcast contexts based on vlan snooping */
+	if (on)
+		__br_multicast_stop(&br->multicast_ctx);
+	else
+		__br_multicast_open(&br->multicast_ctx);
+	list_for_each_entry(p, &br->port_list, list) {
+		if (on)
+			br_multicast_disable_port(p);
+		else
+			br_multicast_enable_port(p);
+	}
+
+	list_for_each_entry(vlan, &vg->vlan_list, vlist)
+		br_multicast_toggle_vlan(vlan, on);
+
+	return 0;
+}
+
+void br_multicast_stop(struct net_bridge *br)
+{
+	ASSERT_RTNL();
+
+	if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+		struct net_bridge_vlan_group *vg;
+		struct net_bridge_vlan *vlan;
+
+		vg = br_vlan_group(br);
+		if (vg) {
+			list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+				struct net_bridge_mcast *brmctx;
+
+				brmctx = &vlan->br_mcast_ctx;
+				if (br_vlan_is_brentry(vlan) &&
+				    !br_multicast_ctx_vlan_disabled(brmctx))
+					__br_multicast_stop(&vlan->br_mcast_ctx);
+			}
 		}
 	}
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 5588f2d3546f..c3c2f19d3b71 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -433,6 +433,7 @@ enum net_bridge_opts {
 	BROPT_VLAN_STATS_PER_PORT,
 	BROPT_NO_LL_LEARN,
 	BROPT_VLAN_BRIDGE_BINDING,
+	BROPT_MCAST_VLAN_SNOOPING_ENABLED,
 };
 
 struct net_bridge {
@@ -829,8 +830,9 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
 
 /* br_multicast.c */
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
-int br_multicast_rcv(struct net_bridge_mcast *brmctx,
-		     struct net_bridge_mcast_port *pmctx,
+int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+		     struct net_bridge_mcast_port **pmctx,
+		     struct net_bridge_vlan *vlan,
 		     struct sk_buff *skb, u16 vid);
 struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx,
 					struct sk_buff *skb, u16 vid);
@@ -904,6 +906,9 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port,
 				struct net_bridge_mcast_port *pmctx);
 void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx);
 void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on);
+void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on);
+int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
+				      struct netlink_ext_ack *extack);
 
 static inline bool br_group_is_l2(const struct br_ip *group)
 {
@@ -1090,7 +1095,8 @@ br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx)
 static inline bool
 br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx)
 {
-	return br_multicast_ctx_is_vlan(brmctx) &&
+	return br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) &&
+	       br_multicast_ctx_is_vlan(brmctx) &&
 	       !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED);
 }
 
@@ -1108,8 +1114,9 @@ br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx)
 	       !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED);
 }
 #else
-static inline int br_multicast_rcv(struct net_bridge_mcast *brmctx,
-				   struct net_bridge_mcast_port *pmctx,
+static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+				   struct net_bridge_mcast_port **pmctx,
+				   struct net_bridge_vlan *vlan,
 				   struct sk_buff *skb,
 				   u16 vid)
 {
@@ -1245,13 +1252,26 @@ static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan,
 						bool on)
 {
 }
+
+static inline void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan,
+					    bool on)
+{
+}
+
+static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br,
+						    bool on,
+						    struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 /* br_vlan.c */
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 bool br_allowed_ingress(const struct net_bridge *br,
 			struct net_bridge_vlan_group *vg, struct sk_buff *skb,
-			u16 *vid, u8 *state);
+			u16 *vid, u8 *state,
+			struct net_bridge_vlan **vlan);
 bool br_allowed_egress(struct net_bridge_vlan_group *vg,
 		       const struct sk_buff *skb);
 bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
@@ -1363,8 +1383,11 @@ static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
 static inline bool br_allowed_ingress(const struct net_bridge *br,
 				      struct net_bridge_vlan_group *vg,
 				      struct sk_buff *skb,
-				      u16 *vid, u8 *state)
+				      u16 *vid, u8 *state,
+				      struct net_bridge_vlan **vlan)
+
 {
+	*vlan = NULL;
 	return true;
 }
 
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 1a8cb2b1b762..ab4969a4a380 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -481,7 +481,8 @@ out:
 static bool __allowed_ingress(const struct net_bridge *br,
 			      struct net_bridge_vlan_group *vg,
 			      struct sk_buff *skb, u16 *vid,
-			      u8 *state)
+			      u8 *state,
+			      struct net_bridge_vlan **vlan)
 {
 	struct pcpu_sw_netstats *stats;
 	struct net_bridge_vlan *v;
@@ -546,8 +547,9 @@ static bool __allowed_ingress(const struct net_bridge *br,
 			 */
 			skb->vlan_tci |= pvid;
 
-		/* if stats are disabled we can avoid the lookup */
-		if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
+		/* if snooping and stats are disabled we can avoid the lookup */
+		if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) &&
+		    !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
 			if (*state == BR_STATE_FORWARDING) {
 				*state = br_vlan_get_pvid_state(vg);
 				return br_vlan_state_allowed(*state, true);
@@ -574,6 +576,8 @@ static bool __allowed_ingress(const struct net_bridge *br,
 		u64_stats_update_end(&stats->syncp);
 	}
 
+	*vlan = v;
+
 	return true;
 
 drop:
@@ -583,17 +587,19 @@ drop:
 
 bool br_allowed_ingress(const struct net_bridge *br,
 			struct net_bridge_vlan_group *vg, struct sk_buff *skb,
-			u16 *vid, u8 *state)
+			u16 *vid, u8 *state,
+			struct net_bridge_vlan **vlan)
 {
 	/* If VLAN filtering is disabled on the bridge, all packets are
 	 * permitted.
 	 */
+	*vlan = NULL;
 	if (!br_opt_get(br, BROPT_VLAN_ENABLED)) {
 		BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
 		return true;
 	}
 
-	return __allowed_ingress(br, vg, skb, vid, state);
+	return __allowed_ingress(br, vg, skb, vid, state, vlan);
 }
 
 /* Called under RCU. */
@@ -834,6 +840,10 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val,
 	br_manage_promisc(br);
 	recalculate_group_addr(br);
 	br_recalculate_fwd_mask(br);
+	if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+		br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n");
+		br_multicast_toggle_vlan_snooping(br, false, NULL);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 1e9ca45662d6bb65fb60d3fbb7737b081d9cffc9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:33 +0300
Subject: net: bridge: multicast: include router port vlan id in notifications

Use the port multicast context to check if the router port is a vlan and
in case it is include its vlan id in the notification.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_mdb.c            | 29 ++++++++++++++++++++++-------
 net/bridge/br_multicast.c      |  4 ++--
 net/bridge/br_private.h        |  2 +-
 4 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 7927ad80ee86..90ac9e11c15b 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -629,6 +629,7 @@ enum {
 	MDBA_ROUTER_PATTR_TYPE,
 	MDBA_ROUTER_PATTR_INET_TIMER,
 	MDBA_ROUTER_PATTR_INET6_TIMER,
+	MDBA_ROUTER_PATTR_VID,
 	__MDBA_ROUTER_PATTR_MAX
 };
 #define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 5319587198eb..d3383a47a2f2 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -781,12 +781,12 @@ errout:
 
 static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
 				   struct net_device *dev,
-				   int ifindex, u32 pid,
+				   int ifindex, u16 vid, u32 pid,
 				   u32 seq, int type, unsigned int flags)
 {
+	struct nlattr *nest, *port_nest;
 	struct br_port_msg *bpm;
 	struct nlmsghdr *nlh;
-	struct nlattr *nest;
 
 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
 	if (!nlh)
@@ -800,8 +800,18 @@ static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
 	if (!nest)
 		goto cancel;
 
-	if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex))
+	port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT);
+	if (!port_nest)
+		goto end;
+	if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) {
+		nla_nest_cancel(skb, port_nest);
 		goto end;
+	}
+	if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) {
+		nla_nest_cancel(skb, port_nest);
+		goto end;
+	}
+	nla_nest_end(skb, port_nest);
 
 	nla_nest_end(skb, nest);
 	nlmsg_end(skb, nlh);
@@ -817,23 +827,28 @@ cancel:
 static inline size_t rtnl_rtr_nlmsg_size(void)
 {
 	return NLMSG_ALIGN(sizeof(struct br_port_msg))
-		+ nla_total_size(sizeof(__u32));
+		+ nla_total_size(sizeof(__u32))
+		+ nla_total_size(sizeof(u16));
 }
 
-void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
 		   int type)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 	int ifindex;
+	u16 vid;
 
-	ifindex = port ? port->dev->ifindex : 0;
+	ifindex = pmctx ? pmctx->port->dev->ifindex : 0;
+	vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid :
+							      0;
 	skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
-	err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF);
+	err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type,
+				      NTF_SELF);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 9d4a18a711e4..fb5e5df571fd 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2979,7 +2979,7 @@ static void br_multicast_add_router(struct net_bridge_mcast *brmctx,
 	 * IPv4 or IPv6 multicast router.
 	 */
 	if (br_multicast_no_router_otherpf(pmctx, rlist)) {
-		br_rtr_notify(pmctx->port->br->dev, pmctx->port, RTM_NEWMDB);
+		br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB);
 		br_port_mc_router_state_change(pmctx->port, true);
 	}
 }
@@ -4078,7 +4078,7 @@ br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted)
 		return;
 #endif
 
-	br_rtr_notify(pmctx->port->br->dev, pmctx->port, RTM_DELMDB);
+	br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB);
 	br_port_mc_router_state_change(pmctx->port, false);
 
 	/* don't allow timer refresh */
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0d019b0b00e5..a18c27f581a3 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -872,7 +872,7 @@ int br_mdb_hash_init(struct net_bridge *br);
 void br_mdb_hash_fini(struct net_bridge *br);
 void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
 		   struct net_bridge_port_group *pg, int type);
-void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
 		   int type);
 void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
 			 struct net_bridge_port_group *pg,
-- 
cgit v1.2.3


From 47ecd2dbd8ec43125ea75d7d2e73c888cda8663f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:34 +0300
Subject: net: bridge: vlan: add support for global options

We can have two types of vlan options depending on context:
 - per-device vlan options (split in per-bridge and per-port)
 - global vlan options

The second type wasn't supported in the bridge until now, but we need
them for per-vlan multicast support, per-vlan STP support and other
options which require global vlan context. They are contained in the global
bridge vlan context even if the vlan is not configured on the bridge device
itself. This patch adds initial netlink attributes and support for setting
these global vlan options, they can only be set (RTM_NEWVLAN) and the
operation must use the bridge device. Since there are no such options yet
it shouldn't have any functional effect.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 13 +++++++
 net/bridge/br_private.h        |  4 ++
 net/bridge/br_vlan.c           | 16 ++++++--
 net/bridge/br_vlan_options.c   | 85 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 90ac9e11c15b..4ed57d1a5d89 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -485,10 +485,15 @@ enum {
  *     [BRIDGE_VLANDB_ENTRY_INFO]
  *     ...
  * }
+ * [BRIDGE_VLANDB_GLOBAL_OPTIONS] = {
+ *     [BRIDGE_VLANDB_GOPTS_ID]
+ *     ...
+ * }
  */
 enum {
 	BRIDGE_VLANDB_UNSPEC,
 	BRIDGE_VLANDB_ENTRY,
+	BRIDGE_VLANDB_GLOBAL_OPTIONS,
 	__BRIDGE_VLANDB_MAX,
 };
 #define BRIDGE_VLANDB_MAX (__BRIDGE_VLANDB_MAX - 1)
@@ -538,6 +543,14 @@ enum {
 };
 #define BRIDGE_VLANDB_STATS_MAX (__BRIDGE_VLANDB_STATS_MAX - 1)
 
+enum {
+	BRIDGE_VLANDB_GOPTS_UNSPEC,
+	BRIDGE_VLANDB_GOPTS_ID,
+	BRIDGE_VLANDB_GOPTS_RANGE,
+	__BRIDGE_VLANDB_GOPTS_MAX
+};
+#define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
+
 /* Bridge multicast database attributes
  * [MDBA_MDB] = {
  *     [MDBA_MDB_ENTRY] = {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a18c27f581a3..6a6ce233a999 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1592,6 +1592,10 @@ int br_vlan_process_options(const struct net_bridge *br,
 			    struct net_bridge_vlan *range_end,
 			    struct nlattr **tb,
 			    struct netlink_ext_ack *extack);
+int br_vlan_rtm_process_global_options(struct net_device *dev,
+				       const struct nlattr *attr,
+				       int cmd,
+				       struct netlink_ext_ack *extack);
 
 /* vlan state manipulation helpers using *_ONCE to annotate lock-free access */
 static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index ab4969a4a380..dcb5acf783d2 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -2203,12 +2203,22 @@ static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh,
 	}
 
 	nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) {
-		if (nla_type(attr) != BRIDGE_VLANDB_ENTRY)
+		switch (nla_type(attr)) {
+		case BRIDGE_VLANDB_ENTRY:
+			err = br_vlan_rtm_process_one(dev, attr,
+						      nlh->nlmsg_type,
+						      extack);
+			break;
+		case BRIDGE_VLANDB_GLOBAL_OPTIONS:
+			err = br_vlan_rtm_process_global_options(dev, attr,
+								 nlh->nlmsg_type,
+								 extack);
+			break;
+		default:
 			continue;
+		}
 
 		vlans++;
-		err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type,
-					      extack);
 		if (err)
 			break;
 	}
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index b4add9ea8964..a7d5a2334207 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -258,3 +258,88 @@ int br_vlan_process_options(const struct net_bridge *br,
 
 	return err;
 }
+
+static int br_vlan_process_global_one_opts(const struct net_bridge *br,
+					   struct net_bridge_vlan_group *vg,
+					   struct net_bridge_vlan *v,
+					   struct nlattr **tb,
+					   bool *changed,
+					   struct netlink_ext_ack *extack)
+{
+	*changed = false;
+	return 0;
+}
+
+static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
+	[BRIDGE_VLANDB_GOPTS_ID]	= { .type = NLA_U16 },
+	[BRIDGE_VLANDB_GOPTS_RANGE]	= { .type = NLA_U16 },
+};
+
+int br_vlan_rtm_process_global_options(struct net_device *dev,
+				       const struct nlattr *attr,
+				       int cmd,
+				       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[BRIDGE_VLANDB_GOPTS_MAX + 1];
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+	u16 vid, vid_range = 0;
+	struct net_bridge *br;
+	int err = 0;
+
+	if (cmd != RTM_NEWVLAN) {
+		NL_SET_ERR_MSG_MOD(extack, "Global vlan options support only set operation");
+		return -EINVAL;
+	}
+	if (!netif_is_bridge_master(dev)) {
+		NL_SET_ERR_MSG_MOD(extack, "Global vlan options can only be set on bridge device");
+		return -EINVAL;
+	}
+	br = netdev_priv(dev);
+	vg = br_vlan_group(br);
+	if (WARN_ON(!vg))
+		return -ENODEV;
+
+	err = nla_parse_nested(tb, BRIDGE_VLANDB_GOPTS_MAX, attr,
+			       br_vlan_db_gpol, extack);
+	if (err)
+		return err;
+
+	if (!tb[BRIDGE_VLANDB_GOPTS_ID]) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry id");
+		return -EINVAL;
+	}
+	vid = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_ID]);
+	if (!br_vlan_valid_id(vid, extack))
+		return -EINVAL;
+
+	if (tb[BRIDGE_VLANDB_GOPTS_RANGE]) {
+		vid_range = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_RANGE]);
+		if (!br_vlan_valid_id(vid_range, extack))
+			return -EINVAL;
+		if (vid >= vid_range) {
+			NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id");
+			return -EINVAL;
+		}
+	} else {
+		vid_range = vid;
+	}
+
+	for (; vid <= vid_range; vid++) {
+		bool changed = false;
+
+		v = br_vlan_find(vg, vid);
+		if (!v) {
+			NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process global options");
+			err = -ENOENT;
+			break;
+		}
+
+		err = br_vlan_process_global_one_opts(br, vg, v, tb, &changed,
+						      extack);
+		if (err)
+			break;
+	}
+
+	return err;
+}
-- 
cgit v1.2.3


From 743a53d9636aad83da63a8638e8365e817ef6365 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:35 +0300
Subject: net: bridge: vlan: add support for dumping global vlan options

Add a new vlan options dump flag which causes only global vlan options
to be dumped. The dumps are done only with bridge devices, ports are
ignored. They support vlan compression if the options in sequential
vlans are equal (currently always true).

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  4 ++++
 net/bridge/br_vlan.c           | 41 +++++++++++++++++++++++++++++++++--------
 net/bridge/br_vlan_options.c   | 31 +++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 4ed57d1a5d89..946ccf33dc53 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -479,6 +479,7 @@ enum {
 
 /* flags used in BRIDGE_VLANDB_DUMP_FLAGS attribute to affect dumps */
 #define BRIDGE_VLANDB_DUMPF_STATS	(1 << 0) /* Include stats in the dump */
+#define BRIDGE_VLANDB_DUMPF_GLOBAL	(1 << 1) /* Dump global vlan options only */
 
 /* Bridge vlan RTM attributes
  * [BRIDGE_VLANDB_ENTRY] = {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6a6ce233a999..a19dbd63d670 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1596,6 +1596,10 @@ int br_vlan_rtm_process_global_options(struct net_device *dev,
 				       const struct nlattr *attr,
 				       int cmd,
 				       struct netlink_ext_ack *extack);
+bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
+					 const struct net_bridge_vlan *r_end);
+bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
+			      const struct net_bridge_vlan *v_opts);
 
 /* vlan state manipulation helpers using *_ONCE to annotate lock-free access */
 static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index dcb5acf783d2..e66b004df763 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1919,6 +1919,7 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 			    u32 dump_flags)
 {
 	struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL;
+	bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL);
 	bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS);
 	struct net_bridge_vlan_group *vg;
 	int idx = 0, s_idx = cb->args[1];
@@ -1937,6 +1938,10 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 		vg = br_vlan_group_rcu(br);
 		p = NULL;
 	} else {
+		/* global options are dumped only for bridge devices */
+		if (dump_global)
+			return 0;
+
 		p = br_port_get_rcu(dev);
 		if (WARN_ON(!p))
 			return -EINVAL;
@@ -1959,7 +1964,7 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 
 	/* idx must stay at range's beginning until it is filled in */
 	list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
-		if (!br_vlan_should_use(v))
+		if (!dump_global && !br_vlan_should_use(v))
 			continue;
 		if (idx < s_idx) {
 			idx++;
@@ -1972,8 +1977,21 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 			continue;
 		}
 
-		if (dump_stats || v->vid == pvid ||
-		    !br_vlan_can_enter_range(v, range_end)) {
+		if (dump_global) {
+			if (br_vlan_global_opts_can_enter_range(v, range_end))
+				continue;
+			if (!br_vlan_global_opts_fill(skb, range_start->vid,
+						      range_end->vid,
+						      range_start)) {
+				err = -EMSGSIZE;
+				break;
+			}
+			/* advance number of filled vlans */
+			idx += range_end->vid - range_start->vid + 1;
+
+			range_start = v;
+		} else if (dump_stats || v->vid == pvid ||
+			   !br_vlan_can_enter_range(v, range_end)) {
 			u16 vlan_flags = br_vlan_flags(range_start, pvid);
 
 			if (!br_vlan_fill_vids(skb, range_start->vid,
@@ -1995,11 +2013,18 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 	 * - last vlan (range_start == range_end, not in range)
 	 * - last vlan range (range_start != range_end, in range)
 	 */
-	if (!err && range_start &&
-	    !br_vlan_fill_vids(skb, range_start->vid, range_end->vid,
-			       range_start, br_vlan_flags(range_start, pvid),
-			       dump_stats))
-		err = -EMSGSIZE;
+	if (!err && range_start) {
+		if (dump_global &&
+		    !br_vlan_global_opts_fill(skb, range_start->vid,
+					      range_end->vid, range_start))
+			err = -EMSGSIZE;
+		else if (!dump_global &&
+			 !br_vlan_fill_vids(skb, range_start->vid,
+					    range_end->vid, range_start,
+					    br_vlan_flags(range_start, pvid),
+					    dump_stats))
+			err = -EMSGSIZE;
+	}
 
 	cb->args[1] = err ? idx : 0;
 
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index a7d5a2334207..f290f5140547 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -259,6 +259,37 @@ int br_vlan_process_options(const struct net_bridge *br,
 	return err;
 }
 
+bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
+					 const struct net_bridge_vlan *r_end)
+{
+	return v_curr->vid - r_end->vid == 1;
+}
+
+bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
+			      const struct net_bridge_vlan *v_opts)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS);
+	if (!nest)
+		return false;
+
+	if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_ID, vid))
+		goto out_err;
+
+	if (vid_range && vid < vid_range &&
+	    nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range))
+		goto out_err;
+
+	nla_nest_end(skb, nest);
+
+	return true;
+
+out_err:
+	nla_nest_cancel(skb, nest);
+	return false;
+}
+
 static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 					   struct net_bridge_vlan_group *vg,
 					   struct net_bridge_vlan *v,
-- 
cgit v1.2.3


From 9dee572c384846f4ece029ab5688faed0682e48a Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:37 +0300
Subject: net: bridge: vlan: add mcast snooping control

Add a new global vlan option which controls whether multicast snooping
is enabled or disabled for a single vlan. It controls the vlan private
flag: BR_VLFLAG_GLOBAL_MCAST_ENABLED.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      | 16 ++++++++++++++++
 net/bridge/br_private.h        |  7 +++++++
 net/bridge/br_vlan_options.c   | 24 +++++++++++++++++++++++-
 4 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 946ccf33dc53..5aca85874447 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -548,6 +548,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_UNSPEC,
 	BRIDGE_VLANDB_GOPTS_ID,
 	BRIDGE_VLANDB_GOPTS_RANGE,
+	BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index fb5e5df571fd..976491951c82 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -3988,6 +3988,22 @@ int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
 	return 0;
 }
 
+bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+	ASSERT_RTNL();
+
+	/* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and
+	 * requires only RTNL to change
+	 */
+	if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED))
+		return false;
+
+	vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED;
+	br_multicast_toggle_vlan(vlan, on);
+
+	return true;
+}
+
 void br_multicast_stop(struct net_bridge *br)
 {
 	ASSERT_RTNL();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a19dbd63d670..4681a4b6020f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -909,6 +909,7 @@ void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on);
 void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on);
 int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
 				      struct netlink_ext_ack *extack);
+bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on);
 
 static inline bool br_group_is_l2(const struct br_ip *group)
 {
@@ -1282,6 +1283,12 @@ static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan,
+						   bool on)
+{
+	return false;
+}
 #endif
 
 /* br_vlan.c */
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 827bfc319599..4ef975b20185 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -262,7 +262,9 @@ int br_vlan_process_options(const struct net_bridge *br,
 bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
 					 const struct net_bridge_vlan *r_end)
 {
-	return v_curr->vid - r_end->vid == 1;
+	return v_curr->vid - r_end->vid == 1 &&
+	       ((v_curr->priv_flags ^ r_end->priv_flags) &
+		BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0;
 }
 
 bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
@@ -281,6 +283,12 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range))
 		goto out_err;
 
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
+		       !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)))
+		goto out_err;
+#endif
+
 	nla_nest_end(skb, nest);
 
 	return true;
@@ -295,6 +303,9 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 	return NLMSG_ALIGN(sizeof(struct br_vlan_msg))
 		+ nla_total_size(0) /* BRIDGE_VLANDB_GLOBAL_OPTIONS */
 		+ nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */
+#endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
 
@@ -349,12 +360,23 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 					   struct netlink_ext_ack *extack)
 {
 	*changed = false;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) {
+		u8 mc_snooping;
+
+		mc_snooping = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]);
+		if (br_multicast_toggle_global_vlan(v, !!mc_snooping))
+			*changed = true;
+	}
+#endif
+
 	return 0;
 }
 
 static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_ID]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_GOPTS_RANGE]	= { .type = NLA_U16 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3


From 0fac6aa098edf91ba65370da03811d9aba5715a9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 19 Jul 2021 20:14:42 +0300
Subject: net: dsa: sja1105: delete the best_effort_vlan_filtering mode

Simply put, the best-effort VLAN filtering mode relied on VLAN retagging
from a bridge VLAN towards a tag_8021q sub-VLAN in order to be able to
decode the source port in the tagger, but the VLAN retagging
implementation inside the sja1105 chips is not the best and we were
relying on marginal operating conditions.

The most notable limitation of the best-effort VLAN filtering mode is
its incapacity to treat this case properly:

ip link add br0 type bridge vlan_filtering 1
ip link set swp2 master br0
ip link set swp4 master br0
bridge vlan del dev swp4 vid 1
bridge vlan add dev swp4 vid 1 pvid

When sending an untagged packet through swp2, the expectation is for it
to be forwarded to swp4 as egress-tagged (so it will contain VLAN ID 1
on egress). But the switch will send it as egress-untagged.

There was an attempt to fix this here:
https://patchwork.kernel.org/project/netdevbpf/patch/20210407201452.1703261-2-olteanv@gmail.com/

but it failed miserably because it broke PTP RX timestamping, in a way
that cannot be corrected due to hardware issues related to VLAN
retagging.

So with either PTP broken or pushing VLAN headers on egress for untagged
packets being broken, the sad reality is that the best-effort VLAN
filtering code is broken. Delete it.

Note that this means there will be a temporary loss of functionality in
this driver until it is replaced with something better (network stack
RX/TX capability for "mode 2" as described in
Documentation/networking/dsa/sja1105.rst, the "port under VLAN-aware
bridge" case). We simply cannot keep this code until that driver rework
is done, it is super bloated and tangled with tag_8021q.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h         |  13 +-
 drivers/net/dsa/sja1105/sja1105_devlink.c | 114 +------
 drivers/net/dsa/sja1105/sja1105_main.c    | 482 ++----------------------------
 drivers/net/dsa/sja1105/sja1105_vl.c      |  14 +-
 include/linux/dsa/8021q.h                 |   9 +-
 include/linux/dsa/sja1105.h               |   1 -
 net/dsa/tag_8021q.c                       |  77 +----
 net/dsa/tag_ocelot_8021q.c                |   4 +-
 net/dsa/tag_sja1105.c                     |  28 +-
 9 files changed, 42 insertions(+), 700 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 221c7abdef0e..869b19c08fc0 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -234,19 +234,13 @@ struct sja1105_bridge_vlan {
 	bool untagged;
 };
 
-enum sja1105_vlan_state {
-	SJA1105_VLAN_UNAWARE,
-	SJA1105_VLAN_BEST_EFFORT,
-	SJA1105_VLAN_FILTERING_FULL,
-};
-
 struct sja1105_private {
 	struct sja1105_static_config static_config;
 	bool rgmii_rx_delay[SJA1105_MAX_NUM_PORTS];
 	bool rgmii_tx_delay[SJA1105_MAX_NUM_PORTS];
 	phy_interface_t phy_mode[SJA1105_MAX_NUM_PORTS];
 	bool fixed_link[SJA1105_MAX_NUM_PORTS];
-	bool best_effort_vlan_filtering;
+	bool vlan_aware;
 	unsigned long learn_ena;
 	unsigned long ucast_egress_floods;
 	unsigned long bcast_egress_floods;
@@ -264,7 +258,6 @@ struct sja1105_private {
 	 */
 	struct mutex mgmt_lock;
 	struct dsa_8021q_context *dsa_8021q_ctx;
-	enum sja1105_vlan_state vlan_state;
 	struct devlink_region **regions;
 	struct sja1105_cbs_entry *cbs;
 	struct mii_bus *mdio_base_t1;
@@ -311,10 +304,6 @@ int sja1110_pcs_mdio_write(struct mii_bus *bus, int phy, int reg, u16 val);
 /* From sja1105_devlink.c */
 int sja1105_devlink_setup(struct dsa_switch *ds);
 void sja1105_devlink_teardown(struct dsa_switch *ds);
-int sja1105_devlink_param_get(struct dsa_switch *ds, u32 id,
-			      struct devlink_param_gset_ctx *ctx);
-int sja1105_devlink_param_set(struct dsa_switch *ds, u32 id,
-			      struct devlink_param_gset_ctx *ctx);
 int sja1105_devlink_info_get(struct dsa_switch *ds,
 			     struct devlink_info_req *req,
 			     struct netlink_ext_ack *extack);
diff --git a/drivers/net/dsa/sja1105/sja1105_devlink.c b/drivers/net/dsa/sja1105/sja1105_devlink.c
index b6a4a16b8c7e..05c7f4ca3b1a 100644
--- a/drivers/net/dsa/sja1105/sja1105_devlink.c
+++ b/drivers/net/dsa/sja1105/sja1105_devlink.c
@@ -115,105 +115,6 @@ static void sja1105_teardown_devlink_regions(struct dsa_switch *ds)
 	kfree(priv->regions);
 }
 
-static int sja1105_best_effort_vlan_filtering_get(struct sja1105_private *priv,
-						  bool *be_vlan)
-{
-	*be_vlan = priv->best_effort_vlan_filtering;
-
-	return 0;
-}
-
-static int sja1105_best_effort_vlan_filtering_set(struct sja1105_private *priv,
-						  bool be_vlan)
-{
-	struct dsa_switch *ds = priv->ds;
-	bool vlan_filtering;
-	int port;
-	int rc;
-
-	priv->best_effort_vlan_filtering = be_vlan;
-
-	rtnl_lock();
-	for (port = 0; port < ds->num_ports; port++) {
-		struct dsa_port *dp;
-
-		if (!dsa_is_user_port(ds, port))
-			continue;
-
-		dp = dsa_to_port(ds, port);
-		vlan_filtering = dsa_port_is_vlan_filtering(dp);
-
-		rc = sja1105_vlan_filtering(ds, port, vlan_filtering, NULL);
-		if (rc)
-			break;
-	}
-	rtnl_unlock();
-
-	return rc;
-}
-
-enum sja1105_devlink_param_id {
-	SJA1105_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
-	SJA1105_DEVLINK_PARAM_ID_BEST_EFFORT_VLAN_FILTERING,
-};
-
-int sja1105_devlink_param_get(struct dsa_switch *ds, u32 id,
-			      struct devlink_param_gset_ctx *ctx)
-{
-	struct sja1105_private *priv = ds->priv;
-	int err;
-
-	switch (id) {
-	case SJA1105_DEVLINK_PARAM_ID_BEST_EFFORT_VLAN_FILTERING:
-		err = sja1105_best_effort_vlan_filtering_get(priv,
-							     &ctx->val.vbool);
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
-int sja1105_devlink_param_set(struct dsa_switch *ds, u32 id,
-			      struct devlink_param_gset_ctx *ctx)
-{
-	struct sja1105_private *priv = ds->priv;
-	int err;
-
-	switch (id) {
-	case SJA1105_DEVLINK_PARAM_ID_BEST_EFFORT_VLAN_FILTERING:
-		err = sja1105_best_effort_vlan_filtering_set(priv,
-							     ctx->val.vbool);
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
-static const struct devlink_param sja1105_devlink_params[] = {
-	DSA_DEVLINK_PARAM_DRIVER(SJA1105_DEVLINK_PARAM_ID_BEST_EFFORT_VLAN_FILTERING,
-				 "best_effort_vlan_filtering",
-				 DEVLINK_PARAM_TYPE_BOOL,
-				 BIT(DEVLINK_PARAM_CMODE_RUNTIME)),
-};
-
-static int sja1105_setup_devlink_params(struct dsa_switch *ds)
-{
-	return dsa_devlink_params_register(ds, sja1105_devlink_params,
-					   ARRAY_SIZE(sja1105_devlink_params));
-}
-
-static void sja1105_teardown_devlink_params(struct dsa_switch *ds)
-{
-	dsa_devlink_params_unregister(ds, sja1105_devlink_params,
-				      ARRAY_SIZE(sja1105_devlink_params));
-}
-
 int sja1105_devlink_info_get(struct dsa_switch *ds,
 			     struct devlink_info_req *req,
 			     struct netlink_ext_ack *extack)
@@ -233,23 +134,10 @@ int sja1105_devlink_info_get(struct dsa_switch *ds,
 
 int sja1105_devlink_setup(struct dsa_switch *ds)
 {
-	int rc;
-
-	rc = sja1105_setup_devlink_params(ds);
-	if (rc)
-		return rc;
-
-	rc = sja1105_setup_devlink_regions(ds);
-	if (rc < 0) {
-		sja1105_teardown_devlink_params(ds);
-		return rc;
-	}
-
-	return 0;
+	return sja1105_setup_devlink_regions(ds);
 }
 
 void sja1105_devlink_teardown(struct dsa_switch *ds)
 {
-	sja1105_teardown_devlink_params(ds);
 	sja1105_teardown_devlink_regions(ds);
 }
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index ced8c9cb29c2..4514ac468cc8 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -545,18 +545,11 @@ void sja1105_frame_memory_partitioning(struct sja1105_private *priv)
 {
 	struct sja1105_l2_forwarding_params_entry *l2_fwd_params;
 	struct sja1105_vl_forwarding_params_entry *vl_fwd_params;
-	int max_mem = priv->info->max_frame_mem;
 	struct sja1105_table *table;
 
-	/* VLAN retagging is implemented using a loopback port that consumes
-	 * frame buffers. That leaves less for us.
-	 */
-	if (priv->vlan_state == SJA1105_VLAN_BEST_EFFORT)
-		max_mem -= SJA1105_FRAME_MEMORY_RETAGGING_OVERHEAD;
-
 	table = &priv->static_config.tables[BLK_IDX_L2_FORWARDING_PARAMS];
 	l2_fwd_params = table->entries;
-	l2_fwd_params->part_spc[0] = max_mem;
+	l2_fwd_params->part_spc[0] = SJA1105_MAX_FRAME_MEMORY;
 
 	/* If we have any critical-traffic virtual links, we need to reserve
 	 * some frame buffer memory for them. At the moment, hardcode the value
@@ -1416,7 +1409,7 @@ int sja1105pqrs_fdb_add(struct dsa_switch *ds, int port,
 	l2_lookup.vlanid = vid;
 	l2_lookup.iotag = SJA1105_S_TAG;
 	l2_lookup.mask_macaddr = GENMASK_ULL(ETH_ALEN * 8 - 1, 0);
-	if (priv->vlan_state != SJA1105_VLAN_UNAWARE) {
+	if (priv->vlan_aware) {
 		l2_lookup.mask_vlanid = VLAN_VID_MASK;
 		l2_lookup.mask_iotag = BIT(0);
 	} else {
@@ -1479,7 +1472,7 @@ int sja1105pqrs_fdb_del(struct dsa_switch *ds, int port,
 	l2_lookup.vlanid = vid;
 	l2_lookup.iotag = SJA1105_S_TAG;
 	l2_lookup.mask_macaddr = GENMASK_ULL(ETH_ALEN * 8 - 1, 0);
-	if (priv->vlan_state != SJA1105_VLAN_UNAWARE) {
+	if (priv->vlan_aware) {
 		l2_lookup.mask_vlanid = VLAN_VID_MASK;
 		l2_lookup.mask_iotag = BIT(0);
 	} else {
@@ -1525,7 +1518,7 @@ static int sja1105_fdb_add(struct dsa_switch *ds, int port,
 	 * for what gets printed in 'bridge fdb show'.  In the case of zero,
 	 * no VID gets printed at all.
 	 */
-	if (priv->vlan_state != SJA1105_VLAN_FILTERING_FULL)
+	if (!priv->vlan_aware)
 		vid = 0;
 
 	return priv->info->fdb_add_cmd(ds, port, addr, vid);
@@ -1536,7 +1529,7 @@ static int sja1105_fdb_del(struct dsa_switch *ds, int port,
 {
 	struct sja1105_private *priv = ds->priv;
 
-	if (priv->vlan_state != SJA1105_VLAN_FILTERING_FULL)
+	if (!priv->vlan_aware)
 		vid = 0;
 
 	return priv->info->fdb_del_cmd(ds, port, addr, vid);
@@ -1581,7 +1574,7 @@ static int sja1105_fdb_dump(struct dsa_switch *ds, int port,
 		u64_to_ether_addr(l2_lookup.macaddr, macaddr);
 
 		/* We need to hide the dsa_8021q VLANs from the user. */
-		if (priv->vlan_state == SJA1105_VLAN_UNAWARE)
+		if (!priv->vlan_aware)
 			l2_lookup.vlanid = 0;
 		cb(macaddr, l2_lookup.vlanid, l2_lookup.lockeds, data);
 	}
@@ -2085,57 +2078,6 @@ sja1105_get_tag_protocol(struct dsa_switch *ds, int port,
 	return priv->info->tag_proto;
 }
 
-static int sja1105_find_free_subvlan(u16 *subvlan_map, bool pvid)
-{
-	int subvlan;
-
-	if (pvid)
-		return 0;
-
-	for (subvlan = 1; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++)
-		if (subvlan_map[subvlan] == VLAN_N_VID)
-			return subvlan;
-
-	return -1;
-}
-
-static int sja1105_find_subvlan(u16 *subvlan_map, u16 vid)
-{
-	int subvlan;
-
-	for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++)
-		if (subvlan_map[subvlan] == vid)
-			return subvlan;
-
-	return -1;
-}
-
-static int sja1105_find_committed_subvlan(struct sja1105_private *priv,
-					  int port, u16 vid)
-{
-	struct sja1105_port *sp = &priv->ports[port];
-
-	return sja1105_find_subvlan(sp->subvlan_map, vid);
-}
-
-static void sja1105_init_subvlan_map(u16 *subvlan_map)
-{
-	int subvlan;
-
-	for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++)
-		subvlan_map[subvlan] = VLAN_N_VID;
-}
-
-static void sja1105_commit_subvlan_map(struct sja1105_private *priv, int port,
-				       u16 *subvlan_map)
-{
-	struct sja1105_port *sp = &priv->ports[port];
-	int subvlan;
-
-	for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++)
-		sp->subvlan_map[subvlan] = subvlan_map[subvlan];
-}
-
 static int sja1105_is_vlan_configured(struct sja1105_private *priv, u16 vid)
 {
 	struct sja1105_vlan_lookup_entry *vlan;
@@ -2152,29 +2094,9 @@ static int sja1105_is_vlan_configured(struct sja1105_private *priv, u16 vid)
 	return -1;
 }
 
-static int
-sja1105_find_retagging_entry(struct sja1105_retagging_entry *retagging,
-			     int count, int from_port, u16 from_vid,
-			     u16 to_vid)
-{
-	int i;
-
-	for (i = 0; i < count; i++)
-		if (retagging[i].ing_port == BIT(from_port) &&
-		    retagging[i].vlan_ing == from_vid &&
-		    retagging[i].vlan_egr == to_vid)
-			return i;
-
-	/* Return an invalid entry index if not found */
-	return -1;
-}
-
 static int sja1105_commit_vlans(struct sja1105_private *priv,
-				struct sja1105_vlan_lookup_entry *new_vlan,
-				struct sja1105_retagging_entry *new_retagging,
-				int num_retagging)
+				struct sja1105_vlan_lookup_entry *new_vlan)
 {
-	struct sja1105_retagging_entry *retagging;
 	struct sja1105_vlan_lookup_entry *vlan;
 	struct sja1105_table *table;
 	int num_vlans = 0;
@@ -2234,50 +2156,9 @@ static int sja1105_commit_vlans(struct sja1105_private *priv,
 		vlan[k++] = new_vlan[i];
 	}
 
-	/* VLAN Retagging Table */
-	table = &priv->static_config.tables[BLK_IDX_RETAGGING];
-	retagging = table->entries;
-
-	for (i = 0; i < table->entry_count; i++) {
-		rc = sja1105_dynamic_config_write(priv, BLK_IDX_RETAGGING,
-						  i, &retagging[i], false);
-		if (rc)
-			return rc;
-	}
-
-	if (table->entry_count)
-		kfree(table->entries);
-
-	table->entries = kcalloc(num_retagging, table->ops->unpacked_entry_size,
-				 GFP_KERNEL);
-	if (!table->entries)
-		return -ENOMEM;
-
-	table->entry_count = num_retagging;
-	retagging = table->entries;
-
-	for (i = 0; i < num_retagging; i++) {
-		retagging[i] = new_retagging[i];
-
-		/* Update entry */
-		rc = sja1105_dynamic_config_write(priv, BLK_IDX_RETAGGING,
-						  i, &retagging[i], true);
-		if (rc < 0)
-			return rc;
-	}
-
 	return 0;
 }
 
-struct sja1105_crosschip_vlan {
-	struct list_head list;
-	u16 vid;
-	bool untagged;
-	int port;
-	int other_port;
-	struct dsa_8021q_context *other_ctx;
-};
-
 struct sja1105_crosschip_switch {
 	struct list_head list;
 	struct dsa_8021q_context *other_ctx;
@@ -2289,7 +2170,7 @@ static int sja1105_commit_pvid(struct sja1105_private *priv)
 	struct list_head *vlan_list;
 	int rc = 0;
 
-	if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL)
+	if (priv->vlan_aware)
 		vlan_list = &priv->bridge_vlans;
 	else
 		vlan_list = &priv->dsa_8021q_vlans;
@@ -2311,7 +2192,7 @@ sja1105_build_bridge_vlans(struct sja1105_private *priv,
 {
 	struct sja1105_bridge_vlan *v;
 
-	if (priv->vlan_state == SJA1105_VLAN_UNAWARE)
+	if (!priv->vlan_aware)
 		return 0;
 
 	list_for_each_entry(v, &priv->bridge_vlans, list) {
@@ -2334,9 +2215,6 @@ sja1105_build_dsa_8021q_vlans(struct sja1105_private *priv,
 {
 	struct sja1105_bridge_vlan *v;
 
-	if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL)
-		return 0;
-
 	list_for_each_entry(v, &priv->dsa_8021q_vlans, list) {
 		int match = v->vid;
 
@@ -2351,267 +2229,6 @@ sja1105_build_dsa_8021q_vlans(struct sja1105_private *priv,
 	return 0;
 }
 
-static int sja1105_build_subvlans(struct sja1105_private *priv,
-				  u16 subvlan_map[][DSA_8021Q_N_SUBVLAN],
-				  struct sja1105_vlan_lookup_entry *new_vlan,
-				  struct sja1105_retagging_entry *new_retagging,
-				  int *num_retagging)
-{
-	struct sja1105_bridge_vlan *v;
-	int k = *num_retagging;
-
-	if (priv->vlan_state != SJA1105_VLAN_BEST_EFFORT)
-		return 0;
-
-	list_for_each_entry(v, &priv->bridge_vlans, list) {
-		int upstream = dsa_upstream_port(priv->ds, v->port);
-		int match, subvlan;
-		u16 rx_vid;
-
-		/* Only sub-VLANs on user ports need to be applied.
-		 * Bridge VLANs also include VLANs added automatically
-		 * by DSA on the CPU port.
-		 */
-		if (!dsa_is_user_port(priv->ds, v->port))
-			continue;
-
-		subvlan = sja1105_find_subvlan(subvlan_map[v->port],
-					       v->vid);
-		if (subvlan < 0) {
-			subvlan = sja1105_find_free_subvlan(subvlan_map[v->port],
-							    v->pvid);
-			if (subvlan < 0) {
-				dev_err(priv->ds->dev, "No more free subvlans\n");
-				return -ENOSPC;
-			}
-		}
-
-		rx_vid = dsa_8021q_rx_vid_subvlan(priv->ds, v->port, subvlan);
-
-		/* @v->vid on @v->port needs to be retagged to @rx_vid
-		 * on @upstream. Assume @v->vid on @v->port and on
-		 * @upstream was already configured by the previous
-		 * iteration over bridge_vlans.
-		 */
-		match = rx_vid;
-		new_vlan[match].vlanid = rx_vid;
-		new_vlan[match].vmemb_port |= BIT(v->port);
-		new_vlan[match].vmemb_port |= BIT(upstream);
-		new_vlan[match].vlan_bc |= BIT(v->port);
-		new_vlan[match].vlan_bc |= BIT(upstream);
-		/* The "untagged" flag is set the same as for the
-		 * original VLAN
-		 */
-		if (!v->untagged)
-			new_vlan[match].tag_port |= BIT(v->port);
-		/* But it's always tagged towards the CPU */
-		new_vlan[match].tag_port |= BIT(upstream);
-		new_vlan[match].type_entry = SJA1110_VLAN_D_TAG;
-
-		/* The Retagging Table generates packet *clones* with
-		 * the new VLAN. This is a very odd hardware quirk
-		 * which we need to suppress by dropping the original
-		 * packet.
-		 * Deny egress of the original VLAN towards the CPU
-		 * port. This will force the switch to drop it, and
-		 * we'll see only the retagged packets.
-		 */
-		match = v->vid;
-		new_vlan[match].vlan_bc &= ~BIT(upstream);
-
-		/* And the retagging itself */
-		new_retagging[k].vlan_ing = v->vid;
-		new_retagging[k].vlan_egr = rx_vid;
-		new_retagging[k].ing_port = BIT(v->port);
-		new_retagging[k].egr_port = BIT(upstream);
-		if (k++ == SJA1105_MAX_RETAGGING_COUNT) {
-			dev_err(priv->ds->dev, "No more retagging rules\n");
-			return -ENOSPC;
-		}
-
-		subvlan_map[v->port][subvlan] = v->vid;
-	}
-
-	*num_retagging = k;
-
-	return 0;
-}
-
-/* Sadly, in crosschip scenarios where the CPU port is also the link to another
- * switch, we should retag backwards (the dsa_8021q vid to the original vid) on
- * the CPU port of neighbour switches.
- */
-static int
-sja1105_build_crosschip_subvlans(struct sja1105_private *priv,
-				 struct sja1105_vlan_lookup_entry *new_vlan,
-				 struct sja1105_retagging_entry *new_retagging,
-				 int *num_retagging)
-{
-	struct sja1105_crosschip_vlan *tmp, *pos;
-	struct dsa_8021q_crosschip_link *c;
-	struct sja1105_bridge_vlan *v, *w;
-	struct list_head crosschip_vlans;
-	int k = *num_retagging;
-	int rc = 0;
-
-	if (priv->vlan_state != SJA1105_VLAN_BEST_EFFORT)
-		return 0;
-
-	INIT_LIST_HEAD(&crosschip_vlans);
-
-	list_for_each_entry(c, &priv->dsa_8021q_ctx->crosschip_links, list) {
-		struct sja1105_private *other_priv = c->other_ctx->ds->priv;
-
-		if (other_priv->vlan_state == SJA1105_VLAN_FILTERING_FULL)
-			continue;
-
-		/* Crosschip links are also added to the CPU ports.
-		 * Ignore those.
-		 */
-		if (!dsa_is_user_port(priv->ds, c->port))
-			continue;
-		if (!dsa_is_user_port(c->other_ctx->ds, c->other_port))
-			continue;
-
-		/* Search for VLANs on the remote port */
-		list_for_each_entry(v, &other_priv->bridge_vlans, list) {
-			bool already_added = false;
-			bool we_have_it = false;
-
-			if (v->port != c->other_port)
-				continue;
-
-			/* If @v is a pvid on @other_ds, it does not need
-			 * re-retagging, because its SVL field is 0 and we
-			 * already allow that, via the dsa_8021q crosschip
-			 * links.
-			 */
-			if (v->pvid)
-				continue;
-
-			/* Search for the VLAN on our local port */
-			list_for_each_entry(w, &priv->bridge_vlans, list) {
-				if (w->port == c->port && w->vid == v->vid) {
-					we_have_it = true;
-					break;
-				}
-			}
-
-			if (!we_have_it)
-				continue;
-
-			list_for_each_entry(tmp, &crosschip_vlans, list) {
-				if (tmp->vid == v->vid &&
-				    tmp->untagged == v->untagged &&
-				    tmp->port == c->port &&
-				    tmp->other_port == v->port &&
-				    tmp->other_ctx == c->other_ctx) {
-					already_added = true;
-					break;
-				}
-			}
-
-			if (already_added)
-				continue;
-
-			tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
-			if (!tmp) {
-				dev_err(priv->ds->dev, "Failed to allocate memory\n");
-				rc = -ENOMEM;
-				goto out;
-			}
-			tmp->vid = v->vid;
-			tmp->port = c->port;
-			tmp->other_port = v->port;
-			tmp->other_ctx = c->other_ctx;
-			tmp->untagged = v->untagged;
-			list_add(&tmp->list, &crosschip_vlans);
-		}
-	}
-
-	list_for_each_entry(tmp, &crosschip_vlans, list) {
-		struct sja1105_private *other_priv = tmp->other_ctx->ds->priv;
-		int upstream = dsa_upstream_port(priv->ds, tmp->port);
-		int match, subvlan;
-		u16 rx_vid;
-
-		subvlan = sja1105_find_committed_subvlan(other_priv,
-							 tmp->other_port,
-							 tmp->vid);
-		/* If this happens, it's a bug. The neighbour switch does not
-		 * have a subvlan for tmp->vid on tmp->other_port, but it
-		 * should, since we already checked for its vlan_state.
-		 */
-		if (WARN_ON(subvlan < 0)) {
-			rc = -EINVAL;
-			goto out;
-		}
-
-		rx_vid = dsa_8021q_rx_vid_subvlan(tmp->other_ctx->ds,
-						  tmp->other_port,
-						  subvlan);
-
-		/* The @rx_vid retagged from @tmp->vid on
-		 * {@tmp->other_ds, @tmp->other_port} needs to be
-		 * re-retagged to @tmp->vid on the way back to us.
-		 *
-		 * Assume the original @tmp->vid is already configured
-		 * on this local switch, otherwise we wouldn't be
-		 * retagging its subvlan on the other switch in the
-		 * first place. We just need to add a reverse retagging
-		 * rule for @rx_vid and install @rx_vid on our ports.
-		 */
-		match = rx_vid;
-		new_vlan[match].vlanid = rx_vid;
-		new_vlan[match].vmemb_port |= BIT(tmp->port);
-		new_vlan[match].vmemb_port |= BIT(upstream);
-		/* The "untagged" flag is set the same as for the
-		 * original VLAN. And towards the CPU, it doesn't
-		 * really matter, because @rx_vid will only receive
-		 * traffic on that port. For consistency with other dsa_8021q
-		 * VLANs, we'll keep the CPU port tagged.
-		 */
-		if (!tmp->untagged)
-			new_vlan[match].tag_port |= BIT(tmp->port);
-		new_vlan[match].tag_port |= BIT(upstream);
-		new_vlan[match].type_entry = SJA1110_VLAN_D_TAG;
-		/* Deny egress of @rx_vid towards our front-panel port.
-		 * This will force the switch to drop it, and we'll see
-		 * only the re-retagged packets (having the original,
-		 * pre-initial-retagging, VLAN @tmp->vid).
-		 */
-		new_vlan[match].vlan_bc &= ~BIT(tmp->port);
-
-		/* On reverse retagging, the same ingress VLAN goes to multiple
-		 * ports. So we have an opportunity to create composite rules
-		 * to not waste the limited space in the retagging table.
-		 */
-		k = sja1105_find_retagging_entry(new_retagging, *num_retagging,
-						 upstream, rx_vid, tmp->vid);
-		if (k < 0) {
-			if (*num_retagging == SJA1105_MAX_RETAGGING_COUNT) {
-				dev_err(priv->ds->dev, "No more retagging rules\n");
-				rc = -ENOSPC;
-				goto out;
-			}
-			k = (*num_retagging)++;
-		}
-		/* And the retagging itself */
-		new_retagging[k].vlan_ing = rx_vid;
-		new_retagging[k].vlan_egr = tmp->vid;
-		new_retagging[k].ing_port = BIT(upstream);
-		new_retagging[k].egr_port |= BIT(tmp->port);
-	}
-
-out:
-	list_for_each_entry_safe(tmp, pos, &crosschip_vlans, list) {
-		list_del(&tmp->list);
-		kfree(tmp);
-	}
-
-	return rc;
-}
-
 static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify);
 
 static int sja1105_notify_crosschip_switches(struct sja1105_private *priv)
@@ -2665,12 +2282,9 @@ out:
 
 static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify)
 {
-	u16 subvlan_map[SJA1105_MAX_NUM_PORTS][DSA_8021Q_N_SUBVLAN];
-	struct sja1105_retagging_entry *new_retagging;
 	struct sja1105_vlan_lookup_entry *new_vlan;
 	struct sja1105_table *table;
-	int i, num_retagging = 0;
-	int rc;
+	int rc, i;
 
 	table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP];
 	new_vlan = kcalloc(VLAN_N_VID,
@@ -2679,22 +2293,10 @@ static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify)
 		return -ENOMEM;
 
 	table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP];
-	new_retagging = kcalloc(SJA1105_MAX_RETAGGING_COUNT,
-				table->ops->unpacked_entry_size, GFP_KERNEL);
-	if (!new_retagging) {
-		kfree(new_vlan);
-		return -ENOMEM;
-	}
 
 	for (i = 0; i < VLAN_N_VID; i++)
 		new_vlan[i].vlanid = VLAN_N_VID;
 
-	for (i = 0; i < SJA1105_MAX_RETAGGING_COUNT; i++)
-		new_retagging[i].vlan_ing = VLAN_N_VID;
-
-	for (i = 0; i < priv->ds->num_ports; i++)
-		sja1105_init_subvlan_map(subvlan_map[i]);
-
 	/* Bridge VLANs */
 	rc = sja1105_build_bridge_vlans(priv, new_vlan);
 	if (rc)
@@ -2709,22 +2311,7 @@ static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify)
 	if (rc)
 		goto out;
 
-	/* Private VLANs necessary for dsa_8021q operation, which we need to
-	 * determine on our own:
-	 * - Sub-VLANs
-	 * - Sub-VLANs of crosschip switches
-	 */
-	rc = sja1105_build_subvlans(priv, subvlan_map, new_vlan, new_retagging,
-				    &num_retagging);
-	if (rc)
-		goto out;
-
-	rc = sja1105_build_crosschip_subvlans(priv, new_vlan, new_retagging,
-					      &num_retagging);
-	if (rc)
-		goto out;
-
-	rc = sja1105_commit_vlans(priv, new_vlan, new_retagging, num_retagging);
+	rc = sja1105_commit_vlans(priv, new_vlan);
 	if (rc)
 		goto out;
 
@@ -2732,9 +2319,6 @@ static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify)
 	if (rc)
 		goto out;
 
-	for (i = 0; i < priv->ds->num_ports; i++)
-		sja1105_commit_subvlan_map(priv, i, subvlan_map[i]);
-
 	if (notify) {
 		rc = sja1105_notify_crosschip_switches(priv);
 		if (rc)
@@ -2743,7 +2327,6 @@ static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify)
 
 out:
 	kfree(new_vlan);
-	kfree(new_retagging);
 
 	return rc;
 }
@@ -2758,10 +2341,8 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled,
 	struct sja1105_l2_lookup_params_entry *l2_lookup_params;
 	struct sja1105_general_params_entry *general_params;
 	struct sja1105_private *priv = ds->priv;
-	enum sja1105_vlan_state state;
 	struct sja1105_table *table;
 	struct sja1105_rule *rule;
-	bool want_tagging;
 	u16 tpid, tpid2;
 	int rc;
 
@@ -2792,19 +2373,10 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled,
 			sp->xmit_tpid = ETH_P_SJA1105;
 	}
 
-	if (!enabled)
-		state = SJA1105_VLAN_UNAWARE;
-	else if (priv->best_effort_vlan_filtering)
-		state = SJA1105_VLAN_BEST_EFFORT;
-	else
-		state = SJA1105_VLAN_FILTERING_FULL;
-
-	if (priv->vlan_state == state)
+	if (priv->vlan_aware == enabled)
 		return 0;
 
-	priv->vlan_state = state;
-	want_tagging = (state == SJA1105_VLAN_UNAWARE ||
-			state == SJA1105_VLAN_BEST_EFFORT);
+	priv->vlan_aware = enabled;
 
 	table = &priv->static_config.tables[BLK_IDX_GENERAL_PARAMS];
 	general_params = table->entries;
@@ -2818,8 +2390,6 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled,
 	general_params->incl_srcpt1 = enabled;
 	general_params->incl_srcpt0 = enabled;
 
-	want_tagging = priv->best_effort_vlan_filtering || !enabled;
-
 	/* VLAN filtering => independent VLAN learning.
 	 * No VLAN filtering (or best effort) => shared VLAN learning.
 	 *
@@ -2840,9 +2410,7 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled,
 	 */
 	table = &priv->static_config.tables[BLK_IDX_L2_LOOKUP_PARAMS];
 	l2_lookup_params = table->entries;
-	l2_lookup_params->shared_learn = want_tagging;
-
-	sja1105_frame_memory_partitioning(priv);
+	l2_lookup_params->shared_learn = !priv->vlan_aware;
 
 	rc = sja1105_build_vlan_table(priv, false);
 	if (rc)
@@ -2852,12 +2420,7 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled,
 	if (rc)
 		NL_SET_ERR_MSG_MOD(extack, "Failed to change VLAN Ethertype");
 
-	/* Switch port identification based on 802.1Q is only passable
-	 * if we are not under a vlan_filtering bridge. So make sure
-	 * the two configurations are mutually exclusive (of course, the
-	 * user may know better, i.e. best_effort_vlan_filtering).
-	 */
-	return sja1105_setup_8021q_tagging(ds, want_tagging);
+	return rc;
 }
 
 /* Returns number of VLANs added (0 or 1) on success,
@@ -2927,12 +2490,9 @@ static int sja1105_vlan_add(struct dsa_switch *ds, int port,
 	bool vlan_table_changed = false;
 	int rc;
 
-	/* If the user wants best-effort VLAN filtering (aka vlan_filtering
-	 * bridge plus tagging), be sure to at least deny alterations to the
-	 * configuration done by dsa_8021q.
+	/* Be sure to deny alterations to the configuration done by tag_8021q.
 	 */
-	if (priv->vlan_state != SJA1105_VLAN_FILTERING_FULL &&
-	    vid_is_dsa_8021q(vlan->vid)) {
+	if (vid_is_dsa_8021q(vlan->vid)) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Range 1024-3071 reserved for dsa_8021q operation");
 		return -EBUSY;
@@ -3086,8 +2646,6 @@ static int sja1105_setup(struct dsa_switch *ds)
 
 	ds->mtu_enforcement_ingress = true;
 
-	priv->best_effort_vlan_filtering = true;
-
 	rc = sja1105_devlink_setup(ds);
 	if (rc < 0)
 		goto out_static_config_free;
@@ -3604,8 +3162,6 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.cls_flower_stats	= sja1105_cls_flower_stats,
 	.crosschip_bridge_join	= sja1105_crosschip_bridge_join,
 	.crosschip_bridge_leave	= sja1105_crosschip_bridge_leave,
-	.devlink_param_get	= sja1105_devlink_param_get,
-	.devlink_param_set	= sja1105_devlink_param_set,
 	.devlink_info_get	= sja1105_devlink_info_get,
 };
 
@@ -3785,7 +3341,6 @@ static int sja1105_probe(struct spi_device *spi)
 		struct sja1105_port *sp = &priv->ports[port];
 		struct dsa_port *dp = dsa_to_port(ds, port);
 		struct net_device *slave;
-		int subvlan;
 
 		if (!dsa_is_user_port(ds, port))
 			continue;
@@ -3806,9 +3361,6 @@ static int sja1105_probe(struct spi_device *spi)
 		}
 		skb_queue_head_init(&sp->xmit_queue);
 		sp->xmit_tpid = ETH_P_SJA1105;
-
-		for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++)
-			sp->subvlan_map[subvlan] = VLAN_N_VID;
 	}
 
 	return 0;
diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c
index f6e13e6c6a18..ec7b65daec20 100644
--- a/drivers/net/dsa/sja1105/sja1105_vl.c
+++ b/drivers/net/dsa/sja1105/sja1105_vl.c
@@ -496,14 +496,11 @@ int sja1105_vl_redirect(struct sja1105_private *priv, int port,
 	struct sja1105_rule *rule = sja1105_rule_find(priv, cookie);
 	int rc;
 
-	if (priv->vlan_state == SJA1105_VLAN_UNAWARE &&
-	    key->type != SJA1105_KEY_VLAN_UNAWARE_VL) {
+	if (!priv->vlan_aware && key->type != SJA1105_KEY_VLAN_UNAWARE_VL) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only redirect based on DMAC");
 		return -EOPNOTSUPP;
-	} else if ((priv->vlan_state == SJA1105_VLAN_BEST_EFFORT ||
-		    priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) &&
-		   key->type != SJA1105_KEY_VLAN_AWARE_VL) {
+	} else if (priv->vlan_aware && key->type != SJA1105_KEY_VLAN_AWARE_VL) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only redirect based on {DMAC, VID, PCP}");
 		return -EOPNOTSUPP;
@@ -595,14 +592,11 @@ int sja1105_vl_gate(struct sja1105_private *priv, int port,
 		return -ERANGE;
 	}
 
-	if (priv->vlan_state == SJA1105_VLAN_UNAWARE &&
-	    key->type != SJA1105_KEY_VLAN_UNAWARE_VL) {
+	if (!priv->vlan_aware && key->type != SJA1105_KEY_VLAN_UNAWARE_VL) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only gate based on DMAC");
 		return -EOPNOTSUPP;
-	} else if ((priv->vlan_state == SJA1105_VLAN_BEST_EFFORT ||
-		    priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) &&
-		   key->type != SJA1105_KEY_VLAN_AWARE_VL) {
+	} else if (priv->vlan_aware && key->type != SJA1105_KEY_VLAN_AWARE_VL) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only gate based on {DMAC, VID, PCP}");
 		return -EOPNOTSUPP;
diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 1587961f1a7b..608607f904a5 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -35,8 +35,6 @@ struct dsa_8021q_context {
 	__be16 proto;
 };
 
-#define DSA_8021Q_N_SUBVLAN			8
-
 int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled);
 
 int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port,
@@ -50,21 +48,16 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port,
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 			       u16 tpid, u16 tci);
 
-void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
-		   int *subvlan);
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id);
 
 u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port);
 
 u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port);
 
-u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan);
-
 int dsa_8021q_rx_switch_id(u16 vid);
 
 int dsa_8021q_rx_source_port(u16 vid);
 
-u16 dsa_8021q_rx_subvlan(u16 vid);
-
 bool vid_is_dsa_8021q_rxvlan(u16 vid);
 
 bool vid_is_dsa_8021q_txvlan(u16 vid);
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index b6089b88314c..0eadc7ac44ec 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -59,7 +59,6 @@ struct sja1105_skb_cb {
 	((struct sja1105_skb_cb *)((skb)->cb))
 
 struct sja1105_port {
-	u16 subvlan_map[DSA_8021Q_N_SUBVLAN];
 	struct kthread_worker *xmit_worker;
 	struct kthread_work xmit_work;
 	struct sk_buff_head xmit_queue;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 4aa29f90ecea..d657864969d4 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -17,7 +17,7 @@
  *
  * | 11  | 10  |  9  |  8  |  7  |  6  |  5  |  4  |  3  |  2  |  1  |  0  |
  * +-----------+-----+-----------------+-----------+-----------------------+
- * |    DIR    | SVL |    SWITCH_ID    |  SUBVLAN  |          PORT         |
+ * |    DIR    | RSV |    SWITCH_ID    |    RSV    |          PORT         |
  * +-----------+-----+-----------------+-----------+-----------------------+
  *
  * DIR - VID[11:10]:
@@ -27,24 +27,13 @@
  *	These values make the special VIDs of 0, 1 and 4095 to be left
  *	unused by this coding scheme.
  *
- * SVL/SUBVLAN - { VID[9], VID[5:4] }:
- *	Sub-VLAN encoding. Valid only when DIR indicates an RX VLAN.
- *	* 0 (0b000): Field does not encode a sub-VLAN, either because
- *	received traffic is untagged, PVID-tagged or because a second
- *	VLAN tag is present after this tag and not inside of it.
- *	* 1 (0b001): Received traffic is tagged with a VID value private
- *	to the host. This field encodes the index in the host's lookup
- *	table through which the value of the ingress VLAN ID can be
- *	recovered.
- *	* 2 (0b010): Field encodes a sub-VLAN.
- *	...
- *	* 7 (0b111): Field encodes a sub-VLAN.
- *	When DIR indicates a TX VLAN, SUBVLAN must be transmitted as zero
- *	(by the host) and ignored on receive (by the switch).
- *
  * SWITCH_ID - VID[8:6]:
  *	Index of switch within DSA tree. Must be between 0 and 7.
  *
+ * RSV - VID[5:4]:
+ *	To be used for further expansion of PORT or for other purposes.
+ *	Must be transmitted as zero and ignored on receive.
+ *
  * PORT - VID[3:0]:
  *	Index of switch port. Must be between 0 and 15.
  */
@@ -61,18 +50,6 @@
 #define DSA_8021Q_SWITCH_ID(x)		(((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \
 						 DSA_8021Q_SWITCH_ID_MASK)
 
-#define DSA_8021Q_SUBVLAN_HI_SHIFT	9
-#define DSA_8021Q_SUBVLAN_HI_MASK	GENMASK(9, 9)
-#define DSA_8021Q_SUBVLAN_LO_SHIFT	4
-#define DSA_8021Q_SUBVLAN_LO_MASK	GENMASK(5, 4)
-#define DSA_8021Q_SUBVLAN_HI(x)		(((x) & GENMASK(2, 2)) >> 2)
-#define DSA_8021Q_SUBVLAN_LO(x)		((x) & GENMASK(1, 0))
-#define DSA_8021Q_SUBVLAN(x)		\
-		(((DSA_8021Q_SUBVLAN_LO(x) << DSA_8021Q_SUBVLAN_LO_SHIFT) & \
-		  DSA_8021Q_SUBVLAN_LO_MASK) | \
-		 ((DSA_8021Q_SUBVLAN_HI(x) << DSA_8021Q_SUBVLAN_HI_SHIFT) & \
-		  DSA_8021Q_SUBVLAN_HI_MASK))
-
 #define DSA_8021Q_PORT_SHIFT		0
 #define DSA_8021Q_PORT_MASK		GENMASK(3, 0)
 #define DSA_8021Q_PORT(x)		(((x) << DSA_8021Q_PORT_SHIFT) & \
@@ -98,13 +75,6 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port)
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid);
 
-u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan)
-{
-	return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) |
-	       DSA_8021Q_PORT(port) | DSA_8021Q_SUBVLAN(subvlan);
-}
-EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid_subvlan);
-
 /* Returns the decoded switch ID from the RX VID. */
 int dsa_8021q_rx_switch_id(u16 vid)
 {
@@ -119,20 +89,6 @@ int dsa_8021q_rx_source_port(u16 vid)
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port);
 
-/* Returns the decoded subvlan from the RX VID. */
-u16 dsa_8021q_rx_subvlan(u16 vid)
-{
-	u16 svl_hi, svl_lo;
-
-	svl_hi = (vid & DSA_8021Q_SUBVLAN_HI_MASK) >>
-		 DSA_8021Q_SUBVLAN_HI_SHIFT;
-	svl_lo = (vid & DSA_8021Q_SUBVLAN_LO_MASK) >>
-		 DSA_8021Q_SUBVLAN_LO_SHIFT;
-
-	return (svl_hi << 2) | svl_lo;
-}
-EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan);
-
 bool vid_is_dsa_8021q_rxvlan(u16 vid)
 {
 	return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX;
@@ -227,7 +183,7 @@ static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port,
 	u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port);
 	u16 tx_vid = dsa_8021q_tx_vid(ctx->ds, port);
 	struct net_device *master;
-	int i, err, subvlan;
+	int i, err;
 
 	/* The CPU port is implicitly configured by
 	 * configuring the front-panel ports
@@ -275,18 +231,11 @@ static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port,
 		return err;
 	}
 
-	/* Add to the master's RX filter not only @rx_vid, but in fact
-	 * the entire subvlan range, just in case this DSA switch might
-	 * want to use sub-VLANs.
-	 */
-	for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++) {
-		u16 vid = dsa_8021q_rx_vid_subvlan(ctx->ds, port, subvlan);
-
-		if (enabled)
-			vlan_vid_add(master, ctx->proto, vid);
-		else
-			vlan_vid_del(master, ctx->proto, vid);
-	}
+	/* Add @rx_vid to the master's RX filter. */
+	if (enabled)
+		vlan_vid_add(master, ctx->proto, rx_vid);
+	else
+		vlan_vid_del(master, ctx->proto, rx_vid);
 
 	/* Finally apply the TX VID on this port and on the CPU port */
 	err = dsa_8021q_vid_apply(ctx, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED,
@@ -471,8 +420,7 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
 
-void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
-		   int *subvlan)
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id)
 {
 	u16 vid, tci;
 
@@ -489,7 +437,6 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
 
 	*source_port = dsa_8021q_rx_source_port(vid);
 	*switch_id = dsa_8021q_rx_switch_id(vid);
-	*subvlan = dsa_8021q_rx_subvlan(vid);
 	skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rcv);
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 85ac85c3af8c..d0781b058610 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -41,9 +41,9 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 				  struct net_device *netdev,
 				  struct packet_type *pt)
 {
-	int src_port, switch_id, subvlan;
+	int src_port, switch_id;
 
-	dsa_8021q_rcv(skb, &src_port, &switch_id, &subvlan);
+	dsa_8021q_rcv(skb, &src_port, &switch_id);
 
 	skb->dev = dsa_master_find_slave(netdev, switch_id, src_port);
 	if (!skb->dev)
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 9c2df9ece01b..7c92c329a092 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -358,20 +358,6 @@ static struct sk_buff
 	return skb;
 }
 
-static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan)
-{
-	struct dsa_port *dp = dsa_slave_to_port(skb->dev);
-	struct sja1105_port *sp = dp->priv;
-	u16 vid = sp->subvlan_map[subvlan];
-	u16 vlan_tci;
-
-	if (vid == VLAN_N_VID)
-		return;
-
-	vlan_tci = (skb->priority << VLAN_PRIO_SHIFT) | vid;
-	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci);
-}
-
 static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb)
 {
 	u16 tpid = ntohs(eth_hdr(skb)->h_proto);
@@ -389,8 +375,8 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 				   struct net_device *netdev,
 				   struct packet_type *pt)
 {
-	int source_port, switch_id, subvlan = 0;
 	struct sja1105_meta meta = {0};
+	int source_port, switch_id;
 	struct ethhdr *hdr;
 	bool is_link_local;
 	bool is_meta;
@@ -403,7 +389,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 
 	if (sja1105_skb_has_tag_8021q(skb)) {
 		/* Normal traffic path. */
-		dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan);
+		dsa_8021q_rcv(skb, &source_port, &switch_id);
 	} else if (is_link_local) {
 		/* Management traffic path. Switch embeds the switch ID and
 		 * port ID into bytes of the destination MAC, courtesy of
@@ -428,9 +414,6 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 		return NULL;
 	}
 
-	if (subvlan)
-		sja1105_decode_subvlan(skb, subvlan);
-
 	return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
 					      is_meta);
 }
@@ -538,7 +521,7 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
 				   struct net_device *netdev,
 				   struct packet_type *pt)
 {
-	int source_port = -1, switch_id = -1, subvlan = 0;
+	int source_port = -1, switch_id = -1;
 
 	skb->offload_fwd_mark = 1;
 
@@ -551,7 +534,7 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
 
 	/* Packets with in-band control extensions might still have RX VLANs */
 	if (likely(sja1105_skb_has_tag_8021q(skb)))
-		dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan);
+		dsa_8021q_rcv(skb, &source_port, &switch_id);
 
 	skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
 	if (!skb->dev) {
@@ -561,9 +544,6 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
 		return NULL;
 	}
 
-	if (subvlan)
-		sja1105_decode_subvlan(skb, subvlan);
-
 	return skb;
 }
 
-- 
cgit v1.2.3


From 8afbea187d31e4e9beb83b7a316d16b7879c2799 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 19 Jul 2021 20:14:45 +0300
Subject: net: dsa: tag_8021q: remove struct packet_type declaration

This is no longer necessary since tag_8021q doesn't register itself as a
full-blown tagger anymore.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/8021q.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 608607f904a5..5f01dea7d5b6 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -11,7 +11,6 @@
 struct dsa_switch;
 struct sk_buff;
 struct net_device;
-struct packet_type;
 struct dsa_8021q_context;
 
 struct dsa_8021q_crosschip_link {
-- 
cgit v1.2.3


From cedf467064b6b8764fdb2ee6b9e3d18bc81a9d8f Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 19 Jul 2021 20:14:46 +0300
Subject: net: dsa: tag_8021q: create dsa_tag_8021q_{register,unregister}
 helpers

In preparation of moving tag_8021q to core DSA, move all initialization
and teardown related to tag_8021q which is currently done by drivers in
2 functions called "register" and "unregister". These will gather more
functionality in future patches, which will better justify the chosen
naming scheme.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c         | 12 ++++--------
 drivers/net/dsa/sja1105/sja1105_main.c | 18 +++++++++---------
 include/linux/dsa/8021q.h              |  6 ++++++
 net/dsa/tag_8021q.c                    | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index a2a15919b960..b52cc381cdc1 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -425,15 +425,11 @@ static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu)
 	ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_MC);
 	ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_BC);
 
-	felix->dsa_8021q_ctx = kzalloc(sizeof(*felix->dsa_8021q_ctx),
-				       GFP_KERNEL);
+	felix->dsa_8021q_ctx = dsa_tag_8021q_register(ds, &felix_tag_8021q_ops,
+						      htons(ETH_P_8021AD));
 	if (!felix->dsa_8021q_ctx)
 		return -ENOMEM;
 
-	felix->dsa_8021q_ctx->ops = &felix_tag_8021q_ops;
-	felix->dsa_8021q_ctx->proto = htons(ETH_P_8021AD);
-	felix->dsa_8021q_ctx->ds = ds;
-
 	err = dsa_8021q_setup(felix->dsa_8021q_ctx, true);
 	if (err)
 		goto out_free_dsa_8021_ctx;
@@ -447,7 +443,7 @@ static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu)
 out_teardown_dsa_8021q:
 	dsa_8021q_setup(felix->dsa_8021q_ctx, false);
 out_free_dsa_8021_ctx:
-	kfree(felix->dsa_8021q_ctx);
+	dsa_tag_8021q_unregister(felix->dsa_8021q_ctx);
 	return err;
 }
 
@@ -466,7 +462,7 @@ static void felix_teardown_tag_8021q(struct dsa_switch *ds, int cpu)
 	if (err)
 		dev_err(ds->dev, "dsa_8021q_setup returned %d", err);
 
-	kfree(felix->dsa_8021q_ctx);
+	dsa_tag_8021q_unregister(felix->dsa_8021q_ctx);
 
 	for (port = 0; port < ds->num_ports; port++) {
 		if (dsa_is_unused_port(ds, port))
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 4514ac468cc8..689f46797d1c 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -3306,16 +3306,11 @@ static int sja1105_probe(struct spi_device *spi)
 	mutex_init(&priv->ptp_data.lock);
 	mutex_init(&priv->mgmt_lock);
 
-	priv->dsa_8021q_ctx = devm_kzalloc(dev, sizeof(*priv->dsa_8021q_ctx),
-					   GFP_KERNEL);
+	priv->dsa_8021q_ctx = dsa_tag_8021q_register(ds, &sja1105_dsa_8021q_ops,
+						     htons(ETH_P_8021Q));
 	if (!priv->dsa_8021q_ctx)
 		return -ENOMEM;
 
-	priv->dsa_8021q_ctx->ops = &sja1105_dsa_8021q_ops;
-	priv->dsa_8021q_ctx->proto = htons(ETH_P_8021Q);
-	priv->dsa_8021q_ctx->ds = ds;
-
-	INIT_LIST_HEAD(&priv->dsa_8021q_ctx->crosschip_links);
 	INIT_LIST_HEAD(&priv->bridge_vlans);
 	INIT_LIST_HEAD(&priv->dsa_8021q_vlans);
 
@@ -3324,7 +3319,7 @@ static int sja1105_probe(struct spi_device *spi)
 
 	rc = dsa_register_switch(priv->ds);
 	if (rc)
-		return rc;
+		goto out_tag_8021q_unregister;
 
 	if (IS_ENABLED(CONFIG_NET_SCH_CBS)) {
 		priv->cbs = devm_kcalloc(dev, priv->info->num_cbs_shapers,
@@ -3377,6 +3372,8 @@ out_destroy_workers:
 
 out_unregister_switch:
 	dsa_unregister_switch(ds);
+out_tag_8021q_unregister:
+	dsa_tag_8021q_unregister(priv->dsa_8021q_ctx);
 
 	return rc;
 }
@@ -3384,8 +3381,11 @@ out_unregister_switch:
 static int sja1105_remove(struct spi_device *spi)
 {
 	struct sja1105_private *priv = spi_get_drvdata(spi);
+	struct dsa_switch *ds = priv->ds;
+
+	dsa_unregister_switch(ds);
+	dsa_tag_8021q_unregister(priv->dsa_8021q_ctx);
 
-	dsa_unregister_switch(priv->ds);
 	return 0;
 }
 
diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 5f01dea7d5b6..9945898a90c3 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -34,6 +34,12 @@ struct dsa_8021q_context {
 	__be16 proto;
 };
 
+struct dsa_8021q_context *dsa_tag_8021q_register(struct dsa_switch *ds,
+						 const struct dsa_8021q_ops *ops,
+						 __be16 proto);
+
+void dsa_tag_8021q_unregister(struct dsa_8021q_context *ctx);
+
 int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled);
 
 int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port,
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 3a25b7b1ba50..73966ca23ac3 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -410,6 +410,39 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port,
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave);
 
+struct dsa_8021q_context *dsa_tag_8021q_register(struct dsa_switch *ds,
+						 const struct dsa_8021q_ops *ops,
+						 __be16 proto)
+{
+	struct dsa_8021q_context *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	ctx->ops = ops;
+	ctx->proto = proto;
+	ctx->ds = ds;
+
+	INIT_LIST_HEAD(&ctx->crosschip_links);
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_register);
+
+void dsa_tag_8021q_unregister(struct dsa_8021q_context *ctx)
+{
+	struct dsa_8021q_crosschip_link *c, *n;
+
+	list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) {
+		list_del(&c->list);
+		kfree(c);
+	}
+
+	kfree(ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_unregister);
+
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 			       u16 tpid, u16 tci)
 {
-- 
cgit v1.2.3


From d7b1fd520d5d4271f4ab9b1671afbdcd868039d3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 19 Jul 2021 20:14:48 +0300
Subject: net: dsa: let the core manage the tag_8021q context

The basic problem description is as follows:

Be there 3 switches in a daisy chain topology:

                                             |
    sw0p0     sw0p1     sw0p2     sw0p3     sw0p4
 [  user ] [  user ] [  user ] [  dsa  ] [  cpu  ]
                                   |
                                   +---------+
                                             |
    sw1p0     sw1p1     sw1p2     sw1p3     sw1p4
 [  user ] [  user ] [  user ] [  dsa  ] [  dsa  ]
                                   |
                                   +---------+
                                             |
    sw2p0     sw2p1     sw2p2     sw2p3     sw2p4
 [  user ] [  user ] [  user ] [  user ] [  dsa  ]

The CPU will not be able to ping through the user ports of the
bottom-most switch (like for example sw2p0), simply because tag_8021q
was not coded up for this scenario - it has always assumed DSA switch
trees with a single switch.

To add support for the topology above, we must admit that the RX VLAN of
sw2p0 must be added on some ports of switches 0 and 1 as well. This is
in fact a textbook example of thing that can use the cross-chip notifier
framework that DSA has set up in switch.c.

There is only one problem: core DSA (switch.c) is not able right now to
make the connection between a struct dsa_switch *ds and a struct
dsa_8021q_context *ctx. Right now, it is drivers who call into
tag_8021q.c and always provide a struct dsa_8021q_context *ctx pointer,
and tag_8021q.c calls them back with the .tag_8021q_vlan_{add,del}
methods.

But with cross-chip notifiers, it is possible for tag_8021q to call
drivers without drivers having ever asked for anything. A good example
is right above: when sw2p0 wants to set itself up for tag_8021q,
the .tag_8021q_vlan_add method needs to be called for switches 1 and 0,
so that they transport sw2p0's VLANs towards the CPU without dropping
them.

So instead of letting drivers manage the tag_8021q context, add a
tag_8021q_ctx pointer inside of struct dsa_switch, which will be
populated when dsa_tag_8021q_register() returns success.

The patch is fairly long-winded because we are partly reverting commit
5899ee367ab3 ("net: dsa: tag_8021q: add a context structure") which made
the driver-facing tag_8021q API use "ctx" instead of "ds". Now that we
can access "ctx" directly from "ds", this is no longer needed.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c         |  22 +++----
 drivers/net/dsa/ocelot/felix.h         |   1 -
 drivers/net/dsa/sja1105/sja1105.h      |   1 -
 drivers/net/dsa/sja1105/sja1105_main.c |  40 +++++-------
 include/linux/dsa/8021q.h              |  18 +++---
 include/net/dsa.h                      |   3 +
 net/dsa/tag_8021q.c                    | 114 ++++++++++++++++++---------------
 7 files changed, 99 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index b52cc381cdc1..9e4ae15aa4fb 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -425,14 +425,14 @@ static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu)
 	ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_MC);
 	ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_BC);
 
-	felix->dsa_8021q_ctx = dsa_tag_8021q_register(ds, &felix_tag_8021q_ops,
-						      htons(ETH_P_8021AD));
-	if (!felix->dsa_8021q_ctx)
-		return -ENOMEM;
+	err = dsa_tag_8021q_register(ds, &felix_tag_8021q_ops,
+				     htons(ETH_P_8021AD));
+	if (err)
+		return err;
 
-	err = dsa_8021q_setup(felix->dsa_8021q_ctx, true);
+	err = dsa_8021q_setup(ds, true);
 	if (err)
-		goto out_free_dsa_8021_ctx;
+		goto out_tag_8021q_unregister;
 
 	err = felix_setup_mmio_filtering(felix);
 	if (err)
@@ -441,9 +441,9 @@ static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu)
 	return 0;
 
 out_teardown_dsa_8021q:
-	dsa_8021q_setup(felix->dsa_8021q_ctx, false);
-out_free_dsa_8021_ctx:
-	dsa_tag_8021q_unregister(felix->dsa_8021q_ctx);
+	dsa_8021q_setup(ds, false);
+out_tag_8021q_unregister:
+	dsa_tag_8021q_unregister(ds);
 	return err;
 }
 
@@ -458,11 +458,11 @@ static void felix_teardown_tag_8021q(struct dsa_switch *ds, int cpu)
 		dev_err(ds->dev, "felix_teardown_mmio_filtering returned %d",
 			err);
 
-	err = dsa_8021q_setup(felix->dsa_8021q_ctx, false);
+	err = dsa_8021q_setup(ds, false);
 	if (err)
 		dev_err(ds->dev, "dsa_8021q_setup returned %d", err);
 
-	dsa_tag_8021q_unregister(felix->dsa_8021q_ctx);
+	dsa_tag_8021q_unregister(ds);
 
 	for (port = 0; port < ds->num_ports; port++) {
 		if (dsa_is_unused_port(ds, port))
diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h
index 4d96cad815d5..9da3c6a94c6e 100644
--- a/drivers/net/dsa/ocelot/felix.h
+++ b/drivers/net/dsa/ocelot/felix.h
@@ -60,7 +60,6 @@ struct felix {
 	struct lynx_pcs			**pcs;
 	resource_size_t			switch_base;
 	resource_size_t			imdio_base;
-	struct dsa_8021q_context	*dsa_8021q_ctx;
 	enum dsa_tag_protocol		tag_proto;
 };
 
diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 869b19c08fc0..068be8afd322 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -257,7 +257,6 @@ struct sja1105_private {
 	 * the switch doesn't confuse them with one another.
 	 */
 	struct mutex mgmt_lock;
-	struct dsa_8021q_context *dsa_8021q_ctx;
 	struct devlink_region **regions;
 	struct sja1105_cbs_entry *cbs;
 	struct mii_bus *mdio_base_t1;
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 689f46797d1c..ac4254690a8d 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1995,8 +1995,6 @@ static int sja1105_crosschip_bridge_join(struct dsa_switch *ds,
 					 int other_port, struct net_device *br)
 {
 	struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index);
-	struct sja1105_private *other_priv = other_ds->priv;
-	struct sja1105_private *priv = ds->priv;
 	int port, rc;
 
 	if (other_ds->ops != &sja1105_switch_ops)
@@ -2008,17 +2006,13 @@ static int sja1105_crosschip_bridge_join(struct dsa_switch *ds,
 		if (dsa_to_port(ds, port)->bridge_dev != br)
 			continue;
 
-		rc = dsa_8021q_crosschip_bridge_join(priv->dsa_8021q_ctx,
-						     port,
-						     other_priv->dsa_8021q_ctx,
+		rc = dsa_8021q_crosschip_bridge_join(ds, port, other_ds,
 						     other_port);
 		if (rc)
 			return rc;
 
-		rc = dsa_8021q_crosschip_bridge_join(other_priv->dsa_8021q_ctx,
-						     other_port,
-						     priv->dsa_8021q_ctx,
-						     port);
+		rc = dsa_8021q_crosschip_bridge_join(other_ds, other_port,
+						     ds, port);
 		if (rc)
 			return rc;
 	}
@@ -2032,8 +2026,6 @@ static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds,
 					   struct net_device *br)
 {
 	struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index);
-	struct sja1105_private *other_priv = other_ds->priv;
-	struct sja1105_private *priv = ds->priv;
 	int port;
 
 	if (other_ds->ops != &sja1105_switch_ops)
@@ -2045,22 +2037,19 @@ static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds,
 		if (dsa_to_port(ds, port)->bridge_dev != br)
 			continue;
 
-		dsa_8021q_crosschip_bridge_leave(priv->dsa_8021q_ctx, port,
-						 other_priv->dsa_8021q_ctx,
+		dsa_8021q_crosschip_bridge_leave(ds, port, other_ds,
 						 other_port);
 
-		dsa_8021q_crosschip_bridge_leave(other_priv->dsa_8021q_ctx,
-						 other_port,
-						 priv->dsa_8021q_ctx, port);
+		dsa_8021q_crosschip_bridge_leave(other_ds, other_port,
+						 ds, port);
 	}
 }
 
 static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled)
 {
-	struct sja1105_private *priv = ds->priv;
 	int rc;
 
-	rc = dsa_8021q_setup(priv->dsa_8021q_ctx, enabled);
+	rc = dsa_8021q_setup(ds, enabled);
 	if (rc)
 		return rc;
 
@@ -2233,6 +2222,7 @@ static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify);
 
 static int sja1105_notify_crosschip_switches(struct sja1105_private *priv)
 {
+	struct dsa_8021q_context *ctx = priv->ds->tag_8021q_ctx;
 	struct sja1105_crosschip_switch *s, *pos;
 	struct list_head crosschip_switches;
 	struct dsa_8021q_crosschip_link *c;
@@ -2240,7 +2230,7 @@ static int sja1105_notify_crosschip_switches(struct sja1105_private *priv)
 
 	INIT_LIST_HEAD(&crosschip_switches);
 
-	list_for_each_entry(c, &priv->dsa_8021q_ctx->crosschip_links, list) {
+	list_for_each_entry(c, &ctx->crosschip_links, list) {
 		bool already_added = false;
 
 		list_for_each_entry(s, &crosschip_switches, list) {
@@ -3306,10 +3296,10 @@ static int sja1105_probe(struct spi_device *spi)
 	mutex_init(&priv->ptp_data.lock);
 	mutex_init(&priv->mgmt_lock);
 
-	priv->dsa_8021q_ctx = dsa_tag_8021q_register(ds, &sja1105_dsa_8021q_ops,
-						     htons(ETH_P_8021Q));
-	if (!priv->dsa_8021q_ctx)
-		return -ENOMEM;
+	rc = dsa_tag_8021q_register(ds, &sja1105_dsa_8021q_ops,
+				    htons(ETH_P_8021Q));
+	if (rc)
+		return rc;
 
 	INIT_LIST_HEAD(&priv->bridge_vlans);
 	INIT_LIST_HEAD(&priv->dsa_8021q_vlans);
@@ -3373,7 +3363,7 @@ out_destroy_workers:
 out_unregister_switch:
 	dsa_unregister_switch(ds);
 out_tag_8021q_unregister:
-	dsa_tag_8021q_unregister(priv->dsa_8021q_ctx);
+	dsa_tag_8021q_unregister(ds);
 
 	return rc;
 }
@@ -3384,7 +3374,7 @@ static int sja1105_remove(struct spi_device *spi)
 	struct dsa_switch *ds = priv->ds;
 
 	dsa_unregister_switch(ds);
-	dsa_tag_8021q_unregister(priv->dsa_8021q_ctx);
+	dsa_tag_8021q_unregister(ds);
 
 	return 0;
 }
diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 9945898a90c3..77939c0c8dd5 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -34,20 +34,20 @@ struct dsa_8021q_context {
 	__be16 proto;
 };
 
-struct dsa_8021q_context *dsa_tag_8021q_register(struct dsa_switch *ds,
-						 const struct dsa_8021q_ops *ops,
-						 __be16 proto);
+int dsa_tag_8021q_register(struct dsa_switch *ds,
+			   const struct dsa_8021q_ops *ops,
+			   __be16 proto);
 
-void dsa_tag_8021q_unregister(struct dsa_8021q_context *ctx);
+void dsa_tag_8021q_unregister(struct dsa_switch *ds);
 
-int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled);
+int dsa_8021q_setup(struct dsa_switch *ds, bool enabled);
 
-int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port,
-				    struct dsa_8021q_context *other_ctx,
+int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
+				    struct dsa_switch *other_ds,
 				    int other_port);
 
-int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port,
-				     struct dsa_8021q_context *other_ctx,
+int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
+				     struct dsa_switch *other_ds,
 				     int other_port);
 
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 33f40c1ec379..e213572f6341 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -352,6 +352,9 @@ struct dsa_switch {
 	unsigned int ageing_time_min;
 	unsigned int ageing_time_max;
 
+	/* Storage for drivers using tag_8021q */
+	struct dsa_8021q_context *tag_8021q_ctx;
+
 	/* devlink used to represent this switch device */
 	struct devlink		*devlink;
 
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 16eb2c7bcc8d..de46a551a486 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -113,10 +113,11 @@ EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
  * user explicitly configured this @vid through the bridge core, then the @vid
  * is installed again, but this time with the flags from the bridge layer.
  */
-static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid,
+static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
 			       u16 flags, bool enabled)
 {
-	struct dsa_port *dp = dsa_to_port(ctx->ds, port);
+	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+	struct dsa_port *dp = dsa_to_port(ds, port);
 
 	if (enabled)
 		return ctx->ops->vlan_add(ctx->ds, dp->index, vid, flags);
@@ -176,29 +177,29 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid,
  * +-+-----+-+-----+-+-----+-+-----+-+    +-+-----+-+-----+-+-----+-+-----+-+
  *   swp0    swp1    swp2    swp3           swp0    swp1    swp2    swp3
  */
-static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port,
-				bool enabled)
+static int dsa_8021q_setup_port(struct dsa_switch *ds, int port, bool enabled)
 {
-	int upstream = dsa_upstream_port(ctx->ds, port);
-	u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port);
-	u16 tx_vid = dsa_8021q_tx_vid(ctx->ds, port);
+	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+	int upstream = dsa_upstream_port(ds, port);
+	u16 rx_vid = dsa_8021q_rx_vid(ds, port);
+	u16 tx_vid = dsa_8021q_tx_vid(ds, port);
 	struct net_device *master;
 	int i, err;
 
 	/* The CPU port is implicitly configured by
 	 * configuring the front-panel ports
 	 */
-	if (!dsa_is_user_port(ctx->ds, port))
+	if (!dsa_is_user_port(ds, port))
 		return 0;
 
-	master = dsa_to_port(ctx->ds, port)->cpu_dp->master;
+	master = dsa_to_port(ds, port)->cpu_dp->master;
 
 	/* Add this user port's RX VID to the membership list of all others
 	 * (including itself). This is so that bridging will not be hindered.
 	 * L2 forwarding rules still take precedence when there are no VLAN
 	 * restrictions, so there are no concerns about leaking traffic.
 	 */
-	for (i = 0; i < ctx->ds->num_ports; i++) {
+	for (i = 0; i < ds->num_ports; i++) {
 		u16 flags;
 
 		if (i == upstream)
@@ -211,9 +212,9 @@ static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port,
 			/* The RX VID is a regular VLAN on all others */
 			flags = BRIDGE_VLAN_INFO_UNTAGGED;
 
-		err = dsa_8021q_vid_apply(ctx, i, rx_vid, flags, enabled);
+		err = dsa_8021q_vid_apply(ds, i, rx_vid, flags, enabled);
 		if (err) {
-			dev_err(ctx->ds->dev,
+			dev_err(ds->dev,
 				"Failed to apply RX VID %d to port %d: %pe\n",
 				rx_vid, port, ERR_PTR(err));
 			return err;
@@ -223,9 +224,9 @@ static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port,
 	/* CPU port needs to see this port's RX VID
 	 * as tagged egress.
 	 */
-	err = dsa_8021q_vid_apply(ctx, upstream, rx_vid, 0, enabled);
+	err = dsa_8021q_vid_apply(ds, upstream, rx_vid, 0, enabled);
 	if (err) {
-		dev_err(ctx->ds->dev,
+		dev_err(ds->dev,
 			"Failed to apply RX VID %d to port %d: %pe\n",
 			rx_vid, port, ERR_PTR(err));
 		return err;
@@ -238,17 +239,17 @@ static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port,
 		vlan_vid_del(master, ctx->proto, rx_vid);
 
 	/* Finally apply the TX VID on this port and on the CPU port */
-	err = dsa_8021q_vid_apply(ctx, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED,
+	err = dsa_8021q_vid_apply(ds, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED,
 				  enabled);
 	if (err) {
-		dev_err(ctx->ds->dev,
+		dev_err(ds->dev,
 			"Failed to apply TX VID %d on port %d: %pe\n",
 			tx_vid, port, ERR_PTR(err));
 		return err;
 	}
-	err = dsa_8021q_vid_apply(ctx, upstream, tx_vid, 0, enabled);
+	err = dsa_8021q_vid_apply(ds, upstream, tx_vid, 0, enabled);
 	if (err) {
-		dev_err(ctx->ds->dev,
+		dev_err(ds->dev,
 			"Failed to apply TX VID %d on port %d: %pe\n",
 			tx_vid, upstream, ERR_PTR(err));
 		return err;
@@ -257,16 +258,16 @@ static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port,
 	return err;
 }
 
-int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled)
+int dsa_8021q_setup(struct dsa_switch *ds, bool enabled)
 {
 	int err, port;
 
 	ASSERT_RTNL();
 
-	for (port = 0; port < ctx->ds->num_ports; port++) {
-		err = dsa_8021q_setup_port(ctx, port, enabled);
+	for (port = 0; port < ds->num_ports; port++) {
+		err = dsa_8021q_setup_port(ds, port, enabled);
 		if (err < 0) {
-			dev_err(ctx->ds->dev,
+			dev_err(ds->dev,
 				"Failed to setup VLAN tagging for port %d: %pe\n",
 				port, ERR_PTR(err));
 			return err;
@@ -277,24 +278,25 @@ int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled)
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_setup);
 
-static int dsa_8021q_crosschip_link_apply(struct dsa_8021q_context *ctx,
-					  int port,
-					  struct dsa_8021q_context *other_ctx,
+static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
+					  struct dsa_switch *other_ds,
 					  int other_port, bool enabled)
 {
-	u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port);
+	u16 rx_vid = dsa_8021q_rx_vid(ds, port);
 
 	/* @rx_vid of local @ds port @port goes to @other_port of
 	 * @other_ds
 	 */
-	return dsa_8021q_vid_apply(other_ctx, other_port, rx_vid,
+	return dsa_8021q_vid_apply(other_ds, other_port, rx_vid,
 				   BRIDGE_VLAN_INFO_UNTAGGED, enabled);
 }
 
-static int dsa_8021q_crosschip_link_add(struct dsa_8021q_context *ctx, int port,
-					struct dsa_8021q_context *other_ctx,
+static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port,
+					struct dsa_switch *other_ds,
 					int other_port)
 {
+	struct dsa_8021q_context *other_ctx = other_ds->tag_8021q_ctx;
+	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_8021q_crosschip_link *c;
 
 	list_for_each_entry(c, &ctx->crosschip_links, list) {
@@ -305,9 +307,9 @@ static int dsa_8021q_crosschip_link_add(struct dsa_8021q_context *ctx, int port,
 		}
 	}
 
-	dev_dbg(ctx->ds->dev,
+	dev_dbg(ds->dev,
 		"adding crosschip link from port %d to %s port %d\n",
-		port, dev_name(other_ctx->ds->dev), other_port);
+		port, dev_name(other_ds->dev), other_port);
 
 	c = kzalloc(sizeof(*c), GFP_KERNEL);
 	if (!c)
@@ -323,7 +325,7 @@ static int dsa_8021q_crosschip_link_add(struct dsa_8021q_context *ctx, int port,
 	return 0;
 }
 
-static void dsa_8021q_crosschip_link_del(struct dsa_8021q_context *ctx,
+static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
 					 struct dsa_8021q_crosschip_link *c,
 					 bool *keep)
 {
@@ -332,7 +334,7 @@ static void dsa_8021q_crosschip_link_del(struct dsa_8021q_context *ctx,
 	if (*keep)
 		return;
 
-	dev_dbg(ctx->ds->dev,
+	dev_dbg(ds->dev,
 		"deleting crosschip link from port %d to %s port %d\n",
 		c->port, dev_name(c->other_ctx->ds->dev), c->other_port);
 
@@ -347,8 +349,8 @@ static void dsa_8021q_crosschip_link_del(struct dsa_8021q_context *ctx,
  * or untagged: it doesn't matter, since it should never egress a frame having
  * our @rx_vid.
  */
-int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port,
-				    struct dsa_8021q_context *other_ctx,
+int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
+				    struct dsa_switch *other_ds,
 				    int other_port)
 {
 	/* @other_upstream is how @other_ds reaches us. If we are part
@@ -356,49 +358,50 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port,
 	 * our CPU ports. If we're part of the same tree though, we should
 	 * probably use dsa_towards_port.
 	 */
-	int other_upstream = dsa_upstream_port(other_ctx->ds, other_port);
+	int other_upstream = dsa_upstream_port(other_ds, other_port);
 	int err;
 
-	err = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_port);
+	err = dsa_8021q_crosschip_link_add(ds, port, other_ds, other_port);
 	if (err)
 		return err;
 
-	err = dsa_8021q_crosschip_link_apply(ctx, port, other_ctx,
+	err = dsa_8021q_crosschip_link_apply(ds, port, other_ds,
 					     other_port, true);
 	if (err)
 		return err;
 
-	err = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_upstream);
+	err = dsa_8021q_crosschip_link_add(ds, port, other_ds, other_upstream);
 	if (err)
 		return err;
 
-	return dsa_8021q_crosschip_link_apply(ctx, port, other_ctx,
+	return dsa_8021q_crosschip_link_apply(ds, port, other_ds,
 					      other_upstream, true);
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join);
 
-int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port,
-				     struct dsa_8021q_context *other_ctx,
+int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
+				     struct dsa_switch *other_ds,
 				     int other_port)
 {
-	int other_upstream = dsa_upstream_port(other_ctx->ds, other_port);
+	struct dsa_8021q_context *other_ctx = other_ds->tag_8021q_ctx;
+	int other_upstream = dsa_upstream_port(other_ds, other_port);
+	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_8021q_crosschip_link *c, *n;
 
 	list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) {
 		if (c->port == port && c->other_ctx == other_ctx &&
 		    (c->other_port == other_port ||
 		     c->other_port == other_upstream)) {
-			struct dsa_8021q_context *other_ctx = c->other_ctx;
 			int other_port = c->other_port;
 			bool keep;
 			int err;
 
-			dsa_8021q_crosschip_link_del(ctx, c, &keep);
+			dsa_8021q_crosschip_link_del(ds, c, &keep);
 			if (keep)
 				continue;
 
-			err = dsa_8021q_crosschip_link_apply(ctx, port,
-							     other_ctx,
+			err = dsa_8021q_crosschip_link_apply(ds, port,
+							     other_ds,
 							     other_port,
 							     false);
 			if (err)
@@ -410,15 +413,15 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port,
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave);
 
-struct dsa_8021q_context *dsa_tag_8021q_register(struct dsa_switch *ds,
-						 const struct dsa_8021q_ops *ops,
-						 __be16 proto)
+int dsa_tag_8021q_register(struct dsa_switch *ds,
+			   const struct dsa_8021q_ops *ops,
+			   __be16 proto)
 {
 	struct dsa_8021q_context *ctx;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
-		return NULL;
+		return -ENOMEM;
 
 	ctx->ops = ops;
 	ctx->proto = proto;
@@ -426,12 +429,15 @@ struct dsa_8021q_context *dsa_tag_8021q_register(struct dsa_switch *ds,
 
 	INIT_LIST_HEAD(&ctx->crosschip_links);
 
-	return ctx;
+	ds->tag_8021q_ctx = ctx;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(dsa_tag_8021q_register);
 
-void dsa_tag_8021q_unregister(struct dsa_8021q_context *ctx)
+void dsa_tag_8021q_unregister(struct dsa_switch *ds)
 {
+	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_8021q_crosschip_link *c, *n;
 
 	list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) {
@@ -439,6 +445,8 @@ void dsa_tag_8021q_unregister(struct dsa_8021q_context *ctx)
 		kfree(c);
 	}
 
+	ds->tag_8021q_ctx = NULL;
+
 	kfree(ctx);
 }
 EXPORT_SYMBOL_GPL(dsa_tag_8021q_unregister);
-- 
cgit v1.2.3


From 5da11eb407340233a6111c563419e19685a062a4 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 19 Jul 2021 20:14:49 +0300
Subject: net: dsa: make tag_8021q operations part of the core

Make tag_8021q a more central element of DSA and move the 2 driver
specific operations outside of struct dsa_8021q_context (which is
supposed to hold dynamic data and not really constant function
pointers).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c         | 10 +++-------
 drivers/net/dsa/sja1105/sja1105_main.c | 10 +++-------
 include/linux/dsa/8021q.h              | 10 +---------
 include/net/dsa.h                      |  7 +++++++
 net/dsa/tag_8021q.c                    | 10 +++-------
 5 files changed, 17 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 9e4ae15aa4fb..b6ab28d2f155 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -231,11 +231,6 @@ static int felix_tag_8021q_vlan_del(struct dsa_switch *ds, int port, u16 vid)
 	return 0;
 }
 
-static const struct dsa_8021q_ops felix_tag_8021q_ops = {
-	.vlan_add	= felix_tag_8021q_vlan_add,
-	.vlan_del	= felix_tag_8021q_vlan_del,
-};
-
 /* Alternatively to using the NPI functionality, that same hardware MAC
  * connected internally to the enetc or fman DSA master can be configured to
  * use the software-defined tag_8021q frame format. As far as the hardware is
@@ -425,8 +420,7 @@ static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu)
 	ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_MC);
 	ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_BC);
 
-	err = dsa_tag_8021q_register(ds, &felix_tag_8021q_ops,
-				     htons(ETH_P_8021AD));
+	err = dsa_tag_8021q_register(ds, htons(ETH_P_8021AD));
 	if (err)
 		return err;
 
@@ -1675,6 +1669,8 @@ const struct dsa_switch_ops felix_switch_ops = {
 	.port_mrp_del			= felix_mrp_del,
 	.port_mrp_add_ring_role		= felix_mrp_add_ring_role,
 	.port_mrp_del_ring_role		= felix_mrp_del_ring_role,
+	.tag_8021q_vlan_add		= felix_tag_8021q_vlan_add,
+	.tag_8021q_vlan_del		= felix_tag_8021q_vlan_del,
 };
 
 struct net_device *felix_port_to_netdev(struct ocelot *ocelot, int port)
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index ac4254690a8d..0c04f6caccdf 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2543,11 +2543,6 @@ static int sja1105_dsa_8021q_vlan_del(struct dsa_switch *ds, int port, u16 vid)
 	return sja1105_build_vlan_table(priv, true);
 }
 
-static const struct dsa_8021q_ops sja1105_dsa_8021q_ops = {
-	.vlan_add	= sja1105_dsa_8021q_vlan_add,
-	.vlan_del	= sja1105_dsa_8021q_vlan_del,
-};
-
 /* The programming model for the SJA1105 switch is "all-at-once" via static
  * configuration tables. Some of these can be dynamically modified at runtime,
  * but not the xMII mode parameters table.
@@ -3153,6 +3148,8 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.crosschip_bridge_join	= sja1105_crosschip_bridge_join,
 	.crosschip_bridge_leave	= sja1105_crosschip_bridge_leave,
 	.devlink_info_get	= sja1105_devlink_info_get,
+	.tag_8021q_vlan_add	= sja1105_dsa_8021q_vlan_add,
+	.tag_8021q_vlan_del	= sja1105_dsa_8021q_vlan_del,
 };
 
 static const struct of_device_id sja1105_dt_ids[];
@@ -3296,8 +3293,7 @@ static int sja1105_probe(struct spi_device *spi)
 	mutex_init(&priv->ptp_data.lock);
 	mutex_init(&priv->mgmt_lock);
 
-	rc = dsa_tag_8021q_register(ds, &sja1105_dsa_8021q_ops,
-				    htons(ETH_P_8021Q));
+	rc = dsa_tag_8021q_register(ds, htons(ETH_P_8021Q));
 	if (rc)
 		return rc;
 
diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 77939c0c8dd5..0bda08fb2f16 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -21,22 +21,14 @@ struct dsa_8021q_crosschip_link {
 	refcount_t refcount;
 };
 
-struct dsa_8021q_ops {
-	int (*vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags);
-	int (*vlan_del)(struct dsa_switch *ds, int port, u16 vid);
-};
-
 struct dsa_8021q_context {
-	const struct dsa_8021q_ops *ops;
 	struct dsa_switch *ds;
 	struct list_head crosschip_links;
 	/* EtherType of RX VID, used for filtering on master interface */
 	__be16 proto;
 };
 
-int dsa_tag_8021q_register(struct dsa_switch *ds,
-			   const struct dsa_8021q_ops *ops,
-			   __be16 proto);
+int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto);
 
 void dsa_tag_8021q_unregister(struct dsa_switch *ds);
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index e213572f6341..9e5593885357 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -872,6 +872,13 @@ struct dsa_switch_ops {
 					  const struct switchdev_obj_ring_role_mrp *mrp);
 	int	(*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,
 					  const struct switchdev_obj_ring_role_mrp *mrp);
+
+	/*
+	 * tag_8021q operations
+	 */
+	int	(*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid,
+				      u16 flags);
+	int	(*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid);
 };
 
 #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index de46a551a486..4a11c5004783 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -116,13 +116,12 @@ EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
 static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
 			       u16 flags, bool enabled)
 {
-	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_port *dp = dsa_to_port(ds, port);
 
 	if (enabled)
-		return ctx->ops->vlan_add(ctx->ds, dp->index, vid, flags);
+		return ds->ops->tag_8021q_vlan_add(ds, dp->index, vid, flags);
 
-	return ctx->ops->vlan_del(ctx->ds, dp->index, vid);
+	return ds->ops->tag_8021q_vlan_del(ds, dp->index, vid);
 }
 
 /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single
@@ -413,9 +412,7 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave);
 
-int dsa_tag_8021q_register(struct dsa_switch *ds,
-			   const struct dsa_8021q_ops *ops,
-			   __be16 proto)
+int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto)
 {
 	struct dsa_8021q_context *ctx;
 
@@ -423,7 +420,6 @@ int dsa_tag_8021q_register(struct dsa_switch *ds,
 	if (!ctx)
 		return -ENOMEM;
 
-	ctx->ops = ops;
 	ctx->proto = proto;
 	ctx->ds = ds;
 
-- 
cgit v1.2.3


From 328621f6131f667c5c328bb72d45442fd76efb81 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 19 Jul 2021 20:14:50 +0300
Subject: net: dsa: tag_8021q: absorb dsa_8021q_setup into
 dsa_tag_8021q_{,un}register

Right now, setting up tag_8021q is a 2-step operation for a driver,
first the context structure needs to be created, then the VLANs need to
be installed on the ports. A similar thing is true for teardown.

Merge the 2 steps into the register/unregister methods, to be as
transparent as possible for the driver as to what tag_8021q does behind
the scenes. This also gets rid of the funny "bool setup == true means
setup, == false means teardown" API that tag_8021q used to expose.

Note that dsa_tag_8021q_register() must be called at least in the
.setup() driver method and never earlier (like in the driver probe
function). This is because the DSA switch tree is not initialized at
probe time, and the cross-chip notifiers will not work.

For symmetry with .setup(), the unregister method should be put in
.teardown().

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c         | 12 +-----------
 drivers/net/dsa/sja1105/sja1105_main.c | 32 ++++++--------------------------
 include/linux/dsa/8021q.h              |  2 --
 net/dsa/tag_8021q.c                    | 11 ++++++++---
 4 files changed, 15 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index b6ab28d2f155..583a22d901b3 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -424,18 +424,12 @@ static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu)
 	if (err)
 		return err;
 
-	err = dsa_8021q_setup(ds, true);
-	if (err)
-		goto out_tag_8021q_unregister;
-
 	err = felix_setup_mmio_filtering(felix);
 	if (err)
-		goto out_teardown_dsa_8021q;
+		goto out_tag_8021q_unregister;
 
 	return 0;
 
-out_teardown_dsa_8021q:
-	dsa_8021q_setup(ds, false);
 out_tag_8021q_unregister:
 	dsa_tag_8021q_unregister(ds);
 	return err;
@@ -452,10 +446,6 @@ static void felix_teardown_tag_8021q(struct dsa_switch *ds, int cpu)
 		dev_err(ds->dev, "felix_teardown_mmio_filtering returned %d",
 			err);
 
-	err = dsa_8021q_setup(ds, false);
-	if (err)
-		dev_err(ds->dev, "dsa_8021q_setup returned %d", err);
-
 	dsa_tag_8021q_unregister(ds);
 
 	for (port = 0; port < ds->num_ports; port++) {
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 0c04f6caccdf..6b56c1ada3ee 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2045,19 +2045,6 @@ static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds,
 	}
 }
 
-static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled)
-{
-	int rc;
-
-	rc = dsa_8021q_setup(ds, enabled);
-	if (rc)
-		return rc;
-
-	dev_info(ds->dev, "%s switch tagging\n",
-		 enabled ? "Enabled" : "Disabled");
-	return 0;
-}
-
 static enum dsa_tag_protocol
 sja1105_get_tag_protocol(struct dsa_switch *ds, int port,
 			 enum dsa_tag_protocol mp)
@@ -2635,12 +2622,8 @@ static int sja1105_setup(struct dsa_switch *ds)
 	if (rc < 0)
 		goto out_static_config_free;
 
-	/* The DSA/switchdev model brings up switch ports in standalone mode by
-	 * default, and that means vlan_filtering is 0 since they're not under
-	 * a bridge, so it's safe to set up switch tagging at this time.
-	 */
 	rtnl_lock();
-	rc = sja1105_setup_8021q_tagging(ds, true);
+	rc = dsa_tag_8021q_register(ds, htons(ETH_P_8021Q));
 	rtnl_unlock();
 	if (rc)
 		goto out_devlink_teardown;
@@ -2665,6 +2648,10 @@ static void sja1105_teardown(struct dsa_switch *ds)
 	struct sja1105_bridge_vlan *v, *n;
 	int port;
 
+	rtnl_lock();
+	dsa_tag_8021q_unregister(ds);
+	rtnl_unlock();
+
 	for (port = 0; port < ds->num_ports; port++) {
 		struct sja1105_port *sp = &priv->ports[port];
 
@@ -3293,10 +3280,6 @@ static int sja1105_probe(struct spi_device *spi)
 	mutex_init(&priv->ptp_data.lock);
 	mutex_init(&priv->mgmt_lock);
 
-	rc = dsa_tag_8021q_register(ds, htons(ETH_P_8021Q));
-	if (rc)
-		return rc;
-
 	INIT_LIST_HEAD(&priv->bridge_vlans);
 	INIT_LIST_HEAD(&priv->dsa_8021q_vlans);
 
@@ -3305,7 +3288,7 @@ static int sja1105_probe(struct spi_device *spi)
 
 	rc = dsa_register_switch(priv->ds);
 	if (rc)
-		goto out_tag_8021q_unregister;
+		return rc;
 
 	if (IS_ENABLED(CONFIG_NET_SCH_CBS)) {
 		priv->cbs = devm_kcalloc(dev, priv->info->num_cbs_shapers,
@@ -3358,8 +3341,6 @@ out_destroy_workers:
 
 out_unregister_switch:
 	dsa_unregister_switch(ds);
-out_tag_8021q_unregister:
-	dsa_tag_8021q_unregister(ds);
 
 	return rc;
 }
@@ -3370,7 +3351,6 @@ static int sja1105_remove(struct spi_device *spi)
 	struct dsa_switch *ds = priv->ds;
 
 	dsa_unregister_switch(ds);
-	dsa_tag_8021q_unregister(ds);
 
 	return 0;
 }
diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 0bda08fb2f16..9cf2c99eb668 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -32,8 +32,6 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto);
 
 void dsa_tag_8021q_unregister(struct dsa_switch *ds);
 
-int dsa_8021q_setup(struct dsa_switch *ds, bool enabled);
-
 int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
 				    struct dsa_switch *other_ds,
 				    int other_port);
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 4a11c5004783..9785c8497039 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -257,7 +257,7 @@ static int dsa_8021q_setup_port(struct dsa_switch *ds, int port, bool enabled)
 	return err;
 }
 
-int dsa_8021q_setup(struct dsa_switch *ds, bool enabled)
+static int dsa_8021q_setup(struct dsa_switch *ds, bool enabled)
 {
 	int err, port;
 
@@ -275,7 +275,6 @@ int dsa_8021q_setup(struct dsa_switch *ds, bool enabled)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dsa_8021q_setup);
 
 static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
 					  struct dsa_switch *other_ds,
@@ -427,7 +426,7 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto)
 
 	ds->tag_8021q_ctx = ctx;
 
-	return 0;
+	return dsa_8021q_setup(ds, true);
 }
 EXPORT_SYMBOL_GPL(dsa_tag_8021q_register);
 
@@ -435,6 +434,12 @@ void dsa_tag_8021q_unregister(struct dsa_switch *ds)
 {
 	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_8021q_crosschip_link *c, *n;
+	int err;
+
+	err = dsa_8021q_setup(ds, false);
+	if (err)
+		dev_err(ds->dev, "failed to tear down tag_8021q VLANs: %pe\n",
+			ERR_PTR(err));
 
 	list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) {
 		list_del(&c->list);
-- 
cgit v1.2.3


From c64b9c05045a21a5258f6dbd81d94a2a22ff73a2 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 19 Jul 2021 20:14:52 +0300
Subject: net: dsa: tag_8021q: add proper cross-chip notifier support

The big problem which mandates cross-chip notifiers for tag_8021q is
this:

                                             |
    sw0p0     sw0p1     sw0p2     sw0p3     sw0p4
 [  user ] [  user ] [  user ] [  dsa  ] [  cpu  ]
                                   |
                                   +---------+
                                             |
    sw1p0     sw1p1     sw1p2     sw1p3     sw1p4
 [  user ] [  user ] [  user ] [  dsa  ] [  dsa  ]
                                   |
                                   +---------+
                                             |
    sw2p0     sw2p1     sw2p2     sw2p3     sw2p4
 [  user ] [  user ] [  user ] [  dsa  ] [  dsa  ]

When the user runs:

ip link add br0 type bridge
ip link set sw0p0 master br0
ip link set sw2p0 master br0

It doesn't work.

This is because dsa_8021q_crosschip_bridge_join() assumes that "ds" and
"other_ds" are at most 1 hop away from each other, so it is sufficient
to add the RX VLAN of {ds, port} into {other_ds, other_port} and vice
versa and presto, the cross-chip link works. When there is another
switch in the middle, such as in this case switch 1 with its DSA links
sw1p3 and sw1p4, somebody needs to tell it about these VLANs too.

Which is exactly why the problem is quadratic: when a port joins a
bridge, for each port in the tree that's already in that same bridge we
notify a tag_8021q VLAN addition of that port's RX VLAN to the entire
tree. It is a very complicated web of VLANs.

It must be mentioned that currently we install tag_8021q VLANs on too
many ports (DSA links - to be precise, on all of them). For example,
when sw2p0 joins br0, and assuming sw1p0 was part of br0 too, we add the
RX VLAN of sw2p0 on the DSA links of switch 0 too, even though there
isn't any port of switch 0 that is a member of br0 (at least yet).
In theory we could notify only the switches which sit in between the
port joining the bridge and the port reacting to that bridge_join event.
But in practice that is impossible, because of the way 'link' properties
are described in the device tree. The DSA bindings require DT writers to
list out not only the real/physical DSA links, but in fact the entire
routing table, like for example switch 0 above will have:

	sw0p3: port@3 {
		link = <&sw1p4 &sw2p4>;
	};

This was done because:

/* TODO: ideally DSA ports would have a single dp->link_dp member,
 * and no dst->rtable nor this struct dsa_link would be needed,
 * but this would require some more complex tree walking,
 * so keep it stupid at the moment and list them all.
 */

but it is a perfect example of a situation where too much information is
actively detrimential, because we are now in the position where we
cannot distinguish a real DSA link from one that is put there to avoid
the 'complex tree walking'. And because DT is ABI, there is not much we
can change.

And because we do not know which DSA links are real and which ones
aren't, we can't really know if DSA switch A is in the data path between
switches B and C, in the general case.

So this is why tag_8021q RX VLANs are added on all DSA links, and
probably why it will never change.

On the other hand, at least the number of additions/deletions is well
balanced, and this means that once we implement reference counting at
the cross-chip notifier level a la fdb/mdb, there is absolutely zero
need for a struct dsa_8021q_crosschip_link, it's all self-managing.

In fact, with the tag_8021q notifiers emitted from the bridge join
notifiers, it becomes so generic that sja1105 does not need to do
anything anymore, we can just delete its implementation of the
.crosschip_bridge_{join,leave} methods.

Among other things we can simply delete is the home-grown implementation
of sja1105_notify_crosschip_switches(). The reason why that is wrong is
because it is not quadratic - it only covers remote switches to which we
have a cross-chip bridging link and that does not cover in-between
switches. This deletion is part of the same patch because sja1105 used
to poke deep inside the guts of the tag_8021q context in order to do
that. Because the cross-chip links went away, so needs the sja1105 code.

Last but not least, dsa_8021q_setup_port() is simplified (and also
renamed). Because our TAG_8021Q_VLAN_ADD notifier is designed to react
on the CPU port too, the four dsa_8021q_vid_apply() calls:
- 1 for RX VLAN on user port
- 1 for the user port's RX VLAN on the CPU port
- 1 for TX VLAN on user port
- 1 for the user port's TX VLAN on the CPU port

now get squashed into only 2 notifier calls via
dsa_port_tag_8021q_vlan_add.

And because the notifiers to add and to delete a tag_8021q VLAN are
distinct, now we finally break up the port setup and teardown into
separate functions instead of relying on a "bool enabled" flag which
tells us what to do. Arguably it should have been this way from the
get go.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c | 132 +----------
 include/linux/dsa/8021q.h              |  16 +-
 net/dsa/dsa_priv.h                     |  16 ++
 net/dsa/port.c                         |  28 +++
 net/dsa/switch.c                       |   6 +
 net/dsa/tag_8021q.c                    | 398 ++++++++++++++++-----------------
 6 files changed, 256 insertions(+), 340 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 6b56c1ada3ee..6618abba23b3 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1990,61 +1990,6 @@ static int sja1105_pvid_apply(struct sja1105_private *priv, int port, u16 pvid)
 					   &mac[port], true);
 }
 
-static int sja1105_crosschip_bridge_join(struct dsa_switch *ds,
-					 int tree_index, int sw_index,
-					 int other_port, struct net_device *br)
-{
-	struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index);
-	int port, rc;
-
-	if (other_ds->ops != &sja1105_switch_ops)
-		return 0;
-
-	for (port = 0; port < ds->num_ports; port++) {
-		if (!dsa_is_user_port(ds, port))
-			continue;
-		if (dsa_to_port(ds, port)->bridge_dev != br)
-			continue;
-
-		rc = dsa_8021q_crosschip_bridge_join(ds, port, other_ds,
-						     other_port);
-		if (rc)
-			return rc;
-
-		rc = dsa_8021q_crosschip_bridge_join(other_ds, other_port,
-						     ds, port);
-		if (rc)
-			return rc;
-	}
-
-	return 0;
-}
-
-static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds,
-					   int tree_index, int sw_index,
-					   int other_port,
-					   struct net_device *br)
-{
-	struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index);
-	int port;
-
-	if (other_ds->ops != &sja1105_switch_ops)
-		return;
-
-	for (port = 0; port < ds->num_ports; port++) {
-		if (!dsa_is_user_port(ds, port))
-			continue;
-		if (dsa_to_port(ds, port)->bridge_dev != br)
-			continue;
-
-		dsa_8021q_crosschip_bridge_leave(ds, port, other_ds,
-						 other_port);
-
-		dsa_8021q_crosschip_bridge_leave(other_ds, other_port,
-						 ds, port);
-	}
-}
-
 static enum dsa_tag_protocol
 sja1105_get_tag_protocol(struct dsa_switch *ds, int port,
 			 enum dsa_tag_protocol mp)
@@ -2135,11 +2080,6 @@ static int sja1105_commit_vlans(struct sja1105_private *priv,
 	return 0;
 }
 
-struct sja1105_crosschip_switch {
-	struct list_head list;
-	struct dsa_8021q_context *other_ctx;
-};
-
 static int sja1105_commit_pvid(struct sja1105_private *priv)
 {
 	struct sja1105_bridge_vlan *v;
@@ -2205,59 +2145,7 @@ sja1105_build_dsa_8021q_vlans(struct sja1105_private *priv,
 	return 0;
 }
 
-static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify);
-
-static int sja1105_notify_crosschip_switches(struct sja1105_private *priv)
-{
-	struct dsa_8021q_context *ctx = priv->ds->tag_8021q_ctx;
-	struct sja1105_crosschip_switch *s, *pos;
-	struct list_head crosschip_switches;
-	struct dsa_8021q_crosschip_link *c;
-	int rc = 0;
-
-	INIT_LIST_HEAD(&crosschip_switches);
-
-	list_for_each_entry(c, &ctx->crosschip_links, list) {
-		bool already_added = false;
-
-		list_for_each_entry(s, &crosschip_switches, list) {
-			if (s->other_ctx == c->other_ctx) {
-				already_added = true;
-				break;
-			}
-		}
-
-		if (already_added)
-			continue;
-
-		s = kzalloc(sizeof(*s), GFP_KERNEL);
-		if (!s) {
-			dev_err(priv->ds->dev, "Failed to allocate memory\n");
-			rc = -ENOMEM;
-			goto out;
-		}
-		s->other_ctx = c->other_ctx;
-		list_add(&s->list, &crosschip_switches);
-	}
-
-	list_for_each_entry(s, &crosschip_switches, list) {
-		struct sja1105_private *other_priv = s->other_ctx->ds->priv;
-
-		rc = sja1105_build_vlan_table(other_priv, false);
-		if (rc)
-			goto out;
-	}
-
-out:
-	list_for_each_entry_safe(s, pos, &crosschip_switches, list) {
-		list_del(&s->list);
-		kfree(s);
-	}
-
-	return rc;
-}
-
-static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify)
+static int sja1105_build_vlan_table(struct sja1105_private *priv)
 {
 	struct sja1105_vlan_lookup_entry *new_vlan;
 	struct sja1105_table *table;
@@ -2296,12 +2184,6 @@ static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify)
 	if (rc)
 		goto out;
 
-	if (notify) {
-		rc = sja1105_notify_crosschip_switches(priv);
-		if (rc)
-			goto out;
-	}
-
 out:
 	kfree(new_vlan);
 
@@ -2389,7 +2271,7 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled,
 	l2_lookup_params = table->entries;
 	l2_lookup_params->shared_learn = !priv->vlan_aware;
 
-	rc = sja1105_build_vlan_table(priv, false);
+	rc = sja1105_build_vlan_table(priv);
 	if (rc)
 		return rc;
 
@@ -2485,7 +2367,7 @@ static int sja1105_vlan_add(struct dsa_switch *ds, int port,
 	if (!vlan_table_changed)
 		return 0;
 
-	return sja1105_build_vlan_table(priv, true);
+	return sja1105_build_vlan_table(priv);
 }
 
 static int sja1105_vlan_del(struct dsa_switch *ds, int port,
@@ -2502,7 +2384,7 @@ static int sja1105_vlan_del(struct dsa_switch *ds, int port,
 	if (!vlan_table_changed)
 		return 0;
 
-	return sja1105_build_vlan_table(priv, true);
+	return sja1105_build_vlan_table(priv);
 }
 
 static int sja1105_dsa_8021q_vlan_add(struct dsa_switch *ds, int port, u16 vid,
@@ -2515,7 +2397,7 @@ static int sja1105_dsa_8021q_vlan_add(struct dsa_switch *ds, int port, u16 vid,
 	if (rc <= 0)
 		return rc;
 
-	return sja1105_build_vlan_table(priv, true);
+	return sja1105_build_vlan_table(priv);
 }
 
 static int sja1105_dsa_8021q_vlan_del(struct dsa_switch *ds, int port, u16 vid)
@@ -2527,7 +2409,7 @@ static int sja1105_dsa_8021q_vlan_del(struct dsa_switch *ds, int port, u16 vid)
 	if (!rc)
 		return 0;
 
-	return sja1105_build_vlan_table(priv, true);
+	return sja1105_build_vlan_table(priv);
 }
 
 /* The programming model for the SJA1105 switch is "all-at-once" via static
@@ -3132,8 +3014,6 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.cls_flower_add		= sja1105_cls_flower_add,
 	.cls_flower_del		= sja1105_cls_flower_del,
 	.cls_flower_stats	= sja1105_cls_flower_stats,
-	.crosschip_bridge_join	= sja1105_crosschip_bridge_join,
-	.crosschip_bridge_leave	= sja1105_crosschip_bridge_leave,
 	.devlink_info_get	= sja1105_devlink_info_get,
 	.tag_8021q_vlan_add	= sja1105_dsa_8021q_vlan_add,
 	.tag_8021q_vlan_del	= sja1105_dsa_8021q_vlan_del,
diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 9cf2c99eb668..ec5abfcdefd1 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -11,19 +11,17 @@
 struct dsa_switch;
 struct sk_buff;
 struct net_device;
-struct dsa_8021q_context;
 
-struct dsa_8021q_crosschip_link {
+struct dsa_tag_8021q_vlan {
 	struct list_head list;
 	int port;
-	struct dsa_8021q_context *other_ctx;
-	int other_port;
+	u16 vid;
 	refcount_t refcount;
 };
 
 struct dsa_8021q_context {
 	struct dsa_switch *ds;
-	struct list_head crosschip_links;
+	struct list_head vlans;
 	/* EtherType of RX VID, used for filtering on master interface */
 	__be16 proto;
 };
@@ -32,14 +30,6 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto);
 
 void dsa_tag_8021q_unregister(struct dsa_switch *ds);
 
-int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
-				    struct dsa_switch *other_ds,
-				    int other_port);
-
-int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
-				     struct dsa_switch *other_ds,
-				     int other_port);
-
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 			       u16 tpid, u16 tci);
 
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 28c4d1107b6d..efd6bca78d2f 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -39,6 +39,8 @@ enum {
 	DSA_NOTIFIER_MRP_DEL,
 	DSA_NOTIFIER_MRP_ADD_RING_ROLE,
 	DSA_NOTIFIER_MRP_DEL_RING_ROLE,
+	DSA_NOTIFIER_TAG_8021Q_VLAN_ADD,
+	DSA_NOTIFIER_TAG_8021Q_VLAN_DEL,
 };
 
 /* DSA_NOTIFIER_AGEING_TIME */
@@ -113,6 +115,14 @@ struct dsa_notifier_mrp_ring_role_info {
 	int port;
 };
 
+/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */
+struct dsa_notifier_tag_8021q_vlan_info {
+	int tree_index;
+	int sw_index;
+	int port;
+	u16 vid;
+};
+
 struct dsa_switchdev_event_work {
 	struct dsa_switch *ds;
 	int port;
@@ -253,6 +263,8 @@ int dsa_port_link_register_of(struct dsa_port *dp);
 void dsa_port_link_unregister_of(struct dsa_port *dp);
 int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr);
 void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
+int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid);
+void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid);
 extern const struct phylink_mac_ops dsa_port_phylink_mac_ops;
 
 static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp,
@@ -391,6 +403,10 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds,
 			      struct dsa_notifier_bridge_info *info);
 int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds,
 			       struct dsa_notifier_bridge_info *info);
+int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
+				  struct dsa_notifier_tag_8021q_vlan_info *info);
+int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
+				  struct dsa_notifier_tag_8021q_vlan_info *info);
 
 extern struct list_head dsa_tree_list;
 
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 28b45b7e66df..982e18771d76 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -1217,3 +1217,31 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr)
 	if (err)
 		pr_err("DSA: failed to notify DSA_NOTIFIER_HSR_LEAVE\n");
 }
+
+int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid)
+{
+	struct dsa_notifier_tag_8021q_vlan_info info = {
+		.tree_index = dp->ds->dst->index,
+		.sw_index = dp->ds->index,
+		.port = dp->index,
+		.vid = vid,
+	};
+
+	return dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info);
+}
+
+void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid)
+{
+	struct dsa_notifier_tag_8021q_vlan_info info = {
+		.tree_index = dp->ds->dst->index,
+		.sw_index = dp->ds->index,
+		.port = dp->index,
+		.vid = vid,
+	};
+	int err;
+
+	err = dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info);
+	if (err)
+		pr_err("DSA: failed to notify tag_8021q VLAN deletion: %pe\n",
+		       ERR_PTR(err));
+}
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 38560de99b80..fd1a1c6bf9cf 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -734,6 +734,12 @@ static int dsa_switch_event(struct notifier_block *nb,
 	case DSA_NOTIFIER_MRP_DEL_RING_ROLE:
 		err = dsa_switch_mrp_del_ring_role(ds, info);
 		break;
+	case DSA_NOTIFIER_TAG_8021Q_VLAN_ADD:
+		err = dsa_switch_tag_8021q_vlan_add(ds, info);
+		break;
+	case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL:
+		err = dsa_switch_tag_8021q_vlan_del(ds, info);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 0946169033a5..51dcde7db26b 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -107,21 +107,152 @@ bool vid_is_dsa_8021q(u16 vid)
 }
 EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
 
-/* If @enabled is true, installs @vid with @flags into the switch port's HW
- * filter.
- * If @enabled is false, deletes @vid (ignores @flags) from the port. Had the
- * user explicitly configured this @vid through the bridge core, then the @vid
- * is installed again, but this time with the flags from the bridge layer.
- */
-static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
-			       u16 flags, bool enabled)
+static struct dsa_tag_8021q_vlan *
+dsa_tag_8021q_vlan_find(struct dsa_8021q_context *ctx, int port, u16 vid)
+{
+	struct dsa_tag_8021q_vlan *v;
+
+	list_for_each_entry(v, &ctx->vlans, list)
+		if (v->vid == vid && v->port == port)
+			return v;
+
+	return NULL;
+}
+
+static int dsa_switch_do_tag_8021q_vlan_add(struct dsa_switch *ds, int port,
+					    u16 vid, u16 flags)
 {
+	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_port *dp = dsa_to_port(ds, port);
+	struct dsa_tag_8021q_vlan *v;
+	int err;
+
+	/* No need to bother with refcounting for user ports */
+	if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
+		return ds->ops->tag_8021q_vlan_add(ds, port, vid, flags);
+
+	v = dsa_tag_8021q_vlan_find(ctx, port, vid);
+	if (v) {
+		refcount_inc(&v->refcount);
+		return 0;
+	}
+
+	v = kzalloc(sizeof(*v), GFP_KERNEL);
+	if (!v)
+		return -ENOMEM;
+
+	err = ds->ops->tag_8021q_vlan_add(ds, port, vid, flags);
+	if (err) {
+		kfree(v);
+		return err;
+	}
+
+	v->vid = vid;
+	v->port = port;
+	refcount_set(&v->refcount, 1);
+	list_add_tail(&v->list, &ctx->vlans);
+
+	return 0;
+}
+
+static int dsa_switch_do_tag_8021q_vlan_del(struct dsa_switch *ds, int port,
+					    u16 vid)
+{
+	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+	struct dsa_port *dp = dsa_to_port(ds, port);
+	struct dsa_tag_8021q_vlan *v;
+	int err;
+
+	/* No need to bother with refcounting for user ports */
+	if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
+		return ds->ops->tag_8021q_vlan_del(ds, port, vid);
+
+	v = dsa_tag_8021q_vlan_find(ctx, port, vid);
+	if (!v)
+		return -ENOENT;
+
+	if (!refcount_dec_and_test(&v->refcount))
+		return 0;
+
+	err = ds->ops->tag_8021q_vlan_del(ds, port, vid);
+	if (err) {
+		refcount_inc(&v->refcount);
+		return err;
+	}
+
+	list_del(&v->list);
+	kfree(v);
+
+	return 0;
+}
 
-	if (enabled)
-		return ds->ops->tag_8021q_vlan_add(ds, dp->index, vid, flags);
+static bool
+dsa_switch_tag_8021q_vlan_match(struct dsa_switch *ds, int port,
+				struct dsa_notifier_tag_8021q_vlan_info *info)
+{
+	if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port))
+		return true;
+
+	if (ds->dst->index == info->tree_index && ds->index == info->sw_index)
+		return port == info->port;
+
+	return false;
+}
+
+int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
+				  struct dsa_notifier_tag_8021q_vlan_info *info)
+{
+	int port, err;
+
+	/* Since we use dsa_broadcast(), there might be other switches in other
+	 * trees which don't support tag_8021q, so don't return an error.
+	 * Or they might even support tag_8021q but have not registered yet to
+	 * use it (maybe they use another tagger currently).
+	 */
+	if (!ds->ops->tag_8021q_vlan_add || !ds->tag_8021q_ctx)
+		return 0;
 
-	return ds->ops->tag_8021q_vlan_del(ds, dp->index, vid);
+	for (port = 0; port < ds->num_ports; port++) {
+		if (dsa_switch_tag_8021q_vlan_match(ds, port, info)) {
+			u16 flags = 0;
+
+			if (dsa_is_user_port(ds, port))
+				flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+
+			if (vid_is_dsa_8021q_rxvlan(info->vid) &&
+			    dsa_8021q_rx_switch_id(info->vid) == ds->index &&
+			    dsa_8021q_rx_source_port(info->vid) == port)
+				flags |= BRIDGE_VLAN_INFO_PVID;
+
+			err = dsa_switch_do_tag_8021q_vlan_add(ds, port,
+							       info->vid,
+							       flags);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
+				  struct dsa_notifier_tag_8021q_vlan_info *info)
+{
+	int port, err;
+
+	if (!ds->ops->tag_8021q_vlan_del || !ds->tag_8021q_ctx)
+		return 0;
+
+	for (port = 0; port < ds->num_ports; port++) {
+		if (dsa_switch_tag_8021q_vlan_match(ds, port, info)) {
+			err = dsa_switch_do_tag_8021q_vlan_del(ds, port,
+							       info->vid);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
 }
 
 /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single
@@ -192,6 +323,7 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds,
 			      struct dsa_notifier_bridge_info *info)
 {
 	struct dsa_switch *targeted_ds;
+	struct dsa_port *targeted_dp;
 	u16 targeted_rx_vid;
 	int err, port;
 
@@ -199,23 +331,23 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds,
 		return 0;
 
 	targeted_ds = dsa_switch_find(info->tree_index, info->sw_index);
+	targeted_dp = dsa_to_port(targeted_ds, info->port);
 	targeted_rx_vid = dsa_8021q_rx_vid(targeted_ds, info->port);
 
 	for (port = 0; port < ds->num_ports; port++) {
+		struct dsa_port *dp = dsa_to_port(ds, port);
 		u16 rx_vid = dsa_8021q_rx_vid(ds, port);
 
 		if (!dsa_tag_8021q_bridge_match(ds, port, info))
 			continue;
 
 		/* Install the RX VID of the targeted port in our VLAN table */
-		err = dsa_8021q_vid_apply(ds, port, targeted_rx_vid,
-					  BRIDGE_VLAN_INFO_UNTAGGED, true);
+		err = dsa_port_tag_8021q_vlan_add(dp, targeted_rx_vid);
 		if (err)
 			return err;
 
 		/* Install our RX VID into the targeted port's VLAN table */
-		err = dsa_8021q_vid_apply(targeted_ds, info->port, rx_vid,
-					  BRIDGE_VLAN_INFO_UNTAGGED, true);
+		err = dsa_port_tag_8021q_vlan_add(targeted_dp, rx_vid);
 		if (err)
 			return err;
 	}
@@ -227,46 +359,39 @@ int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds,
 			       struct dsa_notifier_bridge_info *info)
 {
 	struct dsa_switch *targeted_ds;
+	struct dsa_port *targeted_dp;
 	u16 targeted_rx_vid;
-	int err, port;
+	int port;
 
 	if (!ds->tag_8021q_ctx)
 		return 0;
 
 	targeted_ds = dsa_switch_find(info->tree_index, info->sw_index);
+	targeted_dp = dsa_to_port(targeted_ds, info->port);
 	targeted_rx_vid = dsa_8021q_rx_vid(targeted_ds, info->port);
 
 	for (port = 0; port < ds->num_ports; port++) {
+		struct dsa_port *dp = dsa_to_port(ds, port);
 		u16 rx_vid = dsa_8021q_rx_vid(ds, port);
 
 		if (!dsa_tag_8021q_bridge_match(ds, port, info))
 			continue;
 
 		/* Remove the RX VID of the targeted port from our VLAN table */
-		err = dsa_8021q_vid_apply(ds, port, targeted_rx_vid,
-					  BRIDGE_VLAN_INFO_UNTAGGED, false);
-		if (err)
-			dev_err(ds->dev,
-				"port %d failed to delete tag_8021q VLAN: %pe\n",
-				port, ERR_PTR(err));
+		dsa_port_tag_8021q_vlan_del(dp, targeted_rx_vid);
 
 		/* Remove our RX VID from the targeted port's VLAN table */
-		err = dsa_8021q_vid_apply(targeted_ds, info->port, rx_vid,
-					  BRIDGE_VLAN_INFO_UNTAGGED, false);
-		if (err)
-			dev_err(targeted_ds->dev,
-				"port %d failed to delete tag_8021q VLAN: %pe\n",
-				info->port, ERR_PTR(err));
+		dsa_port_tag_8021q_vlan_del(targeted_dp, rx_vid);
 	}
 
 	return 0;
 }
 
 /* Set up a port's tag_8021q RX and TX VLAN for standalone mode operation */
-static int dsa_8021q_setup_port(struct dsa_switch *ds, int port, bool enabled)
+static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
 {
 	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
-	int upstream = dsa_upstream_port(ds, port);
+	struct dsa_port *dp = dsa_to_port(ds, port);
 	u16 rx_vid = dsa_8021q_rx_vid(ds, port);
 	u16 tx_vid = dsa_8021q_tx_vid(ds, port);
 	struct net_device *master;
@@ -275,29 +400,17 @@ static int dsa_8021q_setup_port(struct dsa_switch *ds, int port, bool enabled)
 	/* The CPU port is implicitly configured by
 	 * configuring the front-panel ports
 	 */
-	if (!dsa_is_user_port(ds, port))
+	if (!dsa_port_is_user(dp))
 		return 0;
 
-	master = dsa_to_port(ds, port)->cpu_dp->master;
+	master = dp->cpu_dp->master;
 
 	/* Add this user port's RX VID to the membership list of all others
 	 * (including itself). This is so that bridging will not be hindered.
 	 * L2 forwarding rules still take precedence when there are no VLAN
 	 * restrictions, so there are no concerns about leaking traffic.
 	 */
-	err = dsa_8021q_vid_apply(ds, port, rx_vid, BRIDGE_VLAN_INFO_UNTAGGED |
-				  BRIDGE_VLAN_INFO_PVID, enabled);
-	if (err) {
-		dev_err(ds->dev,
-			"Failed to apply RX VID %d to port %d: %pe\n",
-			rx_vid, port, ERR_PTR(err));
-		return err;
-	}
-
-	/* CPU port needs to see this port's RX VID
-	 * as tagged egress.
-	 */
-	err = dsa_8021q_vid_apply(ds, upstream, rx_vid, 0, enabled);
+	err = dsa_port_tag_8021q_vlan_add(dp, rx_vid);
 	if (err) {
 		dev_err(ds->dev,
 			"Failed to apply RX VID %d to port %d: %pe\n",
@@ -306,39 +419,51 @@ static int dsa_8021q_setup_port(struct dsa_switch *ds, int port, bool enabled)
 	}
 
 	/* Add @rx_vid to the master's RX filter. */
-	if (enabled)
-		vlan_vid_add(master, ctx->proto, rx_vid);
-	else
-		vlan_vid_del(master, ctx->proto, rx_vid);
+	vlan_vid_add(master, ctx->proto, rx_vid);
 
 	/* Finally apply the TX VID on this port and on the CPU port */
-	err = dsa_8021q_vid_apply(ds, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED,
-				  enabled);
+	err = dsa_port_tag_8021q_vlan_add(dp, tx_vid);
 	if (err) {
 		dev_err(ds->dev,
 			"Failed to apply TX VID %d on port %d: %pe\n",
 			tx_vid, port, ERR_PTR(err));
 		return err;
 	}
-	err = dsa_8021q_vid_apply(ds, upstream, tx_vid, 0, enabled);
-	if (err) {
-		dev_err(ds->dev,
-			"Failed to apply TX VID %d on port %d: %pe\n",
-			tx_vid, upstream, ERR_PTR(err));
-		return err;
-	}
 
 	return err;
 }
 
-static int dsa_8021q_setup(struct dsa_switch *ds, bool enabled)
+static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port)
+{
+	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+	struct dsa_port *dp = dsa_to_port(ds, port);
+	u16 rx_vid = dsa_8021q_rx_vid(ds, port);
+	u16 tx_vid = dsa_8021q_tx_vid(ds, port);
+	struct net_device *master;
+
+	/* The CPU port is implicitly configured by
+	 * configuring the front-panel ports
+	 */
+	if (!dsa_port_is_user(dp))
+		return;
+
+	master = dp->cpu_dp->master;
+
+	dsa_port_tag_8021q_vlan_del(dp, rx_vid);
+
+	vlan_vid_del(master, ctx->proto, rx_vid);
+
+	dsa_port_tag_8021q_vlan_del(dp, tx_vid);
+}
+
+static int dsa_tag_8021q_setup(struct dsa_switch *ds)
 {
 	int err, port;
 
 	ASSERT_RTNL();
 
 	for (port = 0; port < ds->num_ports; port++) {
-		err = dsa_8021q_setup_port(ds, port, enabled);
+		err = dsa_tag_8021q_port_setup(ds, port);
 		if (err < 0) {
 			dev_err(ds->dev,
 				"Failed to setup VLAN tagging for port %d: %pe\n",
@@ -350,140 +475,15 @@ static int dsa_8021q_setup(struct dsa_switch *ds, bool enabled)
 	return 0;
 }
 
-static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
-					  struct dsa_switch *other_ds,
-					  int other_port, bool enabled)
+static void dsa_tag_8021q_teardown(struct dsa_switch *ds)
 {
-	u16 rx_vid = dsa_8021q_rx_vid(ds, port);
+	int port;
 
-	/* @rx_vid of local @ds port @port goes to @other_port of
-	 * @other_ds
-	 */
-	return dsa_8021q_vid_apply(other_ds, other_port, rx_vid,
-				   BRIDGE_VLAN_INFO_UNTAGGED, enabled);
-}
-
-static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port,
-					struct dsa_switch *other_ds,
-					int other_port)
-{
-	struct dsa_8021q_context *other_ctx = other_ds->tag_8021q_ctx;
-	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
-	struct dsa_8021q_crosschip_link *c;
-
-	list_for_each_entry(c, &ctx->crosschip_links, list) {
-		if (c->port == port && c->other_ctx == other_ctx &&
-		    c->other_port == other_port) {
-			refcount_inc(&c->refcount);
-			return 0;
-		}
-	}
-
-	dev_dbg(ds->dev,
-		"adding crosschip link from port %d to %s port %d\n",
-		port, dev_name(other_ds->dev), other_port);
-
-	c = kzalloc(sizeof(*c), GFP_KERNEL);
-	if (!c)
-		return -ENOMEM;
-
-	c->port = port;
-	c->other_ctx = other_ctx;
-	c->other_port = other_port;
-	refcount_set(&c->refcount, 1);
-
-	list_add(&c->list, &ctx->crosschip_links);
-
-	return 0;
-}
-
-static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
-					 struct dsa_8021q_crosschip_link *c,
-					 bool *keep)
-{
-	*keep = !refcount_dec_and_test(&c->refcount);
-
-	if (*keep)
-		return;
-
-	dev_dbg(ds->dev,
-		"deleting crosschip link from port %d to %s port %d\n",
-		c->port, dev_name(c->other_ctx->ds->dev), c->other_port);
-
-	list_del(&c->list);
-	kfree(c);
-}
-
-/* Make traffic from local port @port be received by remote port @other_port.
- * This means that our @rx_vid needs to be installed on @other_ds's upstream
- * and user ports. The user ports should be egress-untagged so that they can
- * pop the dsa_8021q VLAN. But the @other_upstream can be either egress-tagged
- * or untagged: it doesn't matter, since it should never egress a frame having
- * our @rx_vid.
- */
-int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
-				    struct dsa_switch *other_ds,
-				    int other_port)
-{
-	/* @other_upstream is how @other_ds reaches us. If we are part
-	 * of disjoint trees, then we are probably connected through
-	 * our CPU ports. If we're part of the same tree though, we should
-	 * probably use dsa_towards_port.
-	 */
-	int other_upstream = dsa_upstream_port(other_ds, other_port);
-	int err;
-
-	err = dsa_8021q_crosschip_link_add(ds, port, other_ds, other_port);
-	if (err)
-		return err;
-
-	err = dsa_8021q_crosschip_link_apply(ds, port, other_ds,
-					     other_port, true);
-	if (err)
-		return err;
-
-	err = dsa_8021q_crosschip_link_add(ds, port, other_ds, other_upstream);
-	if (err)
-		return err;
-
-	return dsa_8021q_crosschip_link_apply(ds, port, other_ds,
-					      other_upstream, true);
-}
-EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join);
-
-int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
-				     struct dsa_switch *other_ds,
-				     int other_port)
-{
-	struct dsa_8021q_context *other_ctx = other_ds->tag_8021q_ctx;
-	int other_upstream = dsa_upstream_port(other_ds, other_port);
-	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
-	struct dsa_8021q_crosschip_link *c, *n;
-
-	list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) {
-		if (c->port == port && c->other_ctx == other_ctx &&
-		    (c->other_port == other_port ||
-		     c->other_port == other_upstream)) {
-			int other_port = c->other_port;
-			bool keep;
-			int err;
-
-			dsa_8021q_crosschip_link_del(ds, c, &keep);
-			if (keep)
-				continue;
-
-			err = dsa_8021q_crosschip_link_apply(ds, port,
-							     other_ds,
-							     other_port,
-							     false);
-			if (err)
-				return err;
-		}
-	}
+	ASSERT_RTNL();
 
-	return 0;
+	for (port = 0; port < ds->num_ports; port++)
+		dsa_tag_8021q_port_teardown(ds, port);
 }
-EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave);
 
 int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto)
 {
@@ -496,28 +496,24 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto)
 	ctx->proto = proto;
 	ctx->ds = ds;
 
-	INIT_LIST_HEAD(&ctx->crosschip_links);
+	INIT_LIST_HEAD(&ctx->vlans);
 
 	ds->tag_8021q_ctx = ctx;
 
-	return dsa_8021q_setup(ds, true);
+	return dsa_tag_8021q_setup(ds);
 }
 EXPORT_SYMBOL_GPL(dsa_tag_8021q_register);
 
 void dsa_tag_8021q_unregister(struct dsa_switch *ds)
 {
 	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
-	struct dsa_8021q_crosschip_link *c, *n;
-	int err;
+	struct dsa_tag_8021q_vlan *v, *n;
 
-	err = dsa_8021q_setup(ds, false);
-	if (err)
-		dev_err(ds->dev, "failed to tear down tag_8021q VLANs: %pe\n",
-			ERR_PTR(err));
+	dsa_tag_8021q_teardown(ds);
 
-	list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) {
-		list_del(&c->list);
-		kfree(c);
+	list_for_each_entry_safe(v, n, &ctx->vlans, list) {
+		list_del(&v->list);
+		kfree(v);
 	}
 
 	ds->tag_8021q_ctx = NULL;
-- 
cgit v1.2.3


From 8b72b301b442907742c1af1b8fcb52e351a2aac1 Mon Sep 17 00:00:00 2001
From: Xu Liang <lxu@maxlinear.com>
Date: Mon, 19 Jul 2021 13:32:11 +0800
Subject: net: phy: add API to read 802.3-c45 IDs

Add API to read 802.3-c45 IDs so that C22/C45 mixed device can use
C45 APIs without failing ID checks.

Signed-off-by: Xu Liang <lxu@maxlinear.com>
Acked-by: Hauke Mehrtens <hmehrtens@maxlinear.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 14 ++++++++++++++
 include/linux/phy.h          |  1 +
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 5d5f9a9ee768..107aa6d7bc6b 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -968,6 +968,20 @@ void phy_device_remove(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_device_remove);
 
+/**
+ * phy_get_c45_ids - Read 802.3-c45 IDs for phy device.
+ * @phydev: phy_device structure to read 802.3-c45 IDs
+ *
+ * Returns zero on success, %-EIO on bus access error, or %-ENODEV if
+ * the "devices in package" is invalid.
+ */
+int phy_get_c45_ids(struct phy_device *phydev)
+{
+	return get_phy_c45_ids(phydev->mdio.bus, phydev->mdio.addr,
+			       &phydev->c45_ids);
+}
+EXPORT_SYMBOL(phy_get_c45_ids);
+
 /**
  * phy_find_first - finds the first PHY device on the bus
  * @bus: the target MII bus
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3b80dc3ed68b..736e1d1a47c4 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1431,6 +1431,7 @@ static inline int phy_device_register(struct phy_device *phy)
 static inline void phy_device_free(struct phy_device *phydev) { }
 #endif /* CONFIG_PHYLIB */
 void phy_device_remove(struct phy_device *phydev);
+int phy_get_c45_ids(struct phy_device *phydev);
 int phy_init_hw(struct phy_device *phydev);
 int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
-- 
cgit v1.2.3


From c6451cda100d4ebbc3f6819e1161ce0e38ce7746 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 19 Jul 2021 16:51:38 +0300
Subject: net: switchdev: introduce helper for checking dynamically learned FDB
 entries

It is a bit difficult to understand what DSA checks when it tries to
avoid installing dynamically learned addresses on foreign interfaces as
local host addresses, so create a generic switchdev helper that can be
reused and is generally more readable.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h | 6 ++++++
 net/dsa/slave.c         | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index e4cac9218ce1..745eb25fb8c4 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -238,6 +238,12 @@ switchdev_notifier_info_to_extack(const struct switchdev_notifier_info *info)
 	return info->extack;
 }
 
+static inline bool
+switchdev_fdb_is_dynamically_learned(const struct switchdev_notifier_fdb_info *fdb_info)
+{
+	return !fdb_info->added_by_user && !fdb_info->is_local;
+}
+
 #ifdef CONFIG_NET_SWITCHDEV
 
 void switchdev_deferred_process(void);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ffbba1e71551..feb64f58faed 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2438,7 +2438,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
 			 * On the other hand, FDB entries for local termination
 			 * should always be installed.
 			 */
-			if (!fdb_info->added_by_user && !fdb_info->is_local &&
+			if (switchdev_fdb_is_dynamically_learned(fdb_info) &&
 			    !dp->ds->assisted_learning_on_cpu_port)
 				return NOTIFY_DONE;
 
-- 
cgit v1.2.3


From 8ca07176ab00a6d06a9b254dcbb2514b4d607e9c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 19 Jul 2021 16:51:39 +0300
Subject: net: switchdev: introduce a fanout helper for
 SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE

Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.

In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.

This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.

But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.

This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).

So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.

An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.

To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
  all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
  a LAG towards all the lower interfaces that pass the check_cb().

In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.

Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.

DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:

br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.

So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  56 ++++++++++++++
 net/switchdev/switchdev.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 246 insertions(+)

(limited to 'include')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 745eb25fb8c4..6f57eb2e89cc 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -272,6 +272,30 @@ void switchdev_port_fwd_mark_set(struct net_device *dev,
 				 struct net_device *group_dev,
 				 bool joining);
 
+int switchdev_handle_fdb_add_to_device(struct net_device *dev,
+		const struct switchdev_notifier_fdb_info *fdb_info,
+		bool (*check_cb)(const struct net_device *dev),
+		bool (*foreign_dev_check_cb)(const struct net_device *dev,
+					     const struct net_device *foreign_dev),
+		int (*add_cb)(struct net_device *dev,
+			      const struct net_device *orig_dev, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info),
+		int (*lag_add_cb)(struct net_device *dev,
+				  const struct net_device *orig_dev, const void *ctx,
+				  const struct switchdev_notifier_fdb_info *fdb_info));
+
+int switchdev_handle_fdb_del_to_device(struct net_device *dev,
+		const struct switchdev_notifier_fdb_info *fdb_info,
+		bool (*check_cb)(const struct net_device *dev),
+		bool (*foreign_dev_check_cb)(const struct net_device *dev,
+					     const struct net_device *foreign_dev),
+		int (*del_cb)(struct net_device *dev,
+			      const struct net_device *orig_dev, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info),
+		int (*lag_del_cb)(struct net_device *dev,
+				  const struct net_device *orig_dev, const void *ctx,
+				  const struct switchdev_notifier_fdb_info *fdb_info));
+
 int switchdev_handle_port_obj_add(struct net_device *dev,
 			struct switchdev_notifier_port_obj_info *port_obj_info,
 			bool (*check_cb)(const struct net_device *dev),
@@ -355,6 +379,38 @@ call_switchdev_blocking_notifiers(unsigned long val,
 	return NOTIFY_DONE;
 }
 
+static inline int
+switchdev_handle_fdb_add_to_device(struct net_device *dev,
+		const struct switchdev_notifier_fdb_info *fdb_info,
+		bool (*check_cb)(const struct net_device *dev),
+		bool (*foreign_dev_check_cb)(const struct net_device *dev,
+					     const struct net_device *foreign_dev),
+		int (*add_cb)(struct net_device *dev,
+			      const struct net_device *orig_dev, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info),
+		int (*lag_add_cb)(struct net_device *dev,
+				  const struct net_device *orig_dev, const void *ctx,
+				  const struct switchdev_notifier_fdb_info *fdb_info))
+{
+	return 0;
+}
+
+static inline int
+switchdev_handle_fdb_del_to_device(struct net_device *dev,
+		const struct switchdev_notifier_fdb_info *fdb_info,
+		bool (*check_cb)(const struct net_device *dev),
+		bool (*foreign_dev_check_cb)(const struct net_device *dev,
+					     const struct net_device *foreign_dev),
+		int (*del_cb)(struct net_device *dev,
+			      const struct net_device *orig_dev, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info),
+		int (*lag_del_cb)(struct net_device *dev,
+				  const struct net_device *orig_dev, const void *ctx,
+				  const struct switchdev_notifier_fdb_info *fdb_info));
+{
+	return 0;
+}
+
 static inline int
 switchdev_handle_port_obj_add(struct net_device *dev,
 			struct switchdev_notifier_port_obj_info *port_obj_info,
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 070698dd19bc..82dd4e4e86f5 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -378,6 +378,196 @@ int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers);
 
+static int __switchdev_handle_fdb_add_to_device(struct net_device *dev,
+		const struct net_device *orig_dev,
+		const struct switchdev_notifier_fdb_info *fdb_info,
+		bool (*check_cb)(const struct net_device *dev),
+		bool (*foreign_dev_check_cb)(const struct net_device *dev,
+					     const struct net_device *foreign_dev),
+		int (*add_cb)(struct net_device *dev,
+			      const struct net_device *orig_dev, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info),
+		int (*lag_add_cb)(struct net_device *dev,
+				  const struct net_device *orig_dev, const void *ctx,
+				  const struct switchdev_notifier_fdb_info *fdb_info))
+{
+	const struct switchdev_notifier_info *info = &fdb_info->info;
+	struct net_device *lower_dev;
+	struct list_head *iter;
+	int err = -EOPNOTSUPP;
+
+	if (check_cb(dev)) {
+		/* Handle FDB entries on foreign interfaces as FDB entries
+		 * towards the software bridge.
+		 */
+		if (foreign_dev_check_cb && foreign_dev_check_cb(dev, orig_dev)) {
+			struct net_device *br = netdev_master_upper_dev_get_rcu(dev);
+
+			if (!br || !netif_is_bridge_master(br))
+				return 0;
+
+			/* No point in handling FDB entries on a foreign bridge */
+			if (foreign_dev_check_cb(dev, br))
+				return 0;
+
+			return __switchdev_handle_fdb_add_to_device(br, orig_dev,
+								    fdb_info, check_cb,
+								    foreign_dev_check_cb,
+								    add_cb, lag_add_cb);
+		}
+
+		return add_cb(dev, orig_dev, info->ctx, fdb_info);
+	}
+
+	/* If we passed over the foreign check, it means that the LAG interface
+	 * is offloaded.
+	 */
+	if (netif_is_lag_master(dev)) {
+		if (!lag_add_cb)
+			return -EOPNOTSUPP;
+
+		return lag_add_cb(dev, orig_dev, info->ctx, fdb_info);
+	}
+
+	/* Recurse through lower interfaces in case the FDB entry is pointing
+	 * towards a bridge device.
+	 */
+	netdev_for_each_lower_dev(dev, lower_dev, iter) {
+		/* Do not propagate FDB entries across bridges */
+		if (netif_is_bridge_master(lower_dev))
+			continue;
+
+		err = __switchdev_handle_fdb_add_to_device(lower_dev, orig_dev,
+							   fdb_info, check_cb,
+							   foreign_dev_check_cb,
+							   add_cb, lag_add_cb);
+		if (err && err != -EOPNOTSUPP)
+			return err;
+	}
+
+	return err;
+}
+
+int switchdev_handle_fdb_add_to_device(struct net_device *dev,
+		const struct switchdev_notifier_fdb_info *fdb_info,
+		bool (*check_cb)(const struct net_device *dev),
+		bool (*foreign_dev_check_cb)(const struct net_device *dev,
+					     const struct net_device *foreign_dev),
+		int (*add_cb)(struct net_device *dev,
+			      const struct net_device *orig_dev, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info),
+		int (*lag_add_cb)(struct net_device *dev,
+				  const struct net_device *orig_dev, const void *ctx,
+				  const struct switchdev_notifier_fdb_info *fdb_info))
+{
+	int err;
+
+	err = __switchdev_handle_fdb_add_to_device(dev, dev, fdb_info,
+						   check_cb,
+						   foreign_dev_check_cb,
+						   add_cb, lag_add_cb);
+	if (err == -EOPNOTSUPP)
+		err = 0;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_handle_fdb_add_to_device);
+
+static int __switchdev_handle_fdb_del_to_device(struct net_device *dev,
+		const struct net_device *orig_dev,
+		const struct switchdev_notifier_fdb_info *fdb_info,
+		bool (*check_cb)(const struct net_device *dev),
+		bool (*foreign_dev_check_cb)(const struct net_device *dev,
+					     const struct net_device *foreign_dev),
+		int (*del_cb)(struct net_device *dev,
+			      const struct net_device *orig_dev, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info),
+		int (*lag_del_cb)(struct net_device *dev,
+				  const struct net_device *orig_dev, const void *ctx,
+				  const struct switchdev_notifier_fdb_info *fdb_info))
+{
+	const struct switchdev_notifier_info *info = &fdb_info->info;
+	struct net_device *lower_dev;
+	struct list_head *iter;
+	int err = -EOPNOTSUPP;
+
+	if (check_cb(dev)) {
+		/* Handle FDB entries on foreign interfaces as FDB entries
+		 * towards the software bridge.
+		 */
+		if (foreign_dev_check_cb && foreign_dev_check_cb(dev, orig_dev)) {
+			struct net_device *br = netdev_master_upper_dev_get_rcu(dev);
+
+			if (!br || !netif_is_bridge_master(br))
+				return 0;
+
+			/* No point in handling FDB entries on a foreign bridge */
+			if (foreign_dev_check_cb(dev, br))
+				return 0;
+
+			return __switchdev_handle_fdb_del_to_device(br, orig_dev,
+								    fdb_info, check_cb,
+								    foreign_dev_check_cb,
+								    del_cb, lag_del_cb);
+		}
+
+		return del_cb(dev, orig_dev, info->ctx, fdb_info);
+	}
+
+	/* If we passed over the foreign check, it means that the LAG interface
+	 * is offloaded.
+	 */
+	if (netif_is_lag_master(dev)) {
+		if (!lag_del_cb)
+			return -EOPNOTSUPP;
+
+		return lag_del_cb(dev, orig_dev, info->ctx, fdb_info);
+	}
+
+	/* Recurse through lower interfaces in case the FDB entry is pointing
+	 * towards a bridge device.
+	 */
+	netdev_for_each_lower_dev(dev, lower_dev, iter) {
+		/* Do not propagate FDB entries across bridges */
+		if (netif_is_bridge_master(lower_dev))
+			continue;
+
+		err = switchdev_handle_fdb_del_to_device(lower_dev, fdb_info,
+							 check_cb,
+							 foreign_dev_check_cb,
+							 del_cb, lag_del_cb);
+		if (err && err != -EOPNOTSUPP)
+			return err;
+	}
+
+	return err;
+}
+
+int switchdev_handle_fdb_del_to_device(struct net_device *dev,
+		const struct switchdev_notifier_fdb_info *fdb_info,
+		bool (*check_cb)(const struct net_device *dev),
+		bool (*foreign_dev_check_cb)(const struct net_device *dev,
+					     const struct net_device *foreign_dev),
+		int (*del_cb)(struct net_device *dev,
+			      const struct net_device *orig_dev, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info),
+		int (*lag_del_cb)(struct net_device *dev,
+				  const struct net_device *orig_dev, const void *ctx,
+				  const struct switchdev_notifier_fdb_info *fdb_info))
+{
+	int err;
+
+	err = __switchdev_handle_fdb_del_to_device(dev, dev, fdb_info,
+						   check_cb,
+						   foreign_dev_check_cb,
+						   del_cb, lag_del_cb);
+	if (err == -EOPNOTSUPP)
+		err = 0;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_handle_fdb_del_to_device);
+
 static int __switchdev_handle_port_obj_add(struct net_device *dev,
 			struct switchdev_notifier_port_obj_info *port_obj_info,
 			bool (*check_cb)(const struct net_device *dev),
-- 
cgit v1.2.3


From 2d151d39073aff498358543801fca0f670fea981 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Sun, 18 Jul 2021 09:11:06 +0200
Subject: xfrm: Add possibility to set the default to block if we have no
 policy

As the default we assume the traffic to pass, if we have no
matching IPsec policy. With this patch, we have a possibility to
change this default from allow to block. It can be configured
via netlink. Each direction (input/output/forward) can be
configured separately. With the default to block configuered,
we need allow policies for all packet flows we accept.
We do not use default policy lookup for the loopback device.

v1->v2
 - fix compiling when XFRM is disabled
 - Reported-by: kernel test robot <lkp@intel.com>

Co-developed-by: Christian Langrock <christian.langrock@secunet.com>
Signed-off-by: Christian Langrock <christian.langrock@secunet.com>
Co-developed-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h  |  7 +++++++
 include/net/xfrm.h        | 36 ++++++++++++++++++++++++++------
 include/uapi/linux/xfrm.h | 10 +++++++++
 net/xfrm/xfrm_policy.c    | 16 +++++++++++++++
 net/xfrm/xfrm_user.c      | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 115 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index e946366e8ba5..88c647302977 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -65,6 +65,13 @@ struct netns_xfrm {
 	u32			sysctl_aevent_rseqth;
 	int			sysctl_larval_drop;
 	u32			sysctl_acq_expires;
+
+	u8			policy_default;
+#define XFRM_POL_DEFAULT_IN	1
+#define XFRM_POL_DEFAULT_OUT	2
+#define XFRM_POL_DEFAULT_FWD	4
+#define XFRM_POL_DEFAULT_MASK	7
+
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*sysctl_hdr;
 #endif
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index cbff7c2a9724..2308210793a0 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1075,6 +1075,22 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un
 }
 
 #ifdef CONFIG_XFRM
+static inline bool
+xfrm_default_allow(struct net *net, int dir)
+{
+	u8 def = net->xfrm.policy_default;
+
+	switch (dir) {
+	case XFRM_POLICY_IN:
+		return def & XFRM_POL_DEFAULT_IN ? false : true;
+	case XFRM_POLICY_OUT:
+		return def & XFRM_POL_DEFAULT_OUT ? false : true;
+	case XFRM_POLICY_FWD:
+		return def & XFRM_POL_DEFAULT_FWD ? false : true;
+	}
+	return false;
+}
+
 int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
 			unsigned short family);
 
@@ -1088,9 +1104,13 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 	if (sk && sk->sk_policy[XFRM_POLICY_IN])
 		return __xfrm_policy_check(sk, ndir, skb, family);
 
-	return	(!net->xfrm.policy_count[dir] && !secpath_exists(skb)) ||
-		(skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
-		__xfrm_policy_check(sk, ndir, skb, family);
+	if (xfrm_default_allow(net, dir))
+		return (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) ||
+		       (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
+		       __xfrm_policy_check(sk, ndir, skb, family);
+	else
+		return (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
+		       __xfrm_policy_check(sk, ndir, skb, family);
 }
 
 static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
@@ -1142,9 +1162,13 @@ static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
 	struct net *net = dev_net(skb->dev);
 
-	return	!net->xfrm.policy_count[XFRM_POLICY_OUT] ||
-		(skb_dst(skb)->flags & DST_NOXFRM) ||
-		__xfrm_route_forward(skb, family);
+	if (xfrm_default_allow(net, XFRM_POLICY_FWD))
+		return !net->xfrm.policy_count[XFRM_POLICY_OUT] ||
+			(skb_dst(skb)->flags & DST_NOXFRM) ||
+			__xfrm_route_forward(skb, family);
+	else
+		return (skb_dst(skb)->flags & DST_NOXFRM) ||
+			__xfrm_route_forward(skb, family);
 }
 
 static inline int xfrm4_route_forward(struct sk_buff *skb)
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index ffc6a5391bb7..6e8095106192 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -213,6 +213,11 @@ enum {
 	XFRM_MSG_GETSPDINFO,
 #define XFRM_MSG_GETSPDINFO XFRM_MSG_GETSPDINFO
 
+	XFRM_MSG_SETDEFAULT,
+#define XFRM_MSG_SETDEFAULT XFRM_MSG_SETDEFAULT
+	XFRM_MSG_GETDEFAULT,
+#define XFRM_MSG_GETDEFAULT XFRM_MSG_GETDEFAULT
+
 	XFRM_MSG_MAPPING,
 #define XFRM_MSG_MAPPING XFRM_MSG_MAPPING
 	__XFRM_MSG_MAX
@@ -508,6 +513,11 @@ struct xfrm_user_offload {
 #define XFRM_OFFLOAD_IPV6	1
 #define XFRM_OFFLOAD_INBOUND	2
 
+struct xfrm_userpolicy_default {
+	__u8				dirmask;
+	__u8				action;
+};
+
 #ifndef __KERNEL__
 /* backwards compatibility for userspace */
 #define XFRMGRP_ACQUIRE		1
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 827d84255021..d5cb082e11fc 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3165,6 +3165,11 @@ ok:
 	return dst;
 
 nopol:
+	if (!(dst_orig->dev->flags & IFF_LOOPBACK) &&
+	    !xfrm_default_allow(net, dir)) {
+		err = -EPERM;
+		goto error;
+	}
 	if (!(flags & XFRM_LOOKUP_ICMP)) {
 		dst = dst_orig;
 		goto ok;
@@ -3553,6 +3558,11 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	}
 
 	if (!pol) {
+		if (!xfrm_default_allow(net, dir)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
+			return 0;
+		}
+
 		if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) {
 			xfrm_secpath_reject(xerr_idx, skb, &fl);
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
@@ -3607,6 +3617,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 				tpp[ti++] = &pols[pi]->xfrm_vec[i];
 		}
 		xfrm_nr = ti;
+
+		if (!xfrm_default_allow(net, dir) && !xfrm_nr) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
+			goto reject;
+		}
+
 		if (npols > 1) {
 			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
 			tpp = stp;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b47d613409b7..4eafd1130c3e 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1961,6 +1961,54 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
 	return skb;
 }
 
+static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_userpolicy_default *up = nlmsg_data(nlh);
+	u8 dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK;
+	u8 old_default = net->xfrm.policy_default;
+
+	net->xfrm.policy_default = (old_default & (0xff ^ dirmask))
+				    | (up->action << up->dirmask);
+
+	rt_genid_bump_all(net);
+
+	return 0;
+}
+
+static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct nlattr **attrs)
+{
+	struct sk_buff *r_skb;
+	struct nlmsghdr *r_nlh;
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_userpolicy_default *r_up, *up;
+	int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default));
+	u32 portid = NETLINK_CB(skb).portid;
+	u32 seq = nlh->nlmsg_seq;
+
+	up = nlmsg_data(nlh);
+
+	r_skb = nlmsg_new(len, GFP_ATOMIC);
+	if (!r_skb)
+		return -ENOMEM;
+
+	r_nlh = nlmsg_put(r_skb, portid, seq, XFRM_MSG_GETDEFAULT, sizeof(*r_up), 0);
+	if (!r_nlh) {
+		kfree_skb(r_skb);
+		return -EMSGSIZE;
+	}
+
+	r_up = nlmsg_data(r_nlh);
+
+	r_up->action = ((net->xfrm.policy_default & (1 << up->dirmask)) >> up->dirmask);
+	r_up->dirmask = up->dirmask;
+	nlmsg_end(r_skb, r_nlh);
+
+	return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid);
+}
+
 static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 		struct nlattr **attrs)
 {
@@ -2664,6 +2712,8 @@ const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
 	[XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = sizeof(u32),
 	[XFRM_MSG_NEWSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
 	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
+	[XFRM_MSG_SETDEFAULT  - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
+	[XFRM_MSG_GETDEFAULT  - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
 };
 EXPORT_SYMBOL_GPL(xfrm_msg_min);
 
@@ -2743,6 +2793,8 @@ static const struct xfrm_link {
 						   .nla_pol = xfrma_spd_policy,
 						   .nla_max = XFRMA_SPD_MAX },
 	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo   },
+	[XFRM_MSG_SETDEFAULT  - XFRM_MSG_BASE] = { .doit = xfrm_set_default   },
+	[XFRM_MSG_GETDEFAULT  - XFRM_MSG_BASE] = { .doit = xfrm_get_default   },
 };
 
 static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
-- 
cgit v1.2.3


From 94111dfc18b8b8cb3c72006e0e7b31c038709ab4 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 20 Jul 2021 20:35:56 +0300
Subject: net: switchdev: remove stray semicolon in
 switchdev_handle_fdb_del_to_device shim

With the semicolon at the end, the compiler sees the shim function as a
declaration and not as a definition, and warns:

'switchdev_handle_fdb_del_to_device' declared 'static' but never defined

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 8ca07176ab00 ("net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 6f57eb2e89cc..66468ff8cc0a 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -406,7 +406,7 @@ switchdev_handle_fdb_del_to_device(struct net_device *dev,
 			      const struct switchdev_notifier_fdb_info *fdb_info),
 		int (*lag_del_cb)(struct net_device *dev,
 				  const struct net_device *orig_dev, const void *ctx,
-				  const struct switchdev_notifier_fdb_info *fdb_info));
+				  const struct switchdev_notifier_fdb_info *fdb_info))
 {
 	return 0;
 }
-- 
cgit v1.2.3


From db67f219fc9365a0c456666ed7c134d43ab0be8a Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Tue, 20 Jul 2021 21:42:56 +0200
Subject: uapi: IPv6 IOAM headers definition

This patch provides the IPv6 IOAM option header [1] as well as the IOAM
Trace header [2]. An IOAM option must be 4n-aligned. Here is an overview of
a Hop-by-Hop with an IOAM Trace option:

+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|  Next header  |  Hdr Ext Len  |    Padding    |    Padding    |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|  Option Type  |  Opt Data Len |    Reserved   |   IOAM Type   |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|         Namespace-ID          | NodeLen | Flags | RemainingLen|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|                IOAM-Trace-Type                |    Reserved   |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+<-+
|                                                               |  |
|                         node data [n]                         |  |
|                                                               |  |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+  D
|                                                               |  a
|                         node data [n-1]                       |  t
|                                                               |  a
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
~                             ...                               ~  S
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+  p
|                                                               |  a
|                         node data [1]                         |  c
|                                                               |  e
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+  |
|                                                               |  |
|                         node data [0]                         |  |
|                                                               |  |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+<-+

The IOAM option header starts at "Option Type" and ends after "IOAM
Type". The IOAM Trace header starts at "Namespace-ID" and ends after
"IOAM-Trace-Type/Reserved".

IOAM Type: either Pre-allocated Trace (=0), Incremental Trace (=1),
Proof-of-Transit (=2) or Edge-to-Edge (=3). Note that both the
Pre-allocated Trace and the Incremental Trace look the same. The two
others are not implemented.

Namespace-ID: IOAM namespace identifier, not to be confused with network
namespaces. It adds further context to IOAM options and associated data,
and allows devices which are IOAM capable to determine whether IOAM
options must be processed or ignored. It can also be used by an operator
to distinguish different operational domains or to identify different
sets of devices.

NodeLen: Length of data added by each node. It depends on the Trace
Type.

Flags: Only the Overflow (O) flag for now. The O flag is set by a
transit node when there are not enough octets left to record its data.

RemainingLen: Remaining free space to record data.

IOAM-Trace-Type: Bit field where each bit corresponds to a specific kind
of IOAM data. See [2] for a detailed list.

  [1] https://tools.ietf.org/html/draft-ietf-ippm-ioam-ipv6-options
  [2] https://tools.ietf.org/html/draft-ietf-ippm-ioam-data

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ioam6.h | 123 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 include/uapi/linux/ioam6.h

(limited to 'include')

diff --git a/include/uapi/linux/ioam6.h b/include/uapi/linux/ioam6.h
new file mode 100644
index 000000000000..2177e4e49566
--- /dev/null
+++ b/include/uapi/linux/ioam6.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ *  IPv6 IOAM implementation
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#ifndef _UAPI_LINUX_IOAM6_H
+#define _UAPI_LINUX_IOAM6_H
+
+#include <asm/byteorder.h>
+#include <linux/types.h>
+
+/*
+ * IPv6 IOAM Option Header
+ */
+struct ioam6_hdr {
+	__u8 opt_type;
+	__u8 opt_len;
+	__u8 :8;				/* reserved */
+#define IOAM6_TYPE_PREALLOC 0
+	__u8 type;
+} __attribute__((packed));
+
+/*
+ * IOAM Trace Header
+ */
+struct ioam6_trace_hdr {
+	__be16	namespace_id;
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+
+	__u8	:1,				/* unused */
+		:1,				/* unused */
+		overflow:1,
+		nodelen:5;
+
+	__u8	remlen:7,
+		:1;				/* unused */
+
+	union {
+		__be32 type_be32;
+
+		struct {
+			__u32	bit7:1,
+				bit6:1,
+				bit5:1,
+				bit4:1,
+				bit3:1,
+				bit2:1,
+				bit1:1,
+				bit0:1,
+				bit15:1,	/* unused */
+				bit14:1,	/* unused */
+				bit13:1,	/* unused */
+				bit12:1,	/* unused */
+				bit11:1,
+				bit10:1,
+				bit9:1,
+				bit8:1,
+				bit23:1,	/* reserved */
+				bit22:1,
+				bit21:1,	/* unused */
+				bit20:1,	/* unused */
+				bit19:1,	/* unused */
+				bit18:1,	/* unused */
+				bit17:1,	/* unused */
+				bit16:1,	/* unused */
+				:8;		/* reserved */
+		} type;
+	};
+
+#elif defined(__BIG_ENDIAN_BITFIELD)
+
+	__u8	nodelen:5,
+		overflow:1,
+		:1,				/* unused */
+		:1;				/* unused */
+
+	__u8	:1,				/* unused */
+		remlen:7;
+
+	union {
+		__be32 type_be32;
+
+		struct {
+			__u32	bit0:1,
+				bit1:1,
+				bit2:1,
+				bit3:1,
+				bit4:1,
+				bit5:1,
+				bit6:1,
+				bit7:1,
+				bit8:1,
+				bit9:1,
+				bit10:1,
+				bit11:1,
+				bit12:1,	/* unused */
+				bit13:1,	/* unused */
+				bit14:1,	/* unused */
+				bit15:1,	/* unused */
+				bit16:1,	/* unused */
+				bit17:1,	/* unused */
+				bit18:1,	/* unused */
+				bit19:1,	/* unused */
+				bit20:1,	/* unused */
+				bit21:1,	/* unused */
+				bit22:1,
+				bit23:1,	/* reserved */
+				:8;		/* reserved */
+		} type;
+	};
+
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+
+	__u8	data[0];
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_IOAM6_H */
-- 
cgit v1.2.3


From 9ee11f0fff205b4b3df9750bff5e94f97c71b6a0 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Tue, 20 Jul 2021 21:42:57 +0200
Subject: ipv6: ioam: Data plane support for Pre-allocated Trace

Implement support for processing the IOAM Pre-allocated Trace with IPv6,
see [1] and [2]. Introduce a new IPv6 Hop-by-Hop TLV option, see IANA [3].

A new per-interface sysctl is introduced. The value is a boolean to accept (=1)
or ignore (=0, by default) IPv6 IOAM options on ingress for an interface:
 - net.ipv6.conf.XXX.ioam6_enabled

Two other sysctls are introduced to define IOAM IDs, represented by an integer.
They are respectively per-namespace and per-interface:
 - net.ipv6.ioam6_id
 - net.ipv6.conf.XXX.ioam6_id

The value of the first one represents the IOAM ID of the node itself (u32; max
and default value = U32_MAX>>8, due to hop limit concatenation) while the other
represents the IOAM ID of an interface (u16; max and default value = U16_MAX).

Each "ioam6_id" sysctl has a "_wide" equivalent:
 - net.ipv6.ioam6_id_wide
 - net.ipv6.conf.XXX.ioam6_id_wide

The value of the first one represents the wide IOAM ID of the node itself (u64;
max and default value = U64_MAX>>8, due to hop limit concatenation) while the
other represents the wide IOAM ID of an interface (u32; max and default value
= U32_MAX).

The use of short and wide equivalents is not exclusive, a deployment could
choose to leverage both. For example, net.ipv6.conf.XXX.ioam6_id (short format)
could be an identifier for a physical interface, whereas
net.ipv6.conf.XXX.ioam6_id_wide (wide format) could be an identifier for a
logical sub-interface. Documentation about new sysctls is provided at the end
of this patchset.

Two relativistic hash tables are used: one for IOAM namespaces, the other for
IOAM schemas. A namespace can only have a single active schema and a schema
can only be attached to a single namespace (1:1 relationship).

  [1] https://tools.ietf.org/html/draft-ietf-ippm-ioam-ipv6-options
  [2] https://tools.ietf.org/html/draft-ietf-ippm-ioam-data
  [3] https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#ipv6-parameters-2

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ioam6.h      |  13 ++
 include/linux/ipv6.h       |   3 +
 include/net/ioam6.h        |  64 +++++++++
 include/net/netns/ipv6.h   |   3 +
 include/uapi/linux/in6.h   |   1 +
 include/uapi/linux/ioam6.h |   9 ++
 include/uapi/linux/ipv6.h  |   3 +
 net/ipv6/Makefile          |   2 +-
 net/ipv6/addrconf.c        |  37 +++++
 net/ipv6/af_inet6.c        |  10 ++
 net/ipv6/exthdrs.c         |  61 +++++++++
 net/ipv6/ioam6.c           | 333 +++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/sysctl_net_ipv6.c |  19 +++
 13 files changed, 557 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/ioam6.h
 create mode 100644 include/net/ioam6.h
 create mode 100644 net/ipv6/ioam6.c

(limited to 'include')

diff --git a/include/linux/ioam6.h b/include/linux/ioam6.h
new file mode 100644
index 000000000000..94a24b36998f
--- /dev/null
+++ b/include/linux/ioam6.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ *  IPv6 IOAM
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+#ifndef _LINUX_IOAM6_H
+#define _LINUX_IOAM6_H
+
+#include <uapi/linux/ioam6.h>
+
+#endif /* _LINUX_IOAM6_H */
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 70b2ad3b9884..ef4a69865737 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -76,6 +76,9 @@ struct ipv6_devconf {
 	__s32		disable_policy;
 	__s32           ndisc_tclass;
 	__s32		rpl_seg_enabled;
+	__u32		ioam6_id;
+	__u32		ioam6_id_wide;
+	__u8		ioam6_enabled;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/ioam6.h b/include/net/ioam6.h
new file mode 100644
index 000000000000..772b91ee2e87
--- /dev/null
+++ b/include/net/ioam6.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ *  IPv6 IOAM implementation
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#ifndef _NET_IOAM6_H
+#define _NET_IOAM6_H
+
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/ioam6.h>
+#include <linux/rhashtable-types.h>
+
+struct ioam6_namespace {
+	struct rhash_head head;
+	struct rcu_head rcu;
+
+	struct ioam6_schema __rcu *schema;
+
+	__be16 id;
+	__be32 data;
+	__be64 data_wide;
+};
+
+struct ioam6_schema {
+	struct rhash_head head;
+	struct rcu_head rcu;
+
+	struct ioam6_namespace __rcu *ns;
+
+	u32 id;
+	int len;
+	__be32 hdr;
+
+	u8 data[0];
+};
+
+struct ioam6_pernet_data {
+	struct mutex lock;
+	struct rhashtable namespaces;
+	struct rhashtable schemas;
+};
+
+static inline struct ioam6_pernet_data *ioam6_pernet(struct net *net)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	return net->ipv6.ioam6_data;
+#else
+	return NULL;
+#endif
+}
+
+struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id);
+void ioam6_fill_trace_data(struct sk_buff *skb,
+			   struct ioam6_namespace *ns,
+			   struct ioam6_trace_hdr *trace);
+
+int ioam6_init(void);
+void ioam6_exit(void);
+
+#endif /* _NET_IOAM6_H */
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index bde0b7adb4a3..a4b550380316 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -51,6 +51,8 @@ struct netns_sysctl_ipv6 {
 	int max_dst_opts_len;
 	int max_hbh_opts_len;
 	int seg6_flowlabel;
+	u32 ioam6_id;
+	u64 ioam6_id_wide;
 	bool skip_notify_on_dev_down;
 	u8 fib_notify_on_flag_change;
 };
@@ -110,6 +112,7 @@ struct netns_ipv6 {
 		spinlock_t	lock;
 		u32		seq;
 	} ip6addrlbl_table;
+	struct ioam6_pernet_data *ioam6_data;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 5ad396a57eb3..c4c53a9ab959 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -145,6 +145,7 @@ struct in6_flowlabel_req {
 #define IPV6_TLV_PADN		1
 #define IPV6_TLV_ROUTERALERT	5
 #define IPV6_TLV_CALIPSO	7	/* RFC 5570 */
+#define IPV6_TLV_IOAM		49	/* TEMPORARY IANA allocation for IOAM */
 #define IPV6_TLV_JUMBO		194
 #define IPV6_TLV_HAO		201	/* home address option */
 
diff --git a/include/uapi/linux/ioam6.h b/include/uapi/linux/ioam6.h
index 2177e4e49566..23ba6e85582f 100644
--- a/include/uapi/linux/ioam6.h
+++ b/include/uapi/linux/ioam6.h
@@ -12,6 +12,15 @@
 #include <asm/byteorder.h>
 #include <linux/types.h>
 
+#define IOAM6_U16_UNAVAILABLE U16_MAX
+#define IOAM6_U32_UNAVAILABLE U32_MAX
+#define IOAM6_U64_UNAVAILABLE U64_MAX
+
+#define IOAM6_DEFAULT_ID (IOAM6_U32_UNAVAILABLE >> 8)
+#define IOAM6_DEFAULT_ID_WIDE (IOAM6_U64_UNAVAILABLE >> 8)
+#define IOAM6_DEFAULT_IF_ID IOAM6_U16_UNAVAILABLE
+#define IOAM6_DEFAULT_IF_ID_WIDE IOAM6_U32_UNAVAILABLE
+
 /*
  * IPv6 IOAM Option Header
  */
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 70603775fe91..b243a53fa985 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -190,6 +190,9 @@ enum {
 	DEVCONF_NDISC_TCLASS,
 	DEVCONF_RPL_SEG_ENABLED,
 	DEVCONF_RA_DEFRTR_METRIC,
+	DEVCONF_IOAM6_ENABLED,
+	DEVCONF_IOAM6_ID,
+	DEVCONF_IOAM6_ID_WIDE,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index cf7b47bdb9b3..b7ef10d417d6 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -10,7 +10,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
 		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
-		udp_offload.o seg6.o fib6_notifier.o rpl.o
+		udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o
 
 ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index bc330fffb4a8..1802287977f1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -89,12 +89,15 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/export.h>
+#include <linux/ioam6.h>
 
 #define	INFINITY_LIFE_TIME	0xFFFFFFFF
 
 #define IPV6_MAX_STRLEN \
 	sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255")
 
+static u32 ioam6_if_id_max = U16_MAX;
+
 static inline u32 cstamp_delta(unsigned long cstamp)
 {
 	return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
@@ -237,6 +240,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 	.disable_policy		= 0,
 	.rpl_seg_enabled	= 0,
+	.ioam6_enabled		= 0,
+	.ioam6_id               = IOAM6_DEFAULT_IF_ID,
+	.ioam6_id_wide		= IOAM6_DEFAULT_IF_ID_WIDE,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -293,6 +299,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 	.disable_policy		= 0,
 	.rpl_seg_enabled	= 0,
+	.ioam6_enabled		= 0,
+	.ioam6_id               = IOAM6_DEFAULT_IF_ID,
+	.ioam6_id_wide		= IOAM6_DEFAULT_IF_ID_WIDE,
 };
 
 /* Check if link is ready: is it up and is a valid qdisc available */
@@ -5524,6 +5533,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
 	array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
 	array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled;
+	array[DEVCONF_IOAM6_ENABLED] = cnf->ioam6_enabled;
+	array[DEVCONF_IOAM6_ID] = cnf->ioam6_id;
+	array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -6930,6 +6942,31 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "ioam6_enabled",
+		.data		= &ipv6_devconf.ioam6_enabled,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= (void *)SYSCTL_ZERO,
+		.extra2		= (void *)SYSCTL_ONE,
+	},
+	{
+		.procname	= "ioam6_id",
+		.data		= &ipv6_devconf.ioam6_id,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= (void *)SYSCTL_ZERO,
+		.extra2		= (void *)&ioam6_if_id_max,
+	},
+	{
+		.procname	= "ioam6_id_wide",
+		.data		= &ipv6_devconf.ioam6_id_wide,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec,
+	},
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2389ff702f51..d92c90d97763 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -62,6 +62,7 @@
 #include <net/rpl.h>
 #include <net/compat.h>
 #include <net/xfrm.h>
+#include <net/ioam6.h>
 
 #include <linux/uaccess.h>
 #include <linux/mroute6.h>
@@ -961,6 +962,9 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.fib_notify_on_flag_change = 0;
 	atomic_set(&net->ipv6.fib6_sernum, 1);
 
+	net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID;
+	net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE;
+
 	err = ipv6_init_mibs(net);
 	if (err)
 		return err;
@@ -1191,6 +1195,10 @@ static int __init inet6_init(void)
 	if (err)
 		goto rpl_fail;
 
+	err = ioam6_init();
+	if (err)
+		goto ioam6_fail;
+
 	err = igmp6_late_init();
 	if (err)
 		goto igmp6_late_err;
@@ -1213,6 +1221,8 @@ sysctl_fail:
 	igmp6_late_cleanup();
 #endif
 igmp6_late_err:
+	ioam6_exit();
+ioam6_fail:
 	rpl_exit();
 rpl_fail:
 	seg6_exit();
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 26882e165c9e..d897faa4e9e6 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -49,6 +49,9 @@
 #include <net/seg6_hmac.h>
 #endif
 #include <net/rpl.h>
+#include <linux/ioam6.h>
+#include <net/ioam6.h>
+#include <net/dst_metadata.h>
 
 #include <linux/uaccess.h>
 
@@ -928,6 +931,60 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
 	return false;
 }
 
+/* IOAM */
+
+static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff)
+{
+	struct ioam6_trace_hdr *trace;
+	struct ioam6_namespace *ns;
+	struct ioam6_hdr *hdr;
+
+	/* Bad alignment (must be 4n-aligned) */
+	if (optoff & 3)
+		goto drop;
+
+	/* Ignore if IOAM is not enabled on ingress */
+	if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled)
+		goto ignore;
+
+	/* Truncated Option header */
+	hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff);
+	if (hdr->opt_len < 2)
+		goto drop;
+
+	switch (hdr->type) {
+	case IOAM6_TYPE_PREALLOC:
+		/* Truncated Pre-allocated Trace header */
+		if (hdr->opt_len < 2 + sizeof(*trace))
+			goto drop;
+
+		/* Malformed Pre-allocated Trace header */
+		trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr));
+		if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4)
+			goto drop;
+
+		/* Ignore if the IOAM namespace is unknown */
+		ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id);
+		if (!ns)
+			goto ignore;
+
+		if (!skb_valid_dst(skb))
+			ip6_route_input(skb);
+
+		ioam6_fill_trace_data(skb, ns, trace);
+		break;
+	default:
+		break;
+	}
+
+ignore:
+	return true;
+
+drop:
+	kfree_skb(skb);
+	return false;
+}
+
 /* Jumbo payload */
 
 static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
@@ -999,6 +1056,10 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = {
 		.type	= IPV6_TLV_ROUTERALERT,
 		.func	= ipv6_hop_ra,
 	},
+	{
+		.type	= IPV6_TLV_IOAM,
+		.func	= ipv6_hop_ioam,
+	},
 	{
 		.type	= IPV6_TLV_JUMBO,
 		.func	= ipv6_hop_jumbo,
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
new file mode 100644
index 000000000000..ba629e1b9408
--- /dev/null
+++ b/net/ipv6/ioam6.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *  IPv6 IOAM implementation
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/ioam6.h>
+#include <linux/rhashtable.h>
+
+#include <net/addrconf.h>
+#include <net/ioam6.h>
+
+static void ioam6_ns_release(struct ioam6_namespace *ns)
+{
+	kfree_rcu(ns, rcu);
+}
+
+static void ioam6_sc_release(struct ioam6_schema *sc)
+{
+	kfree_rcu(sc, rcu);
+}
+
+static void ioam6_free_ns(void *ptr, void *arg)
+{
+	struct ioam6_namespace *ns = (struct ioam6_namespace *)ptr;
+
+	if (ns)
+		ioam6_ns_release(ns);
+}
+
+static void ioam6_free_sc(void *ptr, void *arg)
+{
+	struct ioam6_schema *sc = (struct ioam6_schema *)ptr;
+
+	if (sc)
+		ioam6_sc_release(sc);
+}
+
+static int ioam6_ns_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct ioam6_namespace *ns = obj;
+
+	return (ns->id != *(__be16 *)arg->key);
+}
+
+static int ioam6_sc_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct ioam6_schema *sc = obj;
+
+	return (sc->id != *(u32 *)arg->key);
+}
+
+static const struct rhashtable_params rht_ns_params = {
+	.key_len		= sizeof(__be16),
+	.key_offset		= offsetof(struct ioam6_namespace, id),
+	.head_offset		= offsetof(struct ioam6_namespace, head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= ioam6_ns_cmpfn,
+};
+
+static const struct rhashtable_params rht_sc_params = {
+	.key_len		= sizeof(u32),
+	.key_offset		= offsetof(struct ioam6_schema, id),
+	.head_offset		= offsetof(struct ioam6_schema, head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= ioam6_sc_cmpfn,
+};
+
+struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id)
+{
+	struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+	return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+}
+
+static void __ioam6_fill_trace_data(struct sk_buff *skb,
+				    struct ioam6_namespace *ns,
+				    struct ioam6_trace_hdr *trace,
+				    struct ioam6_schema *sc,
+				    u8 sclen)
+{
+	struct __kernel_sock_timeval ts;
+	u64 raw64;
+	u32 raw32;
+	u16 raw16;
+	u8 *data;
+	u8 byte;
+
+	data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4;
+
+	/* hop_lim and node_id */
+	if (trace->type.bit0) {
+		byte = ipv6_hdr(skb)->hop_limit;
+		if (skb->dev)
+			byte--;
+
+		raw32 = dev_net(skb->dev)->ipv6.sysctl.ioam6_id;
+
+		*(__be32 *)data = cpu_to_be32((byte << 24) | raw32);
+		data += sizeof(__be32);
+	}
+
+	/* ingress_if_id and egress_if_id */
+	if (trace->type.bit1) {
+		if (!skb->dev)
+			raw16 = IOAM6_U16_UNAVAILABLE;
+		else
+			raw16 = (__force u16)__in6_dev_get(skb->dev)->cnf.ioam6_id;
+
+		*(__be16 *)data = cpu_to_be16(raw16);
+		data += sizeof(__be16);
+
+		if (skb_dst(skb)->dev->flags & IFF_LOOPBACK)
+			raw16 = IOAM6_U16_UNAVAILABLE;
+		else
+			raw16 = (__force u16)__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id;
+
+		*(__be16 *)data = cpu_to_be16(raw16);
+		data += sizeof(__be16);
+	}
+
+	/* timestamp seconds */
+	if (trace->type.bit2) {
+		if (!skb->tstamp)
+			__net_timestamp(skb);
+
+		skb_get_new_timestamp(skb, &ts);
+
+		*(__be32 *)data = cpu_to_be32((u32)ts.tv_sec);
+		data += sizeof(__be32);
+	}
+
+	/* timestamp subseconds */
+	if (trace->type.bit3) {
+		if (!skb->tstamp)
+			__net_timestamp(skb);
+
+		if (!trace->type.bit2)
+			skb_get_new_timestamp(skb, &ts);
+
+		*(__be32 *)data = cpu_to_be32((u32)ts.tv_usec);
+		data += sizeof(__be32);
+	}
+
+	/* transit delay */
+	if (trace->type.bit4) {
+		*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		data += sizeof(__be32);
+	}
+
+	/* namespace data */
+	if (trace->type.bit5) {
+		*(__be32 *)data = ns->data;
+		data += sizeof(__be32);
+	}
+
+	/* queue depth */
+	if (trace->type.bit6) {
+		*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		data += sizeof(__be32);
+	}
+
+	/* checksum complement */
+	if (trace->type.bit7) {
+		*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		data += sizeof(__be32);
+	}
+
+	/* hop_lim and node_id (wide) */
+	if (trace->type.bit8) {
+		byte = ipv6_hdr(skb)->hop_limit;
+		if (skb->dev)
+			byte--;
+
+		raw64 = dev_net(skb->dev)->ipv6.sysctl.ioam6_id_wide;
+
+		*(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64);
+		data += sizeof(__be64);
+	}
+
+	/* ingress_if_id and egress_if_id (wide) */
+	if (trace->type.bit9) {
+		if (!skb->dev)
+			raw32 = IOAM6_U32_UNAVAILABLE;
+		else
+			raw32 = __in6_dev_get(skb->dev)->cnf.ioam6_id_wide;
+
+		*(__be32 *)data = cpu_to_be32(raw32);
+		data += sizeof(__be32);
+
+		if (skb_dst(skb)->dev->flags & IFF_LOOPBACK)
+			raw32 = IOAM6_U32_UNAVAILABLE;
+		else
+			raw32 = __in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide;
+
+		*(__be32 *)data = cpu_to_be32(raw32);
+		data += sizeof(__be32);
+	}
+
+	/* namespace data (wide) */
+	if (trace->type.bit10) {
+		*(__be64 *)data = ns->data_wide;
+		data += sizeof(__be64);
+	}
+
+	/* buffer occupancy */
+	if (trace->type.bit11) {
+		*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		data += sizeof(__be32);
+	}
+
+	/* opaque state snapshot */
+	if (trace->type.bit22) {
+		if (!sc) {
+			*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE >> 8);
+		} else {
+			*(__be32 *)data = sc->hdr;
+			data += sizeof(__be32);
+
+			memcpy(data, sc->data, sc->len);
+		}
+	}
+}
+
+/* called with rcu_read_lock() */
+void ioam6_fill_trace_data(struct sk_buff *skb,
+			   struct ioam6_namespace *ns,
+			   struct ioam6_trace_hdr *trace)
+{
+	struct ioam6_schema *sc;
+	u8 sclen = 0;
+
+	/* Skip if Overflow flag is set OR
+	 * if an unknown type (bit 12-21) is set
+	 */
+	if (trace->overflow ||
+	    trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
+	    trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
+	    trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
+	    trace->type.bit21) {
+		return;
+	}
+
+	/* NodeLen does not include Opaque State Snapshot length. We need to
+	 * take it into account if the corresponding bit is set (bit 22) and
+	 * if the current IOAM namespace has an active schema attached to it
+	 */
+	sc = rcu_dereference(ns->schema);
+	if (trace->type.bit22) {
+		sclen = sizeof_field(struct ioam6_schema, hdr) / 4;
+
+		if (sc)
+			sclen += sc->len / 4;
+	}
+
+	/* If there is no space remaining, we set the Overflow flag and we
+	 * skip without filling the trace
+	 */
+	if (!trace->remlen || trace->remlen < trace->nodelen + sclen) {
+		trace->overflow = 1;
+		return;
+	}
+
+	__ioam6_fill_trace_data(skb, ns, trace, sc, sclen);
+	trace->remlen -= trace->nodelen + sclen;
+}
+
+static int __net_init ioam6_net_init(struct net *net)
+{
+	struct ioam6_pernet_data *nsdata;
+	int err = -ENOMEM;
+
+	nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL);
+	if (!nsdata)
+		goto out;
+
+	mutex_init(&nsdata->lock);
+	net->ipv6.ioam6_data = nsdata;
+
+	err = rhashtable_init(&nsdata->namespaces, &rht_ns_params);
+	if (err)
+		goto free_nsdata;
+
+	err = rhashtable_init(&nsdata->schemas, &rht_sc_params);
+	if (err)
+		goto free_rht_ns;
+
+out:
+	return err;
+free_rht_ns:
+	rhashtable_destroy(&nsdata->namespaces);
+free_nsdata:
+	kfree(nsdata);
+	net->ipv6.ioam6_data = NULL;
+	goto out;
+}
+
+static void __net_exit ioam6_net_exit(struct net *net)
+{
+	struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+	rhashtable_free_and_destroy(&nsdata->namespaces, ioam6_free_ns, NULL);
+	rhashtable_free_and_destroy(&nsdata->schemas, ioam6_free_sc, NULL);
+
+	kfree(nsdata);
+}
+
+static struct pernet_operations ioam6_net_ops = {
+	.init = ioam6_net_init,
+	.exit = ioam6_net_exit,
+};
+
+int __init ioam6_init(void)
+{
+	int err = register_pernet_subsys(&ioam6_net_ops);
+
+	if (err)
+		return err;
+
+	pr_info("In-situ OAM (IOAM) with IPv6\n");
+	return 0;
+}
+
+void ioam6_exit(void)
+{
+	unregister_pernet_subsys(&ioam6_net_ops);
+}
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index d7cf26f730d7..d53dd142bf87 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -21,6 +21,7 @@
 #ifdef CONFIG_NETLABEL
 #include <net/calipso.h>
 #endif
+#include <linux/ioam6.h>
 
 static int two = 2;
 static int three = 3;
@@ -28,6 +29,8 @@ static int flowlabel_reflect_max = 0x7;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
 static u32 rt6_multipath_hash_fields_all_mask =
 	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static u32 ioam6_id_max = IOAM6_DEFAULT_ID;
+static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE;
 
 static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
 					  void *buffer, size_t *lenp, loff_t *ppos)
@@ -196,6 +199,22 @@ static struct ctl_table ipv6_table_template[] = {
 		.extra1         = SYSCTL_ZERO,
 		.extra2         = &two,
 	},
+	{
+		.procname	= "ioam6_id",
+		.data		= &init_net.ipv6.sysctl.ioam6_id,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra2		= &ioam6_id_max,
+	},
+	{
+		.procname	= "ioam6_id_wide",
+		.data		= &init_net.ipv6.sysctl.ioam6_id_wide,
+		.maxlen		= sizeof(u64),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra2		= &ioam6_id_wide_max,
+	},
 	{ }
 };
 
-- 
cgit v1.2.3


From 8c6f6fa6772696be0c047a711858084b38763728 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Tue, 20 Jul 2021 21:42:58 +0200
Subject: ipv6: ioam: IOAM Generic Netlink API

Add Generic Netlink commands to allow userspace to configure IOAM
namespaces and schemas. The target is iproute2 and the patch is ready.
It will be posted as soon as this patchset is merged. Here is an overview:

$ ip ioam
Usage:	ip ioam { COMMAND | help }
	ip ioam namespace show
	ip ioam namespace add ID [ data DATA32 ] [ wide DATA64 ]
	ip ioam namespace del ID
	ip ioam schema show
	ip ioam schema add ID DATA
	ip ioam schema del ID
	ip ioam namespace set ID schema { ID | none }

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ioam6_genl.h      |  13 +
 include/uapi/linux/ioam6_genl.h |  52 ++++
 net/ipv6/ioam6.c                | 561 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 624 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/ioam6_genl.h
 create mode 100644 include/uapi/linux/ioam6_genl.h

(limited to 'include')

diff --git a/include/linux/ioam6_genl.h b/include/linux/ioam6_genl.h
new file mode 100644
index 000000000000..176e67919de3
--- /dev/null
+++ b/include/linux/ioam6_genl.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ *  IPv6 IOAM Generic Netlink API
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+#ifndef _LINUX_IOAM6_GENL_H
+#define _LINUX_IOAM6_GENL_H
+
+#include <uapi/linux/ioam6_genl.h>
+
+#endif /* _LINUX_IOAM6_GENL_H */
diff --git a/include/uapi/linux/ioam6_genl.h b/include/uapi/linux/ioam6_genl.h
new file mode 100644
index 000000000000..ca4b22833754
--- /dev/null
+++ b/include/uapi/linux/ioam6_genl.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ *  IPv6 IOAM Generic Netlink API
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#ifndef _UAPI_LINUX_IOAM6_GENL_H
+#define _UAPI_LINUX_IOAM6_GENL_H
+
+#define IOAM6_GENL_NAME "IOAM6"
+#define IOAM6_GENL_VERSION 0x1
+
+enum {
+	IOAM6_ATTR_UNSPEC,
+
+	IOAM6_ATTR_NS_ID,	/* u16 */
+	IOAM6_ATTR_NS_DATA,	/* u32 */
+	IOAM6_ATTR_NS_DATA_WIDE,/* u64 */
+
+#define IOAM6_MAX_SCHEMA_DATA_LEN (255 * 4)
+	IOAM6_ATTR_SC_ID,	/* u32 */
+	IOAM6_ATTR_SC_DATA,	/* Binary */
+	IOAM6_ATTR_SC_NONE,	/* Flag */
+
+	IOAM6_ATTR_PAD,
+
+	__IOAM6_ATTR_MAX,
+};
+
+#define IOAM6_ATTR_MAX (__IOAM6_ATTR_MAX - 1)
+
+enum {
+	IOAM6_CMD_UNSPEC,
+
+	IOAM6_CMD_ADD_NAMESPACE,
+	IOAM6_CMD_DEL_NAMESPACE,
+	IOAM6_CMD_DUMP_NAMESPACES,
+
+	IOAM6_CMD_ADD_SCHEMA,
+	IOAM6_CMD_DEL_SCHEMA,
+	IOAM6_CMD_DUMP_SCHEMAS,
+
+	IOAM6_CMD_NS_SET_SCHEMA,
+
+	__IOAM6_CMD_MAX,
+};
+
+#define IOAM6_CMD_MAX (__IOAM6_CMD_MAX - 1)
+
+#endif /* _UAPI_LINUX_IOAM6_GENL_H */
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
index ba629e1b9408..ba59671f32b8 100644
--- a/net/ipv6/ioam6.c
+++ b/net/ipv6/ioam6.c
@@ -11,9 +11,11 @@
 #include <linux/kernel.h>
 #include <linux/net.h>
 #include <linux/ioam6.h>
+#include <linux/ioam6_genl.h>
 #include <linux/rhashtable.h>
 
 #include <net/addrconf.h>
+#include <net/genetlink.h>
 #include <net/ioam6.h>
 
 static void ioam6_ns_release(struct ioam6_namespace *ns)
@@ -72,6 +74,552 @@ static const struct rhashtable_params rht_sc_params = {
 	.obj_cmpfn		= ioam6_sc_cmpfn,
 };
 
+static struct genl_family ioam6_genl_family;
+
+static const struct nla_policy ioam6_genl_policy_addns[] = {
+	[IOAM6_ATTR_NS_ID]	= { .type = NLA_U16 },
+	[IOAM6_ATTR_NS_DATA]	= { .type = NLA_U32 },
+	[IOAM6_ATTR_NS_DATA_WIDE] = { .type = NLA_U64 },
+};
+
+static const struct nla_policy ioam6_genl_policy_delns[] = {
+	[IOAM6_ATTR_NS_ID]	= { .type = NLA_U16 },
+};
+
+static const struct nla_policy ioam6_genl_policy_addsc[] = {
+	[IOAM6_ATTR_SC_ID]	= { .type = NLA_U32 },
+	[IOAM6_ATTR_SC_DATA]	= { .type = NLA_BINARY,
+				    .len = IOAM6_MAX_SCHEMA_DATA_LEN },
+};
+
+static const struct nla_policy ioam6_genl_policy_delsc[] = {
+	[IOAM6_ATTR_SC_ID]	= { .type = NLA_U32 },
+};
+
+static const struct nla_policy ioam6_genl_policy_ns_sc[] = {
+	[IOAM6_ATTR_NS_ID]	= { .type = NLA_U16 },
+	[IOAM6_ATTR_SC_ID]	= { .type = NLA_U32 },
+	[IOAM6_ATTR_SC_NONE]	= { .type = NLA_FLAG },
+};
+
+static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_pernet_data *nsdata;
+	struct ioam6_namespace *ns;
+	u64 data64;
+	u32 data32;
+	__be16 id;
+	int err;
+
+	if (!info->attrs[IOAM6_ATTR_NS_ID])
+		return -EINVAL;
+
+	id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+	if (ns) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	ns->id = id;
+
+	if (!info->attrs[IOAM6_ATTR_NS_DATA])
+		data32 = IOAM6_U32_UNAVAILABLE;
+	else
+		data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]);
+
+	if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE])
+		data64 = IOAM6_U64_UNAVAILABLE;
+	else
+		data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]);
+
+	ns->data = cpu_to_be32(data32);
+	ns->data_wide = cpu_to_be64(data64);
+
+	err = rhashtable_lookup_insert_fast(&nsdata->namespaces, &ns->head,
+					    rht_ns_params);
+	if (err)
+		kfree(ns);
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+}
+
+static int ioam6_genl_delns(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_pernet_data *nsdata;
+	struct ioam6_namespace *ns;
+	struct ioam6_schema *sc;
+	__be16 id;
+	int err;
+
+	if (!info->attrs[IOAM6_ATTR_NS_ID])
+		return -EINVAL;
+
+	id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+	if (!ns) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	sc = rcu_dereference_protected(ns->schema,
+				       lockdep_is_held(&nsdata->lock));
+
+	err = rhashtable_remove_fast(&nsdata->namespaces, &ns->head,
+				     rht_ns_params);
+	if (err)
+		goto out_unlock;
+
+	if (sc)
+		rcu_assign_pointer(sc->ns, NULL);
+
+	ioam6_ns_release(ns);
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+}
+
+static int __ioam6_genl_dumpns_element(struct ioam6_namespace *ns,
+				       u32 portid,
+				       u32 seq,
+				       u32 flags,
+				       struct sk_buff *skb,
+				       u8 cmd)
+{
+	struct ioam6_schema *sc;
+	u64 data64;
+	u32 data32;
+	void *hdr;
+
+	hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+	if (!hdr)
+		return -ENOMEM;
+
+	data32 = be32_to_cpu(ns->data);
+	data64 = be64_to_cpu(ns->data_wide);
+
+	if (nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id)) ||
+	    (data32 != IOAM6_U32_UNAVAILABLE &&
+	     nla_put_u32(skb, IOAM6_ATTR_NS_DATA, data32)) ||
+	    (data64 != IOAM6_U64_UNAVAILABLE &&
+	     nla_put_u64_64bit(skb, IOAM6_ATTR_NS_DATA_WIDE,
+			       data64, IOAM6_ATTR_PAD)))
+		goto nla_put_failure;
+
+	rcu_read_lock();
+
+	sc = rcu_dereference(ns->schema);
+	if (sc && nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id)) {
+		rcu_read_unlock();
+		goto nla_put_failure;
+	}
+
+	rcu_read_unlock();
+
+	genlmsg_end(skb, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpns_start(struct netlink_callback *cb)
+{
+	struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+	struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+	if (!iter) {
+		iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+		if (!iter)
+			return -ENOMEM;
+
+		cb->args[0] = (long)iter;
+	}
+
+	rhashtable_walk_enter(&nsdata->namespaces, iter);
+
+	return 0;
+}
+
+static int ioam6_genl_dumpns_done(struct netlink_callback *cb)
+{
+	struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+	rhashtable_walk_exit(iter);
+	kfree(iter);
+
+	return 0;
+}
+
+static int ioam6_genl_dumpns(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct rhashtable_iter *iter;
+	struct ioam6_namespace *ns;
+	int err;
+
+	iter = (struct rhashtable_iter *)cb->args[0];
+	rhashtable_walk_start(iter);
+
+	for (;;) {
+		ns = rhashtable_walk_next(iter);
+
+		if (IS_ERR(ns)) {
+			if (PTR_ERR(ns) == -EAGAIN)
+				continue;
+			err = PTR_ERR(ns);
+			goto done;
+		} else if (!ns) {
+			break;
+		}
+
+		err = __ioam6_genl_dumpns_element(ns,
+						  NETLINK_CB(cb->skb).portid,
+						  cb->nlh->nlmsg_seq,
+						  NLM_F_MULTI,
+						  skb,
+						  IOAM6_CMD_DUMP_NAMESPACES);
+		if (err)
+			goto done;
+	}
+
+	err = skb->len;
+
+done:
+	rhashtable_walk_stop(iter);
+	return err;
+}
+
+static int ioam6_genl_addsc(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_pernet_data *nsdata;
+	int len, len_aligned, err;
+	struct ioam6_schema *sc;
+	u32 id;
+
+	if (!info->attrs[IOAM6_ATTR_SC_ID] || !info->attrs[IOAM6_ATTR_SC_DATA])
+		return -EINVAL;
+
+	id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+	if (sc) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	len = nla_len(info->attrs[IOAM6_ATTR_SC_DATA]);
+	len_aligned = ALIGN(len, 4);
+
+	sc = kzalloc(sizeof(*sc) + len_aligned, GFP_KERNEL);
+	if (!sc) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	sc->id = id;
+	sc->len = len_aligned;
+	sc->hdr = cpu_to_be32(sc->id | ((u8)(sc->len / 4) << 24));
+	nla_memcpy(sc->data, info->attrs[IOAM6_ATTR_SC_DATA], len);
+
+	err = rhashtable_lookup_insert_fast(&nsdata->schemas, &sc->head,
+					    rht_sc_params);
+	if (err)
+		goto free_sc;
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+free_sc:
+	kfree(sc);
+	goto out_unlock;
+}
+
+static int ioam6_genl_delsc(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_pernet_data *nsdata;
+	struct ioam6_namespace *ns;
+	struct ioam6_schema *sc;
+	int err;
+	u32 id;
+
+	if (!info->attrs[IOAM6_ATTR_SC_ID])
+		return -EINVAL;
+
+	id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+	if (!sc) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	ns = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock));
+
+	err = rhashtable_remove_fast(&nsdata->schemas, &sc->head,
+				     rht_sc_params);
+	if (err)
+		goto out_unlock;
+
+	if (ns)
+		rcu_assign_pointer(ns->schema, NULL);
+
+	ioam6_sc_release(sc);
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+}
+
+static int __ioam6_genl_dumpsc_element(struct ioam6_schema *sc,
+				       u32 portid, u32 seq, u32 flags,
+				       struct sk_buff *skb, u8 cmd)
+{
+	struct ioam6_namespace *ns;
+	void *hdr;
+
+	hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+	if (!hdr)
+		return -ENOMEM;
+
+	if (nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id) ||
+	    nla_put(skb, IOAM6_ATTR_SC_DATA, sc->len, sc->data))
+		goto nla_put_failure;
+
+	rcu_read_lock();
+
+	ns = rcu_dereference(sc->ns);
+	if (ns && nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id))) {
+		rcu_read_unlock();
+		goto nla_put_failure;
+	}
+
+	rcu_read_unlock();
+
+	genlmsg_end(skb, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpsc_start(struct netlink_callback *cb)
+{
+	struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+	struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+	if (!iter) {
+		iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+		if (!iter)
+			return -ENOMEM;
+
+		cb->args[0] = (long)iter;
+	}
+
+	rhashtable_walk_enter(&nsdata->schemas, iter);
+
+	return 0;
+}
+
+static int ioam6_genl_dumpsc_done(struct netlink_callback *cb)
+{
+	struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+	rhashtable_walk_exit(iter);
+	kfree(iter);
+
+	return 0;
+}
+
+static int ioam6_genl_dumpsc(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct rhashtable_iter *iter;
+	struct ioam6_schema *sc;
+	int err;
+
+	iter = (struct rhashtable_iter *)cb->args[0];
+	rhashtable_walk_start(iter);
+
+	for (;;) {
+		sc = rhashtable_walk_next(iter);
+
+		if (IS_ERR(sc)) {
+			if (PTR_ERR(sc) == -EAGAIN)
+				continue;
+			err = PTR_ERR(sc);
+			goto done;
+		} else if (!sc) {
+			break;
+		}
+
+		err = __ioam6_genl_dumpsc_element(sc,
+						  NETLINK_CB(cb->skb).portid,
+						  cb->nlh->nlmsg_seq,
+						  NLM_F_MULTI,
+						  skb,
+						  IOAM6_CMD_DUMP_SCHEMAS);
+		if (err)
+			goto done;
+	}
+
+	err = skb->len;
+
+done:
+	rhashtable_walk_stop(iter);
+	return err;
+}
+
+static int ioam6_genl_ns_set_schema(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_namespace *ns, *ns_ref;
+	struct ioam6_schema *sc, *sc_ref;
+	struct ioam6_pernet_data *nsdata;
+	__be16 ns_id;
+	u32 sc_id;
+	int err;
+
+	if (!info->attrs[IOAM6_ATTR_NS_ID] ||
+	    (!info->attrs[IOAM6_ATTR_SC_ID] &&
+	     !info->attrs[IOAM6_ATTR_SC_NONE]))
+		return -EINVAL;
+
+	ns_id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	ns = rhashtable_lookup_fast(&nsdata->namespaces, &ns_id, rht_ns_params);
+	if (!ns) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	if (info->attrs[IOAM6_ATTR_SC_NONE]) {
+		sc = NULL;
+	} else {
+		sc_id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+		sc = rhashtable_lookup_fast(&nsdata->schemas, &sc_id,
+					    rht_sc_params);
+		if (!sc) {
+			err = -ENOENT;
+			goto out_unlock;
+		}
+	}
+
+	sc_ref = rcu_dereference_protected(ns->schema,
+					   lockdep_is_held(&nsdata->lock));
+	if (sc_ref)
+		rcu_assign_pointer(sc_ref->ns, NULL);
+	rcu_assign_pointer(ns->schema, sc);
+
+	if (sc) {
+		ns_ref = rcu_dereference_protected(sc->ns,
+						   lockdep_is_held(&nsdata->lock));
+		if (ns_ref)
+			rcu_assign_pointer(ns_ref->schema, NULL);
+		rcu_assign_pointer(sc->ns, ns);
+	}
+
+	err = 0;
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+}
+
+static const struct genl_ops ioam6_genl_ops[] = {
+	{
+		.cmd	= IOAM6_CMD_ADD_NAMESPACE,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_addns,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_addns,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_addns) - 1,
+	},
+	{
+		.cmd	= IOAM6_CMD_DEL_NAMESPACE,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_delns,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_delns,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_delns) - 1,
+	},
+	{
+		.cmd	= IOAM6_CMD_DUMP_NAMESPACES,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.start	= ioam6_genl_dumpns_start,
+		.dumpit	= ioam6_genl_dumpns,
+		.done	= ioam6_genl_dumpns_done,
+		.flags	= GENL_ADMIN_PERM,
+	},
+	{
+		.cmd	= IOAM6_CMD_ADD_SCHEMA,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_addsc,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_addsc,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_addsc) - 1,
+	},
+	{
+		.cmd	= IOAM6_CMD_DEL_SCHEMA,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_delsc,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_delsc,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_delsc) - 1,
+	},
+	{
+		.cmd	= IOAM6_CMD_DUMP_SCHEMAS,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.start	= ioam6_genl_dumpsc_start,
+		.dumpit	= ioam6_genl_dumpsc,
+		.done	= ioam6_genl_dumpsc_done,
+		.flags	= GENL_ADMIN_PERM,
+	},
+	{
+		.cmd	= IOAM6_CMD_NS_SET_SCHEMA,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_ns_set_schema,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_ns_sc,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_ns_sc) - 1,
+	},
+};
+
+static struct genl_family ioam6_genl_family __ro_after_init = {
+	.name		= IOAM6_GENL_NAME,
+	.version	= IOAM6_GENL_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.ops		= ioam6_genl_ops,
+	.n_ops		= ARRAY_SIZE(ioam6_genl_ops),
+	.module		= THIS_MODULE,
+};
+
 struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id)
 {
 	struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
@@ -319,15 +867,24 @@ static struct pernet_operations ioam6_net_ops = {
 int __init ioam6_init(void)
 {
 	int err = register_pernet_subsys(&ioam6_net_ops);
+	if (err)
+		goto out;
 
+	err = genl_register_family(&ioam6_genl_family);
 	if (err)
-		return err;
+		goto out_unregister_pernet_subsys;
 
 	pr_info("In-situ OAM (IOAM) with IPv6\n");
-	return 0;
+
+out:
+	return err;
+out_unregister_pernet_subsys:
+	unregister_pernet_subsys(&ioam6_net_ops);
+	goto out;
 }
 
 void ioam6_exit(void)
 {
+	genl_unregister_family(&ioam6_genl_family);
 	unregister_pernet_subsys(&ioam6_net_ops);
 }
-- 
cgit v1.2.3


From 3edede08ff37c6a9370510508d5eeb54890baf47 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Tue, 20 Jul 2021 21:42:59 +0200
Subject: ipv6: ioam: Support for IOAM injection with lwtunnels

Add support for the IOAM inline insertion (only for the host-to-host use case)
which is per-route configured with lightweight tunnels. The target is iproute2
and the patch is ready. It will be posted as soon as this patchset is merged.
Here is an overview:

$ ip -6 ro ad fc00::1/128 encap ioam6 trace type 0x800000 ns 1 size 12 dev eth0

This example configures an IOAM Pre-allocated Trace option attached to the
fc00::1/128 prefix. The IOAM namespace (ns) is 1, the size of the pre-allocated
trace data block is 12 octets (size) and only the first IOAM data (bit 0:
hop_limit + node id) is included in the trace (type) represented as a bitfield.

The reason why the in-transit (IPv6-in-IPv6 encapsulation) use case is not
implemented is explained on the patchset cover.

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ioam6_iptunnel.h      |  13 ++
 include/net/ioam6.h                 |   3 +
 include/uapi/linux/ioam6.h          |   1 +
 include/uapi/linux/ioam6_iptunnel.h |  20 +++
 include/uapi/linux/lwtunnel.h       |   1 +
 net/core/lwtunnel.c                 |   2 +
 net/ipv6/Kconfig                    |  11 ++
 net/ipv6/Makefile                   |   1 +
 net/ipv6/ioam6.c                    |  44 ++++--
 net/ipv6/ioam6_iptunnel.c           | 274 ++++++++++++++++++++++++++++++++++++
 10 files changed, 358 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/ioam6_iptunnel.h
 create mode 100644 include/uapi/linux/ioam6_iptunnel.h
 create mode 100644 net/ipv6/ioam6_iptunnel.c

(limited to 'include')

diff --git a/include/linux/ioam6_iptunnel.h b/include/linux/ioam6_iptunnel.h
new file mode 100644
index 000000000000..07d9dfedd29d
--- /dev/null
+++ b/include/linux/ioam6_iptunnel.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ *  IPv6 IOAM Lightweight Tunnel API
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+#ifndef _LINUX_IOAM6_IPTUNNEL_H
+#define _LINUX_IOAM6_IPTUNNEL_H
+
+#include <uapi/linux/ioam6_iptunnel.h>
+
+#endif /* _LINUX_IOAM6_IPTUNNEL_H */
diff --git a/include/net/ioam6.h b/include/net/ioam6.h
index 772b91ee2e87..3c2993bc48c8 100644
--- a/include/net/ioam6.h
+++ b/include/net/ioam6.h
@@ -61,4 +61,7 @@ void ioam6_fill_trace_data(struct sk_buff *skb,
 int ioam6_init(void);
 void ioam6_exit(void);
 
+int ioam6_iptunnel_init(void);
+void ioam6_iptunnel_exit(void);
+
 #endif /* _NET_IOAM6_H */
diff --git a/include/uapi/linux/ioam6.h b/include/uapi/linux/ioam6.h
index 23ba6e85582f..ac4de376f0ce 100644
--- a/include/uapi/linux/ioam6.h
+++ b/include/uapi/linux/ioam6.h
@@ -126,6 +126,7 @@ struct ioam6_trace_hdr {
 #error "Please fix <asm/byteorder.h>"
 #endif
 
+#define IOAM6_TRACE_DATA_SIZE_MAX 244
 	__u8	data[0];
 } __attribute__((packed));
 
diff --git a/include/uapi/linux/ioam6_iptunnel.h b/include/uapi/linux/ioam6_iptunnel.h
new file mode 100644
index 000000000000..bae14636a8c8
--- /dev/null
+++ b/include/uapi/linux/ioam6_iptunnel.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ *  IPv6 IOAM Lightweight Tunnel API
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#ifndef _UAPI_LINUX_IOAM6_IPTUNNEL_H
+#define _UAPI_LINUX_IOAM6_IPTUNNEL_H
+
+enum {
+	IOAM6_IPTUNNEL_UNSPEC,
+	IOAM6_IPTUNNEL_TRACE,		/* struct ioam6_trace_hdr */
+	__IOAM6_IPTUNNEL_MAX,
+};
+
+#define IOAM6_IPTUNNEL_MAX (__IOAM6_IPTUNNEL_MAX - 1)
+
+#endif /* _UAPI_LINUX_IOAM6_IPTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 568a4303ccce..2e206919125c 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -14,6 +14,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_BPF,
 	LWTUNNEL_ENCAP_SEG6_LOCAL,
 	LWTUNNEL_ENCAP_RPL,
+	LWTUNNEL_ENCAP_IOAM6,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 8ec7d13d2860..d0ae987d2de9 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -43,6 +43,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "SEG6LOCAL";
 	case LWTUNNEL_ENCAP_RPL:
 		return "RPL";
+	case LWTUNNEL_ENCAP_IOAM6:
+		return "IOAM6";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE:
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 747f56e0c636..e504204bca92 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -328,4 +328,15 @@ config IPV6_RPL_LWTUNNEL
 
 	  If unsure, say N.
 
+config IPV6_IOAM6_LWTUNNEL
+	bool "IPv6: IOAM Pre-allocated Trace insertion support"
+	depends on IPV6
+	select LWTUNNEL
+	help
+	  Support for the inline insertion of IOAM Pre-allocated
+	  Trace Header (only on locally generated packets), using
+	  the lightweight tunnels mechanism.
+
+	  If unsure, say N.
+
 endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index b7ef10d417d6..1bc7e143217b 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -27,6 +27,7 @@ ipv6-$(CONFIG_NETLABEL) += calipso.o
 ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
 ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
 ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o
+ipv6-$(CONFIG_IPV6_IOAM6_LWTUNNEL) += ioam6_iptunnel.o
 
 ipv6-objs += $(ipv6-y)
 
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
index ba59671f32b8..5e8961004832 100644
--- a/net/ipv6/ioam6.c
+++ b/net/ipv6/ioam6.c
@@ -648,7 +648,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 		if (skb->dev)
 			byte--;
 
-		raw32 = dev_net(skb->dev)->ipv6.sysctl.ioam6_id;
+		raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id;
 
 		*(__be32 *)data = cpu_to_be32((byte << 24) | raw32);
 		data += sizeof(__be32);
@@ -675,24 +675,31 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 
 	/* timestamp seconds */
 	if (trace->type.bit2) {
-		if (!skb->tstamp)
-			__net_timestamp(skb);
-
-		skb_get_new_timestamp(skb, &ts);
+		if (!skb->dev) {
+			*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		} else {
+			if (!skb->tstamp)
+				__net_timestamp(skb);
 
-		*(__be32 *)data = cpu_to_be32((u32)ts.tv_sec);
+			skb_get_new_timestamp(skb, &ts);
+			*(__be32 *)data = cpu_to_be32((u32)ts.tv_sec);
+		}
 		data += sizeof(__be32);
 	}
 
 	/* timestamp subseconds */
 	if (trace->type.bit3) {
-		if (!skb->tstamp)
-			__net_timestamp(skb);
+		if (!skb->dev) {
+			*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		} else {
+			if (!skb->tstamp)
+				__net_timestamp(skb);
 
-		if (!trace->type.bit2)
-			skb_get_new_timestamp(skb, &ts);
+			if (!trace->type.bit2)
+				skb_get_new_timestamp(skb, &ts);
 
-		*(__be32 *)data = cpu_to_be32((u32)ts.tv_usec);
+			*(__be32 *)data = cpu_to_be32((u32)ts.tv_usec);
+		}
 		data += sizeof(__be32);
 	}
 
@@ -726,7 +733,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 		if (skb->dev)
 			byte--;
 
-		raw64 = dev_net(skb->dev)->ipv6.sysctl.ioam6_id_wide;
+		raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide;
 
 		*(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64);
 		data += sizeof(__be64);
@@ -874,10 +881,20 @@ int __init ioam6_init(void)
 	if (err)
 		goto out_unregister_pernet_subsys;
 
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+	err = ioam6_iptunnel_init();
+	if (err)
+		goto out_unregister_genl;
+#endif
+
 	pr_info("In-situ OAM (IOAM) with IPv6\n");
 
 out:
 	return err;
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+out_unregister_genl:
+	genl_unregister_family(&ioam6_genl_family);
+#endif
 out_unregister_pernet_subsys:
 	unregister_pernet_subsys(&ioam6_net_ops);
 	goto out;
@@ -885,6 +902,9 @@ out_unregister_pernet_subsys:
 
 void ioam6_exit(void)
 {
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+	ioam6_iptunnel_exit();
+#endif
 	genl_unregister_family(&ioam6_genl_family);
 	unregister_pernet_subsys(&ioam6_net_ops);
 }
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
new file mode 100644
index 000000000000..f9ee04541c17
--- /dev/null
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *  IPv6 IOAM Lightweight Tunnel implementation
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/netlink.h>
+#include <linux/in6.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_iptunnel.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/lwtunnel.h>
+#include <net/ioam6.h>
+
+#define IOAM6_MASK_SHORT_FIELDS 0xff100000
+#define IOAM6_MASK_WIDE_FIELDS 0xe00000
+
+struct ioam6_lwt_encap {
+	struct ipv6_hopopt_hdr	eh;
+	u8			pad[2];	/* 2-octet padding for 4n-alignment */
+	struct ioam6_hdr	ioamh;
+	struct ioam6_trace_hdr	traceh;
+} __packed;
+
+struct ioam6_lwt {
+	struct ioam6_lwt_encap	tuninfo;
+};
+
+static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
+{
+	return (struct ioam6_lwt *)lwt->data;
+}
+
+static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt)
+{
+	return &ioam6_lwt_state(lwt)->tuninfo;
+}
+
+static struct ioam6_trace_hdr *ioam6_trace(struct lwtunnel_state *lwt)
+{
+	return &(ioam6_lwt_state(lwt)->tuninfo.traceh);
+}
+
+static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
+	[IOAM6_IPTUNNEL_TRACE]	= NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)),
+};
+
+static int nla_put_ioam6_trace(struct sk_buff *skb, int attrtype,
+			       struct ioam6_trace_hdr *trace)
+{
+	struct ioam6_trace_hdr *data;
+	struct nlattr *nla;
+	int len;
+
+	len = sizeof(*trace);
+
+	nla = nla_reserve(skb, attrtype, len);
+	if (!nla)
+		return -EMSGSIZE;
+
+	data = nla_data(nla);
+	memcpy(data, trace, len);
+
+	return 0;
+}
+
+static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace)
+{
+	u32 fields;
+
+	if (!trace->type_be32 || !trace->remlen ||
+	    trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4)
+		return false;
+
+	trace->nodelen = 0;
+	fields = be32_to_cpu(trace->type_be32);
+
+	trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS)
+				* (sizeof(__be32) / 4);
+	trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS)
+				* (sizeof(__be64) / 4);
+
+	return true;
+}
+
+static int ioam6_build_state(struct net *net, struct nlattr *nla,
+			     unsigned int family, const void *cfg,
+			     struct lwtunnel_state **ts,
+			     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1];
+	struct ioam6_lwt_encap *tuninfo;
+	struct ioam6_trace_hdr *trace;
+	struct lwtunnel_state *s;
+	int len_aligned;
+	int len, err;
+
+	if (family != AF_INET6)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla,
+			       ioam6_iptunnel_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[IOAM6_IPTUNNEL_TRACE]) {
+		NL_SET_ERR_MSG(extack, "missing trace");
+		return -EINVAL;
+	}
+
+	trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]);
+	if (!ioam6_validate_trace_hdr(trace)) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE],
+				    "invalid trace validation");
+		return -EINVAL;
+	}
+
+	len = sizeof(*tuninfo) + trace->remlen * 4;
+	len_aligned = ALIGN(len, 8);
+
+	s = lwtunnel_state_alloc(len_aligned);
+	if (!s)
+		return -ENOMEM;
+
+	tuninfo = ioam6_lwt_info(s);
+	tuninfo->eh.hdrlen = (len_aligned >> 3) - 1;
+	tuninfo->pad[0] = IPV6_TLV_PADN;
+	tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC;
+	tuninfo->ioamh.opt_type = IPV6_TLV_IOAM;
+	tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace)
+					+ trace->remlen * 4;
+
+	memcpy(&tuninfo->traceh, trace, sizeof(*trace));
+
+	len = len_aligned - len;
+	if (len == 1) {
+		tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PAD1;
+	} else if (len > 0) {
+		tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN;
+		tuninfo->traceh.data[trace->remlen * 4 + 1] = len - 2;
+	}
+
+	s->type = LWTUNNEL_ENCAP_IOAM6;
+	s->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+	*ts = s;
+
+	return 0;
+}
+
+static int ioam6_do_inline(struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo)
+{
+	struct ioam6_trace_hdr *trace;
+	struct ipv6hdr *oldhdr, *hdr;
+	struct ioam6_namespace *ns;
+	int hdrlen, err;
+
+	hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
+
+	err = skb_cow_head(skb, hdrlen + skb->mac_len);
+	if (unlikely(err))
+		return err;
+
+	oldhdr = ipv6_hdr(skb);
+	skb_pull(skb, sizeof(*oldhdr));
+	skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr));
+
+	skb_push(skb, sizeof(*oldhdr) + hdrlen);
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+
+	hdr = ipv6_hdr(skb);
+	memmove(hdr, oldhdr, sizeof(*oldhdr));
+	tuninfo->eh.nexthdr = hdr->nexthdr;
+
+	skb_set_transport_header(skb, sizeof(*hdr));
+	skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen);
+
+	memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
+
+	hdr->nexthdr = NEXTHDR_HOP;
+	hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
+
+	trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb)
+					   + sizeof(struct ipv6_hopopt_hdr) + 2
+					   + sizeof(struct ioam6_hdr));
+
+	ns = ioam6_namespace(dev_net(skb_dst(skb)->dev), trace->namespace_id);
+	if (ns)
+		ioam6_fill_trace_data(skb, ns, trace);
+
+	return 0;
+}
+
+static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct lwtunnel_state *lwt = skb_dst(skb)->lwtstate;
+	int err = -EINVAL;
+
+	if (skb->protocol != htons(ETH_P_IPV6))
+		goto drop;
+
+	/* Only for packets we send and
+	 * that do not contain a Hop-by-Hop yet
+	 */
+	if (skb->dev || ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP)
+		goto out;
+
+	err = ioam6_do_inline(skb, ioam6_lwt_info(lwt));
+	if (unlikely(err))
+		goto drop;
+
+	err = skb_cow_head(skb, LL_RESERVED_SPACE(skb_dst(skb)->dev));
+	if (unlikely(err))
+		goto drop;
+
+out:
+	return lwt->orig_output(net, sk, skb);
+
+drop:
+	kfree_skb(skb);
+	return err;
+}
+
+static int ioam6_fill_encap_info(struct sk_buff *skb,
+				 struct lwtunnel_state *lwtstate)
+{
+	struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate);
+
+	if (nla_put_ioam6_trace(skb, IOAM6_IPTUNNEL_TRACE, trace))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate);
+
+	return nla_total_size(sizeof(*trace));
+}
+
+static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct ioam6_trace_hdr *a_hdr = ioam6_trace(a);
+	struct ioam6_trace_hdr *b_hdr = ioam6_trace(b);
+
+	return (a_hdr->namespace_id != b_hdr->namespace_id);
+}
+
+static const struct lwtunnel_encap_ops ioam6_iptun_ops = {
+	.build_state	= ioam6_build_state,
+	.output		= ioam6_output,
+	.fill_encap	= ioam6_fill_encap_info,
+	.get_encap_size	= ioam6_encap_nlsize,
+	.cmp_encap	= ioam6_encap_cmp,
+	.owner		= THIS_MODULE,
+};
+
+int __init ioam6_iptunnel_init(void)
+{
+	return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
+
+void ioam6_iptunnel_exit(void)
+{
+	lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
-- 
cgit v1.2.3


From 427faee167bce4ccb0b22f6815a9ac509e31d4c3 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Tue, 20 Jul 2021 23:06:27 +0300
Subject: net: ipv6: introduce ip6_dst_mtu_maybe_forward

Replace ip6_dst_mtu_forward with ip6_dst_mtu_maybe_forward and
reuse this code in ip6_mtu. Actually these two functions were
almost duplicates, this change will simplify the maintaince of
mtu calculation code.

Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h            |  5 +++--
 net/ipv6/ip6_output.c              |  2 +-
 net/ipv6/route.c                   | 20 +-------------------
 net/netfilter/nf_flow_table_core.c |  2 +-
 4 files changed, 6 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 625a38ccb5d9..820eae3ea95f 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -316,12 +316,13 @@ static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *
 	       !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
 }
 
-static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
+static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst,
+						     bool forwarding)
 {
 	struct inet6_dev *idev;
 	unsigned int mtu;
 
-	if (dst_metric_locked(dst, RTAX_MTU)) {
+	if (!forwarding || dst_metric_locked(dst, RTAX_MTU)) {
 		mtu = dst_metric_raw(dst, RTAX_MTU);
 		if (mtu)
 			goto out;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 01bea76e3891..f6bc7828a480 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -607,7 +607,7 @@ int ip6_forward(struct sk_buff *skb)
 		}
 	}
 
-	mtu = ip6_dst_mtu_forward(dst);
+	mtu = ip6_dst_mtu_maybe_forward(dst, true);
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5f7286acca33..21d5d0b4cde5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3201,25 +3201,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
 
 INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
 {
-	struct inet6_dev *idev;
-	unsigned int mtu;
-
-	mtu = dst_metric_raw(dst, RTAX_MTU);
-	if (mtu)
-		goto out;
-
-	mtu = IPV6_MIN_MTU;
-
-	rcu_read_lock();
-	idev = __in6_dev_get(dst->dev);
-	if (idev)
-		mtu = idev->cnf.mtu6;
-	rcu_read_unlock();
-
-out:
-	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
-
-	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+	return ip6_dst_mtu_maybe_forward(dst, false);
 }
 EXPORT_INDIRECT_CALLABLE(ip6_mtu);
 
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 1e50908b1b7e..8fe024a0ae46 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -99,7 +99,7 @@ static int flow_offload_fill_route(struct flow_offload *flow,
 		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
 		break;
 	case NFPROTO_IPV6:
-		flow_tuple->mtu = ip6_dst_mtu_forward(dst);
+		flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
 		break;
 	}
 
-- 
cgit v1.2.3


From ac6627a28dbfb5d96736544a00c3938fa7ea6dfb Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Tue, 20 Jul 2021 23:06:28 +0300
Subject: net: ipv4: Consolidate ipv4_mtu and ip_dst_mtu_maybe_forward

Consolidate IPv4 MTU code the same way it is done in IPv6 to have code
aligned in both address families

Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 22 ++++++++++++++++++----
 net/ipv4/route.c | 21 +--------------------
 2 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index d9683bef8684..9192444f2964 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -436,18 +436,32 @@ static inline bool ip_sk_ignore_df(const struct sock *sk)
 static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 						    bool forwarding)
 {
+	const struct rtable *rt = container_of(dst, struct rtable, dst);
 	struct net *net = dev_net(dst->dev);
 	unsigned int mtu;
 
 	if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
 	    ip_mtu_locked(dst) ||
-	    !forwarding)
-		return dst_mtu(dst);
+	    !forwarding) {
+		mtu = rt->rt_pmtu;
+		if (mtu && time_before(jiffies, rt->dst.expires))
+			goto out;
+	}
 
 	/* 'forwarding = true' case should always honour route mtu */
 	mtu = dst_metric_raw(dst, RTAX_MTU);
-	if (!mtu)
-		mtu = min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);
+	if (mtu)
+		goto out;
+
+	mtu = READ_ONCE(dst->dev->mtu);
+
+	if (unlikely(ip_mtu_locked(dst))) {
+		if (rt->rt_uses_gateway && mtu > 576)
+			mtu = 576;
+	}
+
+out:
+	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
 
 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 99c06944501a..04754d55b3c1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1299,26 +1299,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
 
 INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
 {
-	const struct rtable *rt = (const struct rtable *)dst;
-	unsigned int mtu = rt->rt_pmtu;
-
-	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
-		mtu = dst_metric_raw(dst, RTAX_MTU);
-
-	if (mtu)
-		goto out;
-
-	mtu = READ_ONCE(dst->dev->mtu);
-
-	if (unlikely(ip_mtu_locked(dst))) {
-		if (rt->rt_uses_gateway && mtu > 576)
-			mtu = 576;
-	}
-
-out:
-	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
-
-	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+	return ip_dst_mtu_maybe_forward(dst, false);
 }
 EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
 
-- 
cgit v1.2.3


From 2f5dc00f7a3ea669fd387ce79ffca92bff361550 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 21 Jul 2021 19:24:01 +0300
Subject: net: bridge: switchdev: let drivers inform which bridge ports are
 offloaded

On reception of an skb, the bridge checks if it was marked as 'already
forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it
is, it assigns the source hardware domain of that skb based on the
hardware domain of the ingress port. Then during forwarding, it enforces
that the egress port must have a different hardware domain than the
ingress one (this is done in nbp_switchdev_allowed_egress).

Non-switchdev drivers don't report any physical switch id (neither
through devlink nor .ndo_get_port_parent_id), therefore the bridge
assigns them a hardware domain of 0, and packets coming from them will
always have skb->offload_fwd_mark = 0. So there aren't any restrictions.

Problems appear due to the fact that DSA would like to perform software
fallback for bonding and team interfaces that the physical switch cannot
offload.

       +-- br0 ---+
      / /   |      \
     / /    |       \
    /  |    |      bond0
   /   |    |     /    \
 swp0 swp1 swp2 swp3 swp4

There, it is desirable that the presence of swp3 and swp4 under a
non-offloaded LAG does not preclude us from doing hardware bridging
beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high
enough that software bridging between {swp0,swp1,swp2} and bond0 is not
impractical.

But this creates an impossible paradox given the current way in which
port hardware domains are assigned. When the driver receives a packet
from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to
something.

- If we set it to 0, then the bridge will forward it towards swp1, swp2
  and bond0. But the switch has already forwarded it towards swp1 and
  swp2 (not to bond0, remember, that isn't offloaded, so as far as the
  switch is concerned, ports swp3 and swp4 are not looking up the FDB,
  and the entire bond0 is a destination that is strictly behind the
  CPU). But we don't want duplicated traffic towards swp1 and swp2, so
  it's not ok to set skb->offload_fwd_mark = 0.

- If we set it to 1, then the bridge will not forward the skb towards
  the ports with the same switchdev mark, i.e. not to swp1, swp2 and
  bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should
  have forwarded the skb there.

So the real issue is that bond0 will be assigned the same hardware
domain as {swp0,swp1,swp2}, because the function that assigns hardware
domains to bridge ports, nbp_switchdev_add(), recurses through bond0's
lower interfaces until it finds something that implements devlink (calls
dev_get_port_parent_id with bool recurse = true). This is a problem
because the fact that bond0 can be offloaded by swp3 and swp4 in our
example is merely an assumption.

A solution is to give the bridge explicit hints as to what hardware
domain it should use for each port.

Currently, the bridging offload is very 'silent': a driver registers a
netdevice notifier, which is put on the netns's notifier chain, and
which sniffs around for NETDEV_CHANGEUPPER events where the upper is a
bridge, and the lower is an interface it knows about (one registered by
this driver, normally). Then, from within that notifier, it does a bunch
of stuff behind the bridge's back, without the bridge necessarily
knowing that there's somebody offloading that port. It looks like this:

     ip link set swp0 master br0
                  |
                  v
 br_add_if() calls netdev_master_upper_dev_link()
                  |
                  v
        call_netdevice_notifiers
                  |
                  v
       dsa_slave_netdevice_event
                  |
                  v
        oh, hey! it's for me!
                  |
                  v
           .port_bridge_join

What we do to solve the conundrum is to be less silent, and change the
switchdev drivers to present themselves to the bridge. Something like this:

     ip link set swp0 master br0
                  |
                  v
 br_add_if() calls netdev_master_upper_dev_link()
                  |
                  v                    bridge: Aye! I'll use this
        call_netdevice_notifiers           ^  ppid as the
                  |                        |  hardware domain for
                  v                        |  this port, and zero
       dsa_slave_netdevice_event           |  if I got nothing.
                  |                        |
                  v                        |
        oh, hey! it's for me!              |
                  |                        |
                  v                        |
           .port_bridge_join               |
                  |                        |
                  +------------------------+
             switchdev_bridge_port_offload(swp0, swp0)

Then stacked interfaces (like bond0 on top of swp3/swp4) would be
treated differently in DSA, depending on whether we can or cannot
offload them.

The offload case:

    ip link set bond0 master br0
                  |
                  v
 br_add_if() calls netdev_master_upper_dev_link()
                  |
                  v                    bridge: Aye! I'll use this
        call_netdevice_notifiers           ^  ppid as the
                  |                        |  switchdev mark for
                  v                        |        bond0.
       dsa_slave_netdevice_event           | Coincidentally (or not),
                  |                        | bond0 and swp0, swp1, swp2
                  v                        | all have the same switchdev
        hmm, it's not quite for me,        | mark now, since the ASIC
         but my driver has already         | is able to forward towards
           called .port_lag_join           | all these ports in hw.
          for it, because I have           |
      a port with dp->lag_dev == bond0.    |
                  |                        |
                  v                        |
           .port_bridge_join               |
           for swp3 and swp4               |
                  |                        |
                  +------------------------+
            switchdev_bridge_port_offload(bond0, swp3)
            switchdev_bridge_port_offload(bond0, swp4)

And the non-offload case:

    ip link set bond0 master br0
                  |
                  v
 br_add_if() calls netdev_master_upper_dev_link()
                  |
                  v                    bridge waiting:
        call_netdevice_notifiers           ^  huh, switchdev_bridge_port_offload
                  |                        |  wasn't called, okay, I'll use a
                  v                        |  hwdom of zero for this one.
       dsa_slave_netdevice_event           :  Then packets received on swp0 will
                  |                        :  not be software-forwarded towards
                  v                        :  swp1, but they will towards bond0.
         it's not for me, but
       bond0 is an upper of swp3
      and swp4, but their dp->lag_dev
       is NULL because they couldn't
            offload it.

Basically we can draw the conclusion that the lowers of a bridge port
can come and go, so depending on the configuration of lowers for a
bridge port, it can dynamically toggle between offloaded and unoffloaded.
Therefore, we need an equivalent switchdev_bridge_port_unoffload too.

This patch changes the way any switchdev driver interacts with the
bridge. From now on, everybody needs to call switchdev_bridge_port_offload
and switchdev_bridge_port_unoffload, otherwise the bridge will treat the
port as non-offloaded and allow software flooding to other ports from
the same ASIC.

Note that these functions lay the ground for a more complex handshake
between switchdev drivers and the bridge in the future.

For drivers that will request a replay of the switchdev objects when
they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we
place the call to switchdev_bridge_port_unoffload() strategically inside
the NETDEV_PRECHANGEUPPER notifier's code path, and not inside
NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers
need the netdev adjacency lists to be valid, and that is only true in
NETDEV_PRECHANGEUPPER.

Cc: Vadym Kochan <vkochan@marvell.com>
Cc: Taras Chornyi <tchornyi@marvell.com>
Cc: Ioana Ciornei <ioana.ciornei@nxp.com>
Cc: Lars Povlsen <lars.povlsen@microchip.com>
Cc: Steen Hegelund <Steen.Hegelund@microchip.com>
Cc: UNGLinuxDriver@microchip.com
Cc: Claudiu Manoil <claudiu.manoil@nxp.com>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression
Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch
Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/freescale/dpaa2/dpaa2-switch.c    | 13 ++++
 .../net/ethernet/marvell/prestera/prestera_main.c  |  3 +-
 .../ethernet/marvell/prestera/prestera_switchdev.c | 11 ++-
 .../ethernet/marvell/prestera/prestera_switchdev.h |  3 +-
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 24 +++++--
 .../ethernet/microchip/sparx5/sparx5_switchdev.c   | 23 +++++-
 drivers/net/ethernet/mscc/ocelot_net.c             | 71 +++++++++++++++++++
 drivers/net/ethernet/rocker/rocker.h               |  3 +-
 drivers/net/ethernet/rocker/rocker_main.c          |  9 ++-
 drivers/net/ethernet/rocker/rocker_ofdpa.c         | 18 +++--
 drivers/net/ethernet/ti/am65-cpsw-nuss.c           | 17 ++++-
 drivers/net/ethernet/ti/cpsw_new.c                 | 15 +++-
 include/linux/if_bridge.h                          | 21 ++++++
 net/bridge/br_if.c                                 | 13 +---
 net/bridge/br_private.h                            | 13 +---
 net/bridge/br_switchdev.c                          | 82 ++++++++++++++++++----
 net/dsa/port.c                                     | 16 ++++-
 17 files changed, 298 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index 23798feb40b2..9b090da3e460 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -1930,8 +1930,13 @@ static int dpaa2_switch_port_bridge_join(struct net_device *netdev,
 	if (err)
 		goto err_egress_flood;
 
+	err = switchdev_bridge_port_offload(netdev, netdev, extack);
+	if (err)
+		goto err_switchdev_offload;
+
 	return 0;
 
+err_switchdev_offload:
 err_egress_flood:
 	dpaa2_switch_port_set_fdb(port_priv, NULL);
 	return err;
@@ -1957,6 +1962,11 @@ static int dpaa2_switch_port_restore_rxvlan(struct net_device *vdev, int vid, vo
 	return dpaa2_switch_port_vlan_add(arg, vlan_proto, vid);
 }
 
+static void dpaa2_switch_port_pre_bridge_leave(struct net_device *netdev)
+{
+	switchdev_bridge_port_unoffload(netdev);
+}
+
 static int dpaa2_switch_port_bridge_leave(struct net_device *netdev)
 {
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
@@ -2078,6 +2088,9 @@ static int dpaa2_switch_port_netdevice_event(struct notifier_block *nb,
 		if (err)
 			goto out;
 
+		if (!info->linking)
+			dpaa2_switch_port_pre_bridge_leave(netdev);
+
 		break;
 	case NETDEV_CHANGEUPPER:
 		upper_dev = info->upper_dev;
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c
index 226f4ff29f6e..7c569c1abefc 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_main.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c
@@ -746,7 +746,8 @@ static int prestera_netdev_port_event(struct net_device *lower,
 	case NETDEV_CHANGEUPPER:
 		if (netif_is_bridge_master(upper)) {
 			if (info->linking)
-				return prestera_bridge_port_join(upper, port);
+				return prestera_bridge_port_join(upper, port,
+								 extack);
 			else
 				prestera_bridge_port_leave(upper, port);
 		} else if (netif_is_lag_master(upper)) {
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
index 0b3e8f2db294..8cf3fe3b7e58 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
@@ -480,7 +480,8 @@ err_port_flood_set:
 }
 
 int prestera_bridge_port_join(struct net_device *br_dev,
-			      struct prestera_port *port)
+			      struct prestera_port *port,
+			      struct netlink_ext_ack *extack)
 {
 	struct prestera_switchdev *swdev = port->sw->swdev;
 	struct prestera_bridge_port *br_port;
@@ -500,6 +501,10 @@ int prestera_bridge_port_join(struct net_device *br_dev,
 		goto err_brport_create;
 	}
 
+	err = switchdev_bridge_port_offload(br_port->dev, port->dev, extack);
+	if (err)
+		goto err_switchdev_offload;
+
 	if (bridge->vlan_enabled)
 		return 0;
 
@@ -510,6 +515,8 @@ int prestera_bridge_port_join(struct net_device *br_dev,
 	return 0;
 
 err_port_join:
+	switchdev_bridge_port_unoffload(br_port->dev);
+err_switchdev_offload:
 	prestera_bridge_port_put(br_port);
 err_brport_create:
 	prestera_bridge_put(bridge);
@@ -584,6 +591,8 @@ void prestera_bridge_port_leave(struct net_device *br_dev,
 	else
 		prestera_bridge_1d_port_leave(br_port);
 
+	switchdev_bridge_port_unoffload(br_port->dev);
+
 	prestera_hw_port_learning_set(port, false);
 	prestera_hw_port_flood_set(port, BR_FLOOD | BR_MCAST_FLOOD, 0);
 	prestera_port_vid_stp_set(port, PRESTERA_VID_ALL, BR_STATE_FORWARDING);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.h b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.h
index a91bc35d235f..0e93fda3d9a5 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.h
@@ -8,7 +8,8 @@ int prestera_switchdev_init(struct prestera_switch *sw);
 void prestera_switchdev_fini(struct prestera_switch *sw);
 
 int prestera_bridge_port_join(struct net_device *br_dev,
-			      struct prestera_port *port);
+			      struct prestera_port *port,
+			      struct netlink_ext_ack *extack);
 
 void prestera_bridge_port_leave(struct net_device *br_dev,
 				struct prestera_port *port);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 61911fed6aeb..c52317de1f35 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -335,14 +335,16 @@ mlxsw_sp_bridge_port_find(struct mlxsw_sp_bridge *bridge,
 
 static struct mlxsw_sp_bridge_port *
 mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device,
-			    struct net_device *brport_dev)
+			    struct net_device *brport_dev,
+			    struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp_bridge_port *bridge_port;
 	struct mlxsw_sp_port *mlxsw_sp_port;
+	int err;
 
 	bridge_port = kzalloc(sizeof(*bridge_port), GFP_KERNEL);
 	if (!bridge_port)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	mlxsw_sp_port = mlxsw_sp_port_dev_lower_find(brport_dev);
 	bridge_port->lagged = mlxsw_sp_port->lagged;
@@ -359,12 +361,23 @@ mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device,
 	list_add(&bridge_port->list, &bridge_device->ports_list);
 	bridge_port->ref_count = 1;
 
+	err = switchdev_bridge_port_offload(brport_dev, mlxsw_sp_port->dev,
+					    extack);
+	if (err)
+		goto err_switchdev_offload;
+
 	return bridge_port;
+
+err_switchdev_offload:
+	list_del(&bridge_port->list);
+	kfree(bridge_port);
+	return ERR_PTR(err);
 }
 
 static void
 mlxsw_sp_bridge_port_destroy(struct mlxsw_sp_bridge_port *bridge_port)
 {
+	switchdev_bridge_port_unoffload(bridge_port->dev);
 	list_del(&bridge_port->list);
 	WARN_ON(!list_empty(&bridge_port->vlans_list));
 	kfree(bridge_port);
@@ -390,9 +403,10 @@ mlxsw_sp_bridge_port_get(struct mlxsw_sp_bridge *bridge,
 	if (IS_ERR(bridge_device))
 		return ERR_CAST(bridge_device);
 
-	bridge_port = mlxsw_sp_bridge_port_create(bridge_device, brport_dev);
-	if (!bridge_port) {
-		err = -ENOMEM;
+	bridge_port = mlxsw_sp_bridge_port_create(bridge_device, brport_dev,
+						  extack);
+	if (IS_ERR(bridge_port)) {
+		err = PTR_ERR(bridge_port);
 		goto err_bridge_port_create;
 	}
 
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
index a72e3b3b596e..e4fb573563d0 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
@@ -93,9 +93,12 @@ static int sparx5_port_attr_set(struct net_device *dev, const void *ctx,
 }
 
 static int sparx5_port_bridge_join(struct sparx5_port *port,
-				   struct net_device *bridge)
+				   struct net_device *bridge,
+				   struct netlink_ext_ack *extack)
 {
 	struct sparx5 *sparx5 = port->sparx5;
+	struct net_device *ndev = port->ndev;
+	int err;
 
 	if (bitmap_empty(sparx5->bridge_mask, SPX5_PORTS))
 		/* First bridged port */
@@ -109,12 +112,20 @@ static int sparx5_port_bridge_join(struct sparx5_port *port,
 
 	set_bit(port->portno, sparx5->bridge_mask);
 
+	err = switchdev_bridge_port_offload(ndev, ndev, extack);
+	if (err)
+		goto err_switchdev_offload;
+
 	/* Port enters in bridge mode therefor don't need to copy to CPU
 	 * frames for multicast in case the bridge is not requesting them
 	 */
-	__dev_mc_unsync(port->ndev, sparx5_mc_unsync);
+	__dev_mc_unsync(ndev, sparx5_mc_unsync);
 
 	return 0;
+
+err_switchdev_offload:
+	clear_bit(port->portno, sparx5->bridge_mask);
+	return err;
 }
 
 static void sparx5_port_bridge_leave(struct sparx5_port *port,
@@ -122,6 +133,8 @@ static void sparx5_port_bridge_leave(struct sparx5_port *port,
 {
 	struct sparx5 *sparx5 = port->sparx5;
 
+	switchdev_bridge_port_unoffload(port->ndev);
+
 	clear_bit(port->portno, sparx5->bridge_mask);
 	if (bitmap_empty(sparx5->bridge_mask, SPX5_PORTS))
 		sparx5->hw_bridge_dev = NULL;
@@ -139,11 +152,15 @@ static int sparx5_port_changeupper(struct net_device *dev,
 				   struct netdev_notifier_changeupper_info *info)
 {
 	struct sparx5_port *port = netdev_priv(dev);
+	struct netlink_ext_ack *extack;
 	int err = 0;
 
+	extack = netdev_notifier_info_to_extack(&info->info);
+
 	if (netif_is_bridge_master(info->upper_dev)) {
 		if (info->linking)
-			err = sparx5_port_bridge_join(port, info->upper_dev);
+			err = sparx5_port_bridge_join(port, info->upper_dev,
+						      extack);
 		else
 			sparx5_port_bridge_leave(port, info->upper_dev);
 
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index e9d260d84bf3..76b7b9536bf7 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1216,6 +1216,10 @@ static int ocelot_netdevice_bridge_join(struct net_device *dev,
 
 	ocelot_port_bridge_join(ocelot, port, bridge);
 
+	err = switchdev_bridge_port_offload(brport_dev, dev, extack);
+	if (err)
+		goto err_switchdev_offload;
+
 	err = ocelot_switchdev_sync(ocelot, port, brport_dev, bridge, extack);
 	if (err)
 		goto err_switchdev_sync;
@@ -1223,10 +1227,17 @@ static int ocelot_netdevice_bridge_join(struct net_device *dev,
 	return 0;
 
 err_switchdev_sync:
+	switchdev_bridge_port_unoffload(brport_dev);
+err_switchdev_offload:
 	ocelot_port_bridge_leave(ocelot, port, bridge);
 	return err;
 }
 
+static void ocelot_netdevice_pre_bridge_leave(struct net_device *brport_dev)
+{
+	switchdev_bridge_port_unoffload(brport_dev);
+}
+
 static int ocelot_netdevice_bridge_leave(struct net_device *dev,
 					 struct net_device *brport_dev,
 					 struct net_device *bridge)
@@ -1279,6 +1290,18 @@ err_bridge_join:
 	return err;
 }
 
+static void ocelot_netdevice_pre_lag_leave(struct net_device *dev,
+					   struct net_device *bond)
+{
+	struct net_device *bridge_dev;
+
+	bridge_dev = netdev_master_upper_dev_get(bond);
+	if (!bridge_dev || !netif_is_bridge_master(bridge_dev))
+		return;
+
+	ocelot_netdevice_pre_bridge_leave(bond);
+}
+
 static int ocelot_netdevice_lag_leave(struct net_device *dev,
 				      struct net_device *bond)
 {
@@ -1355,6 +1378,43 @@ ocelot_netdevice_lag_changeupper(struct net_device *dev,
 	return NOTIFY_DONE;
 }
 
+static int
+ocelot_netdevice_prechangeupper(struct net_device *dev,
+				struct net_device *brport_dev,
+				struct netdev_notifier_changeupper_info *info)
+{
+	if (netif_is_bridge_master(info->upper_dev) && !info->linking)
+		ocelot_netdevice_pre_bridge_leave(brport_dev);
+
+	if (netif_is_lag_master(info->upper_dev) && !info->linking)
+		ocelot_netdevice_pre_lag_leave(dev, info->upper_dev);
+
+	return NOTIFY_DONE;
+}
+
+static int
+ocelot_netdevice_lag_prechangeupper(struct net_device *dev,
+				    struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *lower;
+	struct list_head *iter;
+	int err = NOTIFY_DONE;
+
+	netdev_for_each_lower_dev(dev, lower, iter) {
+		struct ocelot_port_private *priv = netdev_priv(lower);
+		struct ocelot_port *ocelot_port = &priv->port;
+
+		if (ocelot_port->bond != dev)
+			return NOTIFY_OK;
+
+		err = ocelot_netdevice_prechangeupper(dev, lower, info);
+		if (err)
+			return err;
+	}
+
+	return NOTIFY_DONE;
+}
+
 static int
 ocelot_netdevice_changelowerstate(struct net_device *dev,
 				  struct netdev_lag_lower_state_info *info)
@@ -1382,6 +1442,17 @@ static int ocelot_netdevice_event(struct notifier_block *unused,
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
 	switch (event) {
+	case NETDEV_PRECHANGEUPPER: {
+		struct netdev_notifier_changeupper_info *info = ptr;
+
+		if (ocelot_netdevice_dev_check(dev))
+			return ocelot_netdevice_prechangeupper(dev, dev, info);
+
+		if (netif_is_lag_master(dev))
+			return ocelot_netdevice_lag_prechangeupper(dev, info);
+
+		break;
+	}
 	case NETDEV_CHANGEUPPER: {
 		struct netdev_notifier_changeupper_info *info = ptr;
 
diff --git a/drivers/net/ethernet/rocker/rocker.h b/drivers/net/ethernet/rocker/rocker.h
index 315a6e5c0f59..e75814a4654f 100644
--- a/drivers/net/ethernet/rocker/rocker.h
+++ b/drivers/net/ethernet/rocker/rocker.h
@@ -119,7 +119,8 @@ struct rocker_world_ops {
 	int (*port_obj_fdb_del)(struct rocker_port *rocker_port,
 				u16 vid, const unsigned char *addr);
 	int (*port_master_linked)(struct rocker_port *rocker_port,
-				  struct net_device *master);
+				  struct net_device *master,
+				  struct netlink_ext_ack *extack);
 	int (*port_master_unlinked)(struct rocker_port *rocker_port,
 				    struct net_device *master);
 	int (*port_neigh_update)(struct rocker_port *rocker_port,
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index a46633606cae..53d407a5dbf7 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -1670,13 +1670,14 @@ rocker_world_port_fdb_del(struct rocker_port *rocker_port,
 }
 
 static int rocker_world_port_master_linked(struct rocker_port *rocker_port,
-					   struct net_device *master)
+					   struct net_device *master,
+					   struct netlink_ext_ack *extack)
 {
 	struct rocker_world_ops *wops = rocker_port->rocker->wops;
 
 	if (!wops->port_master_linked)
 		return -EOPNOTSUPP;
-	return wops->port_master_linked(rocker_port, master);
+	return wops->port_master_linked(rocker_port, master, extack);
 }
 
 static int rocker_world_port_master_unlinked(struct rocker_port *rocker_port,
@@ -3107,6 +3108,7 @@ struct rocker_port *rocker_port_dev_lower_find(struct net_device *dev,
 static int rocker_netdevice_event(struct notifier_block *unused,
 				  unsigned long event, void *ptr)
 {
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct netdev_notifier_changeupper_info *info;
 	struct rocker_port *rocker_port;
@@ -3123,7 +3125,8 @@ static int rocker_netdevice_event(struct notifier_block *unused,
 		rocker_port = netdev_priv(dev);
 		if (info->linking) {
 			err = rocker_world_port_master_linked(rocker_port,
-							      info->upper_dev);
+							      info->upper_dev,
+							      extack);
 			if (err)
 				netdev_warn(dev, "failed to reflect master linked (err %d)\n",
 					    err);
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 967a634ee9ac..84dcaf8687a0 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2571,8 +2571,10 @@ static int ofdpa_port_obj_fdb_del(struct rocker_port *rocker_port,
 }
 
 static int ofdpa_port_bridge_join(struct ofdpa_port *ofdpa_port,
-				  struct net_device *bridge)
+				  struct net_device *bridge,
+				  struct netlink_ext_ack *extack)
 {
+	struct net_device *dev = ofdpa_port->dev;
 	int err;
 
 	/* Port is joining bridge, so the internal VLAN for the
@@ -2592,13 +2594,20 @@ static int ofdpa_port_bridge_join(struct ofdpa_port *ofdpa_port,
 
 	ofdpa_port->bridge_dev = bridge;
 
-	return ofdpa_port_vlan_add(ofdpa_port, OFDPA_UNTAGGED_VID, 0);
+	err = ofdpa_port_vlan_add(ofdpa_port, OFDPA_UNTAGGED_VID, 0);
+	if (err)
+		return err;
+
+	return switchdev_bridge_port_offload(dev, dev, extack);
 }
 
 static int ofdpa_port_bridge_leave(struct ofdpa_port *ofdpa_port)
 {
+	struct net_device *dev = ofdpa_port->dev;
 	int err;
 
+	switchdev_bridge_port_unoffload(dev);
+
 	err = ofdpa_port_vlan_del(ofdpa_port, OFDPA_UNTAGGED_VID, 0);
 	if (err)
 		return err;
@@ -2637,13 +2646,14 @@ static int ofdpa_port_ovs_changed(struct ofdpa_port *ofdpa_port,
 }
 
 static int ofdpa_port_master_linked(struct rocker_port *rocker_port,
-				    struct net_device *master)
+				    struct net_device *master,
+				    struct netlink_ext_ack *extack)
 {
 	struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
 	int err = 0;
 
 	if (netif_is_bridge_master(master))
-		err = ofdpa_port_bridge_join(ofdpa_port, master);
+		err = ofdpa_port_bridge_join(ofdpa_port, master, extack);
 	else if (netif_is_ovs_master(master))
 		err = ofdpa_port_ovs_changed(ofdpa_port, master);
 	return err;
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index 718539cdd2f2..8b9596eb808e 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -7,6 +7,7 @@
 
 #include <linux/clk.h>
 #include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
 #include <linux/if_vlan.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
@@ -2077,10 +2078,13 @@ bool am65_cpsw_port_dev_check(const struct net_device *ndev)
 	return false;
 }
 
-static int am65_cpsw_netdevice_port_link(struct net_device *ndev, struct net_device *br_ndev)
+static int am65_cpsw_netdevice_port_link(struct net_device *ndev,
+					 struct net_device *br_ndev,
+					 struct netlink_ext_ack *extack)
 {
 	struct am65_cpsw_common *common = am65_ndev_to_common(ndev);
 	struct am65_cpsw_ndev_priv *priv = am65_ndev_to_priv(ndev);
+	int err;
 
 	if (!common->br_members) {
 		common->hw_bridge_dev = br_ndev;
@@ -2092,6 +2096,10 @@ static int am65_cpsw_netdevice_port_link(struct net_device *ndev, struct net_dev
 			return -EOPNOTSUPP;
 	}
 
+	err = switchdev_bridge_port_offload(ndev, ndev, extack);
+	if (err)
+		return err;
+
 	common->br_members |= BIT(priv->port->port_id);
 
 	am65_cpsw_port_offload_fwd_mark_update(common);
@@ -2104,6 +2112,8 @@ static void am65_cpsw_netdevice_port_unlink(struct net_device *ndev)
 	struct am65_cpsw_common *common = am65_ndev_to_common(ndev);
 	struct am65_cpsw_ndev_priv *priv = am65_ndev_to_priv(ndev);
 
+	switchdev_bridge_port_unoffload(ndev);
+
 	common->br_members &= ~BIT(priv->port->port_id);
 
 	am65_cpsw_port_offload_fwd_mark_update(common);
@@ -2116,6 +2126,7 @@ static void am65_cpsw_netdevice_port_unlink(struct net_device *ndev)
 static int am65_cpsw_netdevice_event(struct notifier_block *unused,
 				     unsigned long event, void *ptr)
 {
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
 	struct netdev_notifier_changeupper_info *info;
 	int ret = NOTIFY_DONE;
@@ -2129,7 +2140,9 @@ static int am65_cpsw_netdevice_event(struct notifier_block *unused,
 
 		if (netif_is_bridge_master(info->upper_dev)) {
 			if (info->linking)
-				ret = am65_cpsw_netdevice_port_link(ndev, info->upper_dev);
+				ret = am65_cpsw_netdevice_port_link(ndev,
+								    info->upper_dev,
+								    extack);
 			else
 				am65_cpsw_netdevice_port_unlink(ndev);
 		}
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 57d279fdcc9f..bf9cadfb11b5 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/irqreturn.h>
 #include <linux/interrupt.h>
+#include <linux/if_bridge.h>
 #include <linux/if_ether.h>
 #include <linux/etherdevice.h>
 #include <linux/net_tstamp.h>
@@ -1499,10 +1500,12 @@ static void cpsw_port_offload_fwd_mark_update(struct cpsw_common *cpsw)
 }
 
 static int cpsw_netdevice_port_link(struct net_device *ndev,
-				    struct net_device *br_ndev)
+				    struct net_device *br_ndev,
+				    struct netlink_ext_ack *extack)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
+	int err;
 
 	if (!cpsw->br_members) {
 		cpsw->hw_bridge_dev = br_ndev;
@@ -1514,6 +1517,10 @@ static int cpsw_netdevice_port_link(struct net_device *ndev,
 			return -EOPNOTSUPP;
 	}
 
+	err = switchdev_bridge_port_offload(ndev, ndev, extack);
+	if (err)
+		return err;
+
 	cpsw->br_members |= BIT(priv->emac_port);
 
 	cpsw_port_offload_fwd_mark_update(cpsw);
@@ -1526,6 +1533,8 @@ static void cpsw_netdevice_port_unlink(struct net_device *ndev)
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
 
+	switchdev_bridge_port_unoffload(ndev);
+
 	cpsw->br_members &= ~BIT(priv->emac_port);
 
 	cpsw_port_offload_fwd_mark_update(cpsw);
@@ -1538,6 +1547,7 @@ static void cpsw_netdevice_port_unlink(struct net_device *ndev)
 static int cpsw_netdevice_event(struct notifier_block *unused,
 				unsigned long event, void *ptr)
 {
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
 	struct netdev_notifier_changeupper_info *info;
 	int ret = NOTIFY_DONE;
@@ -1552,7 +1562,8 @@ static int cpsw_netdevice_event(struct notifier_block *unused,
 		if (netif_is_bridge_master(info->upper_dev)) {
 			if (info->linking)
 				ret = cpsw_netdevice_port_link(ndev,
-							       info->upper_dev);
+							       info->upper_dev,
+							       extack);
 			else
 				cpsw_netdevice_port_unlink(ndev);
 		}
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index b651c5e32a28..ce413eca527e 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -206,4 +206,25 @@ static inline int br_fdb_replay(const struct net_device *br_dev,
 }
 #endif
 
+#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_NET_SWITCHDEV)
+
+int switchdev_bridge_port_offload(struct net_device *brport_dev,
+				  struct net_device *dev,
+				  struct netlink_ext_ack *extack);
+void switchdev_bridge_port_unoffload(struct net_device *brport_dev);
+
+#else
+
+static inline int switchdev_bridge_port_offload(struct net_device *brport_dev,
+						struct net_device *dev,
+						struct netlink_ext_ack *extack)
+{
+	return -EINVAL;
+}
+
+static inline void switchdev_bridge_port_unoffload(struct net_device *brport_dev)
+{
+}
+#endif
+
 #endif
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index c0df50e4abbb..86f6d7e93ea8 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -349,7 +349,6 @@ static void del_nbp(struct net_bridge_port *p)
 	nbp_backup_clear(p);
 
 	nbp_update_port_count(br);
-	nbp_switchdev_del(p);
 
 	netdev_upper_dev_unlink(dev, br->dev);
 
@@ -644,10 +643,6 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	if (err)
 		goto err5;
 
-	err = nbp_switchdev_add(p);
-	if (err)
-		goto err6;
-
 	dev_disable_lro(dev);
 
 	list_add_rcu(&p->list, &br->port_list);
@@ -685,13 +680,13 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 		 */
 		err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack);
 		if (err)
-			goto err7;
+			goto err6;
 	}
 
 	err = nbp_vlan_init(p, extack);
 	if (err) {
 		netdev_err(dev, "failed to initialize vlan filtering on this port\n");
-		goto err7;
+		goto err6;
 	}
 
 	spin_lock_bh(&br->lock);
@@ -714,14 +709,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 
 	return 0;
 
-err7:
+err6:
 	if (fdb_synced)
 		br_fdb_unsync_static(br, p);
 	list_del_rcu(&p->list);
 	br_fdb_delete_by_port(br, p, 0, 1);
 	nbp_update_port_count(br);
-	nbp_switchdev_del(p);
-err6:
 	netdev_upper_dev_unlink(dev, br->dev);
 err5:
 	dev->priv_flags &= ~IFF_BRIDGE_PORT;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d0e6ae25a4e4..fbf73a5256d0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -392,6 +392,8 @@ struct net_bridge_port {
 	 * hardware domain.
 	 */
 	int				hwdom;
+	int				offload_count;
+	struct netdev_phys_item_id	ppid;
 #endif
 	u16				group_fwd_mask;
 	u16				backup_redirected_cnt;
@@ -1857,8 +1859,6 @@ void br_switchdev_fdb_notify(struct net_bridge *br,
 int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
 			       struct netlink_ext_ack *extack);
 int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
-int nbp_switchdev_add(struct net_bridge_port *p);
-void nbp_switchdev_del(struct net_bridge_port *p);
 void br_switchdev_init(struct net_bridge *br);
 
 static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
@@ -1907,15 +1907,6 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
 {
 }
 
-static inline int nbp_switchdev_add(struct net_bridge_port *p)
-{
-	return 0;
-}
-
-static inline void nbp_switchdev_del(struct net_bridge_port *p)
-{
-}
-
 static inline void br_switchdev_init(struct net_bridge *br)
 {
 }
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index f3120f13c293..39f0787fde01 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -133,7 +133,7 @@ static int nbp_switchdev_hwdom_set(struct net_bridge_port *joining)
 
 	/* joining is yet to be added to the port list. */
 	list_for_each_entry(p, &br->port_list, list) {
-		if (netdev_port_same_parent_id(joining->dev, p->dev)) {
+		if (netdev_phys_item_id_same(&joining->ppid, &p->ppid)) {
 			joining->hwdom = p->hwdom;
 			return 0;
 		}
@@ -162,27 +162,85 @@ static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving)
 	clear_bit(leaving->hwdom, &br->busy_hwdoms);
 }
 
-int nbp_switchdev_add(struct net_bridge_port *p)
+static int nbp_switchdev_add(struct net_bridge_port *p,
+			     struct netdev_phys_item_id ppid,
+			     struct netlink_ext_ack *extack)
 {
-	struct netdev_phys_item_id ppid = { };
-	int err;
+	if (p->offload_count) {
+		/* Prevent unsupported configurations such as a bridge port
+		 * which is a bonding interface, and the member ports are from
+		 * different hardware switches.
+		 */
+		if (!netdev_phys_item_id_same(&p->ppid, &ppid)) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Same bridge port cannot be offloaded by two physical switches");
+			return -EBUSY;
+		}
 
-	ASSERT_RTNL();
+		/* Tolerate drivers that call switchdev_bridge_port_offload()
+		 * more than once for the same bridge port, such as when the
+		 * bridge port is an offloaded bonding/team interface.
+		 */
+		p->offload_count++;
 
-	err = dev_get_port_parent_id(p->dev, &ppid, true);
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			return 0;
-		return err;
+		return 0;
 	}
 
+	p->ppid = ppid;
+	p->offload_count = 1;
+
 	return nbp_switchdev_hwdom_set(p);
 }
 
-void nbp_switchdev_del(struct net_bridge_port *p)
+static void nbp_switchdev_del(struct net_bridge_port *p)
 {
-	ASSERT_RTNL();
+	if (WARN_ON(!p->offload_count))
+		return;
+
+	p->offload_count--;
+
+	if (p->offload_count)
+		return;
 
 	if (p->hwdom)
 		nbp_switchdev_hwdom_put(p);
 }
+
+/* Let the bridge know that this port is offloaded, so that it can assign a
+ * switchdev hardware domain to it.
+ */
+int switchdev_bridge_port_offload(struct net_device *brport_dev,
+				  struct net_device *dev,
+				  struct netlink_ext_ack *extack)
+{
+	struct netdev_phys_item_id ppid;
+	struct net_bridge_port *p;
+	int err;
+
+	ASSERT_RTNL();
+
+	p = br_port_get_rtnl(brport_dev);
+	if (!p)
+		return -ENODEV;
+
+	err = dev_get_port_parent_id(dev, &ppid, false);
+	if (err)
+		return err;
+
+	return nbp_switchdev_add(p, ppid, extack);
+}
+EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload);
+
+void switchdev_bridge_port_unoffload(struct net_device *brport_dev)
+{
+	struct net_bridge_port *p;
+
+	ASSERT_RTNL();
+
+	p = br_port_get_rtnl(brport_dev);
+	if (!p)
+		return;
+
+	nbp_switchdev_del(p);
+}
+EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 982e18771d76..7accda066149 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -292,6 +292,8 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 		.port = dp->index,
 		.br = br,
 	};
+	struct net_device *dev = dp->slave;
+	struct net_device *brport_dev;
 	int err;
 
 	/* Here the interface is already bridged. Reflect the current
@@ -299,16 +301,24 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 	 */
 	dp->bridge_dev = br;
 
+	brport_dev = dsa_port_to_bridge_port(dp);
+
 	err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info);
 	if (err)
 		goto out_rollback;
 
-	err = dsa_port_switchdev_sync(dp, extack);
+	err = switchdev_bridge_port_offload(brport_dev, dev, extack);
 	if (err)
 		goto out_rollback_unbridge;
 
+	err = dsa_port_switchdev_sync(dp, extack);
+	if (err)
+		goto out_rollback_unoffload;
+
 	return 0;
 
+out_rollback_unoffload:
+	switchdev_bridge_port_unoffload(brport_dev);
 out_rollback_unbridge:
 	dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
 out_rollback:
@@ -319,6 +329,10 @@ out_rollback:
 int dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br,
 			      struct netlink_ext_ack *extack)
 {
+	struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+
+	switchdev_bridge_port_unoffload(brport_dev);
+
 	return dsa_port_switchdev_unsync_objs(dp, br, extack);
 }
 
-- 
cgit v1.2.3


From 4e51bf44a03af6fa19a39a36ea8fedfacb8ccadf Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 21 Jul 2021 19:24:03 +0300
Subject: net: bridge: move the switchdev object replay helpers to "push" mode

Starting with commit 4f2673b3a2b6 ("net: bridge: add helper to replay
port and host-joined mdb entries"), DSA has introduced some bridge
helpers that replay switchdev events (FDB/MDB/VLAN additions and
deletions) that can be lost by the switchdev drivers in a variety of
circumstances:

- an IP multicast group was host-joined on the bridge itself before any
  switchdev port joined the bridge, leading to the host MDB entries
  missing in the hardware database.
- during the bridge creation process, the MAC address of the bridge was
  added to the FDB as an entry pointing towards the bridge device
  itself, but with no switchdev ports being part of the bridge yet, this
  local FDB entry would remain unknown to the switchdev hardware
  database.
- a VLAN/FDB/MDB was added to a bridge port that is a LAG interface,
  before any switchdev port joined that LAG, leading to the hardware
  database missing those entries.
- a switchdev port left a LAG that is a bridge port, while the LAG
  remained part of the bridge, and all FDB/MDB/VLAN entries remained
  installed in the hardware database of the switchdev port.

Also, since commit 0d2cfbd41c4a ("net: bridge: ignore switchdev events
for LAG ports which didn't request replay"), DSA introduced a method,
based on a const void *ctx, to ensure that two switchdev ports under the
same LAG that is a bridge port do not see the same MDB/VLAN entry being
replayed twice by the bridge, once for every bridge port that joins the
LAG.

With so many ordering corner cases being possible, it seems unreasonable
to expect a switchdev driver writer to get it right from the first try.
Therefore, now that DSA has experimented with the bridge replay helpers
for a little bit, we can move the code to the bridge driver where it is
more readily available to all switchdev drivers.

To convert the switchdev object replay helpers from "pull mode" (where
the driver asks for them) to a "push mode" (where the bridge offers them
automatically), the biggest problem is that the bridge needs to be aware
when a switchdev port joins and leaves, even when the switchdev is only
indirectly a bridge port (for example when the bridge port is a LAG
upper of the switchdev).

Luckily, we already have a hook for that, in the form of the newly
introduced switchdev_bridge_port_offload() and
switchdev_bridge_port_unoffload() calls. These offer a natural place for
hooking the object addition and deletion replays.

Extend the above 2 functions with:
- pointers to the switchdev atomic notifier (for FDB replays) and the
  blocking notifier (for MDB and VLAN replays).
- the "const void *ctx" argument required for drivers to be able to
  disambiguate between which port is targeted, when multiple ports are
  lowers of the same LAG that is a bridge port. Most of the drivers pass
  NULL to this argument, except the ones that support LAG offload and have
  the proper context check already in place in the switchdev blocking
  notifier handler.

Also unexport the replay helpers, since nobody except the bridge calls
them directly now.

Note that:
(a) we abuse the terminology slightly, because FDB entries are not
    "switchdev objects", but we count them as objects nonetheless.
    With no direct way to prove it, I think they are not modeled as
    switchdev objects because those can only be installed by the bridge
    to the hardware (as opposed to FDB entries which can be propagated
    in the other direction too). This is merely an abuse of terms, FDB
    entries are replayed too, despite not being objects.
(b) the bridge does not attempt to sync port attributes to newly joined
    ports, just the countable stuff (the objects). The reason for this
    is simple: no universal and symmetric way to sync and unsync them is
    known. For example, VLAN filtering: what to do on unsync, disable or
    leave it enabled? Similarly, STP state, ageing timer, etc etc. What
    a switchdev port does when it becomes standalone again is not really
    up to the bridge's competence, and the driver should deal with it.
    On the other hand, replaying deletions of switchdev objects can be
    seen a matter of cleanup and therefore be treated by the bridge,
    hence this patch.

We make the replay helpers opt-in for drivers, because they might not
bring immediate benefits for them:

- nbp_vlan_init() is called _after_ netdev_master_upper_dev_link(),
  so br_vlan_replay() should not do anything for the new drivers on
  which we call it. The existing drivers where there was even a slight
  possibility for there to exist a VLAN on a bridge port before they
  join it are already guarded against this: mlxsw and prestera deny
  joining LAG interfaces that are members of a bridge.

- br_fdb_replay() should now notify of local FDB entries, but I patched
  all drivers except DSA to ignore these new entries in commit
  2c4eca3ef716 ("net: bridge: switchdev: include local flag in FDB
  notifications"). Driver authors can lift this restriction as they
  wish, and when they do, they can also opt into the FDB replay
  functionality.

- br_mdb_replay() should fix a real issue which is described in commit
  4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined
  mdb entries"). However most drivers do not offload the
  SWITCHDEV_OBJ_ID_HOST_MDB to see this issue: only cpsw and am65_cpsw
  offload this switchdev object, and I don't completely understand the
  way in which they offload this switchdev object anyway. So I'll leave
  it up to these drivers' respective maintainers to opt into
  br_mdb_replay().

So most of the drivers pass NULL notifier blocks for the replay helpers,
except:
- dpaa2-switch which was already acked/regression-tested with the
  helpers enabled (and there isn't much of a downside in having them)
- ocelot which already had replay logic in "pull" mode
- DSA which already had replay logic in "pull" mode

An important observation is that the drivers which don't currently
request bridge event replays don't even have the
switchdev_bridge_port_{offload,unoffload} calls placed in proper places
right now. This was done to avoid unnecessary rework for drivers which
might never even add support for this. For driver writers who wish to
add replay support, this can be used as a tentative placement guide:
https://patchwork.kernel.org/project/netdevbpf/patch/20210720134655.892334-11-vladimir.oltean@nxp.com/

Cc: Vadym Kochan <vkochan@marvell.com>
Cc: Taras Chornyi <tchornyi@marvell.com>
Cc: Ioana Ciornei <ioana.ciornei@nxp.com>
Cc: Lars Povlsen <lars.povlsen@microchip.com>
Cc: Steen Hegelund <Steen.Hegelund@microchip.com>
Cc: UNGLinuxDriver@microchip.com
Cc: Claudiu Manoil <claudiu.manoil@nxp.com>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/freescale/dpaa2/dpaa2-switch.c    | 12 +++-
 .../ethernet/marvell/prestera/prestera_switchdev.c |  7 +-
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   |  4 +-
 .../ethernet/microchip/sparx5/sparx5_switchdev.c   |  5 +-
 drivers/net/ethernet/mscc/ocelot_net.c             | 45 +++++-------
 drivers/net/ethernet/rocker/rocker_ofdpa.c         |  5 +-
 drivers/net/ethernet/ti/am65-cpsw-nuss.c           |  5 +-
 drivers/net/ethernet/ti/cpsw_new.c                 |  5 +-
 include/linux/if_bridge.h                          | 54 +++++---------
 net/bridge/br_fdb.c                                |  1 -
 net/bridge/br_mdb.c                                |  1 -
 net/bridge/br_private.h                            | 25 +++++++
 net/bridge/br_switchdev.c                          | 75 ++++++++++++++++++-
 net/bridge/br_vlan.c                               |  1 -
 net/dsa/dsa_priv.h                                 |  6 +-
 net/dsa/port.c                                     | 84 +++++-----------------
 net/dsa/slave.c                                    | 10 +--
 17 files changed, 182 insertions(+), 163 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index 9b090da3e460..2138239facfd 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -1889,6 +1889,9 @@ static int dpaa2_switch_port_attr_set_event(struct net_device *netdev,
 	return notifier_from_errno(err);
 }
 
+static struct notifier_block dpaa2_switch_port_switchdev_nb;
+static struct notifier_block dpaa2_switch_port_switchdev_blocking_nb;
+
 static int dpaa2_switch_port_bridge_join(struct net_device *netdev,
 					 struct net_device *upper_dev,
 					 struct netlink_ext_ack *extack)
@@ -1930,7 +1933,10 @@ static int dpaa2_switch_port_bridge_join(struct net_device *netdev,
 	if (err)
 		goto err_egress_flood;
 
-	err = switchdev_bridge_port_offload(netdev, netdev, extack);
+	err = switchdev_bridge_port_offload(netdev, netdev, NULL,
+					    &dpaa2_switch_port_switchdev_nb,
+					    &dpaa2_switch_port_switchdev_blocking_nb,
+					    extack);
 	if (err)
 		goto err_switchdev_offload;
 
@@ -1964,7 +1970,9 @@ static int dpaa2_switch_port_restore_rxvlan(struct net_device *vdev, int vid, vo
 
 static void dpaa2_switch_port_pre_bridge_leave(struct net_device *netdev)
 {
-	switchdev_bridge_port_unoffload(netdev);
+	switchdev_bridge_port_unoffload(netdev, NULL,
+					&dpaa2_switch_port_switchdev_nb,
+					&dpaa2_switch_port_switchdev_blocking_nb);
 }
 
 static int dpaa2_switch_port_bridge_leave(struct net_device *netdev)
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
index 8cf3fe3b7e58..7fe1287228e5 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
@@ -501,7 +501,8 @@ int prestera_bridge_port_join(struct net_device *br_dev,
 		goto err_brport_create;
 	}
 
-	err = switchdev_bridge_port_offload(br_port->dev, port->dev, extack);
+	err = switchdev_bridge_port_offload(br_port->dev, port->dev, NULL,
+					    NULL, NULL, extack);
 	if (err)
 		goto err_switchdev_offload;
 
@@ -515,7 +516,7 @@ int prestera_bridge_port_join(struct net_device *br_dev,
 	return 0;
 
 err_port_join:
-	switchdev_bridge_port_unoffload(br_port->dev);
+	switchdev_bridge_port_unoffload(br_port->dev, NULL, NULL, NULL);
 err_switchdev_offload:
 	prestera_bridge_port_put(br_port);
 err_brport_create:
@@ -591,7 +592,7 @@ void prestera_bridge_port_leave(struct net_device *br_dev,
 	else
 		prestera_bridge_1d_port_leave(br_port);
 
-	switchdev_bridge_port_unoffload(br_port->dev);
+	switchdev_bridge_port_unoffload(br_port->dev, NULL, NULL, NULL);
 
 	prestera_hw_port_learning_set(port, false);
 	prestera_hw_port_flood_set(port, BR_FLOOD | BR_MCAST_FLOOD, 0);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index c52317de1f35..0a53f1d8e7e1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -362,7 +362,7 @@ mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device,
 	bridge_port->ref_count = 1;
 
 	err = switchdev_bridge_port_offload(brport_dev, mlxsw_sp_port->dev,
-					    extack);
+					    NULL, NULL, NULL, extack);
 	if (err)
 		goto err_switchdev_offload;
 
@@ -377,7 +377,7 @@ err_switchdev_offload:
 static void
 mlxsw_sp_bridge_port_destroy(struct mlxsw_sp_bridge_port *bridge_port)
 {
-	switchdev_bridge_port_unoffload(bridge_port->dev);
+	switchdev_bridge_port_unoffload(bridge_port->dev, NULL, NULL, NULL);
 	list_del(&bridge_port->list);
 	WARN_ON(!list_empty(&bridge_port->vlans_list));
 	kfree(bridge_port);
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
index e4fb573563d0..807dc45cfae4 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
@@ -112,7 +112,8 @@ static int sparx5_port_bridge_join(struct sparx5_port *port,
 
 	set_bit(port->portno, sparx5->bridge_mask);
 
-	err = switchdev_bridge_port_offload(ndev, ndev, extack);
+	err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL,
+					    extack);
 	if (err)
 		goto err_switchdev_offload;
 
@@ -133,7 +134,7 @@ static void sparx5_port_bridge_leave(struct sparx5_port *port,
 {
 	struct sparx5 *sparx5 = port->sparx5;
 
-	switchdev_bridge_port_unoffload(port->ndev);
+	switchdev_bridge_port_unoffload(port->ndev, NULL, NULL, NULL);
 
 	clear_bit(port->portno, sparx5->bridge_mask);
 	if (bitmap_empty(sparx5->bridge_mask, SPX5_PORTS))
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 76b7b9536bf7..3558ee8d9212 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1154,38 +1154,19 @@ static int ocelot_switchdev_sync(struct ocelot *ocelot, int port,
 				 struct net_device *bridge_dev,
 				 struct netlink_ext_ack *extack)
 {
-	struct ocelot_port *ocelot_port = ocelot->ports[port];
-	struct ocelot_port_private *priv;
 	clock_t ageing_time;
 	u8 stp_state;
-	int err;
-
-	priv = container_of(ocelot_port, struct ocelot_port_private, port);
 
 	ocelot_inherit_brport_flags(ocelot, port, brport_dev);
 
 	stp_state = br_port_get_stp_state(brport_dev);
 	ocelot_bridge_stp_state_set(ocelot, port, stp_state);
 
-	err = ocelot_port_vlan_filtering(ocelot, port,
-					 br_vlan_enabled(bridge_dev));
-	if (err)
-		return err;
-
 	ageing_time = br_get_ageing_time(bridge_dev);
 	ocelot_port_attr_ageing_set(ocelot, port, ageing_time);
 
-	err = br_mdb_replay(bridge_dev, brport_dev, priv, true,
-			    &ocelot_switchdev_blocking_nb, extack);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	err = br_vlan_replay(bridge_dev, brport_dev, priv, true,
-			     &ocelot_switchdev_blocking_nb, extack);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	return 0;
+	return ocelot_port_vlan_filtering(ocelot, port,
+					  br_vlan_enabled(bridge_dev));
 }
 
 static int ocelot_switchdev_unsync(struct ocelot *ocelot, int port)
@@ -1216,7 +1197,10 @@ static int ocelot_netdevice_bridge_join(struct net_device *dev,
 
 	ocelot_port_bridge_join(ocelot, port, bridge);
 
-	err = switchdev_bridge_port_offload(brport_dev, dev, extack);
+	err = switchdev_bridge_port_offload(brport_dev, dev, priv,
+					    &ocelot_netdevice_nb,
+					    &ocelot_switchdev_blocking_nb,
+					    extack);
 	if (err)
 		goto err_switchdev_offload;
 
@@ -1227,15 +1211,22 @@ static int ocelot_netdevice_bridge_join(struct net_device *dev,
 	return 0;
 
 err_switchdev_sync:
-	switchdev_bridge_port_unoffload(brport_dev);
+	switchdev_bridge_port_unoffload(brport_dev, priv,
+					&ocelot_netdevice_nb,
+					&ocelot_switchdev_blocking_nb);
 err_switchdev_offload:
 	ocelot_port_bridge_leave(ocelot, port, bridge);
 	return err;
 }
 
-static void ocelot_netdevice_pre_bridge_leave(struct net_device *brport_dev)
+static void ocelot_netdevice_pre_bridge_leave(struct net_device *dev,
+					      struct net_device *brport_dev)
 {
-	switchdev_bridge_port_unoffload(brport_dev);
+	struct ocelot_port_private *priv = netdev_priv(dev);
+
+	switchdev_bridge_port_unoffload(brport_dev, priv,
+					&ocelot_netdevice_nb,
+					&ocelot_switchdev_blocking_nb);
 }
 
 static int ocelot_netdevice_bridge_leave(struct net_device *dev,
@@ -1299,7 +1290,7 @@ static void ocelot_netdevice_pre_lag_leave(struct net_device *dev,
 	if (!bridge_dev || !netif_is_bridge_master(bridge_dev))
 		return;
 
-	ocelot_netdevice_pre_bridge_leave(bond);
+	ocelot_netdevice_pre_bridge_leave(dev, bond);
 }
 
 static int ocelot_netdevice_lag_leave(struct net_device *dev,
@@ -1384,7 +1375,7 @@ ocelot_netdevice_prechangeupper(struct net_device *dev,
 				struct netdev_notifier_changeupper_info *info)
 {
 	if (netif_is_bridge_master(info->upper_dev) && !info->linking)
-		ocelot_netdevice_pre_bridge_leave(brport_dev);
+		ocelot_netdevice_pre_bridge_leave(dev, brport_dev);
 
 	if (netif_is_lag_master(info->upper_dev) && !info->linking)
 		ocelot_netdevice_pre_lag_leave(dev, info->upper_dev);
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 84dcaf8687a0..03df6a24d0ba 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2598,7 +2598,8 @@ static int ofdpa_port_bridge_join(struct ofdpa_port *ofdpa_port,
 	if (err)
 		return err;
 
-	return switchdev_bridge_port_offload(dev, dev, extack);
+	return switchdev_bridge_port_offload(dev, dev, NULL, NULL, NULL,
+					     extack);
 }
 
 static int ofdpa_port_bridge_leave(struct ofdpa_port *ofdpa_port)
@@ -2606,7 +2607,7 @@ static int ofdpa_port_bridge_leave(struct ofdpa_port *ofdpa_port)
 	struct net_device *dev = ofdpa_port->dev;
 	int err;
 
-	switchdev_bridge_port_unoffload(dev);
+	switchdev_bridge_port_unoffload(dev, NULL, NULL, NULL);
 
 	err = ofdpa_port_vlan_del(ofdpa_port, OFDPA_UNTAGGED_VID, 0);
 	if (err)
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index 8b9596eb808e..b285606f963d 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -2096,7 +2096,8 @@ static int am65_cpsw_netdevice_port_link(struct net_device *ndev,
 			return -EOPNOTSUPP;
 	}
 
-	err = switchdev_bridge_port_offload(ndev, ndev, extack);
+	err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL,
+					    extack);
 	if (err)
 		return err;
 
@@ -2112,7 +2113,7 @@ static void am65_cpsw_netdevice_port_unlink(struct net_device *ndev)
 	struct am65_cpsw_common *common = am65_ndev_to_common(ndev);
 	struct am65_cpsw_ndev_priv *priv = am65_ndev_to_priv(ndev);
 
-	switchdev_bridge_port_unoffload(ndev);
+	switchdev_bridge_port_unoffload(ndev, NULL, NULL, NULL);
 
 	common->br_members &= ~BIT(priv->port->port_id);
 
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index bf9cadfb11b5..31030f73840d 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -1517,7 +1517,8 @@ static int cpsw_netdevice_port_link(struct net_device *ndev,
 			return -EOPNOTSUPP;
 	}
 
-	err = switchdev_bridge_port_offload(ndev, ndev, extack);
+	err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL,
+					    extack);
 	if (err)
 		return err;
 
@@ -1533,7 +1534,7 @@ static void cpsw_netdevice_port_unlink(struct net_device *ndev)
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
 
-	switchdev_bridge_port_unoffload(ndev);
+	switchdev_bridge_port_unoffload(ndev, NULL, NULL, NULL);
 
 	cpsw->br_members &= ~BIT(priv->emac_port);
 
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index ce413eca527e..bbf680093823 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -70,9 +70,6 @@ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
 bool br_multicast_has_router_adjacent(struct net_device *dev, int proto);
 bool br_multicast_enabled(const struct net_device *dev);
 bool br_multicast_router(const struct net_device *dev);
-int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
-		  const void *ctx, bool adding, struct notifier_block *nb,
-		  struct netlink_ext_ack *extack);
 #else
 static inline int br_multicast_list_adjacent(struct net_device *dev,
 					     struct list_head *br_ip_list)
@@ -104,13 +101,6 @@ static inline bool br_multicast_router(const struct net_device *dev)
 {
 	return false;
 }
-static inline int br_mdb_replay(const struct net_device *br_dev,
-				const struct net_device *dev, const void *ctx,
-				bool adding, struct notifier_block *nb,
-				struct netlink_ext_ack *extack)
-{
-	return -EOPNOTSUPP;
-}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
@@ -120,9 +110,6 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid);
 int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto);
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo);
-int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
-		   const void *ctx, bool adding, struct notifier_block *nb,
-		   struct netlink_ext_ack *extack);
 #else
 static inline bool br_vlan_enabled(const struct net_device *dev)
 {
@@ -149,14 +136,6 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
 {
 	return -EINVAL;
 }
-
-static inline int br_vlan_replay(struct net_device *br_dev,
-				 struct net_device *dev, const void *ctx,
-				 bool adding, struct notifier_block *nb,
-				 struct netlink_ext_ack *extack)
-{
-	return -EOPNOTSUPP;
-}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE)
@@ -167,8 +146,6 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid);
 bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag);
 u8 br_port_get_stp_state(const struct net_device *dev);
 clock_t br_get_ageing_time(const struct net_device *br_dev);
-int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev,
-		  const void *ctx, bool adding, struct notifier_block *nb);
 #else
 static inline struct net_device *
 br_fdb_find_port(const struct net_device *br_dev,
@@ -197,32 +174,37 @@ static inline clock_t br_get_ageing_time(const struct net_device *br_dev)
 {
 	return 0;
 }
-
-static inline int br_fdb_replay(const struct net_device *br_dev,
-				const struct net_device *dev, const void *ctx,
-				bool adding, struct notifier_block *nb)
-{
-	return -EOPNOTSUPP;
-}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_NET_SWITCHDEV)
 
 int switchdev_bridge_port_offload(struct net_device *brport_dev,
-				  struct net_device *dev,
+				  struct net_device *dev, const void *ctx,
+				  struct notifier_block *atomic_nb,
+				  struct notifier_block *blocking_nb,
 				  struct netlink_ext_ack *extack);
-void switchdev_bridge_port_unoffload(struct net_device *brport_dev);
+void switchdev_bridge_port_unoffload(struct net_device *brport_dev,
+				     const void *ctx,
+				     struct notifier_block *atomic_nb,
+				     struct notifier_block *blocking_nb);
 
 #else
 
-static inline int switchdev_bridge_port_offload(struct net_device *brport_dev,
-						struct net_device *dev,
-						struct netlink_ext_ack *extack)
+static inline int
+switchdev_bridge_port_offload(struct net_device *brport_dev,
+			      struct net_device *dev, const void *ctx,
+			      struct notifier_block *atomic_nb,
+			      struct notifier_block *blocking_nb,
+			      struct netlink_ext_ack *extack)
 {
 	return -EINVAL;
 }
 
-static inline void switchdev_bridge_port_unoffload(struct net_device *brport_dev)
+static inline void
+switchdev_bridge_port_unoffload(struct net_device *brport_dev,
+				const void *ctx,
+				struct notifier_block *atomic_nb,
+				struct notifier_block *blocking_nb)
 {
 }
 #endif
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 47f190b6bfa3..7747442e6572 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -795,7 +795,6 @@ int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev,
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(br_fdb_replay);
 
 static void fdb_notify(struct net_bridge *br,
 		       const struct net_bridge_fdb_entry *fdb, int type,
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index e58f1a4ac962..73a8915b0148 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -689,7 +689,6 @@ out_free_mdb:
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(br_mdb_replay);
 
 static void br_mdb_switchdev_host_port(struct net_device *dev,
 				       struct net_device *lower_dev,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index fbf73a5256d0..2f32d330b648 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -769,6 +769,8 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      bool swdev_notify);
 void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
 			  const unsigned char *addr, u16 vid, bool offloaded);
+int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev,
+		  const void *ctx, bool adding, struct notifier_block *nb);
 
 /* br_forward.c */
 enum br_pkt_type {
@@ -929,6 +931,10 @@ int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
 				      struct netlink_ext_ack *extack);
 bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on);
 
+int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
+		  const void *ctx, bool adding, struct notifier_block *nb,
+		  struct netlink_ext_ack *extack);
+
 static inline bool br_group_is_l2(const struct br_ip *group)
 {
 	return group->proto == 0;
@@ -1307,6 +1313,14 @@ static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan,
 {
 	return false;
 }
+
+static inline int br_mdb_replay(struct net_device *br_dev,
+				struct net_device *dev, const void *ctx,
+				bool adding, struct notifier_block *nb,
+				struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 /* br_vlan.c */
@@ -1358,6 +1372,9 @@ void br_vlan_notify(const struct net_bridge *br,
 		    const struct net_bridge_port *p,
 		    u16 vid, u16 vid_range,
 		    int cmd);
+int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
+		   const void *ctx, bool adding, struct notifier_block *nb,
+		   struct netlink_ext_ack *extack);
 bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
 			     const struct net_bridge_vlan *range_end);
 
@@ -1603,6 +1620,14 @@ static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
 {
 	return true;
 }
+
+static inline int br_vlan_replay(struct net_device *br_dev,
+				 struct net_device *dev, const void *ctx,
+				 bool adding, struct notifier_block *nb,
+				 struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 /* br_vlan_options.c */
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 39f0787fde01..6bfff28ede23 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -206,11 +206,62 @@ static void nbp_switchdev_del(struct net_bridge_port *p)
 		nbp_switchdev_hwdom_put(p);
 }
 
+static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx,
+				   struct notifier_block *atomic_nb,
+				   struct notifier_block *blocking_nb,
+				   struct netlink_ext_ack *extack)
+{
+	struct net_device *br_dev = p->br->dev;
+	struct net_device *dev = p->dev;
+	int err;
+
+	err = br_vlan_replay(br_dev, dev, ctx, true, blocking_nb, extack);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	err = br_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	/* Forwarding and termination FDB entries on the port */
+	err = br_fdb_replay(br_dev, dev, ctx, true, atomic_nb);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	/* Termination FDB entries on the bridge itself */
+	err = br_fdb_replay(br_dev, br_dev, ctx, true, atomic_nb);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	return 0;
+}
+
+static void nbp_switchdev_unsync_objs(struct net_bridge_port *p,
+				      const void *ctx,
+				      struct notifier_block *atomic_nb,
+				      struct notifier_block *blocking_nb)
+{
+	struct net_device *br_dev = p->br->dev;
+	struct net_device *dev = p->dev;
+
+	br_vlan_replay(br_dev, dev, ctx, false, blocking_nb, NULL);
+
+	br_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL);
+
+	/* Forwarding and termination FDB entries on the port */
+	br_fdb_replay(br_dev, dev, ctx, false, atomic_nb);
+
+	/* Termination FDB entries on the bridge itself */
+	br_fdb_replay(br_dev, br_dev, ctx, false, atomic_nb);
+}
+
 /* Let the bridge know that this port is offloaded, so that it can assign a
  * switchdev hardware domain to it.
  */
 int switchdev_bridge_port_offload(struct net_device *brport_dev,
-				  struct net_device *dev,
+				  struct net_device *dev, const void *ctx,
+				  struct notifier_block *atomic_nb,
+				  struct notifier_block *blocking_nb,
 				  struct netlink_ext_ack *extack)
 {
 	struct netdev_phys_item_id ppid;
@@ -227,11 +278,27 @@ int switchdev_bridge_port_offload(struct net_device *brport_dev,
 	if (err)
 		return err;
 
-	return nbp_switchdev_add(p, ppid, extack);
+	err = nbp_switchdev_add(p, ppid, extack);
+	if (err)
+		return err;
+
+	err = nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack);
+	if (err)
+		goto out_switchdev_del;
+
+	return 0;
+
+out_switchdev_del:
+	nbp_switchdev_del(p);
+
+	return err;
 }
 EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload);
 
-void switchdev_bridge_port_unoffload(struct net_device *brport_dev)
+void switchdev_bridge_port_unoffload(struct net_device *brport_dev,
+				     const void *ctx,
+				     struct notifier_block *atomic_nb,
+				     struct notifier_block *blocking_nb)
 {
 	struct net_bridge_port *p;
 
@@ -241,6 +308,8 @@ void switchdev_bridge_port_unoffload(struct net_device *brport_dev)
 	if (!p)
 		return;
 
+	nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb);
+
 	nbp_switchdev_del(p);
 }
 EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload);
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 45ef07f682f1..382ab992badf 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1905,7 +1905,6 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(br_vlan_replay);
 
 /* check if v_curr can enter a range ending in range_end */
 bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 4d3ab9e6183a..78c70f5bdab5 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -204,16 +204,14 @@ void dsa_port_disable_rt(struct dsa_port *dp);
 void dsa_port_disable(struct dsa_port *dp);
 int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 			 struct netlink_ext_ack *extack);
-int dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br,
-			      struct netlink_ext_ack *extack);
+void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br);
 void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
 int dsa_port_lag_change(struct dsa_port *dp,
 			struct netdev_lag_lower_state_info *linfo);
 int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev,
 		      struct netdev_lag_upper_info *uinfo,
 		      struct netlink_ext_ack *extack);
-int dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev,
-			   struct netlink_ext_ack *extack);
+void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
 void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
 int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
 			    struct netlink_ext_ack *extack);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 7accda066149..d81c283b7358 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -167,8 +167,8 @@ static void dsa_port_clear_brport_flags(struct dsa_port *dp)
 	}
 }
 
-static int dsa_port_switchdev_sync(struct dsa_port *dp,
-				   struct netlink_ext_ack *extack)
+static int dsa_port_switchdev_sync_attrs(struct dsa_port *dp,
+					 struct netlink_ext_ack *extack)
 {
 	struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
 	struct net_device *br = dp->bridge_dev;
@@ -194,59 +194,6 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp,
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	err = br_mdb_replay(br, brport_dev, dp, true,
-			    &dsa_slave_switchdev_blocking_notifier, extack);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	/* Forwarding and termination FDB entries on the port */
-	err = br_fdb_replay(br, brport_dev, dp, true,
-			    &dsa_slave_switchdev_notifier);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	/* Termination FDB entries on the bridge itself */
-	err = br_fdb_replay(br, br, dp, true, &dsa_slave_switchdev_notifier);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	err = br_vlan_replay(br, brport_dev, dp, true,
-			     &dsa_slave_switchdev_blocking_notifier, extack);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	return 0;
-}
-
-static int dsa_port_switchdev_unsync_objs(struct dsa_port *dp,
-					  struct net_device *br,
-					  struct netlink_ext_ack *extack)
-{
-	struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
-	int err;
-
-	/* Delete the switchdev objects left on this port */
-	err = br_mdb_replay(br, brport_dev, dp, false,
-			    &dsa_slave_switchdev_blocking_notifier, extack);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	/* Forwarding and termination FDB entries on the port */
-	err = br_fdb_replay(br, brport_dev, dp, false,
-			    &dsa_slave_switchdev_notifier);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	/* Termination FDB entries on the bridge itself */
-	err = br_fdb_replay(br, br, dp, false, &dsa_slave_switchdev_notifier);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	err = br_vlan_replay(br, brport_dev, dp, false,
-			     &dsa_slave_switchdev_blocking_notifier, extack);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
 	return 0;
 }
 
@@ -307,18 +254,23 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 	if (err)
 		goto out_rollback;
 
-	err = switchdev_bridge_port_offload(brport_dev, dev, extack);
+	err = switchdev_bridge_port_offload(brport_dev, dev, dp,
+					    &dsa_slave_switchdev_notifier,
+					    &dsa_slave_switchdev_blocking_notifier,
+					    extack);
 	if (err)
 		goto out_rollback_unbridge;
 
-	err = dsa_port_switchdev_sync(dp, extack);
+	err = dsa_port_switchdev_sync_attrs(dp, extack);
 	if (err)
 		goto out_rollback_unoffload;
 
 	return 0;
 
 out_rollback_unoffload:
-	switchdev_bridge_port_unoffload(brport_dev);
+	switchdev_bridge_port_unoffload(brport_dev, dp,
+					&dsa_slave_switchdev_notifier,
+					&dsa_slave_switchdev_blocking_notifier);
 out_rollback_unbridge:
 	dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
 out_rollback:
@@ -326,14 +278,13 @@ out_rollback:
 	return err;
 }
 
-int dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br,
-			      struct netlink_ext_ack *extack)
+void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br)
 {
 	struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
 
-	switchdev_bridge_port_unoffload(brport_dev);
-
-	return dsa_port_switchdev_unsync_objs(dp, br, extack);
+	switchdev_bridge_port_unoffload(brport_dev, dp,
+					&dsa_slave_switchdev_notifier,
+					&dsa_slave_switchdev_blocking_notifier);
 }
 
 void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
@@ -423,13 +374,10 @@ err_lag_join:
 	return err;
 }
 
-int dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag,
-			   struct netlink_ext_ack *extack)
+void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag)
 {
 	if (dp->bridge_dev)
-		return dsa_port_pre_bridge_leave(dp, dp->bridge_dev, extack);
-
-	return 0;
+		dsa_port_pre_bridge_leave(dp, dp->bridge_dev);
 }
 
 void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 22ce11cd770e..8105f642572b 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2052,20 +2052,16 @@ static int dsa_slave_prechangeupper(struct net_device *dev,
 				    struct netdev_notifier_changeupper_info *info)
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct netlink_ext_ack *extack;
-	int err = 0;
-
-	extack = netdev_notifier_info_to_extack(&info->info);
 
 	if (netif_is_bridge_master(info->upper_dev) && !info->linking)
-		err = dsa_port_pre_bridge_leave(dp, info->upper_dev, extack);
+		dsa_port_pre_bridge_leave(dp, info->upper_dev);
 	else if (netif_is_lag_master(info->upper_dev) && !info->linking)
-		err = dsa_port_pre_lag_leave(dp, info->upper_dev, extack);
+		dsa_port_pre_lag_leave(dp, info->upper_dev);
 	/* dsa_port_pre_hsr_leave is not yet necessary since hsr cannot be
 	 * meaningfully enslaved to a bridge yet
 	 */
 
-	return notifier_from_errno(err);
+	return NOTIFY_DONE;
 }
 
 static int
-- 
cgit v1.2.3


From 1a33b18b3bd9748c9c712a23e788bf1f1c4a7025 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 22 Jul 2021 16:28:58 +0200
Subject: compat: make linux/compat.h available everywhere

Parts of linux/compat.h are under an #ifdef, but we end up
using more of those over time, moving things around bit by
bit.

To get it over with once and for all, make all of this file
uncondititonal now so it can be accessed everywhere. There
are only a few types left that are in asm/compat.h but not
yet in the asm-generic version, so add those in the process.

This requires providing a few more types in asm-generic/compat.h
that were not already there. The only tricky one is
compat_sigset_t, which needs a little help on 32-bit architectures
and for x86.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm64/include/asm/compat.h   | 14 +++-----------
 arch/mips/include/asm/compat.h    | 24 +++++++++++-------------
 arch/parisc/include/asm/compat.h  | 14 +++-----------
 arch/powerpc/include/asm/compat.h | 11 -----------
 arch/s390/include/asm/compat.h    | 14 +++-----------
 arch/sparc/include/asm/compat.h   | 14 +++-----------
 arch/x86/include/asm/compat.h     | 14 +++-----------
 arch/x86/include/asm/signal.h     |  1 +
 include/asm-generic/compat.h      | 17 +++++++++++++++++
 include/linux/compat.h            | 32 +++++++++++++++-----------------
 10 files changed, 59 insertions(+), 96 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index 23a9fb73c04f..79c1a750e357 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -5,6 +5,9 @@
 #ifndef __ASM_COMPAT_H
 #define __ASM_COMPAT_H
 
+#define compat_mode_t compat_mode_t
+typedef u16		compat_mode_t;
+
 #include <asm-generic/compat.h>
 
 #ifdef CONFIG_COMPAT
@@ -27,13 +30,9 @@ typedef u16		__compat_uid_t;
 typedef u16		__compat_gid_t;
 typedef u16		__compat_uid16_t;
 typedef u16		__compat_gid16_t;
-typedef u32		__compat_uid32_t;
-typedef u32		__compat_gid32_t;
-typedef u16		compat_mode_t;
 typedef u32		compat_dev_t;
 typedef s32		compat_nlink_t;
 typedef u16		compat_ipc_pid_t;
-typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 
 struct compat_stat {
@@ -103,13 +102,6 @@ struct compat_statfs {
 
 #define COMPAT_RLIM_INFINITY		0xffffffff
 
-typedef u32		compat_old_sigset_t;
-
-#define _COMPAT_NSIG		64
-#define _COMPAT_NSIG_BPW	32
-
-typedef u32		compat_sigset_word;
-
 #define COMPAT_OFF_T_MAX	0x7fffffff
 
 #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current)))
diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h
index 65975712a22d..53f015a1b0a7 100644
--- a/arch/mips/include/asm/compat.h
+++ b/arch/mips/include/asm/compat.h
@@ -9,20 +9,25 @@
 #include <asm/page.h>
 #include <asm/ptrace.h>
 
+typedef s32		__compat_uid_t;
+typedef s32		__compat_gid_t;
+typedef __compat_uid_t	__compat_uid32_t;
+typedef __compat_gid_t	__compat_gid32_t;
+#define __compat_uid32_t __compat_uid32_t
+#define __compat_gid32_t __compat_gid32_t
+
+#define _COMPAT_NSIG		128		/* Don't ask !$@#% ...	*/
+#define _COMPAT_NSIG_BPW	32
+typedef u32		compat_sigset_word;
+
 #include <asm-generic/compat.h>
 
 #define COMPAT_USER_HZ		100
 #define COMPAT_UTS_MACHINE	"mips\0\0\0"
 
-typedef s32		__compat_uid_t;
-typedef s32		__compat_gid_t;
-typedef __compat_uid_t	__compat_uid32_t;
-typedef __compat_gid_t	__compat_gid32_t;
-typedef u32		compat_mode_t;
 typedef u32		compat_dev_t;
 typedef u32		compat_nlink_t;
 typedef s32		compat_ipc_pid_t;
-typedef s32		compat_caddr_t;
 typedef struct {
 	s32	val[2];
 } compat_fsid_t;
@@ -89,13 +94,6 @@ struct compat_statfs {
 
 #define COMPAT_RLIM_INFINITY	0x7fffffffUL
 
-typedef u32		compat_old_sigset_t;	/* at least 32 bits */
-
-#define _COMPAT_NSIG		128		/* Don't ask !$@#% ...	*/
-#define _COMPAT_NSIG_BPW	32
-
-typedef u32		compat_sigset_word;
-
 #define COMPAT_OFF_T_MAX	0x7fffffff
 
 static inline void __user *arch_compat_alloc_user_space(long len)
diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h
index 1a609d38f667..b5d90e82b65d 100644
--- a/arch/parisc/include/asm/compat.h
+++ b/arch/parisc/include/asm/compat.h
@@ -8,6 +8,9 @@
 #include <linux/sched.h>
 #include <linux/thread_info.h>
 
+#define compat_mode_t compat_mode_t
+typedef u16	compat_mode_t;
+
 #include <asm-generic/compat.h>
 
 #define COMPAT_USER_HZ 		100
@@ -15,13 +18,9 @@
 
 typedef u32	__compat_uid_t;
 typedef u32	__compat_gid_t;
-typedef u32	__compat_uid32_t;
-typedef u32	__compat_gid32_t;
-typedef u16	compat_mode_t;
 typedef u32	compat_dev_t;
 typedef u16	compat_nlink_t;
 typedef u16	compat_ipc_pid_t;
-typedef u32	compat_caddr_t;
 
 struct compat_stat {
 	compat_dev_t		st_dev;	/* dev_t is 32 bits on parisc */
@@ -96,13 +95,6 @@ struct compat_sigcontext {
 
 #define COMPAT_RLIM_INFINITY 0xffffffff
 
-typedef u32		compat_old_sigset_t;	/* at least 32 bits */
-
-#define _COMPAT_NSIG		64
-#define _COMPAT_NSIG_BPW	32
-
-typedef u32		compat_sigset_word;
-
 #define COMPAT_OFF_T_MAX	0x7fffffff
 
 struct compat_ipc64_perm {
diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h
index 9191fc29e6ed..e33dcf134cdd 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -19,13 +19,9 @@
 
 typedef u32		__compat_uid_t;
 typedef u32		__compat_gid_t;
-typedef u32		__compat_uid32_t;
-typedef u32		__compat_gid32_t;
-typedef u32		compat_mode_t;
 typedef u32		compat_dev_t;
 typedef s16		compat_nlink_t;
 typedef u16		compat_ipc_pid_t;
-typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 
 struct compat_stat {
@@ -85,13 +81,6 @@ struct compat_statfs {
 
 #define COMPAT_RLIM_INFINITY		0xffffffff
 
-typedef u32		compat_old_sigset_t;
-
-#define _COMPAT_NSIG		64
-#define _COMPAT_NSIG_BPW	32
-
-typedef u32		compat_sigset_word;
-
 #define COMPAT_OFF_T_MAX	0x7fffffff
 
 static inline void __user *arch_compat_alloc_user_space(long len)
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index ea5b9c34b7be..8d49505b4a43 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -9,6 +9,9 @@
 #include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
 
+#define compat_mode_t	compat_mode_t
+typedef u16		compat_mode_t;
+
 #include <asm-generic/compat.h>
 
 #define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \
@@ -55,13 +58,9 @@
 
 typedef u16		__compat_uid_t;
 typedef u16		__compat_gid_t;
-typedef u32		__compat_uid32_t;
-typedef u32		__compat_gid32_t;
-typedef u16		compat_mode_t;
 typedef u16		compat_dev_t;
 typedef u16		compat_nlink_t;
 typedef u16		compat_ipc_pid_t;
-typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 
 typedef struct {
@@ -155,13 +154,6 @@ struct compat_statfs64 {
 
 #define COMPAT_RLIM_INFINITY		0xffffffff
 
-typedef u32		compat_old_sigset_t;	/* at least 32 bits */
-
-#define _COMPAT_NSIG		64
-#define _COMPAT_NSIG_BPW	32
-
-typedef u32		compat_sigset_word;
-
 #define COMPAT_OFF_T_MAX	0x7fffffff
 
 /*
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index b85842cda99f..8b63410e830f 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -6,6 +6,9 @@
  */
 #include <linux/types.h>
 
+#define compat_mode_t	compat_mode_t
+typedef u16		compat_mode_t;
+
 #include <asm-generic/compat.h>
 
 #define COMPAT_USER_HZ		100
@@ -13,13 +16,9 @@
 
 typedef u16		__compat_uid_t;
 typedef u16		__compat_gid_t;
-typedef u32		__compat_uid32_t;
-typedef u32		__compat_gid32_t;
-typedef u16		compat_mode_t;
 typedef u16		compat_dev_t;
 typedef s16		compat_nlink_t;
 typedef u16		compat_ipc_pid_t;
-typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 
 struct compat_stat {
@@ -115,13 +114,6 @@ struct compat_statfs {
 
 #define COMPAT_RLIM_INFINITY 0x7fffffff
 
-typedef u32		compat_old_sigset_t;
-
-#define _COMPAT_NSIG		64
-#define _COMPAT_NSIG_BPW	32
-
-typedef u32		compat_sigset_word;
-
 #define COMPAT_OFF_T_MAX	0x7fffffff
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index be09c7eac89f..4ae01cdb99de 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -12,6 +12,9 @@
 #include <asm/user32.h>
 #include <asm/unistd.h>
 
+#define compat_mode_t	compat_mode_t
+typedef u16		compat_mode_t;
+
 #include <asm-generic/compat.h>
 
 #define COMPAT_USER_HZ		100
@@ -19,13 +22,9 @@
 
 typedef u16		__compat_uid_t;
 typedef u16		__compat_gid_t;
-typedef u32		__compat_uid32_t;
-typedef u32		__compat_gid32_t;
-typedef u16		compat_mode_t;
 typedef u16		compat_dev_t;
 typedef u16		compat_nlink_t;
 typedef u16		compat_ipc_pid_t;
-typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 
 struct compat_stat {
@@ -92,13 +91,6 @@ struct compat_statfs {
 
 #define COMPAT_RLIM_INFINITY		0xffffffff
 
-typedef u32		compat_old_sigset_t;	/* at least 32 bits */
-
-#define _COMPAT_NSIG		64
-#define _COMPAT_NSIG_BPW	32
-
-typedef u32               compat_sigset_word;
-
 #define COMPAT_OFF_T_MAX	0x7fffffff
 
 struct compat_ipc64_perm {
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 6fd8410a3910..2dfb5fea13af 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -29,6 +29,7 @@ typedef struct {
 #define SA_X32_ABI	0x01000000u
 
 #ifndef CONFIG_COMPAT
+#define compat_sigset_t compat_sigset_t
 typedef sigset_t compat_sigset_t;
 #endif
 
diff --git a/include/asm-generic/compat.h b/include/asm-generic/compat.h
index 30f7b18a36f9..d46c0201cc34 100644
--- a/include/asm-generic/compat.h
+++ b/include/asm-generic/compat.h
@@ -20,7 +20,18 @@ typedef u16 compat_ushort_t;
 typedef u32 compat_uint_t;
 typedef u32 compat_ulong_t;
 typedef u32 compat_uptr_t;
+typedef u32 compat_caddr_t;
 typedef u32 compat_aio_context_t;
+typedef u32 compat_old_sigset_t;
+
+#ifndef __compat_uid32_t
+typedef u32 __compat_uid32_t;
+typedef u32 __compat_gid32_t;
+#endif
+
+#ifndef compat_mode_t
+typedef u32 compat_mode_t;
+#endif
 
 #ifdef CONFIG_COMPAT_FOR_U64_ALIGNMENT
 typedef s64 __attribute__((aligned(4))) compat_s64;
@@ -30,4 +41,10 @@ typedef s64 compat_s64;
 typedef u64 compat_u64;
 #endif
 
+#ifndef _COMPAT_NSIG
+typedef u32 compat_sigset_word;
+#define _COMPAT_NSIG _NSIG
+#define _COMPAT_NSIG_BPW 32
+#endif
+
 #endif
diff --git a/include/linux/compat.h b/include/linux/compat.h
index c270124e4402..8e0598c7d1d1 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -20,11 +20,8 @@
 #include <linux/unistd.h>
 
 #include <asm/compat.h>
-
-#ifdef CONFIG_COMPAT
 #include <asm/siginfo.h>
 #include <asm/signal.h>
-#endif
 
 #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
 /*
@@ -95,8 +92,6 @@ struct compat_iovec {
 	compat_size_t	iov_len;
 };
 
-#ifdef CONFIG_COMPAT
-
 #ifndef compat_user_stack_pointer
 #define compat_user_stack_pointer() current_user_stack_pointer()
 #endif
@@ -131,9 +126,11 @@ struct compat_tms {
 
 #define _COMPAT_NSIG_WORDS	(_COMPAT_NSIG / _COMPAT_NSIG_BPW)
 
+#ifndef compat_sigset_t
 typedef struct {
 	compat_sigset_word	sig[_COMPAT_NSIG_WORDS];
 } compat_sigset_t;
+#endif
 
 int set_compat_user_sigmask(const compat_sigset_t __user *umask,
 			    size_t sigsetsize);
@@ -384,6 +381,7 @@ struct compat_keyctl_kdf_params {
 	__u32 __spare[8];
 };
 
+struct compat_stat;
 struct compat_statfs;
 struct compat_statfs64;
 struct compat_old_linux_dirent;
@@ -428,7 +426,7 @@ put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
 		  unsigned int size)
 {
 	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
-#ifdef __BIG_ENDIAN
+#if defined(__BIG_ENDIAN) && defined(CONFIG_64BIT)
 	compat_sigset_t v;
 	switch (_NSIG_WORDS) {
 	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
@@ -929,17 +927,6 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args);
 
 #endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
 
-
-/*
- * For most but not all architectures, "am I in a compat syscall?" and
- * "am I a compat task?" are the same question.  For architectures on which
- * they aren't the same question, arch code can override in_compat_syscall.
- */
-
-#ifndef in_compat_syscall
-static inline bool in_compat_syscall(void) { return is_compat_task(); }
-#endif
-
 /**
  * ns_to_old_timeval32 - Compat version of ns_to_timeval
  * @nsec:	the nanoseconds value to be converted
@@ -969,6 +956,17 @@ int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz,
 int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz,
 			  struct compat_statfs64 __user * buf);
 
+#ifdef CONFIG_COMPAT
+
+/*
+ * For most but not all architectures, "am I in a compat syscall?" and
+ * "am I a compat task?" are the same question.  For architectures on which
+ * they aren't the same question, arch code can override in_compat_syscall.
+ */
+#ifndef in_compat_syscall
+static inline bool in_compat_syscall(void) { return is_compat_task(); }
+#endif
+
 #else /* !CONFIG_COMPAT */
 
 #define is_compat_task() (0)
-- 
cgit v1.2.3


From dd98d2895de6485c884a9cb42de69fed02826fa4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 22 Jul 2021 16:28:59 +0200
Subject: ethtool: improve compat ioctl handling

The ethtool compat ioctl handling is hidden away in net/socket.c,
which introduces a couple of minor oddities:

- The implementation may end up diverging, as seen in the RXNFC
  extension in commit 84a1d9c48200 ("net: ethtool: extend RXNFC
  API to support RSS spreading of filter matches") that does not work
  in compat mode.

- Most architectures do not need the compat handling at all
  because u64 and compat_u64 have the same alignment.

- On x86, the conversion is done for both x32 and i386 user space,
  but it's actually wrong to do it for x32 and cannot work there.

- On 32-bit Arm, it never worked for compat oabi user space, since
  that needs to do the same conversion but does not.

- It would be nice to get rid of both compat_alloc_user_space()
  and copy_in_user() throughout the kernel.

None of these actually seems to be a serious problem that real
users are likely to encounter, but fixing all of them actually
leads to code that is both shorter and more readable.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |   4 --
 net/ethtool/ioctl.c     | 136 ++++++++++++++++++++++++++++++++++++++++++------
 net/socket.c            | 125 +-------------------------------------------
 3 files changed, 121 insertions(+), 144 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 232daaec56e4..4711b96dae0c 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -17,8 +17,6 @@
 #include <linux/compat.h>
 #include <uapi/linux/ethtool.h>
 
-#ifdef CONFIG_COMPAT
-
 struct compat_ethtool_rx_flow_spec {
 	u32		flow_type;
 	union ethtool_flow_union h_u;
@@ -38,8 +36,6 @@ struct compat_ethtool_rxnfc {
 	u32				rule_locs[];
 };
 
-#endif /* CONFIG_COMPAT */
-
 #include <linux/rculist.h>
 
 /**
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index baa5d10043cb..6134b180f59f 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -7,6 +7,7 @@
  * the information ethtool needs.
  */
 
+#include <linux/compat.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/capability.h>
@@ -807,6 +808,120 @@ out:
 	return ret;
 }
 
+static noinline_for_stack int
+ethtool_rxnfc_copy_from_compat(struct ethtool_rxnfc *rxnfc,
+			       const struct compat_ethtool_rxnfc __user *useraddr,
+			       size_t size)
+{
+	struct compat_ethtool_rxnfc crxnfc = {};
+
+	/* We expect there to be holes between fs.m_ext and
+	 * fs.ring_cookie and at the end of fs, but nowhere else.
+	 * On non-x86, no conversion should be needed.
+	 */
+	BUILD_BUG_ON(!IS_ENABLED(CONFIG_X86_64) &&
+		     sizeof(struct compat_ethtool_rxnfc) !=
+		     sizeof(struct ethtool_rxnfc));
+	BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
+		     sizeof(useraddr->fs.m_ext) !=
+		     offsetof(struct ethtool_rxnfc, fs.m_ext) +
+		     sizeof(rxnfc->fs.m_ext));
+	BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.location) -
+		     offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
+		     offsetof(struct ethtool_rxnfc, fs.location) -
+		     offsetof(struct ethtool_rxnfc, fs.ring_cookie));
+
+	if (copy_from_user(&crxnfc, useraddr, min(size, sizeof(crxnfc))))
+		return -EFAULT;
+
+	*rxnfc = (struct ethtool_rxnfc) {
+		.cmd		= crxnfc.cmd,
+		.flow_type	= crxnfc.flow_type,
+		.data		= crxnfc.data,
+		.fs		= {
+			.flow_type	= crxnfc.fs.flow_type,
+			.h_u		= crxnfc.fs.h_u,
+			.h_ext		= crxnfc.fs.h_ext,
+			.m_u		= crxnfc.fs.m_u,
+			.m_ext		= crxnfc.fs.m_ext,
+			.ring_cookie	= crxnfc.fs.ring_cookie,
+			.location	= crxnfc.fs.location,
+		},
+		.rule_cnt	= crxnfc.rule_cnt,
+	};
+
+	return 0;
+}
+
+static int ethtool_rxnfc_copy_from_user(struct ethtool_rxnfc *rxnfc,
+					const void __user *useraddr,
+					size_t size)
+{
+	if (compat_need_64bit_alignment_fixup())
+		return ethtool_rxnfc_copy_from_compat(rxnfc, useraddr, size);
+
+	if (copy_from_user(rxnfc, useraddr, size))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ethtool_rxnfc_copy_to_compat(void __user *useraddr,
+					const struct ethtool_rxnfc *rxnfc,
+					size_t size, const u32 *rule_buf)
+{
+	struct compat_ethtool_rxnfc crxnfc;
+
+	memset(&crxnfc, 0, sizeof(crxnfc));
+	crxnfc = (struct compat_ethtool_rxnfc) {
+		.cmd		= rxnfc->cmd,
+		.flow_type	= rxnfc->flow_type,
+		.data		= rxnfc->data,
+		.fs		= {
+			.flow_type	= rxnfc->fs.flow_type,
+			.h_u		= rxnfc->fs.h_u,
+			.h_ext		= rxnfc->fs.h_ext,
+			.m_u		= rxnfc->fs.m_u,
+			.m_ext		= rxnfc->fs.m_ext,
+			.ring_cookie	= rxnfc->fs.ring_cookie,
+			.location	= rxnfc->fs.location,
+		},
+		.rule_cnt	= rxnfc->rule_cnt,
+	};
+
+	if (copy_to_user(useraddr, &crxnfc, min(size, sizeof(crxnfc))))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ethtool_rxnfc_copy_to_user(void __user *useraddr,
+				      const struct ethtool_rxnfc *rxnfc,
+				      size_t size, const u32 *rule_buf)
+{
+	int ret;
+
+	if (compat_need_64bit_alignment_fixup()) {
+		ret = ethtool_rxnfc_copy_to_compat(useraddr, rxnfc, size,
+						   rule_buf);
+		useraddr += offsetof(struct compat_ethtool_rxnfc, rule_locs);
+	} else {
+		ret = copy_to_user(useraddr, &rxnfc, size);
+		useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+	}
+
+	if (ret)
+		return -EFAULT;
+
+	if (rule_buf) {
+		if (copy_to_user(useraddr, rule_buf,
+				 rxnfc->rule_cnt * sizeof(u32)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
 static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
 						u32 cmd, void __user *useraddr)
 {
@@ -825,7 +940,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
 		info_size = (offsetof(struct ethtool_rxnfc, data) +
 			     sizeof(info.data));
 
-	if (copy_from_user(&info, useraddr, info_size))
+	if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size))
 		return -EFAULT;
 
 	rc = dev->ethtool_ops->set_rxnfc(dev, &info);
@@ -833,7 +948,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
 		return rc;
 
 	if (cmd == ETHTOOL_SRXCLSRLINS &&
-	    copy_to_user(useraddr, &info, info_size))
+	    ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL))
 		return -EFAULT;
 
 	return 0;
@@ -859,7 +974,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
 		info_size = (offsetof(struct ethtool_rxnfc, data) +
 			     sizeof(info.data));
 
-	if (copy_from_user(&info, useraddr, info_size))
+	if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size))
 		return -EFAULT;
 
 	/* If FLOW_RSS was requested then user-space must be using the
@@ -867,7 +982,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
 	 */
 	if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) {
 		info_size = sizeof(info);
-		if (copy_from_user(&info, useraddr, info_size))
+		if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size))
 			return -EFAULT;
 		/* Since malicious users may modify the original data,
 		 * we need to check whether FLOW_RSS is still requested.
@@ -893,18 +1008,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
 	if (ret < 0)
 		goto err_out;
 
-	ret = -EFAULT;
-	if (copy_to_user(useraddr, &info, info_size))
-		goto err_out;
-
-	if (rule_buf) {
-		useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
-		if (copy_to_user(useraddr, rule_buf,
-				 info.rule_cnt * sizeof(u32)))
-			goto err_out;
-	}
-	ret = 0;
-
+	ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf);
 err_out:
 	kfree(rule_buf);
 
diff --git a/net/socket.c b/net/socket.c
index 0b2dad3bdf7f..ec63cf6de33e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3152,128 +3152,6 @@ static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc3
 	return 0;
 }
 
-static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
-{
-	struct compat_ethtool_rxnfc __user *compat_rxnfc;
-	bool convert_in = false, convert_out = false;
-	size_t buf_size = 0;
-	struct ethtool_rxnfc __user *rxnfc = NULL;
-	struct ifreq ifr;
-	u32 rule_cnt = 0, actual_rule_cnt;
-	u32 ethcmd;
-	u32 data;
-	int ret;
-
-	if (get_user(data, &ifr32->ifr_ifru.ifru_data))
-		return -EFAULT;
-
-	compat_rxnfc = compat_ptr(data);
-
-	if (get_user(ethcmd, &compat_rxnfc->cmd))
-		return -EFAULT;
-
-	/* Most ethtool structures are defined without padding.
-	 * Unfortunately struct ethtool_rxnfc is an exception.
-	 */
-	switch (ethcmd) {
-	default:
-		break;
-	case ETHTOOL_GRXCLSRLALL:
-		/* Buffer size is variable */
-		if (get_user(rule_cnt, &compat_rxnfc->rule_cnt))
-			return -EFAULT;
-		if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32))
-			return -ENOMEM;
-		buf_size += rule_cnt * sizeof(u32);
-		fallthrough;
-	case ETHTOOL_GRXRINGS:
-	case ETHTOOL_GRXCLSRLCNT:
-	case ETHTOOL_GRXCLSRULE:
-	case ETHTOOL_SRXCLSRLINS:
-		convert_out = true;
-		fallthrough;
-	case ETHTOOL_SRXCLSRLDEL:
-		buf_size += sizeof(struct ethtool_rxnfc);
-		convert_in = true;
-		rxnfc = compat_alloc_user_space(buf_size);
-		break;
-	}
-
-	if (copy_from_user(&ifr.ifr_name, &ifr32->ifr_name, IFNAMSIZ))
-		return -EFAULT;
-
-	ifr.ifr_data = convert_in ? rxnfc : (void __user *)compat_rxnfc;
-
-	if (convert_in) {
-		/* We expect there to be holes between fs.m_ext and
-		 * fs.ring_cookie and at the end of fs, but nowhere else.
-		 */
-		BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
-			     sizeof(compat_rxnfc->fs.m_ext) !=
-			     offsetof(struct ethtool_rxnfc, fs.m_ext) +
-			     sizeof(rxnfc->fs.m_ext));
-		BUILD_BUG_ON(
-			offsetof(struct compat_ethtool_rxnfc, fs.location) -
-			offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
-			offsetof(struct ethtool_rxnfc, fs.location) -
-			offsetof(struct ethtool_rxnfc, fs.ring_cookie));
-
-		if (copy_in_user(rxnfc, compat_rxnfc,
-				 (void __user *)(&rxnfc->fs.m_ext + 1) -
-				 (void __user *)rxnfc) ||
-		    copy_in_user(&rxnfc->fs.ring_cookie,
-				 &compat_rxnfc->fs.ring_cookie,
-				 (void __user *)(&rxnfc->fs.location + 1) -
-				 (void __user *)&rxnfc->fs.ring_cookie))
-			return -EFAULT;
-		if (ethcmd == ETHTOOL_GRXCLSRLALL) {
-			if (put_user(rule_cnt, &rxnfc->rule_cnt))
-				return -EFAULT;
-		} else if (copy_in_user(&rxnfc->rule_cnt,
-					&compat_rxnfc->rule_cnt,
-					sizeof(rxnfc->rule_cnt)))
-			return -EFAULT;
-	}
-
-	ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL);
-	if (ret)
-		return ret;
-
-	if (convert_out) {
-		if (copy_in_user(compat_rxnfc, rxnfc,
-				 (const void __user *)(&rxnfc->fs.m_ext + 1) -
-				 (const void __user *)rxnfc) ||
-		    copy_in_user(&compat_rxnfc->fs.ring_cookie,
-				 &rxnfc->fs.ring_cookie,
-				 (const void __user *)(&rxnfc->fs.location + 1) -
-				 (const void __user *)&rxnfc->fs.ring_cookie) ||
-		    copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt,
-				 sizeof(rxnfc->rule_cnt)))
-			return -EFAULT;
-
-		if (ethcmd == ETHTOOL_GRXCLSRLALL) {
-			/* As an optimisation, we only copy the actual
-			 * number of rules that the underlying
-			 * function returned.  Since Mallory might
-			 * change the rule count in user memory, we
-			 * check that it is less than the rule count
-			 * originally given (as the user buffer size),
-			 * which has been range-checked.
-			 */
-			if (get_user(actual_rule_cnt, &rxnfc->rule_cnt))
-				return -EFAULT;
-			if (actual_rule_cnt < rule_cnt)
-				rule_cnt = actual_rule_cnt;
-			if (copy_in_user(&compat_rxnfc->rule_locs[0],
-					 &rxnfc->rule_locs[0],
-					 rule_cnt * sizeof(u32)))
-				return -EFAULT;
-		}
-	}
-
-	return 0;
-}
-
 static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
 {
 	compat_uptr_t uptr32;
@@ -3428,8 +3306,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 		return old_bridge_ioctl(argp);
 	case SIOCGIFCONF:
 		return compat_dev_ifconf(net, argp);
-	case SIOCETHTOOL:
-		return ethtool_ioctl(net, argp);
 	case SIOCWANDEV:
 		return compat_siocwandev(net, argp);
 	case SIOCGIFMAP:
@@ -3442,6 +3318,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 		return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD,
 					    !COMPAT_USE_64BIT_TIME);
 
+	case SIOCETHTOOL:
 	case SIOCBONDSLAVEINFOQUERY:
 	case SIOCBONDINFOQUERY:
 	case SIOCSHWTSTAMP:
-- 
cgit v1.2.3


From b0e99d03778b2418aec20db99d97d19d25d198b6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 22 Jul 2021 16:29:01 +0200
Subject: net: socket: remove register_gifconf

Since dynamic registration of the gifconf() helper is only used for
IPv4, and this can not be in a loadable module, this can be simplified
noticeably by turning it into a direct function call as a preparation
for cleaning up the compat handling.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |  9 +++++++++
 include/linux/netdevice.h  |  8 --------
 net/core/dev_ioctl.c       | 43 ++++++++++---------------------------------
 net/ipv4/devinet.c         |  4 +---
 4 files changed, 20 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 53aa0343bf69..67e042932681 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -178,6 +178,15 @@ static inline struct net_device *ip_dev_find(struct net *net, __be32 addr)
 
 int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b);
 int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *);
+#ifdef CONFIG_INET
+int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size);
+#else
+static inline int inet_gifconf(struct net_device *dev, char __user *buf,
+			       int len, int size)
+{
+	return 0;
+}
+#endif
 void devinet_init(void);
 struct in_device *inetdev_by_index(struct net *, int);
 __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 42f6f866d5f3..6630a9f0b0f0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3289,14 +3289,6 @@ static inline bool dev_has_header(const struct net_device *dev)
 	return dev->header_ops && dev->header_ops->create;
 }
 
-typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr,
-			   int len, int size);
-int register_gifconf(unsigned int family, gifconf_func_t *gifconf);
-static inline int unregister_gifconf(unsigned int family)
-{
-	return register_gifconf(family, NULL);
-}
-
 #ifdef CONFIG_NET_FLOW_LIMIT
 #define FLOW_LIMIT_HISTORY	(1 << 7)  /* must be ^2 and !overflow buckets */
 struct sd_flow_limit {
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 62f45da7ecfe..c22c3dc15ce9 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/kmod.h>
 #include <linux/netdevice.h>
+#include <linux/inetdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/net_tstamp.h>
@@ -25,26 +26,6 @@ static int dev_ifname(struct net *net, struct ifreq *ifr)
 	return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex);
 }
 
-static gifconf_func_t *gifconf_list[NPROTO];
-
-/**
- *	register_gifconf	-	register a SIOCGIF handler
- *	@family: Address family
- *	@gifconf: Function handler
- *
- *	Register protocol dependent address dumping routines. The handler
- *	that is passed must not be freed or reused until it has been replaced
- *	by another handler.
- */
-int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
-{
-	if (family >= NPROTO)
-		return -EINVAL;
-	gifconf_list[family] = gifconf;
-	return 0;
-}
-EXPORT_SYMBOL(register_gifconf);
-
 /*
  *	Perform a SIOCGIFCONF call. This structure will change
  *	size eventually, and there is nothing I can do about it.
@@ -72,19 +53,15 @@ int dev_ifconf(struct net *net, struct ifconf *ifc, int size)
 
 	total = 0;
 	for_each_netdev(net, dev) {
-		for (i = 0; i < NPROTO; i++) {
-			if (gifconf_list[i]) {
-				int done;
-				if (!pos)
-					done = gifconf_list[i](dev, NULL, 0, size);
-				else
-					done = gifconf_list[i](dev, pos + total,
-							       len - total, size);
-				if (done < 0)
-					return -EFAULT;
-				total += done;
-			}
-		}
+		int done;
+		if (!pos)
+			done = inet_gifconf(dev, NULL, 0, size);
+		else
+			done = inet_gifconf(dev, pos + total,
+					    len - total, size);
+		if (done < 0)
+			return -EFAULT;
+		total += done;
 	}
 
 	/*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 94b648d9eaff..c82aded8da7d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1243,7 +1243,7 @@ out:
 	return ret;
 }
 
-static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
+int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
 {
 	struct in_device *in_dev = __in_dev_get_rtnl(dev);
 	const struct in_ifaddr *ifa;
@@ -2766,8 +2766,6 @@ void __init devinet_init(void)
 		INIT_HLIST_HEAD(&inet_addr_lst[i]);
 
 	register_pernet_subsys(&devinet_ops);
-
-	register_gifconf(PF_INET, inet_gifconf);
 	register_netdevice_notifier(&ip_netdev_notifier);
 
 	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
-- 
cgit v1.2.3


From 876f0bf9d0d5189dca9341c8e8e8686b09db8398 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 22 Jul 2021 16:29:02 +0200
Subject: net: socket: simplify dev_ifconf handling

The dev_ifconf() calling conventions make compat handling
more complicated than necessary, simplify this by moving
the in_compat_syscall() check into the function.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 +-
 net/core/dev_ioctl.c      | 55 ++++++++++++++++++++++---------------------
 net/socket.c              | 59 +++++++++++------------------------------------
 3 files changed, 44 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6630a9f0b0f0..da2c273c7e0a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4008,7 +4008,7 @@ void netdev_rx_handler_unregister(struct net_device *dev);
 bool dev_valid_name(const char *name);
 int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
 		bool *need_copyout);
-int dev_ifconf(struct net *net, struct ifconf *, int);
+int dev_ifconf(struct net *net, struct ifconf __user *ifc);
 int dev_ethtool(struct net *net, struct ifreq *);
 unsigned int dev_get_flags(const struct net_device *);
 int __dev_change_flags(struct net_device *dev, unsigned int flags,
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index c22c3dc15ce9..950e2fe5d56a 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -31,48 +31,51 @@ static int dev_ifname(struct net *net, struct ifreq *ifr)
  *	size eventually, and there is nothing I can do about it.
  *	Thus we will need a 'compatibility mode'.
  */
-
-int dev_ifconf(struct net *net, struct ifconf *ifc, int size)
+int dev_ifconf(struct net *net, struct ifconf __user *uifc)
 {
 	struct net_device *dev;
-	char __user *pos;
-	int len;
-	int total;
-	int i;
+	void __user *pos;
+	size_t size;
+	int len, total = 0, done;
 
-	/*
-	 *	Fetch the caller's info block.
-	 */
+	/* both the ifconf and the ifreq structures are slightly different */
+	if (in_compat_syscall()) {
+		struct compat_ifconf ifc32;
+
+		if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf)))
+			return -EFAULT;
 
-	pos = ifc->ifc_buf;
-	len = ifc->ifc_len;
+		pos = compat_ptr(ifc32.ifcbuf);
+		len = ifc32.ifc_len;
+		size = sizeof(struct compat_ifreq);
+	} else {
+		struct ifconf ifc;
 
-	/*
-	 *	Loop over the interfaces, and write an info block for each.
-	 */
+		if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
+			return -EFAULT;
 
-	total = 0;
+		pos = ifc.ifc_buf;
+		len = ifc.ifc_len;
+		size = sizeof(struct ifreq);
+	}
+
+	/* Loop over the interfaces, and write an info block for each. */
+	rtnl_lock();
 	for_each_netdev(net, dev) {
-		int done;
 		if (!pos)
 			done = inet_gifconf(dev, NULL, 0, size);
 		else
 			done = inet_gifconf(dev, pos + total,
 					    len - total, size);
-		if (done < 0)
+		if (done < 0) {
+			rtnl_unlock();
 			return -EFAULT;
+		}
 		total += done;
 	}
+	rtnl_unlock();
 
-	/*
-	 *	All done.  Write the updated control block back to the caller.
-	 */
-	ifc->ifc_len = total;
-
-	/*
-	 * 	Both BSD and Solaris return 0 here, so we do too.
-	 */
-	return 0;
+	return put_user(total, &uifc->ifc_len);
 }
 
 static int dev_getifmap(struct net_device *dev, struct ifreq *ifr)
diff --git a/net/socket.c b/net/socket.c
index 62005a12ec70..ecdb7913a3bd 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1088,6 +1088,8 @@ EXPORT_SYMBOL(vlan_ioctl_set);
 static long sock_do_ioctl(struct net *net, struct socket *sock,
 			  unsigned int cmd, unsigned long arg)
 {
+	struct ifreq ifr;
+	bool need_copyout;
 	int err;
 	void __user *argp = (void __user *)arg;
 
@@ -1100,25 +1102,13 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
 	if (err != -ENOIOCTLCMD)
 		return err;
 
-	if (cmd == SIOCGIFCONF) {
-		struct ifconf ifc;
-		if (copy_from_user(&ifc, argp, sizeof(struct ifconf)))
-			return -EFAULT;
-		rtnl_lock();
-		err = dev_ifconf(net, &ifc, sizeof(struct ifreq));
-		rtnl_unlock();
-		if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf)))
-			err = -EFAULT;
-	} else {
-		struct ifreq ifr;
-		bool need_copyout;
-		if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
+	if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
+		return -EFAULT;
+	err = dev_ioctl(net, cmd, &ifr, &need_copyout);
+	if (!err && need_copyout)
+		if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
 			return -EFAULT;
-		err = dev_ioctl(net, cmd, &ifr, &need_copyout);
-		if (!err && need_copyout)
-			if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
-				return -EFAULT;
-	}
+
 	return err;
 }
 
@@ -1217,6 +1207,11 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 						   cmd == SIOCGSTAMP_NEW,
 						   false);
 			break;
+
+		case SIOCGIFCONF:
+			err = dev_ifconf(net, argp);
+			break;
+
 		default:
 			err = sock_do_ioctl(net, sock, cmd, arg);
 			break;
@@ -3127,31 +3122,6 @@ void socket_seq_show(struct seq_file *seq)
 #endif				/* CONFIG_PROC_FS */
 
 #ifdef CONFIG_COMPAT
-static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
-{
-	struct compat_ifconf ifc32;
-	struct ifconf ifc;
-	int err;
-
-	if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf)))
-		return -EFAULT;
-
-	ifc.ifc_len = ifc32.ifc_len;
-	ifc.ifc_req = compat_ptr(ifc32.ifcbuf);
-
-	rtnl_lock();
-	err = dev_ifconf(net, &ifc, sizeof(struct compat_ifreq));
-	rtnl_unlock();
-	if (err)
-		return err;
-
-	ifc32.ifc_len = ifc.ifc_len;
-	if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf)))
-		return -EFAULT;
-
-	return 0;
-}
-
 static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
 {
 	compat_uptr_t uptr32;
@@ -3270,8 +3240,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCSIFBR:
 	case SIOCGIFBR:
 		return old_bridge_ioctl(argp);
-	case SIOCGIFCONF:
-		return compat_dev_ifconf(net, argp);
 	case SIOCWANDEV:
 		return compat_siocwandev(net, argp);
 	case SIOCGSTAMP_OLD:
@@ -3299,6 +3267,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCGSKNS:
 	case SIOCGSTAMP_NEW:
 	case SIOCGSTAMPNS_NEW:
+	case SIOCGIFCONF:
 		return sock_ioctl(file, cmd, arg);
 
 	case SIOCGIFFLAGS:
-- 
cgit v1.2.3


From 29c4964822aad42c960d9edf67fb8209f1886baa Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 22 Jul 2021 16:29:03 +0200
Subject: net: socket: rework compat_ifreq_ioctl()

compat_ifreq_ioctl() is one of the last users of copy_in_user() and
compat_alloc_user_space(), as it attempts to convert the 'struct ifreq'
arguments from 32-bit to 64-bit format as used by dev_ioctl() and a
couple of socket family specific interpretations.

The current implementation works correctly when calling dev_ioctl(),
inet_ioctl(), ieee802154_sock_ioctl(), atalk_ioctl(), qrtr_ioctl()
and packet_ioctl(). The ioctl handlers for x25, netrom, rose and x25 do
not interpret the arguments and only block the corresponding commands,
so they do not care.

For af_inet6 and af_decnet however, the compat conversion is slightly
incorrect, as it will copy more data than the native handler accesses,
both of them use a structure that is shorter than ifreq.

Replace the copy_in_user() conversion with a pair of accessor functions
to read and write the ifreq data in place with the correct length where
needed, while leaving the other ones to copy the (already compatible)
structures directly.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   2 +
 net/appletalk/ddp.c       |   4 +-
 net/ieee802154/socket.c   |   4 +-
 net/ipv4/af_inet.c        |   6 +--
 net/qrtr/qrtr.c           |   4 +-
 net/socket.c              | 103 +++++++++++++++++++++++++++++-----------------
 6 files changed, 76 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index da2c273c7e0a..c871dc223dfa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4006,6 +4006,8 @@ int netdev_rx_handler_register(struct net_device *dev,
 void netdev_rx_handler_unregister(struct net_device *dev);
 
 bool dev_valid_name(const char *name);
+int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg);
+int put_user_ifreq(struct ifreq *ifr, void __user *arg);
 int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
 		bool *need_copyout);
 int dev_ifconf(struct net *net, struct ifconf __user *ifc);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 8ade5a4ceaf5..bf5736c1d458 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -666,7 +666,7 @@ static int atif_ioctl(int cmd, void __user *arg)
 	struct rtentry rtdef;
 	int add_route;
 
-	if (copy_from_user(&atreq, arg, sizeof(atreq)))
+	if (get_user_ifreq(&atreq, NULL, arg))
 		return -EFAULT;
 
 	dev = __dev_get_by_name(&init_net, atreq.ifr_name);
@@ -865,7 +865,7 @@ static int atif_ioctl(int cmd, void __user *arg)
 		return 0;
 	}
 
-	return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0;
+	return put_user_ifreq(&atreq, arg);
 }
 
 static int atrtr_ioctl_addrt(struct rtentry *rt)
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a45a0401adc5..f5077de3619e 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -129,7 +129,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg,
 	int ret = -ENOIOCTLCMD;
 	struct net_device *dev;
 
-	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+	if (get_user_ifreq(&ifr, NULL, arg))
 		return -EFAULT;
 
 	ifr.ifr_name[IFNAMSIZ-1] = 0;
@@ -143,7 +143,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg,
 	if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl)
 		ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd);
 
-	if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+	if (!ret && put_user_ifreq(&ifr, arg))
 		ret = -EFAULT;
 	dev_put(dev);
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 54648181dd56..0e4d758c2585 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -953,10 +953,10 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCGIFNETMASK:
 	case SIOCGIFDSTADDR:
 	case SIOCGIFPFLAGS:
-		if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
+		if (get_user_ifreq(&ifr, NULL, p))
 			return -EFAULT;
 		err = devinet_ioctl(net, cmd, &ifr);
-		if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq)))
+		if (!err && put_user_ifreq(&ifr, p))
 			err = -EFAULT;
 		break;
 
@@ -966,7 +966,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCSIFDSTADDR:
 	case SIOCSIFPFLAGS:
 	case SIOCSIFFLAGS:
-		if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
+		if (get_user_ifreq(&ifr, NULL, p))
 			return -EFAULT;
 		err = devinet_ioctl(net, cmd, &ifr);
 		break;
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index e6f4a6202f82..e71847877248 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1153,14 +1153,14 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		rc = put_user(len, (int __user *)argp);
 		break;
 	case SIOCGIFADDR:
-		if (copy_from_user(&ifr, argp, sizeof(ifr))) {
+		if (get_user_ifreq(&ifr, NULL, argp)) {
 			rc = -EFAULT;
 			break;
 		}
 
 		sq = (struct sockaddr_qrtr *)&ifr.ifr_addr;
 		*sq = ipc->us;
-		if (copy_to_user(argp, &ifr, sizeof(ifr))) {
+		if (put_user_ifreq(&ifr, argp)) {
 			rc = -EFAULT;
 			break;
 		}
diff --git a/net/socket.c b/net/socket.c
index ecdb7913a3bd..84de89c1ee9d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3121,6 +3121,54 @@ void socket_seq_show(struct seq_file *seq)
 }
 #endif				/* CONFIG_PROC_FS */
 
+/* Handle the fact that while struct ifreq has the same *layout* on
+ * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
+ * which are handled elsewhere, it still has different *size* due to
+ * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
+ * resulting in struct ifreq being 32 and 40 bytes respectively).
+ * As a result, if the struct happens to be at the end of a page and
+ * the next page isn't readable/writable, we get a fault. To prevent
+ * that, copy back and forth to the full size.
+ */
+int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg)
+{
+	if (in_compat_syscall()) {
+		struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr;
+
+		memset(ifr, 0, sizeof(*ifr));
+		if (copy_from_user(ifr32, arg, sizeof(*ifr32)))
+			return -EFAULT;
+
+		if (ifrdata)
+			*ifrdata = compat_ptr(ifr32->ifr_data);
+
+		return 0;
+	}
+
+	if (copy_from_user(ifr, arg, sizeof(*ifr)))
+		return -EFAULT;
+
+	if (ifrdata)
+		*ifrdata = ifr->ifr_data;
+
+	return 0;
+}
+EXPORT_SYMBOL(get_user_ifreq);
+
+int put_user_ifreq(struct ifreq *ifr, void __user *arg)
+{
+	size_t size = sizeof(*ifr);
+
+	if (in_compat_syscall())
+		size = sizeof(struct compat_ifreq);
+
+	if (copy_to_user(arg, ifr, size))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL(put_user_ifreq);
+
 #ifdef CONFIG_COMPAT
 static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
 {
@@ -3129,7 +3177,7 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32
 	void __user *saved;
 	int err;
 
-	if (copy_from_user(&ifr, uifr32, sizeof(struct compat_ifreq)))
+	if (get_user_ifreq(&ifr, NULL, uifr32))
 		return -EFAULT;
 
 	if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
@@ -3141,7 +3189,7 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32
 	err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL);
 	if (!err) {
 		ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved;
-		if (copy_to_user(uifr32, &ifr, sizeof(struct compat_ifreq)))
+		if (put_user_ifreq(&ifr, uifr32))
 			err = -EFAULT;
 	}
 	return err;
@@ -3165,49 +3213,28 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
 
 static int compat_ifreq_ioctl(struct net *net, struct socket *sock,
 			      unsigned int cmd,
+			      unsigned long arg,
 			      struct compat_ifreq __user *uifr32)
 {
-	struct ifreq __user *uifr;
+	struct ifreq ifr;
+	bool need_copyout;
 	int err;
 
-	/* Handle the fact that while struct ifreq has the same *layout* on
-	 * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
-	 * which are handled elsewhere, it still has different *size* due to
-	 * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
-	 * resulting in struct ifreq being 32 and 40 bytes respectively).
-	 * As a result, if the struct happens to be at the end of a page and
-	 * the next page isn't readable/writable, we get a fault. To prevent
-	 * that, copy back and forth to the full size.
+	err = sock->ops->ioctl(sock, cmd, arg);
+
+	/* If this ioctl is unknown try to hand it down
+	 * to the NIC driver.
 	 */
+	if (err != -ENOIOCTLCMD)
+		return err;
 
-	uifr = compat_alloc_user_space(sizeof(*uifr));
-	if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
+	if (get_user_ifreq(&ifr, NULL, uifr32))
 		return -EFAULT;
+	err = dev_ioctl(net, cmd, &ifr, &need_copyout);
+	if (!err && need_copyout)
+		if (put_user_ifreq(&ifr, uifr32))
+			return -EFAULT;
 
-	err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);
-
-	if (!err) {
-		switch (cmd) {
-		case SIOCGIFFLAGS:
-		case SIOCGIFMETRIC:
-		case SIOCGIFMTU:
-		case SIOCGIFMEM:
-		case SIOCGIFHWADDR:
-		case SIOCGIFINDEX:
-		case SIOCGIFADDR:
-		case SIOCGIFBRDADDR:
-		case SIOCGIFDSTADDR:
-		case SIOCGIFNETMASK:
-		case SIOCGIFPFLAGS:
-		case SIOCGIFTXQLEN:
-		case SIOCGMIIPHY:
-		case SIOCGMIIREG:
-		case SIOCGIFNAME:
-			if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
-				err = -EFAULT;
-			break;
-		}
-	}
 	return err;
 }
 
@@ -3310,7 +3337,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCBONDRELEASE:
 	case SIOCBONDSETHWADDR:
 	case SIOCBONDCHANGEACTIVE:
-		return compat_ifreq_ioctl(net, sock, cmd, argp);
+		return compat_ifreq_ioctl(net, sock, cmd, arg, argp);
 
 	case SIOCSARP:
 	case SIOCGARP:
-- 
cgit v1.2.3


From 472111920f1c5fbe103022a4b05bfb37128a2a29 Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Thu, 22 Jul 2021 18:55:38 +0300
Subject: net: bridge: switchdev: allow the TX data plane forwarding to be
 offloaded

Allow switchdevs to forward frames from the CPU in accordance with the
bridge configuration in the same way as is done between bridge
ports. This means that the bridge will only send a single skb towards
one of the ports under the switchdev's control, and expects the driver
to deliver the packet to all eligible ports in its domain.

Primarily this improves the performance of multicast flows with
multiple subscribers, as it allows the hardware to perform the frame
replication.

The basic flow between the driver and the bridge is as follows:

- When joining a bridge port, the switchdev driver calls
  switchdev_bridge_port_offload() with tx_fwd_offload = true.

- The bridge sends offloadable skbs to one of the ports under the
  switchdev's control using skb->offload_fwd_mark = true.

- The switchdev driver checks the skb->offload_fwd_mark field and lets
  its FDB lookup select the destination port mask for this packet.

v1->v2:
- convert br_input_skb_cb::fwd_hwdoms to a plain unsigned long
- introduce a static key "br_switchdev_fwd_offload_used" to minimize the
  impact of the newly introduced feature on all the setups which don't
  have hardware that can make use of it
- introduce a check for nbp->flags & BR_FWD_OFFLOAD to optimize cache
  line access
- reorder nbp_switchdev_frame_mark_accel() and br_handle_vlan() in
  __br_forward()
- do not strip VLAN on egress if forwarding offload on VLAN-aware bridge
  is being used
- propagate errors from .ndo_dfwd_add_station() if not EOPNOTSUPP

v2->v3:
- replace the solution based on .ndo_dfwd_add_station with a solution
  based on switchdev_bridge_port_offload
- rename BR_FWD_OFFLOAD to BR_TX_FWD_OFFLOAD
v3->v4: rebase
v4->v5:
- make sure the static key is decremented on bridge port unoffload
- more function and variable renaming and comments for them:
  br_switchdev_fwd_offload_used to br_switchdev_tx_fwd_offload
  br_switchdev_accels_skb to br_switchdev_frame_uses_tx_fwd_offload
  nbp_switchdev_frame_mark_tx_fwd to nbp_switchdev_frame_mark_tx_fwd_to_hwdom
  nbp_switchdev_frame_mark_accel to nbp_switchdev_frame_mark_tx_fwd_offload
  fwd_accel to tx_fwd_offload

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/freescale/dpaa2/dpaa2-switch.c    |  2 +-
 .../ethernet/marvell/prestera/prestera_switchdev.c |  2 +-
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   |  2 +-
 .../ethernet/microchip/sparx5/sparx5_switchdev.c   |  2 +-
 drivers/net/ethernet/mscc/ocelot_net.c             |  2 +-
 drivers/net/ethernet/rocker/rocker_ofdpa.c         |  2 +-
 drivers/net/ethernet/ti/am65-cpsw-nuss.c           |  2 +-
 drivers/net/ethernet/ti/cpsw_new.c                 |  2 +-
 include/linux/if_bridge.h                          |  3 +
 net/bridge/br_forward.c                            |  9 +++
 net/bridge/br_private.h                            | 31 ++++++++++
 net/bridge/br_switchdev.c                          | 68 ++++++++++++++++++++--
 net/bridge/br_vlan.c                               | 10 +++-
 net/dsa/port.c                                     |  2 +-
 14 files changed, 125 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index 3d021edb78e6..c233e8786e19 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -1936,7 +1936,7 @@ static int dpaa2_switch_port_bridge_join(struct net_device *netdev,
 	err = switchdev_bridge_port_offload(netdev, netdev, NULL,
 					    &dpaa2_switch_port_switchdev_nb,
 					    &dpaa2_switch_port_switchdev_blocking_nb,
-					    extack);
+					    false, extack);
 	if (err)
 		goto err_switchdev_offload;
 
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
index 7fe1287228e5..be01ec8284e6 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
@@ -502,7 +502,7 @@ int prestera_bridge_port_join(struct net_device *br_dev,
 	}
 
 	err = switchdev_bridge_port_offload(br_port->dev, port->dev, NULL,
-					    NULL, NULL, extack);
+					    NULL, NULL, false, extack);
 	if (err)
 		goto err_switchdev_offload;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 0a53f1d8e7e1..f5d0d392efbf 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -362,7 +362,7 @@ mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device,
 	bridge_port->ref_count = 1;
 
 	err = switchdev_bridge_port_offload(brport_dev, mlxsw_sp_port->dev,
-					    NULL, NULL, NULL, extack);
+					    NULL, NULL, NULL, false, extack);
 	if (err)
 		goto err_switchdev_offload;
 
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
index 807dc45cfae4..649ca609884a 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
@@ -113,7 +113,7 @@ static int sparx5_port_bridge_join(struct sparx5_port *port,
 	set_bit(port->portno, sparx5->bridge_mask);
 
 	err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL,
-					    extack);
+					    false, extack);
 	if (err)
 		goto err_switchdev_offload;
 
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 3558ee8d9212..c52f175df389 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1200,7 +1200,7 @@ static int ocelot_netdevice_bridge_join(struct net_device *dev,
 	err = switchdev_bridge_port_offload(brport_dev, dev, priv,
 					    &ocelot_netdevice_nb,
 					    &ocelot_switchdev_blocking_nb,
-					    extack);
+					    false, extack);
 	if (err)
 		goto err_switchdev_offload;
 
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 03df6a24d0ba..b82e169b7836 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2599,7 +2599,7 @@ static int ofdpa_port_bridge_join(struct ofdpa_port *ofdpa_port,
 		return err;
 
 	return switchdev_bridge_port_offload(dev, dev, NULL, NULL, NULL,
-					     extack);
+					     false, extack);
 }
 
 static int ofdpa_port_bridge_leave(struct ofdpa_port *ofdpa_port)
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index b285606f963d..229e2f09d605 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -2097,7 +2097,7 @@ static int am65_cpsw_netdevice_port_link(struct net_device *ndev,
 	}
 
 	err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL,
-					    extack);
+					    false, extack);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 31030f73840d..4448a91cce54 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -1518,7 +1518,7 @@ static int cpsw_netdevice_port_link(struct net_device *ndev,
 	}
 
 	err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL,
-					    extack);
+					    false, extack);
 	if (err)
 		return err;
 
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index bbf680093823..f0b4ffbd8582 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -57,6 +57,7 @@ struct br_ip_list {
 #define BR_MRP_AWARE		BIT(17)
 #define BR_MRP_LOST_CONT	BIT(18)
 #define BR_MRP_LOST_IN_CONT	BIT(19)
+#define BR_TX_FWD_OFFLOAD	BIT(20)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
@@ -182,6 +183,7 @@ int switchdev_bridge_port_offload(struct net_device *brport_dev,
 				  struct net_device *dev, const void *ctx,
 				  struct notifier_block *atomic_nb,
 				  struct notifier_block *blocking_nb,
+				  bool tx_fwd_offload,
 				  struct netlink_ext_ack *extack);
 void switchdev_bridge_port_unoffload(struct net_device *brport_dev,
 				     const void *ctx,
@@ -195,6 +197,7 @@ switchdev_bridge_port_offload(struct net_device *brport_dev,
 			      struct net_device *dev, const void *ctx,
 			      struct notifier_block *atomic_nb,
 			      struct notifier_block *blocking_nb,
+			      bool tx_fwd_offload,
 			      struct netlink_ext_ack *extack)
 {
 	return -EINVAL;
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index bfdbaf3015b9..bc14b1b384e9 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -48,6 +48,8 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb
 		skb_set_network_header(skb, depth);
 	}
 
+	skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb);
+
 	dev_queue_xmit(skb);
 
 	return 0;
@@ -76,6 +78,11 @@ static void __br_forward(const struct net_bridge_port *to,
 	struct net *net;
 	int br_hook;
 
+	/* Mark the skb for forwarding offload early so that br_handle_vlan()
+	 * can know whether to pop the VLAN header on egress or keep it.
+	 */
+	nbp_switchdev_frame_mark_tx_fwd_offload(to, skb);
+
 	vg = nbp_vlan_group_rcu(to);
 	skb = br_handle_vlan(to->br, to, vg, skb);
 	if (!skb)
@@ -174,6 +181,8 @@ static struct net_bridge_port *maybe_deliver(
 	if (!should_deliver(p, skb))
 		return prev;
 
+	nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb);
+
 	if (!prev)
 		goto out;
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 2f32d330b648..86ca617fec7a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -552,12 +552,20 @@ struct br_input_skb_cb {
 #endif
 
 #ifdef CONFIG_NET_SWITCHDEV
+	/* Set if TX data plane offloading is used towards at least one
+	 * hardware domain.
+	 */
+	u8 tx_fwd_offload:1;
 	/* The switchdev hardware domain from which this packet was received.
 	 * If skb->offload_fwd_mark was set, then this packet was already
 	 * forwarded by hardware to the other ports in the source hardware
 	 * domain, otherwise it wasn't.
 	 */
 	int src_hwdom;
+	/* Bit mask of hardware domains towards this packet has already been
+	 * transmitted using the TX data plane offload.
+	 */
+	unsigned long fwd_hwdoms;
 #endif
 };
 
@@ -1871,6 +1879,12 @@ static inline void br_sysfs_delbr(struct net_device *dev) { return; }
 
 /* br_switchdev.c */
 #ifdef CONFIG_NET_SWITCHDEV
+bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb);
+
+void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
+					     struct sk_buff *skb);
+void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
+					      struct sk_buff *skb);
 void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
 			      struct sk_buff *skb);
 bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
@@ -1891,6 +1905,23 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
 	skb->offload_fwd_mark = 0;
 }
 #else
+static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb)
+{
+	return false;
+}
+
+static inline void
+nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
+					struct sk_buff *skb)
+{
+}
+
+static inline void
+nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
+					 struct sk_buff *skb)
+{
+}
+
 static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
 					    struct sk_buff *skb)
 {
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 6bfff28ede23..96ce069d0c8c 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -8,6 +8,46 @@
 
 #include "br_private.h"
 
+static struct static_key_false br_switchdev_tx_fwd_offload;
+
+static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p,
+					     const struct sk_buff *skb)
+{
+	if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload))
+		return false;
+
+	return (p->flags & BR_TX_FWD_OFFLOAD) &&
+	       (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom);
+}
+
+bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb)
+{
+	if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload))
+		return false;
+
+	return BR_INPUT_SKB_CB(skb)->tx_fwd_offload;
+}
+
+/* Mark the frame for TX forwarding offload if this egress port supports it */
+void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
+					     struct sk_buff *skb)
+{
+	if (nbp_switchdev_can_offload_tx_fwd(p, skb))
+		BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true;
+}
+
+/* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms
+ * that the skb has been already forwarded to, to avoid further cloning to
+ * other ports in the same hwdom by making nbp_switchdev_allowed_egress()
+ * return false.
+ */
+void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
+					      struct sk_buff *skb)
+{
+	if (nbp_switchdev_can_offload_tx_fwd(p, skb))
+		set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms);
+}
+
 void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
 			      struct sk_buff *skb)
 {
@@ -18,8 +58,10 @@ void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
 bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
 				  const struct sk_buff *skb)
 {
-	return !skb->offload_fwd_mark ||
-	       BR_INPUT_SKB_CB(skb)->src_hwdom != p->hwdom;
+	struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb);
+
+	return !test_bit(p->hwdom, &cb->fwd_hwdoms) &&
+		(!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom);
 }
 
 /* Flags that can be offloaded to hardware */
@@ -164,8 +206,11 @@ static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving)
 
 static int nbp_switchdev_add(struct net_bridge_port *p,
 			     struct netdev_phys_item_id ppid,
+			     bool tx_fwd_offload,
 			     struct netlink_ext_ack *extack)
 {
+	int err;
+
 	if (p->offload_count) {
 		/* Prevent unsupported configurations such as a bridge port
 		 * which is a bonding interface, and the member ports are from
@@ -189,7 +234,16 @@ static int nbp_switchdev_add(struct net_bridge_port *p,
 	p->ppid = ppid;
 	p->offload_count = 1;
 
-	return nbp_switchdev_hwdom_set(p);
+	err = nbp_switchdev_hwdom_set(p);
+	if (err)
+		return err;
+
+	if (tx_fwd_offload) {
+		p->flags |= BR_TX_FWD_OFFLOAD;
+		static_branch_inc(&br_switchdev_tx_fwd_offload);
+	}
+
+	return 0;
 }
 
 static void nbp_switchdev_del(struct net_bridge_port *p)
@@ -204,6 +258,11 @@ static void nbp_switchdev_del(struct net_bridge_port *p)
 
 	if (p->hwdom)
 		nbp_switchdev_hwdom_put(p);
+
+	if (p->flags & BR_TX_FWD_OFFLOAD) {
+		p->flags &= ~BR_TX_FWD_OFFLOAD;
+		static_branch_dec(&br_switchdev_tx_fwd_offload);
+	}
 }
 
 static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx,
@@ -262,6 +321,7 @@ int switchdev_bridge_port_offload(struct net_device *brport_dev,
 				  struct net_device *dev, const void *ctx,
 				  struct notifier_block *atomic_nb,
 				  struct notifier_block *blocking_nb,
+				  bool tx_fwd_offload,
 				  struct netlink_ext_ack *extack)
 {
 	struct netdev_phys_item_id ppid;
@@ -278,7 +338,7 @@ int switchdev_bridge_port_offload(struct net_device *brport_dev,
 	if (err)
 		return err;
 
-	err = nbp_switchdev_add(p, ppid, extack);
+	err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack);
 	if (err)
 		return err;
 
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 382ab992badf..325600361487 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -465,7 +465,15 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
 		u64_stats_update_end(&stats->syncp);
 	}
 
-	if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
+	/* If the skb will be sent using forwarding offload, the assumption is
+	 * that the switchdev will inject the packet into hardware together
+	 * with the bridge VLAN, so that it can be forwarded according to that
+	 * VLAN. The switchdev should deal with popping the VLAN header in
+	 * hardware on each egress port as appropriate. So only strip the VLAN
+	 * header if forwarding offload is not being used.
+	 */
+	if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED &&
+	    !br_switchdev_frame_uses_tx_fwd_offload(skb))
 		__vlan_hwaccel_clear_tag(skb);
 
 	if (p && (p->flags & BR_VLAN_TUNNEL) &&
diff --git a/net/dsa/port.c b/net/dsa/port.c
index d81c283b7358..f2704f101ccf 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -257,7 +257,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 	err = switchdev_bridge_port_offload(brport_dev, dev, dp,
 					    &dsa_slave_switchdev_notifier,
 					    &dsa_slave_switchdev_blocking_notifier,
-					    extack);
+					    false, extack);
 	if (err)
 		goto out_rollback_unbridge;
 
-- 
cgit v1.2.3


From 5b22d3669f2fa6e762c5302fc4b6051a92b81617 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 22 Jul 2021 18:55:39 +0300
Subject: net: dsa: track the number of switches in a tree

In preparation of supporting data plane forwarding on behalf of a
software bridge, some drivers might need to view bridges as virtual
switches behind the CPU port in a cross-chip topology.

Give them some help and let them know how many physical switches there
are in the tree, so that they can count the virtual switches starting
from that number on.

Note that the first dsa_switch_ops method where this information is
reliably available is .setup(). This is because of how DSA works:
in a tree with 3 switches, each calling dsa_register_switch(), the first
2 will advance until dsa_tree_setup() -> dsa_tree_setup_routing_table()
and exit with error code 0 because the topology is not complete. Since
probing is parallel at this point, one switch does not know about the
existence of the other. Then the third switch comes, and for it,
dsa_tree_setup_routing_table() returns complete = true. This switch goes
ahead and calls dsa_tree_setup_switches() for everybody else, calling
their .setup() methods too. This acts as the synchronization point.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 3 +++
 net/dsa/dsa2.c    | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 9e5593885357..929bcaec4d41 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -159,6 +159,9 @@ struct dsa_switch_tree {
 	 */
 	struct net_device **lags;
 	unsigned int lags_len;
+
+	/* Track the largest switch index within a tree */
+	unsigned int last_switch;
 };
 
 #define dsa_lags_foreach_id(_id, _dst)				\
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 185629f27f80..de5e93ba2a9d 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -1265,6 +1265,9 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds,
 		return -EEXIST;
 	}
 
+	if (ds->dst->last_switch < ds->index)
+		ds->dst->last_switch = ds->index;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 123abc06e74f49d9b173a93cb2b797fb85f50ba3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 22 Jul 2021 18:55:40 +0300
Subject: net: dsa: add support for bridge TX forwarding offload

For a DSA switch, to offload the forwarding process of a bridge device
means to send the packets coming from the software bridge as data plane
packets. This is contrary to everything that DSA has done so far,
because the current taggers only know to send control packets (ones that
target a specific destination port), whereas data plane packets are
supposed to be forwarded according to the FDB lookup, much like packets
ingressing on any regular ingress port. If the FDB lookup process
returns multiple destination ports (flooding, multicast), then
replication is also handled by the switch hardware - the bridge only
sends a single packet and avoids the skb_clone().

DSA keeps for each bridge port a zero-based index (the number of the
bridge). Multiple ports performing TX forwarding offload to the same
bridge have the same dp->bridge_num value, and ports not offloading the
TX data plane of a bridge have dp->bridge_num = -1.

The tagger can check if the packet that is being transmitted on has
skb->offload_fwd_mark = true or not. If it does, it can be sure that the
packet belongs to the data plane of a bridge, further information about
which can be obtained based on dp->bridge_dev and dp->bridge_num.
It can then compose a DSA tag for injecting a data plane packet into
that bridge number.

For the switch driver side, we offer two new dsa_switch_ops methods,
called .port_bridge_fwd_offload_{add,del}, which are modeled after
.port_bridge_{join,leave}.
These methods are provided in case the driver needs to configure the
hardware to treat packets coming from that bridge software interface as
data plane packets. The switchdev <-> bridge interaction happens during
the netdev_master_upper_dev_link() call, so to switch drivers, the
effect is that the .port_bridge_fwd_offload_add() method is called
immediately after .port_bridge_join().

If the bridge number exceeds the number of bridges for which the switch
driver can offload the TX data plane (and this includes the case where
the driver can offload none), DSA falls back to simply returning
tx_fwd_offload = false in the switchdev_bridge_port_offload() call.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  | 18 ++++++++++++
 net/dsa/dsa2.c     |  1 +
 net/dsa/dsa_priv.h |  2 ++
 net/dsa/port.c     | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 104 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 929bcaec4d41..f8eb2dc3fbef 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -162,6 +162,9 @@ struct dsa_switch_tree {
 
 	/* Track the largest switch index within a tree */
 	unsigned int last_switch;
+
+	/* Track the bridges with forwarding offload enabled */
+	unsigned long fwd_offloading_bridges;
 };
 
 #define dsa_lags_foreach_id(_id, _dst)				\
@@ -262,6 +265,7 @@ struct dsa_port {
 	bool			vlan_filtering;
 	u8			stp_state;
 	struct net_device	*bridge_dev;
+	int			bridge_num;
 	struct devlink_port	devlink_port;
 	bool			devlink_port_setup;
 	struct phylink		*pl;
@@ -413,6 +417,12 @@ struct dsa_switch {
 	 */
 	unsigned int		num_lag_ids;
 
+	/* Drivers that support bridge forwarding offload should set this to
+	 * the maximum number of bridges spanning the same switch tree that can
+	 * be offloaded.
+	 */
+	unsigned int		num_fwd_offloading_bridges;
+
 	size_t num_ports;
 };
 
@@ -696,6 +706,14 @@ struct dsa_switch_ops {
 				    struct net_device *bridge);
 	void	(*port_bridge_leave)(struct dsa_switch *ds, int port,
 				     struct net_device *bridge);
+	/* Called right after .port_bridge_join() */
+	int	(*port_bridge_tx_fwd_offload)(struct dsa_switch *ds, int port,
+					      struct net_device *bridge,
+					      int bridge_num);
+	/* Called right before .port_bridge_leave() */
+	void	(*port_bridge_tx_fwd_unoffload)(struct dsa_switch *ds, int port,
+						struct net_device *bridge,
+						int bridge_num);
 	void	(*port_stp_state_set)(struct dsa_switch *ds, int port,
 				      u8 state);
 	void	(*port_fast_age)(struct dsa_switch *ds, int port);
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index de5e93ba2a9d..c7fa85fb3086 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -1044,6 +1044,7 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
 
 	dp->ds = ds;
 	dp->index = index;
+	dp->bridge_num = -1;
 
 	INIT_LIST_HEAD(&dp->list);
 	list_add_tail(&dp->list, &dst->ports);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 78c70f5bdab5..b1d9aa4d313c 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -14,6 +14,8 @@
 #include <net/dsa.h>
 #include <net/gro_cells.h>
 
+#define DSA_MAX_NUM_OFFLOADING_BRIDGES		BITS_PER_LONG
+
 enum {
 	DSA_NOTIFIER_AGEING_TIME,
 	DSA_NOTIFIER_BRIDGE_JOIN,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index f2704f101ccf..7b9bf45a76b6 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -230,6 +230,83 @@ static void dsa_port_switchdev_unsync_attrs(struct dsa_port *dp)
 	 */
 }
 
+static int dsa_tree_find_bridge_num(struct dsa_switch_tree *dst,
+				    struct net_device *bridge_dev)
+{
+	struct dsa_port *dp;
+
+	/* When preparing the offload for a port, it will have a valid
+	 * dp->bridge_dev pointer but a not yet valid dp->bridge_num.
+	 * However there might be other ports having the same dp->bridge_dev
+	 * and a valid dp->bridge_num, so just ignore this port.
+	 */
+	list_for_each_entry(dp, &dst->ports, list)
+		if (dp->bridge_dev == bridge_dev && dp->bridge_num != -1)
+			return dp->bridge_num;
+
+	return -1;
+}
+
+static void dsa_port_bridge_tx_fwd_unoffload(struct dsa_port *dp,
+					     struct net_device *bridge_dev)
+{
+	struct dsa_switch_tree *dst = dp->ds->dst;
+	int bridge_num = dp->bridge_num;
+	struct dsa_switch *ds = dp->ds;
+
+	/* No bridge TX forwarding offload => do nothing */
+	if (!ds->ops->port_bridge_tx_fwd_unoffload || dp->bridge_num == -1)
+		return;
+
+	dp->bridge_num = -1;
+
+	/* Check if the bridge is still in use, otherwise it is time
+	 * to clean it up so we can reuse this bridge_num later.
+	 */
+	if (!dsa_tree_find_bridge_num(dst, bridge_dev))
+		clear_bit(bridge_num, &dst->fwd_offloading_bridges);
+
+	/* Notify the chips only once the offload has been deactivated, so
+	 * that they can update their configuration accordingly.
+	 */
+	ds->ops->port_bridge_tx_fwd_unoffload(ds, dp->index, bridge_dev,
+					      bridge_num);
+}
+
+static bool dsa_port_bridge_tx_fwd_offload(struct dsa_port *dp,
+					   struct net_device *bridge_dev)
+{
+	struct dsa_switch_tree *dst = dp->ds->dst;
+	struct dsa_switch *ds = dp->ds;
+	int bridge_num, err;
+
+	if (!ds->ops->port_bridge_tx_fwd_offload)
+		return false;
+
+	bridge_num = dsa_tree_find_bridge_num(dst, bridge_dev);
+	if (bridge_num < 0) {
+		/* First port that offloads TX forwarding for this bridge */
+		bridge_num = find_first_zero_bit(&dst->fwd_offloading_bridges,
+						 DSA_MAX_NUM_OFFLOADING_BRIDGES);
+		if (bridge_num >= ds->num_fwd_offloading_bridges)
+			return false;
+
+		set_bit(bridge_num, &dst->fwd_offloading_bridges);
+	}
+
+	dp->bridge_num = bridge_num;
+
+	/* Notify the driver */
+	err = ds->ops->port_bridge_tx_fwd_offload(ds, dp->index, bridge_dev,
+						  bridge_num);
+	if (err) {
+		dsa_port_bridge_tx_fwd_unoffload(dp, bridge_dev);
+		return false;
+	}
+
+	return true;
+}
+
 int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 			 struct netlink_ext_ack *extack)
 {
@@ -241,6 +318,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 	};
 	struct net_device *dev = dp->slave;
 	struct net_device *brport_dev;
+	bool tx_fwd_offload;
 	int err;
 
 	/* Here the interface is already bridged. Reflect the current
@@ -254,10 +332,12 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 	if (err)
 		goto out_rollback;
 
+	tx_fwd_offload = dsa_port_bridge_tx_fwd_offload(dp, br);
+
 	err = switchdev_bridge_port_offload(brport_dev, dev, dp,
 					    &dsa_slave_switchdev_notifier,
 					    &dsa_slave_switchdev_blocking_notifier,
-					    false, extack);
+					    tx_fwd_offload, extack);
 	if (err)
 		goto out_rollback_unbridge;
 
@@ -302,6 +382,8 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
 	 */
 	dp->bridge_dev = NULL;
 
+	dsa_port_bridge_tx_fwd_unoffload(dp, br);
+
 	err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
 	if (err)
 		pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n");
-- 
cgit v1.2.3


From 62001372c2b6cdf2346afb2cf94ed3d950eee64c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:05:54 -0700
Subject: bpf: tcp: seq_file: Remove bpf_seq_afinfo from tcp_iter_state

A following patch will create a separate struct to store extra
bpf_iter state and it will embed the existing tcp_iter_state like this:
struct bpf_tcp_iter_state {
	struct tcp_iter_state state;
	/* More bpf_iter specific states here ... */
}

As a prep work, this patch removes the
"struct tcp_seq_afinfo *bpf_seq_afinfo" where its purpose is
to tell if it is iterating from bpf_iter instead of proc fs.
Currently, if "*bpf_seq_afinfo" is not NULL, it is iterating from
bpf_iter.  The kernel should not filter by the addr family and
leave this filtering decision to the bpf prog.

Instead of adding a "*bpf_seq_afinfo" pointer, this patch uses the
"seq->op == &bpf_iter_tcp_seq_ops" test to tell if it is iterating
from the bpf iter.

The bpf_iter_(init|fini)_tcp() is left here to prepare for
the change of a following patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200554.1034982-1-kafai@fb.com
---
 include/net/tcp.h   |  1 -
 net/ipv4/tcp_ipv4.c | 25 +++++--------------------
 2 files changed, 5 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 17df9b047ee4..ba3034123e1d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1959,7 +1959,6 @@ struct tcp_iter_state {
 	struct seq_net_private	p;
 	enum tcp_seq_states	state;
 	struct sock		*syn_wait_sk;
-	struct tcp_seq_afinfo	*bpf_seq_afinfo;
 	int			bucket, offset, sbucket, num;
 	loff_t			last_pos;
 };
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f2583c4699fd..665f99d14436 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2735,12 +2735,13 @@ static const struct seq_operations bpf_iter_tcp_seq_ops = {
 #endif
 static unsigned short seq_file_family(const struct seq_file *seq)
 {
-	const struct tcp_iter_state *st = seq->private;
-	const struct tcp_seq_afinfo *afinfo = st->bpf_seq_afinfo;
+	const struct tcp_seq_afinfo *afinfo;
 
+#ifdef CONFIG_BPF_SYSCALL
 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
-	if (afinfo)
+	if (seq->op == &bpf_iter_tcp_seq_ops)
 		return AF_UNSPEC;
+#endif
 
 	/* Iterated from proc fs */
 	afinfo = PDE_DATA(file_inode(seq->file));
@@ -2998,27 +2999,11 @@ DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
 
 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
 {
-	struct tcp_iter_state *st = priv_data;
-	struct tcp_seq_afinfo *afinfo;
-	int ret;
-
-	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
-	if (!afinfo)
-		return -ENOMEM;
-
-	afinfo->family = AF_UNSPEC;
-	st->bpf_seq_afinfo = afinfo;
-	ret = bpf_iter_init_seq_net(priv_data, aux);
-	if (ret)
-		kfree(afinfo);
-	return ret;
+	return bpf_iter_init_seq_net(priv_data, aux);
 }
 
 static void bpf_iter_fini_tcp(void *priv_data)
 {
-	struct tcp_iter_state *st = priv_data;
-
-	kfree(st->bpf_seq_afinfo);
 	bpf_iter_fini_seq_net(priv_data);
 }
 
-- 
cgit v1.2.3


From 05c0b35709c58b83d4dc515d2ac52e9c0f197d03 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:06:06 -0700
Subject: tcp: seq_file: Replace listening_hash with lhash2

This patch moves the tcp seq_file iteration on listeners
from the port only listening_hash to the port+addr lhash2.

When iterating from the bpf iter, the next patch will need to
lock the socket such that the bpf iter can call setsockopt (e.g. to
change the TCP_CONGESTION).  To avoid locking the bucket and then locking
the sock, the bpf iter will first batch some sockets from the same bucket
and then unlock the bucket.  If the bucket size is small (which
usually is), it is easier to batch the whole bucket such that it is less
likely to miss a setsockopt on a socket due to changes in the bucket.

However, the port only listening_hash could have many listeners
hashed to a bucket (e.g. many individual VIP(s):443 and also
multiple by the number of SO_REUSEPORT).  We have seen bucket size in
tens of thousands range.  Also, the chance of having changes
in some popular port buckets (e.g. 443) is also high.

The port+addr lhash2 was introduced to solve this large listener bucket
issue.  Also, the listening_hash usage has already been replaced with
lhash2 in the fast path inet[6]_lookup_listener().  This patch follows
the same direction on moving to lhash2 and iterates the lhash2
instead of listening_hash.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200606.1035783-1-kafai@fb.com
---
 include/net/inet_hashtables.h |  6 ++++++
 net/ipv4/tcp_ipv4.c           | 35 ++++++++++++++++++-----------------
 2 files changed, 24 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ca6a3ea9057e..f72ec113ae56 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -160,6 +160,12 @@ struct inet_hashinfo {
 					____cacheline_aligned_in_smp;
 };
 
+#define inet_lhash2_for_each_icsk_continue(__icsk) \
+	hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node)
+
+#define inet_lhash2_for_each_icsk(__icsk, list) \
+	hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node)
+
 #define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
 	hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 48a0a3873c7a..d38b4379dca4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2296,21 +2296,22 @@ static void *listening_get_first(struct seq_file *seq)
 	struct tcp_iter_state *st = seq->private;
 
 	st->offset = 0;
-	for (; st->bucket < INET_LHTABLE_SIZE; st->bucket++) {
-		struct inet_listen_hashbucket *ilb;
-		struct hlist_nulls_node *node;
+	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
+		struct inet_listen_hashbucket *ilb2;
+		struct inet_connection_sock *icsk;
 		struct sock *sk;
 
-		ilb = &tcp_hashinfo.listening_hash[st->bucket];
-		if (hlist_nulls_empty(&ilb->nulls_head))
+		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+		if (hlist_empty(&ilb2->head))
 			continue;
 
-		spin_lock(&ilb->lock);
-		sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+		spin_lock(&ilb2->lock);
+		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
+			sk = (struct sock *)icsk;
 			if (seq_sk_match(seq, sk))
 				return sk;
 		}
-		spin_unlock(&ilb->lock);
+		spin_unlock(&ilb2->lock);
 	}
 
 	return NULL;
@@ -2324,22 +2325,22 @@ static void *listening_get_first(struct seq_file *seq)
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
 	struct tcp_iter_state *st = seq->private;
-	struct inet_listen_hashbucket *ilb;
-	struct hlist_nulls_node *node;
+	struct inet_listen_hashbucket *ilb2;
+	struct inet_connection_sock *icsk;
 	struct sock *sk = cur;
 
 	++st->num;
 	++st->offset;
 
-	sk = sk_nulls_next(sk);
-
-	sk_nulls_for_each_from(sk, node) {
+	icsk = inet_csk(sk);
+	inet_lhash2_for_each_icsk_continue(icsk) {
+		sk = (struct sock *)icsk;
 		if (seq_sk_match(seq, sk))
 			return sk;
 	}
 
-	ilb = &tcp_hashinfo.listening_hash[st->bucket];
-	spin_unlock(&ilb->lock);
+	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+	spin_unlock(&ilb2->lock);
 	++st->bucket;
 	return listening_get_first(seq);
 }
@@ -2456,7 +2457,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 
 	switch (st->state) {
 	case TCP_SEQ_STATE_LISTENING:
-		if (st->bucket >= INET_LHTABLE_SIZE)
+		if (st->bucket > tcp_hashinfo.lhash2_mask)
 			break;
 		st->state = TCP_SEQ_STATE_LISTENING;
 		rc = listening_get_first(seq);
@@ -2541,7 +2542,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v)
 	switch (st->state) {
 	case TCP_SEQ_STATE_LISTENING:
 		if (v != SEQ_START_TOKEN)
-			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
+			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
 		break;
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (v)
-- 
cgit v1.2.3


From 3cee6fb8e69ecd79be891c89a94974c48a25a437 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:06:19 -0700
Subject: bpf: tcp: Support bpf_(get|set)sockopt in bpf tcp iter

This patch allows bpf tcp iter to call bpf_(get|set)sockopt.
To allow a specific bpf iter (tcp here) to call a set of helpers,
get_func_proto function pointer is added to bpf_iter_reg.
The bpf iter is a tracing prog which currently requires
CAP_PERFMON or CAP_SYS_ADMIN, so this patch does not
impose other capability checks for bpf_(get|set)sockopt.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200619.1036715-1-kafai@fb.com
---
 include/linux/bpf.h      |  8 ++++++++
 kernel/bpf/bpf_iter.c    | 22 ++++++++++++++++++++++
 kernel/trace/bpf_trace.c |  7 ++++++-
 net/core/filter.c        | 34 ++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      | 15 +++++++++++++++
 5 files changed, 85 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 978ebd16ae60..c8cc09013210 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1442,6 +1442,9 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux,
 					struct seq_file *seq);
 typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux,
 					 struct bpf_link_info *info);
+typedef const struct bpf_func_proto *
+(*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id,
+			     const struct bpf_prog *prog);
 
 enum bpf_iter_feature {
 	BPF_ITER_RESCHED	= BIT(0),
@@ -1454,6 +1457,7 @@ struct bpf_iter_reg {
 	bpf_iter_detach_target_t detach_target;
 	bpf_iter_show_fdinfo_t show_fdinfo;
 	bpf_iter_fill_link_info_t fill_link_info;
+	bpf_iter_get_func_proto_t get_func_proto;
 	u32 ctx_arg_info_size;
 	u32 feature;
 	struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
@@ -1476,6 +1480,8 @@ struct bpf_iter__bpf_map_elem {
 int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog);
 int bpf_iter_new_fd(struct bpf_link *link);
 bool bpf_link_is_iter(struct bpf_link *link);
@@ -2050,6 +2056,8 @@ extern const struct bpf_func_proto bpf_task_storage_get_proto;
 extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
+extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
+extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 2d4fbdbb194e..2e9d47bb40ff 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -360,6 +360,28 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
 	return supported;
 }
 
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	const struct bpf_iter_target_info *tinfo;
+	const struct bpf_func_proto *fn = NULL;
+
+	mutex_lock(&targets_mutex);
+	list_for_each_entry(tinfo, &targets, list) {
+		if (tinfo->btf_id == prog->aux->attach_btf_id) {
+			const struct bpf_iter_reg *reg_info;
+
+			reg_info = tinfo->reg_info;
+			if (reg_info->get_func_proto)
+				fn = reg_info->get_func_proto(func_id, prog);
+			break;
+		}
+	}
+	mutex_unlock(&targets_mutex);
+
+	return fn;
+}
+
 static void bpf_iter_link_release(struct bpf_link *link)
 {
 	struct bpf_iter_link *iter_link =
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1f22ce1fa971..c5e0b6a64091 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1461,6 +1461,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 const struct bpf_func_proto *
 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
+	const struct bpf_func_proto *fn;
+
 	switch (func_id) {
 #ifdef CONFIG_NET
 	case BPF_FUNC_skb_output:
@@ -1501,7 +1503,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_d_path:
 		return &bpf_d_path_proto;
 	default:
-		return raw_tp_prog_func_proto(func_id, prog);
+		fn = raw_tp_prog_func_proto(func_id, prog);
+		if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
+			fn = bpf_iter_get_func_proto(func_id, prog);
+		return fn;
 	}
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 3b4986e96e9c..faf29fd82276 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5016,6 +5016,40 @@ err_clear:
 	return -EINVAL;
 }
 
+BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
+	   int, optname, char *, optval, int, optlen)
+{
+	return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_proto = {
+	.func		= bpf_sk_setsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
+	   int, optname, char *, optval, int, optlen)
+{
+	return _bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_getsockopt_proto = {
+	.func		= bpf_sk_getsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
 	   int, level, int, optname, char *, optval, int, optlen)
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 84ac0135d389..f9c6e47141fd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3259,6 +3259,20 @@ static const struct bpf_iter_seq_info tcp_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
 };
 
+static const struct bpf_func_proto *
+bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
+			    const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_setsockopt:
+		return &bpf_sk_setsockopt_proto;
+	case BPF_FUNC_getsockopt:
+		return &bpf_sk_getsockopt_proto;
+	default:
+		return NULL;
+	}
+}
+
 static struct bpf_iter_reg tcp_reg_info = {
 	.target			= "tcp",
 	.ctx_arg_info_size	= 1,
@@ -3266,6 +3280,7 @@ static struct bpf_iter_reg tcp_reg_info = {
 		{ offsetof(struct bpf_iter__tcp, sk_common),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.get_func_proto		= bpf_iter_tcp_get_func_proto,
 	.seq_info		= &tcp_seq_info,
 };
 
-- 
cgit v1.2.3


From 48d5440393d328cfe18b26e3609c84c0f41d6515 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Sat, 24 Jul 2021 23:47:32 +0200
Subject: nfc: constify payload argument in nci_send_cmd()

The nci_send_cmd() payload argument is passed directly to skb_put_data()
which already accepts a pointer to const, so make it const as well for
correctness and safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nfc/nci_core.h | 2 +-
 net/nfc/nci/core.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index 1df0f8074c9d..bf573eca07ca 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -360,7 +360,7 @@ int nci_core_rsp_packet(struct nci_dev *ndev, __u16 opcode,
 int nci_core_ntf_packet(struct nci_dev *ndev, __u16 opcode,
 			struct sk_buff *skb);
 void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb);
-int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload);
+int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, const void *payload);
 int nci_send_data(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb);
 int nci_conn_max_data_pkt_payload_size(struct nci_dev *ndev, __u8 conn_id);
 void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb,
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index da7fe9db1b00..09967b836361 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -1332,7 +1332,7 @@ int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb)
 EXPORT_SYMBOL(nci_send_frame);
 
 /* Send NCI command */
-int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload)
+int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, const void *payload)
 {
 	struct nci_ctrl_hdr *hdr;
 	struct sk_buff *skb;
-- 
cgit v1.2.3


From b9c28286d8f1822a10f702fcefe0b8a1db4e6917 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Sat, 24 Jul 2021 23:47:33 +0200
Subject: nfc: constify nci_ops

The struct nci_ops is modified by NFC core in only one case:
nci_allocate_device() receives too many proprietary commands (prop_ops)
to configure.  This is a build time known constrain, so a graceful
handling of such case is not necessary.

Instead, fail the nci_allocate_device() and add BUILD_BUG_ON() to places
which set these.

This allows to constify the struct nci_ops (consisting of function
pointers) for correctness and safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nfc/fdp/fdp.c        | 3 ++-
 drivers/nfc/nfcmrvl/main.c   | 2 +-
 drivers/nfc/nxp-nci/core.c   | 2 +-
 drivers/nfc/st-nci/core.c    | 3 ++-
 drivers/nfc/virtual_ncidev.c | 2 +-
 include/net/nfc/nci_core.h   | 4 ++--
 net/nfc/nci/core.c           | 5 ++---
 7 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/nfc/fdp/fdp.c b/drivers/nfc/fdp/fdp.c
index 528745862738..73f51848a693 100644
--- a/drivers/nfc/fdp/fdp.c
+++ b/drivers/nfc/fdp/fdp.c
@@ -675,7 +675,7 @@ static struct nci_driver_ops fdp_prop_ops[] = {
 	},
 };
 
-static struct nci_ops nci_ops = {
+static const struct nci_ops nci_ops = {
 	.open = fdp_nci_open,
 	.close = fdp_nci_close,
 	.send = fdp_nci_send,
@@ -718,6 +718,7 @@ int fdp_nci_probe(struct fdp_i2c_phy *phy, struct nfc_phy_ops *phy_ops,
 		    NFC_PROTO_NFC_DEP_MASK |
 		    NFC_PROTO_ISO15693_MASK;
 
+	BUILD_BUG_ON(ARRAY_SIZE(fdp_prop_ops) > NCI_MAX_PROPRIETARY_CMD);
 	ndev = nci_allocate_device(&nci_ops, protocols, tx_headroom,
 				   tx_tailroom);
 	if (!ndev) {
diff --git a/drivers/nfc/nfcmrvl/main.c b/drivers/nfc/nfcmrvl/main.c
index a4620b480c4f..6e9e7ce8792c 100644
--- a/drivers/nfc/nfcmrvl/main.c
+++ b/drivers/nfc/nfcmrvl/main.c
@@ -81,7 +81,7 @@ static int nfcmrvl_nci_fw_download(struct nci_dev *ndev,
 	return nfcmrvl_fw_dnld_start(ndev, firmware_name);
 }
 
-static struct nci_ops nfcmrvl_nci_ops = {
+static const struct nci_ops nfcmrvl_nci_ops = {
 	.open = nfcmrvl_nci_open,
 	.close = nfcmrvl_nci_close,
 	.send = nfcmrvl_nci_send,
diff --git a/drivers/nfc/nxp-nci/core.c b/drivers/nfc/nxp-nci/core.c
index 2b0c7232e91f..518e2afb43a8 100644
--- a/drivers/nfc/nxp-nci/core.c
+++ b/drivers/nfc/nxp-nci/core.c
@@ -83,7 +83,7 @@ static int nxp_nci_send(struct nci_dev *ndev, struct sk_buff *skb)
 	return r;
 }
 
-static struct nci_ops nxp_nci_ops = {
+static const struct nci_ops nxp_nci_ops = {
 	.open = nxp_nci_open,
 	.close = nxp_nci_close,
 	.send = nxp_nci_send,
diff --git a/drivers/nfc/st-nci/core.c b/drivers/nfc/st-nci/core.c
index 110ff1281e5f..f6fce34a77da 100644
--- a/drivers/nfc/st-nci/core.c
+++ b/drivers/nfc/st-nci/core.c
@@ -94,7 +94,7 @@ static struct nci_driver_ops st_nci_prop_ops[] = {
 	},
 };
 
-static struct nci_ops st_nci_ops = {
+static const struct nci_ops st_nci_ops = {
 	.init = st_nci_init,
 	.open = st_nci_open,
 	.close = st_nci_close,
@@ -131,6 +131,7 @@ int st_nci_probe(struct llt_ndlc *ndlc, int phy_headroom,
 		| NFC_PROTO_ISO15693_MASK
 		| NFC_PROTO_NFC_DEP_MASK;
 
+	BUILD_BUG_ON(ARRAY_SIZE(st_nci_prop_ops) > NCI_MAX_PROPRIETARY_CMD);
 	ndlc->ndev = nci_allocate_device(&st_nci_ops, protocols,
 					phy_headroom, phy_tailroom);
 	if (!ndlc->ndev) {
diff --git a/drivers/nfc/virtual_ncidev.c b/drivers/nfc/virtual_ncidev.c
index f73ee0bf3593..b914ab2c2109 100644
--- a/drivers/nfc/virtual_ncidev.c
+++ b/drivers/nfc/virtual_ncidev.c
@@ -65,7 +65,7 @@ static int virtual_nci_send(struct nci_dev *ndev, struct sk_buff *skb)
 	return 0;
 }
 
-static struct nci_ops virtual_nci_ops = {
+static const struct nci_ops virtual_nci_ops = {
 	.open = virtual_nci_open,
 	.close = virtual_nci_close,
 	.send = virtual_nci_send
diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index bf573eca07ca..5dae7e2cbc49 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -194,7 +194,7 @@ struct nci_hci_dev {
 /* NCI Core structures */
 struct nci_dev {
 	struct nfc_dev		*nfc_dev;
-	struct nci_ops		*ops;
+	const struct nci_ops	*ops;
 	struct nci_hci_dev	*hci_dev;
 
 	int			tx_headroom;
@@ -267,7 +267,7 @@ struct nci_dev {
 };
 
 /* ----- NCI Devices ----- */
-struct nci_dev *nci_allocate_device(struct nci_ops *ops,
+struct nci_dev *nci_allocate_device(const struct nci_ops *ops,
 				    __u32 supported_protocols,
 				    int tx_headroom,
 				    int tx_tailroom);
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 09967b836361..a7d26f2791b0 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -1129,7 +1129,7 @@ static struct nfc_ops nci_nfc_ops = {
  * @tx_headroom: Reserved space at beginning of skb
  * @tx_tailroom: Reserved space at end of skb
  */
-struct nci_dev *nci_allocate_device(struct nci_ops *ops,
+struct nci_dev *nci_allocate_device(const struct nci_ops *ops,
 				    __u32 supported_protocols,
 				    int tx_headroom, int tx_tailroom)
 {
@@ -1152,8 +1152,7 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops,
 	if (ops->n_prop_ops > NCI_MAX_PROPRIETARY_CMD) {
 		pr_err("Too many proprietary commands: %zd\n",
 		       ops->n_prop_ops);
-		ops->prop_ops = NULL;
-		ops->n_prop_ops = 0;
+		goto free_nci;
 	}
 
 	ndev->tx_headroom = tx_headroom;
-- 
cgit v1.2.3


From cb8caa3c6c04c18027258ab34e09bcf87e726f22 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Sat, 24 Jul 2021 23:47:35 +0200
Subject: nfc: constify nci_driver_ops (prop_ops and core_ops)

Neither the core nor the drivers modify the passed pointer to struct
nci_driver_ops (consisting of function pointers), so make it a pointer
to const for correctness and safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nfc/fdp/fdp.c      |  4 ++--
 drivers/nfc/s3fwrn5/nci.c  |  2 +-
 drivers/nfc/s3fwrn5/nci.h  |  2 +-
 drivers/nfc/st-nci/core.c  |  2 +-
 include/net/nfc/nci_core.h |  4 ++--
 net/nfc/nci/core.c         | 16 ++++++++--------
 6 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/nfc/fdp/fdp.c b/drivers/nfc/fdp/fdp.c
index 73f51848a693..4d88a617d0e8 100644
--- a/drivers/nfc/fdp/fdp.c
+++ b/drivers/nfc/fdp/fdp.c
@@ -651,7 +651,7 @@ static int fdp_nci_core_get_config_rsp_packet(struct nci_dev *ndev,
 	return 0;
 }
 
-static struct nci_driver_ops fdp_core_ops[] = {
+static const struct nci_driver_ops fdp_core_ops[] = {
 	{
 		.opcode = NCI_OP_CORE_GET_CONFIG_RSP,
 		.rsp = fdp_nci_core_get_config_rsp_packet,
@@ -662,7 +662,7 @@ static struct nci_driver_ops fdp_core_ops[] = {
 	},
 };
 
-static struct nci_driver_ops fdp_prop_ops[] = {
+static const struct nci_driver_ops fdp_prop_ops[] = {
 	{
 		.opcode = nci_opcode_pack(NCI_GID_PROP, NCI_OP_PROP_PATCH_OID),
 		.rsp = fdp_nci_prop_patch_rsp_packet,
diff --git a/drivers/nfc/s3fwrn5/nci.c b/drivers/nfc/s3fwrn5/nci.c
index 819e3474a437..e374e670b36b 100644
--- a/drivers/nfc/s3fwrn5/nci.c
+++ b/drivers/nfc/s3fwrn5/nci.c
@@ -20,7 +20,7 @@ static int s3fwrn5_nci_prop_rsp(struct nci_dev *ndev, struct sk_buff *skb)
 	return 0;
 }
 
-struct nci_driver_ops s3fwrn5_nci_prop_ops[4] = {
+const struct nci_driver_ops s3fwrn5_nci_prop_ops[4] = {
 	{
 		.opcode = nci_opcode_pack(NCI_GID_PROPRIETARY,
 				NCI_PROP_SET_RFREG),
diff --git a/drivers/nfc/s3fwrn5/nci.h b/drivers/nfc/s3fwrn5/nci.h
index 5c22c5315f79..c2d906591e9e 100644
--- a/drivers/nfc/s3fwrn5/nci.h
+++ b/drivers/nfc/s3fwrn5/nci.h
@@ -50,7 +50,7 @@ struct nci_prop_fw_cfg_rsp {
 	__u8 status;
 };
 
-extern struct nci_driver_ops s3fwrn5_nci_prop_ops[4];
+extern const struct nci_driver_ops s3fwrn5_nci_prop_ops[4];
 int s3fwrn5_nci_rf_configure(struct s3fwrn5_info *info, const char *fw_name);
 
 #endif /* __LOCAL_S3FWRN5_NCI_H_ */
diff --git a/drivers/nfc/st-nci/core.c b/drivers/nfc/st-nci/core.c
index f6fce34a77da..72bb51efdf9c 100644
--- a/drivers/nfc/st-nci/core.c
+++ b/drivers/nfc/st-nci/core.c
@@ -86,7 +86,7 @@ static int st_nci_prop_rsp_packet(struct nci_dev *ndev,
 	return 0;
 }
 
-static struct nci_driver_ops st_nci_prop_ops[] = {
+static const struct nci_driver_ops st_nci_prop_ops[] = {
 	{
 		.opcode = nci_opcode_pack(NCI_GID_PROPRIETARY,
 					  ST_NCI_CORE_PROP),
diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index 5dae7e2cbc49..e7118e0cc3b1 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -82,10 +82,10 @@ struct nci_ops {
 	void  (*hci_cmd_received)(struct nci_dev *ndev, u8 pipe, u8 cmd,
 				  struct sk_buff *skb);
 
-	struct nci_driver_ops *prop_ops;
+	const struct nci_driver_ops *prop_ops;
 	size_t n_prop_ops;
 
-	struct nci_driver_ops *core_ops;
+	const struct nci_driver_ops *core_ops;
 	size_t n_core_ops;
 };
 
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index a7d26f2791b0..50c625940fa3 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -1363,12 +1363,12 @@ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, const void *payl
 EXPORT_SYMBOL(nci_send_cmd);
 
 /* Proprietary commands API */
-static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops,
-					     size_t n_ops,
-					     __u16 opcode)
+static const struct nci_driver_ops *ops_cmd_lookup(const struct nci_driver_ops *ops,
+						   size_t n_ops,
+						   __u16 opcode)
 {
 	size_t i;
-	struct nci_driver_ops *op;
+	const struct nci_driver_ops *op;
 
 	if (!ops || !n_ops)
 		return NULL;
@@ -1383,10 +1383,10 @@ static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops,
 }
 
 static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode,
-			     struct sk_buff *skb, struct nci_driver_ops *ops,
+			     struct sk_buff *skb, const struct nci_driver_ops *ops,
 			     size_t n_ops)
 {
-	struct nci_driver_ops *op;
+	const struct nci_driver_ops *op;
 
 	op = ops_cmd_lookup(ops, n_ops, rsp_opcode);
 	if (!op || !op->rsp)
@@ -1396,10 +1396,10 @@ static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode,
 }
 
 static int nci_op_ntf_packet(struct nci_dev *ndev, __u16 ntf_opcode,
-			     struct sk_buff *skb, struct nci_driver_ops *ops,
+			     struct sk_buff *skb, const struct nci_driver_ops *ops,
 			     size_t n_ops)
 {
-	struct nci_driver_ops *op;
+	const struct nci_driver_ops *op;
 
 	op = ops_cmd_lookup(ops, n_ops, ntf_opcode);
 	if (!op || !op->ntf)
-- 
cgit v1.2.3


From 15944ad2e5a1cc2ef2c6fa60b04e464dbb4ddb61 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Sat, 24 Jul 2021 23:49:23 +0200
Subject: nfc: constify pointer to nfc_vendor_cmd

Neither the core nor the drivers modify the passed pointer to struct
nfc_vendor_cmd, so make it a pointer to const for correctness and
safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nfc/st-nci/vendor_cmds.c   | 2 +-
 drivers/nfc/st21nfca/vendor_cmds.c | 2 +-
 include/net/nfc/hci.h              | 2 +-
 include/net/nfc/nci_core.h         | 2 +-
 include/net/nfc/nfc.h              | 4 ++--
 net/nfc/netlink.c                  | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/nfc/st-nci/vendor_cmds.c b/drivers/nfc/st-nci/vendor_cmds.c
index 94b600029a2a..30d2912d1a05 100644
--- a/drivers/nfc/st-nci/vendor_cmds.c
+++ b/drivers/nfc/st-nci/vendor_cmds.c
@@ -371,7 +371,7 @@ static int st_nci_manufacturer_specific(struct nfc_dev *dev, void *data,
 	return nfc_vendor_cmd_reply(msg);
 }
 
-static struct nfc_vendor_cmd st_nci_vendor_cmds[] = {
+static const struct nfc_vendor_cmd st_nci_vendor_cmds[] = {
 	{
 		.vendor_id = ST_NCI_VENDOR_OUI,
 		.subcmd = FACTORY_MODE,
diff --git a/drivers/nfc/st21nfca/vendor_cmds.c b/drivers/nfc/st21nfca/vendor_cmds.c
index 62332ca91554..74882866dbaf 100644
--- a/drivers/nfc/st21nfca/vendor_cmds.c
+++ b/drivers/nfc/st21nfca/vendor_cmds.c
@@ -295,7 +295,7 @@ exit:
 	return r;
 }
 
-static struct nfc_vendor_cmd st21nfca_vendor_cmds[] = {
+static const struct nfc_vendor_cmd st21nfca_vendor_cmds[] = {
 	{
 		.vendor_id = ST21NFCA_VENDOR_OUI,
 		.subcmd = FACTORY_MODE,
diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h
index b35f37a57686..2daec8036be9 100644
--- a/include/net/nfc/hci.h
+++ b/include/net/nfc/hci.h
@@ -168,7 +168,7 @@ void nfc_hci_set_clientdata(struct nfc_hci_dev *hdev, void *clientdata);
 void *nfc_hci_get_clientdata(struct nfc_hci_dev *hdev);
 
 static inline int nfc_hci_set_vendor_cmds(struct nfc_hci_dev *hdev,
-					  struct nfc_vendor_cmd *cmds,
+					  const struct nfc_vendor_cmd *cmds,
 					  int n_cmds)
 {
 	return nfc_set_vendor_cmds(hdev->ndev, cmds, n_cmds);
diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index e7118e0cc3b1..00f2c60971d7 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -343,7 +343,7 @@ static inline void *nci_get_drvdata(struct nci_dev *ndev)
 }
 
 static inline int nci_set_vendor_cmds(struct nci_dev *ndev,
-				      struct nfc_vendor_cmd *cmds,
+				      const struct nfc_vendor_cmd *cmds,
 				      int n_cmds)
 {
 	return nfc_set_vendor_cmds(ndev->nfc_dev, cmds, n_cmds);
diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 2cd3a261bcbc..31672021d071 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -188,7 +188,7 @@ struct nfc_dev {
 
 	struct rfkill *rfkill;
 
-	struct nfc_vendor_cmd *vendor_cmds;
+	const struct nfc_vendor_cmd *vendor_cmds;
 	int n_vendor_cmds;
 
 	struct nfc_ops *ops;
@@ -297,7 +297,7 @@ void nfc_send_to_raw_sock(struct nfc_dev *dev, struct sk_buff *skb,
 			  u8 payload_type, u8 direction);
 
 static inline int nfc_set_vendor_cmds(struct nfc_dev *dev,
-				      struct nfc_vendor_cmd *cmds,
+				      const struct nfc_vendor_cmd *cmds,
 				      int n_cmds)
 {
 	if (dev->vendor_cmds || dev->n_vendor_cmds)
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 722f7ef891e1..70467a82be8f 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1531,7 +1531,7 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb,
 			       struct genl_info *info)
 {
 	struct nfc_dev *dev;
-	struct nfc_vendor_cmd *cmd;
+	const struct nfc_vendor_cmd *cmd;
 	u32 dev_idx, vid, subcmd;
 	u8 *data;
 	size_t data_len;
-- 
cgit v1.2.3


From f6c802a726aea43a2937763f13560c1fd170fae6 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Sat, 24 Jul 2021 23:49:25 +0200
Subject: nfc: constify nfc_ops

Neither the core nor the drivers modify the passed pointer to struct
nfc_ops, so make it a pointer to const for correctness and safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nfc/pn533/pn533.c | 2 +-
 include/net/nfc/nfc.h     | 4 ++--
 net/nfc/core.c            | 2 +-
 net/nfc/digital_core.c    | 2 +-
 net/nfc/hci/core.c        | 2 +-
 net/nfc/nci/core.c        | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c
index cd64bfe20402..2f3f3fe9a0ba 100644
--- a/drivers/nfc/pn533/pn533.c
+++ b/drivers/nfc/pn533/pn533.c
@@ -2623,7 +2623,7 @@ static int pn533_dev_down(struct nfc_dev *nfc_dev)
 	return ret;
 }
 
-static struct nfc_ops pn533_nfc_ops = {
+static const struct nfc_ops pn533_nfc_ops = {
 	.dev_up = pn533_dev_up,
 	.dev_down = pn533_dev_down,
 	.dep_link_up = pn533_dep_link_up,
diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 31672021d071..85b698794b14 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -191,14 +191,14 @@ struct nfc_dev {
 	const struct nfc_vendor_cmd *vendor_cmds;
 	int n_vendor_cmds;
 
-	struct nfc_ops *ops;
+	const struct nfc_ops *ops;
 	struct genl_info *cur_cmd_info;
 };
 #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev)
 
 extern struct class nfc_class;
 
-struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops,
+struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops,
 				    u32 supported_protocols,
 				    int tx_headroom,
 				    int tx_tailroom);
diff --git a/net/nfc/core.c b/net/nfc/core.c
index 573c80c6ff7a..6ade54149b73 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -1048,7 +1048,7 @@ struct nfc_dev *nfc_get_device(unsigned int idx)
  * @tx_headroom: reserved space at beginning of skb
  * @tx_tailroom: reserved space at end of skb
  */
-struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops,
+struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops,
 				    u32 supported_protocols,
 				    int tx_headroom, int tx_tailroom)
 {
diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c
index 5044c7db577e..8f2572decccd 100644
--- a/net/nfc/digital_core.c
+++ b/net/nfc/digital_core.c
@@ -732,7 +732,7 @@ exit:
 	return rc;
 }
 
-static struct nfc_ops digital_nfc_ops = {
+static const struct nfc_ops digital_nfc_ops = {
 	.dev_up = digital_dev_up,
 	.dev_down = digital_dev_down,
 	.start_poll = digital_start_poll,
diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index e37d30302b06..b33fe4ee1581 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -928,7 +928,7 @@ static int hci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name)
 	return hdev->ops->fw_download(hdev, firmware_name);
 }
 
-static struct nfc_ops hci_nfc_ops = {
+static const struct nfc_ops hci_nfc_ops = {
 	.dev_up = hci_dev_up,
 	.dev_down = hci_dev_down,
 	.start_poll = hci_start_poll,
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 50c625940fa3..400d66c4e210 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -1102,7 +1102,7 @@ static int nci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name)
 	return ndev->ops->fw_download(ndev, firmware_name);
 }
 
-static struct nfc_ops nci_nfc_ops = {
+static const struct nfc_ops nci_nfc_ops = {
 	.dev_up = nci_dev_up,
 	.dev_down = nci_dev_down,
 	.start_poll = nci_start_poll,
-- 
cgit v1.2.3


From 094c45c84d799fb0eb3617632097370843ba3678 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Sat, 24 Jul 2021 23:49:26 +0200
Subject: nfc: constify nfc_hci_ops

Neither the core nor the drivers modify the passed pointer to struct
nfc_hci_ops, so make it a pointer to const for correctness and safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nfc/microread/microread.c | 2 +-
 drivers/nfc/pn544/pn544.c         | 2 +-
 drivers/nfc/st21nfca/core.c       | 2 +-
 include/net/nfc/hci.h             | 4 ++--
 net/nfc/hci/core.c                | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/nfc/microread/microread.c b/drivers/nfc/microread/microread.c
index 151a0631ec72..8e847524937c 100644
--- a/drivers/nfc/microread/microread.c
+++ b/drivers/nfc/microread/microread.c
@@ -625,7 +625,7 @@ static int microread_event_received(struct nfc_hci_dev *hdev, u8 pipe,
 	return r;
 }
 
-static struct nfc_hci_ops microread_hci_ops = {
+static const struct nfc_hci_ops microread_hci_ops = {
 	.open = microread_open,
 	.close = microread_close,
 	.hci_ready = microread_hci_ready,
diff --git a/drivers/nfc/pn544/pn544.c b/drivers/nfc/pn544/pn544.c
index f4d09ebba5c8..c2b4555ab4b7 100644
--- a/drivers/nfc/pn544/pn544.c
+++ b/drivers/nfc/pn544/pn544.c
@@ -881,7 +881,7 @@ static int pn544_hci_disable_se(struct nfc_hci_dev *hdev, u32 se_idx)
 	}
 }
 
-static struct nfc_hci_ops pn544_hci_ops = {
+static const struct nfc_hci_ops pn544_hci_ops = {
 	.open = pn544_hci_open,
 	.close = pn544_hci_close,
 	.hci_ready = pn544_hci_ready,
diff --git a/drivers/nfc/st21nfca/core.c b/drivers/nfc/st21nfca/core.c
index 675f8a342869..5e6c99fcfd27 100644
--- a/drivers/nfc/st21nfca/core.c
+++ b/drivers/nfc/st21nfca/core.c
@@ -912,7 +912,7 @@ static int st21nfca_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe,
 	}
 }
 
-static struct nfc_hci_ops st21nfca_hci_ops = {
+static const struct nfc_hci_ops st21nfca_hci_ops = {
 	.open = st21nfca_hci_open,
 	.close = st21nfca_hci_close,
 	.load_session = st21nfca_hci_load_session,
diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h
index 2daec8036be9..756c11084f65 100644
--- a/include/net/nfc/hci.h
+++ b/include/net/nfc/hci.h
@@ -118,7 +118,7 @@ struct nfc_hci_dev {
 
 	struct sk_buff_head msg_rx_queue;
 
-	struct nfc_hci_ops *ops;
+	const struct nfc_hci_ops *ops;
 
 	struct nfc_llc *llc;
 
@@ -151,7 +151,7 @@ struct nfc_hci_dev {
 };
 
 /* hci device allocation */
-struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops,
+struct nfc_hci_dev *nfc_hci_allocate_device(const struct nfc_hci_ops *ops,
 					    struct nfc_hci_init_data *init_data,
 					    unsigned long quirks,
 					    u32 protocols,
diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index b33fe4ee1581..ff94ac774937 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -947,7 +947,7 @@ static const struct nfc_ops hci_nfc_ops = {
 	.se_io = hci_se_io,
 };
 
-struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops,
+struct nfc_hci_dev *nfc_hci_allocate_device(const struct nfc_hci_ops *ops,
 					    struct nfc_hci_init_data *init_data,
 					    unsigned long quirks,
 					    u32 protocols,
-- 
cgit v1.2.3


From 7186aac9c22de855220ebc54df7af399647bd1fc Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Sat, 24 Jul 2021 23:49:28 +0200
Subject: nfc: constify nfc_digital_ops

Neither the core nor the drivers modify the passed pointer to struct
nfc_digital_ops, so make it a pointer to const for correctness and safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nfc/nfcsim.c      | 2 +-
 drivers/nfc/port100.c     | 2 +-
 drivers/nfc/st95hf/core.c | 2 +-
 drivers/nfc/trf7970a.c    | 2 +-
 include/net/nfc/digital.h | 4 ++--
 net/nfc/digital_core.c    | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/nfc/nfcsim.c b/drivers/nfc/nfcsim.c
index a9864fcdfba6..143dc49b815b 100644
--- a/drivers/nfc/nfcsim.c
+++ b/drivers/nfc/nfcsim.c
@@ -320,7 +320,7 @@ static int nfcsim_tg_listen(struct nfc_digital_dev *ddev, u16 timeout,
 	return nfcsim_send(ddev, NULL, timeout, cb, arg);
 }
 
-static struct nfc_digital_ops nfcsim_digital_ops = {
+static const struct nfc_digital_ops nfcsim_digital_ops = {
 	.in_configure_hw = nfcsim_in_configure_hw,
 	.in_send_cmd = nfcsim_in_send_cmd,
 
diff --git a/drivers/nfc/port100.c b/drivers/nfc/port100.c
index 1d614f9d864a..ccb5c5fab905 100644
--- a/drivers/nfc/port100.c
+++ b/drivers/nfc/port100.c
@@ -1463,7 +1463,7 @@ static int port100_listen(struct nfc_digital_dev *ddev, u16 timeout,
 	return port100_tg_send_cmd(ddev, skb, timeout, cb, arg);
 }
 
-static struct nfc_digital_ops port100_digital_ops = {
+static const struct nfc_digital_ops port100_digital_ops = {
 	.in_configure_hw = port100_in_configure_hw,
 	.in_send_cmd = port100_in_send_cmd,
 
diff --git a/drivers/nfc/st95hf/core.c b/drivers/nfc/st95hf/core.c
index 2dc788c363fd..993818742570 100644
--- a/drivers/nfc/st95hf/core.c
+++ b/drivers/nfc/st95hf/core.c
@@ -1037,7 +1037,7 @@ static void st95hf_abort_cmd(struct nfc_digital_dev *ddev)
 {
 }
 
-static struct nfc_digital_ops st95hf_nfc_digital_ops = {
+static const struct nfc_digital_ops st95hf_nfc_digital_ops = {
 	.in_configure_hw = st95hf_in_configure_hw,
 	.in_send_cmd = st95hf_in_send_cmd,
 
diff --git a/drivers/nfc/trf7970a.c b/drivers/nfc/trf7970a.c
index 33978022ae47..1aed44629aaa 100644
--- a/drivers/nfc/trf7970a.c
+++ b/drivers/nfc/trf7970a.c
@@ -1861,7 +1861,7 @@ static void trf7970a_abort_cmd(struct nfc_digital_dev *ddev)
 	mutex_unlock(&trf->lock);
 }
 
-static struct nfc_digital_ops trf7970a_nfc_ops = {
+static const struct nfc_digital_ops trf7970a_nfc_ops = {
 	.in_configure_hw	= trf7970a_in_configure_hw,
 	.in_send_cmd		= trf7970a_send_cmd,
 	.tg_configure_hw	= trf7970a_tg_configure_hw,
diff --git a/include/net/nfc/digital.h b/include/net/nfc/digital.h
index 963db96bcbbb..bb3e8fdc0692 100644
--- a/include/net/nfc/digital.h
+++ b/include/net/nfc/digital.h
@@ -191,7 +191,7 @@ struct digital_poll_tech {
 
 struct nfc_digital_dev {
 	struct nfc_dev *nfc_dev;
-	struct nfc_digital_ops *ops;
+	const struct nfc_digital_ops *ops;
 
 	u32 protocols;
 
@@ -236,7 +236,7 @@ struct nfc_digital_dev {
 	void (*skb_add_crc)(struct sk_buff *skb);
 };
 
-struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops,
+struct nfc_digital_dev *nfc_digital_allocate_device(const struct nfc_digital_ops *ops,
 						    __u32 supported_protocols,
 						    __u32 driver_capabilities,
 						    int tx_headroom,
diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c
index 8f2572decccd..fefc03674f4f 100644
--- a/net/nfc/digital_core.c
+++ b/net/nfc/digital_core.c
@@ -745,7 +745,7 @@ static const struct nfc_ops digital_nfc_ops = {
 	.im_transceive = digital_in_send,
 };
 
-struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops,
+struct nfc_digital_dev *nfc_digital_allocate_device(const struct nfc_digital_ops *ops,
 					    __u32 supported_protocols,
 					    __u32 driver_capabilities,
 					    int tx_headroom, int tx_tailroom)
-- 
cgit v1.2.3


From 616d5769345528b989294a242a5906b157a92837 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@nvidia.com>
Date: Sun, 18 Jul 2021 14:54:13 +0300
Subject: IB/mlx5: Rename is_apu_thread_cq function to is_apu_cq

is_apu_thread_cq() used to detect CQs which are attached to APU
threads. This was extended to support other elements as well,
so the function was renamed to is_apu_cq().

c_eqn_or_apu_element was extended from 8 bits to 32 bits, which wan't
reflected when the APU support was first introduced.

Acked-by: Michael S. Tsirkin <mst@redhat.com> # vdpa
Signed-off-by: Tal Gilboa <talgi@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/cq.c                            | 2 +-
 drivers/infiniband/hw/mlx5/devx.c                          | 7 +++----
 drivers/net/ethernet/mellanox/mlx5/core/cq.c               | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c          | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c        | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c | 2 +-
 drivers/vdpa/mlx5/net/mlx5_vnet.c                          | 2 +-
 include/linux/mlx5/mlx5_ifc.h                              | 5 ++---
 8 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 7abeb576b3c5..18b55d2eba40 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -997,7 +997,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 				  MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD));
 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries));
 	MLX5_SET(cqc, cqc, uar_page, index);
-	MLX5_SET(cqc, cqc, c_eqn, eqn);
+	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
 	if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
 		MLX5_SET(cqc, cqc, oi, 1);
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index eb9b0a2707f8..e994aefcc40f 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1437,11 +1437,10 @@ out:
 	rcu_read_unlock();
 }
 
-static bool is_apu_thread_cq(struct mlx5_ib_dev *dev, const void *in)
+static bool is_apu_cq(struct mlx5_ib_dev *dev, const void *in)
 {
 	if (!MLX5_CAP_GEN(dev->mdev, apu) ||
-	    !MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context),
-		      apu_thread_cq))
+	    !MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), apu_cq))
 		return false;
 
 	return true;
@@ -1501,7 +1500,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 		err = mlx5_core_create_dct(dev, &obj->core_dct, cmd_in,
 					   cmd_in_len, cmd_out, cmd_out_len);
 	} else if (opcode == MLX5_CMD_OP_CREATE_CQ &&
-		   !is_apu_thread_cq(dev, cmd_in)) {
+		   !is_apu_cq(dev, cmd_in)) {
 		obj->flags |= DEVX_OBJ_FLAGS_CQ;
 		obj->core_cq.comp = devx_cq_comp;
 		err = mlx5_core_create_cq(dev->mdev, &obj->core_cq,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index df3e4938ecdd..99ec278d0370 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -89,7 +89,8 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq,
 int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 			u32 *in, int inlen, u32 *out, int outlen)
 {
-	int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), c_eqn);
+	int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context),
+			   c_eqn_or_apu_element);
 	u32 din[MLX5_ST_SZ_DW(destroy_cq_in)] = {};
 	struct mlx5_eq_comp *eq;
 	int err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d09e65557e75..cd2042d11968 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1627,7 +1627,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 				  (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
 
 	MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
-	MLX5_SET(cqc,   cqc, c_eqn,         eqn);
+	MLX5_SET(cqc,   cqc, c_eqn_or_apu_element, eqn);
 	MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
 	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 					    MLX5_ADAPTER_PAGE_SHIFT);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index bd66ab2af5b5..9b2cca6d9620 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -454,7 +454,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 
 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(cq_size));
-	MLX5_SET(cqc, cqc, c_eqn, eqn);
+	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
 	MLX5_SET(cqc, cqc, uar_page, fdev->conn_res.uar->index);
 	MLX5_SET(cqc, cqc, log_page_size, conn->cq.wq_ctrl.buf.page_shift -
 			   MLX5_ADAPTER_PAGE_SHIFT);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
index 12cf323a5943..754f89222858 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
@@ -790,7 +790,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
 
 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
-	MLX5_SET(cqc, cqc, c_eqn, eqn);
+	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
 	MLX5_SET(cqc, cqc, uar_page, uar->index);
 	MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 		 MLX5_ADAPTER_PAGE_SHIFT);
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 2a31467f7ac5..de8e8a1e13e7 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -573,7 +573,7 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
-	MLX5_SET(cqc, cqc, c_eqn, eqn);
+	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
 
 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3dd6641e942c..0b413f365699 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -3919,7 +3919,7 @@ struct mlx5_ifc_cqc_bits {
 	u8         status[0x4];
 	u8         reserved_at_4[0x2];
 	u8         dbr_umem_valid[0x1];
-	u8         apu_thread_cq[0x1];
+	u8         apu_cq[0x1];
 	u8         cqe_sz[0x3];
 	u8         cc[0x1];
 	u8         reserved_at_c[0x1];
@@ -3945,8 +3945,7 @@ struct mlx5_ifc_cqc_bits {
 	u8         cq_period[0xc];
 	u8         cq_max_count[0x10];
 
-	u8         reserved_at_a0[0x18];
-	u8         c_eqn[0x8];
+	u8         c_eqn_or_apu_element[0x20];
 
 	u8         reserved_at_c0[0x3];
 	u8         log_page_size[0x5];
-- 
cgit v1.2.3


From c757096ea1033c46ab768709847f7776b7e92a92 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Wed, 9 Oct 2019 06:41:08 +0200
Subject: can: rx-offload: add skb queue for use during ISR

Adding a skb to the skb_queue in rx-offload requires to take a lock.

This commit avoids this by adding an unlocked skb queue that is
appended at the end of the ISR. Having one lock at the end of the ISR
should be OK as the HW is empty, not about to overflow.

Link: https://lore.kernel.org/r/20210724204745.736053-2-mkl@pengutronix.de
Tested-by: Oleksij Rempel <o.rempel@pengutronix.de>
Co-developed-by: Kurt Van Dijck <dev.kurt@vandijck-laurijssen.be>
Signed-off-by: Kurt Van Dijck <dev.kurt@vandijck-laurijssen.be>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/rx-offload.c               | 67 +++++++++++++-------------
 drivers/net/can/flexcan.c                      |  3 ++
 drivers/net/can/m_can/m_can.c                  |  3 ++
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c |  6 ++-
 drivers/net/can/ti_hecc.c                      |  2 +
 include/linux/can/rx-offload.h                 |  2 +
 6 files changed, 48 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/rx-offload.c b/drivers/net/can/dev/rx-offload.c
index ab2c1543786c..d0bdb6db3a57 100644
--- a/drivers/net/can/dev/rx-offload.c
+++ b/drivers/net/can/dev/rx-offload.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2014      Protonic Holland,
  *                         David Jander
- * Copyright (C) 2014-2017 Pengutronix,
+ * Copyright (C) 2014-2021 Pengutronix,
  *                         Marc Kleine-Budde <kernel@pengutronix.de>
  */
 
@@ -174,10 +174,8 @@ can_rx_offload_offload_one(struct can_rx_offload *offload, unsigned int n)
 int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload,
 					 u64 pending)
 {
-	struct sk_buff_head skb_queue;
 	unsigned int i;
-
-	__skb_queue_head_init(&skb_queue);
+	int received = 0;
 
 	for (i = offload->mb_first;
 	     can_rx_offload_le(offload, i, offload->mb_last);
@@ -191,26 +189,12 @@ int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload,
 		if (IS_ERR_OR_NULL(skb))
 			continue;
 
-		__skb_queue_add_sort(&skb_queue, skb, can_rx_offload_compare);
-	}
-
-	if (!skb_queue_empty(&skb_queue)) {
-		unsigned long flags;
-		u32 queue_len;
-
-		spin_lock_irqsave(&offload->skb_queue.lock, flags);
-		skb_queue_splice_tail(&skb_queue, &offload->skb_queue);
-		spin_unlock_irqrestore(&offload->skb_queue.lock, flags);
-
-		queue_len = skb_queue_len(&offload->skb_queue);
-		if (queue_len > offload->skb_queue_len_max / 8)
-			netdev_dbg(offload->dev, "%s: queue_len=%d\n",
-				   __func__, queue_len);
-
-		can_rx_offload_schedule(offload);
+		__skb_queue_add_sort(&offload->skb_irq_queue, skb,
+				     can_rx_offload_compare);
+		received++;
 	}
 
-	return skb_queue_len(&skb_queue);
+	return received;
 }
 EXPORT_SYMBOL_GPL(can_rx_offload_irq_offload_timestamp);
 
@@ -226,13 +210,10 @@ int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload)
 		if (!skb)
 			break;
 
-		skb_queue_tail(&offload->skb_queue, skb);
+		__skb_queue_tail(&offload->skb_irq_queue, skb);
 		received++;
 	}
 
-	if (received)
-		can_rx_offload_schedule(offload);
-
 	return received;
 }
 EXPORT_SYMBOL_GPL(can_rx_offload_irq_offload_fifo);
@@ -241,7 +222,6 @@ int can_rx_offload_queue_sorted(struct can_rx_offload *offload,
 				struct sk_buff *skb, u32 timestamp)
 {
 	struct can_rx_offload_cb *cb;
-	unsigned long flags;
 
 	if (skb_queue_len(&offload->skb_queue) >
 	    offload->skb_queue_len_max) {
@@ -252,11 +232,8 @@ int can_rx_offload_queue_sorted(struct can_rx_offload *offload,
 	cb = can_rx_offload_get_cb(skb);
 	cb->timestamp = timestamp;
 
-	spin_lock_irqsave(&offload->skb_queue.lock, flags);
-	__skb_queue_add_sort(&offload->skb_queue, skb, can_rx_offload_compare);
-	spin_unlock_irqrestore(&offload->skb_queue.lock, flags);
-
-	can_rx_offload_schedule(offload);
+	__skb_queue_add_sort(&offload->skb_irq_queue, skb,
+			     can_rx_offload_compare);
 
 	return 0;
 }
@@ -295,13 +272,33 @@ int can_rx_offload_queue_tail(struct can_rx_offload *offload,
 		return -ENOBUFS;
 	}
 
-	skb_queue_tail(&offload->skb_queue, skb);
-	can_rx_offload_schedule(offload);
+	__skb_queue_tail(&offload->skb_irq_queue, skb);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(can_rx_offload_queue_tail);
 
+void can_rx_offload_irq_finish(struct can_rx_offload *offload)
+{
+	unsigned long flags;
+	int queue_len;
+
+	if (skb_queue_empty_lockless(&offload->skb_irq_queue))
+		return;
+
+	spin_lock_irqsave(&offload->skb_queue.lock, flags);
+	skb_queue_splice_tail_init(&offload->skb_irq_queue, &offload->skb_queue);
+	spin_unlock_irqrestore(&offload->skb_queue.lock, flags);
+
+	queue_len = skb_queue_len(&offload->skb_queue);
+	if (queue_len > offload->skb_queue_len_max / 8)
+		netdev_dbg(offload->dev, "%s: queue_len=%d\n",
+			   __func__, queue_len);
+
+	can_rx_offload_schedule(offload);
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_irq_finish);
+
 static int can_rx_offload_init_queue(struct net_device *dev,
 				     struct can_rx_offload *offload,
 				     unsigned int weight)
@@ -312,6 +309,7 @@ static int can_rx_offload_init_queue(struct net_device *dev,
 	offload->skb_queue_len_max = 2 << fls(weight);
 	offload->skb_queue_len_max *= 4;
 	skb_queue_head_init(&offload->skb_queue);
+	__skb_queue_head_init(&offload->skb_irq_queue);
 
 	netif_napi_add(dev, &offload->napi, can_rx_offload_napi_poll, weight);
 
@@ -373,5 +371,6 @@ void can_rx_offload_del(struct can_rx_offload *offload)
 {
 	netif_napi_del(&offload->napi);
 	skb_queue_purge(&offload->skb_queue);
+	__skb_queue_purge(&offload->skb_irq_queue);
 }
 EXPORT_SYMBOL_GPL(can_rx_offload_del);
diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 57f3635ad8d7..d9dcf6a8412b 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -1198,6 +1198,9 @@ static irqreturn_t flexcan_irq(int irq, void *dev_id)
 		}
 	}
 
+	if (handled)
+		can_rx_offload_irq_finish(&priv->offload);
+
 	return handled;
 }
 
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index bba2a449ac70..18461982f7a1 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1058,6 +1058,9 @@ static irqreturn_t m_can_isr(int irq, void *dev_id)
 		}
 	}
 
+	if (cdev->is_peripheral)
+		can_rx_offload_irq_finish(&cdev->offload);
+
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 47c3f408a799..f3b267ec22e0 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -2195,8 +2195,10 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id)
 			FIELD_GET(MCP251XFD_REG_INT_IE_MASK,
 				  priv->regs_status.intf);
 
-		if (!(intf_pending))
+		if (!(intf_pending)) {
+			can_rx_offload_irq_finish(&priv->offload);
 			return handled;
+		}
 
 		/* Some interrupts must be ACKed in the
 		 * MCP251XFD_REG_INT register.
@@ -2296,6 +2298,8 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id)
 	} while (1);
 
  out_fail:
+	can_rx_offload_irq_finish(&priv->offload);
+
 	netdev_err(priv->ndev, "IRQ handler returned %d (intf=0x%08x).\n",
 		   err, priv->regs_status.intf);
 	mcp251xfd_dump(priv);
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index 73245d8836a9..353062ead98f 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -786,6 +786,8 @@ static irqreturn_t ti_hecc_interrupt(int irq, void *dev_id)
 		int_status = hecc_read(priv, HECC_CANGIF0);
 	}
 
+	can_rx_offload_irq_finish(&priv->offload);
+
 	return IRQ_HANDLED;
 }
 
diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h
index 40882df7105e..d71c938e17d0 100644
--- a/include/linux/can/rx-offload.h
+++ b/include/linux/can/rx-offload.h
@@ -20,6 +20,7 @@ struct can_rx_offload {
 					bool drop);
 
 	struct sk_buff_head skb_queue;
+	struct sk_buff_head skb_irq_queue;
 	u32 skb_queue_len_max;
 
 	unsigned int mb_first;
@@ -48,6 +49,7 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload,
 					 unsigned int *frame_len_ptr);
 int can_rx_offload_queue_tail(struct can_rx_offload *offload,
 			      struct sk_buff *skb);
+void can_rx_offload_irq_finish(struct can_rx_offload *offload);
 void can_rx_offload_del(struct can_rx_offload *offload);
 void can_rx_offload_enable(struct can_rx_offload *offload);
 
-- 
cgit v1.2.3


From 1e0d8e507ea42dd37f52636db300de7ea7118012 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Fri, 7 May 2021 17:58:30 +0200
Subject: can: rx-offload: can_rx_offload_irq_finish(): directly call
 napi_schedule()

Instead of calling can_rx_offload_schedule() call napi_schedule()
directly. As this was the last use of can_rx_offload_schedule() remove
this helper function.

Link: https://lore.kernel.org/r/20210724204745.736053-3-mkl@pengutronix.de
Tested-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/rx-offload.c | 2 +-
 include/linux/can/rx-offload.h   | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/rx-offload.c b/drivers/net/can/dev/rx-offload.c
index d0bdb6db3a57..82ade3aa5c13 100644
--- a/drivers/net/can/dev/rx-offload.c
+++ b/drivers/net/can/dev/rx-offload.c
@@ -295,7 +295,7 @@ void can_rx_offload_irq_finish(struct can_rx_offload *offload)
 		netdev_dbg(offload->dev, "%s: queue_len=%d\n",
 			   __func__, queue_len);
 
-	can_rx_offload_schedule(offload);
+	napi_schedule(&offload->napi);
 }
 EXPORT_SYMBOL_GPL(can_rx_offload_irq_finish);
 
diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h
index d71c938e17d0..516f64df0ebc 100644
--- a/include/linux/can/rx-offload.h
+++ b/include/linux/can/rx-offload.h
@@ -53,11 +53,6 @@ void can_rx_offload_irq_finish(struct can_rx_offload *offload);
 void can_rx_offload_del(struct can_rx_offload *offload);
 void can_rx_offload_enable(struct can_rx_offload *offload);
 
-static inline void can_rx_offload_schedule(struct can_rx_offload *offload)
-{
-	napi_schedule(&offload->napi);
-}
-
 static inline void can_rx_offload_disable(struct can_rx_offload *offload)
 {
 	napi_disable(&offload->napi);
-- 
cgit v1.2.3


From 30bfec4fec5902731c8823f51c5332e6f2b2312a Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 10 May 2021 22:51:39 +0200
Subject: can: rx-offload: can_rx_offload_threaded_irq_finish(): add new
 function to be called from threaded interrupt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After reading all CAN frames from the controller in the IRQ handler
and storing them into a skb_queue, the driver calls napi_schedule().
In the napi poll function the skb from the skb_queue are then pushed
into the networking stack.

However if napi_schedule() is called from a threaded IRQ handler this
triggers the following error:

| NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!

To avoid this, create a new rx-offload
function (can_rx_offload_threaded_irq_finish()) with a call to
local_bh_disable()/local_bh_enable() around the napi_schedule() call.

Convert all drivers that call can_rx_offload_irq_finish() from
threaded IRQ context to can_rx_offload_threaded_irq_finish().

Link: https://lore.kernel.org/r/20210724204745.736053-4-mkl@pengutronix.de
Suggested-by: Daniel Glöckner <dg@emlix.com>
Tested-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/rx-offload.c               | 23 +++++++++++++++++++++++
 drivers/net/can/m_can/m_can.c                  |  2 +-
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c |  4 ++--
 include/linux/can/rx-offload.h                 |  1 +
 4 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/rx-offload.c b/drivers/net/can/dev/rx-offload.c
index 82ade3aa5c13..37b0cc65237b 100644
--- a/drivers/net/can/dev/rx-offload.c
+++ b/drivers/net/can/dev/rx-offload.c
@@ -299,6 +299,29 @@ void can_rx_offload_irq_finish(struct can_rx_offload *offload)
 }
 EXPORT_SYMBOL_GPL(can_rx_offload_irq_finish);
 
+void can_rx_offload_threaded_irq_finish(struct can_rx_offload *offload)
+{
+	unsigned long flags;
+	int queue_len;
+
+	if (skb_queue_empty_lockless(&offload->skb_irq_queue))
+		return;
+
+	spin_lock_irqsave(&offload->skb_queue.lock, flags);
+	skb_queue_splice_tail_init(&offload->skb_irq_queue, &offload->skb_queue);
+	spin_unlock_irqrestore(&offload->skb_queue.lock, flags);
+
+	queue_len = skb_queue_len(&offload->skb_queue);
+	if (queue_len > offload->skb_queue_len_max / 8)
+		netdev_dbg(offload->dev, "%s: queue_len=%d\n",
+			   __func__, queue_len);
+
+	local_bh_disable();
+	napi_schedule(&offload->napi);
+	local_bh_enable();
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_threaded_irq_finish);
+
 static int can_rx_offload_init_queue(struct net_device *dev,
 				     struct can_rx_offload *offload,
 				     unsigned int weight)
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 18461982f7a1..317cdc98c539 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1059,7 +1059,7 @@ static irqreturn_t m_can_isr(int irq, void *dev_id)
 	}
 
 	if (cdev->is_peripheral)
-		can_rx_offload_irq_finish(&cdev->offload);
+		can_rx_offload_threaded_irq_finish(&cdev->offload);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index f3b267ec22e0..6962ab2749df 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -2196,7 +2196,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id)
 				  priv->regs_status.intf);
 
 		if (!(intf_pending)) {
-			can_rx_offload_irq_finish(&priv->offload);
+			can_rx_offload_threaded_irq_finish(&priv->offload);
 			return handled;
 		}
 
@@ -2298,7 +2298,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id)
 	} while (1);
 
  out_fail:
-	can_rx_offload_irq_finish(&priv->offload);
+	can_rx_offload_threaded_irq_finish(&priv->offload);
 
 	netdev_err(priv->ndev, "IRQ handler returned %d (intf=0x%08x).\n",
 		   err, priv->regs_status.intf);
diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h
index 516f64df0ebc..c11477620403 100644
--- a/include/linux/can/rx-offload.h
+++ b/include/linux/can/rx-offload.h
@@ -50,6 +50,7 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload,
 int can_rx_offload_queue_tail(struct can_rx_offload *offload,
 			      struct sk_buff *skb);
 void can_rx_offload_irq_finish(struct can_rx_offload *offload);
+void can_rx_offload_threaded_irq_finish(struct can_rx_offload *offload);
 void can_rx_offload_del(struct can_rx_offload *offload);
 void can_rx_offload_enable(struct can_rx_offload *offload);
 
-- 
cgit v1.2.3


From 8345a330738149389dc8883573c9264965922e08 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Wed, 16 Jun 2021 11:55:26 +0200
Subject: can: bittiming: fix documentation for struct can_tdc

This patch fixes a typo in the documentation for struct can_tdc::tdcv.
The number "0" refers to automatic mode not the letter "O".

Further two grammar errors in the documentation for struct can_tdc are
fixed.

First grammar error: add a missing third person 's'.

Second grammar error: replace "such as" by "such that". The intent is
to give a condition, not an example.

Fixes: 289ea9e4ae59 ("can: add new CAN FD bittiming parameters: Transmitter Delay Compensation (TDC)")
Link: https://lore.kernel.org/r/20210616095922.2430415-1-mkl@pengutronix.de
Link: https://lore.kernel.org/r/20210616124057.60723-1-mailhol.vincent@wanadoo.fr
Co-developed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/bittiming.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index ae7a3411167c..9de6e9053e34 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -37,7 +37,7 @@
  *	quanta, from when the bit is sent on the TX pin to when it is
  *	received on the RX pin of the transmitter. Possible options:
  *
- *	  O: automatic mode. The controller dynamically measure @tdcv
+ *	  0: automatic mode. The controller dynamically measures @tdcv
  *	  for each transmitted CAN FD frame.
  *
  *	  Other values: manual mode. Use the fixed provided value.
@@ -45,7 +45,7 @@
  * @tdco: Transmitter Delay Compensation Offset. Offset value, in time
  *	quanta, defining the distance between the start of the bit
  *	reception on the RX pin of the transceiver and the SSP
- *	position such as SSP = @tdcv + @tdco.
+ *	position such that SSP = @tdcv + @tdco.
  *
  *	If @tdco is zero, then TDC is disabled and both @tdcv and
  *	@tdcf should be ignored.
-- 
cgit v1.2.3


From 896e7f3e7424d6cc1436172740aa76ebb2c1b248 Mon Sep 17 00:00:00 2001
From: Angelo Dureghello <angelo@kernel-space.org>
Date: Fri, 2 Jul 2021 11:48:37 +0200
Subject: can: flexcan: add platform data header

Add platform data header for flexcan.

Link: https://lore.kernel.org/r/20210702094841.327679-1-angelo@kernel-space.org
Signed-off-by: Angelo Dureghello <angelo@kernel-space.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/platform/flexcan.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 include/linux/can/platform/flexcan.h

(limited to 'include')

diff --git a/include/linux/can/platform/flexcan.h b/include/linux/can/platform/flexcan.h
new file mode 100644
index 000000000000..1b536fb999de
--- /dev/null
+++ b/include/linux/can/platform/flexcan.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021  Angelo Dureghello <angelo@kernel-space.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _CAN_PLATFORM_FLEXCAN_H
+#define _CAN_PLATFORM_FLEXCAN_H
+
+struct flexcan_platform_data {
+	u32 clock_frequency;
+	u8 clk_src;
+};
+
+#endif /* _CAN_PLATFORM_FLEXCAN_H */
-- 
cgit v1.2.3


From 26ab7b384525ccfa678c518577f7f0d841209c8b Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 23 Apr 2021 20:34:48 +0300
Subject: net/mlx5e: Block LRO if firmware asks for tunneled LRO

This commit does a cleanup in LRO configuration.

LRO is a parameter of an RQ, but its state is changed by modifying a TIR
related to the RQ.

The current status: LRO for tunneled packets is not supported in the
driver, inner TIRs may enable LRO on creation, but LRO status of inner
TIRs isn't changed in mlx5e_modify_tirs_lro(). This is inconsistent, but
as long as the firmware doesn't declare support for tunneled LRO, it
works, because the same RQs are shared between the inner and outer TIRs.

This commit does two fixes:

1. If the firmware has the tunneled LRO capability, LRO is blocked
altogether, because it's not possible to block it for inner TIRs only,
when the same RQs are shared between inner and outer TIRs, and the
driver won't be able to handle tunneled LRO traffic.

2. mlx5e_modify_tirs_lro() is patched to modify LRO state for all TIRs,
including inner ones, because all TIRs related to an RQ should agree on
their LRO state.

Fixes: 7b3722fa9ef6 ("net/mlx5e: Support RSS for GRE tunneled packets")
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 15 +++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                     |  3 ++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d09e65557e75..b651134b0f6b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2576,6 +2576,14 @@ static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv)
 		err = mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in);
 		if (err)
 			goto free_in;
+
+		/* Verify inner tirs resources allocated */
+		if (!priv->inner_indir_tir[0].tirn)
+			continue;
+
+		err = mlx5_core_modify_tir(mdev, priv->inner_indir_tir[tt].tirn, in);
+		if (err)
+			goto free_in;
 	}
 
 	for (ix = 0; ix < priv->max_nch; ix++) {
@@ -4808,7 +4816,14 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_TX;
 	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_RX;
 
+	/* Tunneled LRO is not supported in the driver, and the same RQs are
+	 * shared between inner and outer TIRs, so the driver can't disable LRO
+	 * for inner TIRs while having it enabled for outer TIRs. Due to this,
+	 * block LRO altogether if the firmware declares tunneled LRO support.
+	 */
 	if (!!MLX5_CAP_ETH(mdev, lro_cap) &&
+	    !MLX5_CAP_ETH(mdev, tunnel_lro_vxlan) &&
+	    !MLX5_CAP_ETH(mdev, tunnel_lro_gre) &&
 	    mlx5e_check_fragmented_striding_rq_cap(mdev))
 		netdev->vlan_features    |= NETIF_F_LRO;
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b0009aa3647f..6bbae0c3bc0b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -921,7 +921,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         scatter_fcs[0x1];
 	u8         enhanced_multi_pkt_send_wqe[0x1];
 	u8         tunnel_lso_const_out_ip_id[0x1];
-	u8         reserved_at_1c[0x2];
+	u8         tunnel_lro_gre[0x1];
+	u8         tunnel_lro_vxlan[0x1];
 	u8         tunnel_stateless_gre[0x1];
 	u8         tunnel_stateless_vxlan[0x1];
 
-- 
cgit v1.2.3


From ee80dd2e89ecce9c5dd6f556b8f581c9e1cbb605 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 26 Jul 2021 19:55:29 +0300
Subject: net: bridge: add a helper for retrieving port VLANs from the data
 path

Introduce a brother of br_vlan_get_info() which is protected by the RCU
mechanism, as opposed to br_vlan_get_info() which relies on taking the
write-side rtnl_mutex.

This is needed for drivers which need to find out whether a bridge port
has a VLAN configured or not. For example, certain DSA switches might
not offer complete source port identification to the CPU on RX, just the
VLAN in which the packet was received. Based on this VLAN, we cannot set
an accurate skb->dev ingress port, but at least we can configure one
that behaves the same as the correct one would (this is possible because
DSA sets skb->offload_fwd_mark = 1).

When we look at the bridge RX handler (br_handle_frame), we see that
what matters regarding skb->dev is the VLAN ID and the port STP state.
So we need to select an skb->dev that has the same bridge VLAN as the
packet we're receiving, and is in the LEARNING or FORWARDING STP state.
The latter is easy, but for the former, we should somehow keep a shadow
list of the bridge VLANs on each port, and a lookup table between VLAN
ID and the 'designated port for imprecise RX'. That is rather
complicated to keep in sync properly (the designated port per VLAN needs
to be updated on the addition and removal of a VLAN, as well as on the
join/leave events of the bridge on that port).

So, to avoid all that complexity, let's just iterate through our finite
number of ports and ask the bridge, for each packet: "do you have this
VLAN configured on this port?".

Cc: Roopa Prabhu <roopa@nvidia.com>
Cc: Nikolay Aleksandrov <nikolay@nvidia.com>
Cc: Ido Schimmel <idosch@nvidia.com>
Cc: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  8 ++++++++
 net/bridge/br_vlan.c      | 27 +++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index f0b4ffbd8582..b73b4ff749e1 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -111,6 +111,8 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid);
 int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto);
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo);
+int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid,
+			 struct bridge_vlan_info *p_vinfo);
 #else
 static inline bool br_vlan_enabled(const struct net_device *dev)
 {
@@ -137,6 +139,12 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
 {
 	return -EINVAL;
 }
+
+static inline int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid,
+				       struct bridge_vlan_info *p_vinfo)
+{
+	return -EINVAL;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 805206f31795..8cfd035bbaf9 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1449,6 +1449,33 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid,
 }
 EXPORT_SYMBOL_GPL(br_vlan_get_info);
 
+int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid,
+			 struct bridge_vlan_info *p_vinfo)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+	struct net_bridge_port *p;
+
+	p = br_port_get_check_rcu(dev);
+	if (p)
+		vg = nbp_vlan_group_rcu(p);
+	else if (netif_is_bridge_master(dev))
+		vg = br_vlan_group_rcu(netdev_priv(dev));
+	else
+		return -EINVAL;
+
+	v = br_vlan_find(vg, vid);
+	if (!v)
+		return -ENOENT;
+
+	p_vinfo->vid = vid;
+	p_vinfo->flags = v->flags;
+	if (vid == br_get_pvid(vg))
+		p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu);
+
 static int br_vlan_is_bind_vlan_dev(const struct net_device *dev)
 {
 	return is_vlan_dev(dev) &&
-- 
cgit v1.2.3


From b6ad86e6ad6c46e52cac218e62613c6c47cf7fa0 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 26 Jul 2021 19:55:35 +0300
Subject: net: dsa: sja1105: add bridge TX data plane offload based on
 tag_8021q

The main desire for having this feature in sja1105 is to support network
stack termination for traffic coming from a VLAN-aware bridge.

For sja1105, offloading the bridge data plane means sending packets
as-is, with the proper VLAN tag, to the chip. The chip will look up its
FDB and forward them to the correct destination port.

But we support bridge data plane offload even for VLAN-unaware bridges,
and the implementation there is different. In fact, VLAN-unaware
bridging is governed by tag_8021q, so it makes sense to have the
.bridge_fwd_offload_add() implementation fully within tag_8021q.
The key difference is that we only support 1 VLAN-aware bridge, but we
support multiple VLAN-unaware bridges. So we need to make sure that the
forwarding domain is not crossed by packets injected from the stack.

For this, we introduce the concept of a tag_8021q TX VLAN for bridge
forwarding offload. As opposed to the regular TX VLANs which contain
only 2 ports (the user port and the CPU port), a bridge data plane TX
VLAN is "multicast" (or "imprecise"): it contains all the ports that are
part of a certain bridge, and the hardware will select where the packet
goes within this "imprecise" forwarding domain.

Each VLAN-unaware bridge has its own "imprecise" TX VLAN, so we make use
of the unique "bridge_num" provided by DSA for the data plane offload.
We use the same 3 bits from the tag_8021q VLAN ID format to encode this
bridge number.

Note that these 3 bit positions have been used before for sub-VLANs in
best-effort VLAN filtering mode. The difference is that for best-effort,
the sub-VLANs were only valid on RX (and it was documented that the
sub-VLAN field needed to be transmitted as zero). Whereas for the bridge
data plane offload, these 3 bits are only valid on TX.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c |  4 +++
 include/linux/dsa/8021q.h              | 10 +++++++
 net/dsa/tag_8021q.c                    | 48 +++++++++++++++++++++++++++++++---
 net/dsa/tag_sja1105.c                  | 31 ++++++++++++++++++++++
 4 files changed, 89 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index a6a671f0fca5..da042e211dda 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2367,6 +2367,8 @@ static int sja1105_setup(struct dsa_switch *ds)
 	 */
 	ds->vlan_filtering_is_global = true;
 	ds->untag_bridge_pvid = true;
+	/* tag_8021q has 3 bits for the VBID, and the value 0 is reserved */
+	ds->num_fwd_offloading_bridges = 7;
 
 	/* Advertise the 8 egress queues */
 	ds->num_tx_queues = SJA1105_NUM_TC;
@@ -2880,6 +2882,8 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.tag_8021q_vlan_add	= sja1105_dsa_8021q_vlan_add,
 	.tag_8021q_vlan_del	= sja1105_dsa_8021q_vlan_del,
 	.port_prechangeupper	= sja1105_prechangeupper,
+	.port_bridge_tx_fwd_offload = dsa_tag_8021q_bridge_tx_fwd_offload,
+	.port_bridge_tx_fwd_unoffload = dsa_tag_8021q_bridge_tx_fwd_unoffload,
 };
 
 static const struct of_device_id sja1105_dt_ids[];
diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index ec5abfcdefd1..c7fa4a3498fe 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -35,6 +35,16 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 
 void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id);
 
+int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port,
+					struct net_device *br,
+					int bridge_num);
+
+void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port,
+					   struct net_device *br,
+					   int bridge_num);
+
+u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num);
+
 u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port);
 
 u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port);
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 51dcde7db26b..654697ebb6f3 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -17,7 +17,7 @@
  *
  * | 11  | 10  |  9  |  8  |  7  |  6  |  5  |  4  |  3  |  2  |  1  |  0  |
  * +-----------+-----+-----------------+-----------+-----------------------+
- * |    DIR    | RSV |    SWITCH_ID    |    RSV    |          PORT         |
+ * |    DIR    | VBID|    SWITCH_ID    |   VBID    |          PORT         |
  * +-----------+-----+-----------------+-----------+-----------------------+
  *
  * DIR - VID[11:10]:
@@ -30,9 +30,10 @@
  * SWITCH_ID - VID[8:6]:
  *	Index of switch within DSA tree. Must be between 0 and 7.
  *
- * RSV - VID[5:4]:
- *	To be used for further expansion of PORT or for other purposes.
- *	Must be transmitted as zero and ignored on receive.
+ * VBID - { VID[9], VID[5:4] }:
+ *	Virtual bridge ID. If between 1 and 7, packet targets the broadcast
+ *	domain of a bridge. If transmitted as zero, packet targets a single
+ *	port. Field only valid on transmit, must be ignored on receive.
  *
  * PORT - VID[3:0]:
  *	Index of switch port. Must be between 0 and 15.
@@ -50,11 +51,30 @@
 #define DSA_8021Q_SWITCH_ID(x)		(((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \
 						 DSA_8021Q_SWITCH_ID_MASK)
 
+#define DSA_8021Q_VBID_HI_SHIFT		9
+#define DSA_8021Q_VBID_HI_MASK		GENMASK(9, 9)
+#define DSA_8021Q_VBID_LO_SHIFT		4
+#define DSA_8021Q_VBID_LO_MASK		GENMASK(5, 4)
+#define DSA_8021Q_VBID_HI(x)		(((x) & GENMASK(2, 2)) >> 2)
+#define DSA_8021Q_VBID_LO(x)		((x) & GENMASK(1, 0))
+#define DSA_8021Q_VBID(x)		\
+		(((DSA_8021Q_VBID_LO(x) << DSA_8021Q_VBID_LO_SHIFT) & \
+		  DSA_8021Q_VBID_LO_MASK) | \
+		 ((DSA_8021Q_VBID_HI(x) << DSA_8021Q_VBID_HI_SHIFT) & \
+		  DSA_8021Q_VBID_HI_MASK))
+
 #define DSA_8021Q_PORT_SHIFT		0
 #define DSA_8021Q_PORT_MASK		GENMASK(3, 0)
 #define DSA_8021Q_PORT(x)		(((x) << DSA_8021Q_PORT_SHIFT) & \
 						 DSA_8021Q_PORT_MASK)
 
+u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num)
+{
+	/* The VBID value of 0 is reserved for precise TX */
+	return DSA_8021Q_DIR_TX | DSA_8021Q_VBID(bridge_num + 1);
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_bridge_tx_fwd_offload_vid);
+
 /* Returns the VID to be inserted into the frame from xmit for switch steering
  * instructions on egress. Encodes switch ID and port ID.
  */
@@ -387,6 +407,26 @@ int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds,
 	return 0;
 }
 
+int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port,
+					struct net_device *br,
+					int bridge_num)
+{
+	u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num);
+
+	return dsa_port_tag_8021q_vlan_add(dsa_to_port(ds, port), tx_vid);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_offload);
+
+void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port,
+					   struct net_device *br,
+					   int bridge_num)
+{
+	u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num);
+
+	dsa_port_tag_8021q_vlan_del(dsa_to_port(ds, port), tx_vid);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_unoffload);
+
 /* Set up a port's tag_8021q RX and TX VLAN for standalone mode operation */
 static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
 {
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index f142a933c5e2..cddee4b499d8 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -133,6 +133,31 @@ static u16 sja1105_xmit_tpid(struct sja1105_port *sp)
 	return sp->xmit_tpid;
 }
 
+static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
+					      struct net_device *netdev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct net_device *br = dp->bridge_dev;
+	u16 tx_vid;
+
+	/* If the port is under a VLAN-aware bridge, just slide the
+	 * VLAN-tagged packet into the FDB and hope for the best.
+	 * This works because we support a single VLAN-aware bridge
+	 * across the entire dst, and its VLANs cannot be shared with
+	 * any standalone port.
+	 */
+	if (br_vlan_enabled(br))
+		return skb;
+
+	/* If the port is under a VLAN-unaware bridge, use an imprecise
+	 * TX VLAN that targets the bridge's entire broadcast domain,
+	 * instead of just the specific port.
+	 */
+	tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(dp->bridge_num);
+
+	return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), tx_vid);
+}
+
 static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
@@ -141,6 +166,9 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
 
+	if (skb->offload_fwd_mark)
+		return sja1105_imprecise_xmit(skb, netdev);
+
 	/* Transmitting management traffic does not rely upon switch tagging,
 	 * but instead SPI-installed management routes. Part 2 of this
 	 * is the .port_deferred_xmit driver callback.
@@ -165,6 +193,9 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
 	__be16 *tx_header;
 	int trailer_pos;
 
+	if (skb->offload_fwd_mark)
+		return sja1105_imprecise_xmit(skb, netdev);
+
 	/* Transmitting control packets is done using in-band control
 	 * extensions, while data packets are transmitted using
 	 * tag_8021q TX VLANs.
-- 
cgit v1.2.3


From edac6f6332d96aab59af5f27a195f55cd080f034 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 26 Jul 2021 19:55:36 +0300
Subject: Revert "net: dsa: Allow drivers to filter packets they can decode
 source port from"

This reverts commit cc1939e4b3aaf534fb2f3706820012036825731c.

Currently 2 classes of DSA drivers are able to send/receive packets
directly through the DSA master:
- drivers with DSA_TAG_PROTO_NONE
- sja1105

Now that sja1105 has gained the ability to perform traffic termination
even under the tricky case (VLAN-aware bridge), and that is much more
functional (we can perform VLAN-aware bridging with foreign interfaces),
there is no reason to keep this code in the receive path of the network
core. So delete it.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  | 15 ---------------
 net/dsa/port.c     |  1 -
 net/ethernet/eth.c |  6 +-----
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index f8eb2dc3fbef..55fcac854058 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -88,11 +88,6 @@ struct dsa_device_ops {
 			       struct packet_type *pt);
 	void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
 			     int *offset);
-	/* Used to determine which traffic should match the DSA filter in
-	 * eth_type_trans, and which, if any, should bypass it and be processed
-	 * as regular on the master net device.
-	 */
-	bool (*filter)(const struct sk_buff *skb, struct net_device *dev);
 	unsigned int needed_headroom;
 	unsigned int needed_tailroom;
 	const char *name;
@@ -246,7 +241,6 @@ struct dsa_port {
 	struct dsa_switch_tree *dst;
 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
 			       struct packet_type *pt);
-	bool (*filter)(const struct sk_buff *skb, struct net_device *dev);
 
 	enum {
 		DSA_PORT_TYPE_UNUSED = 0,
@@ -985,15 +979,6 @@ static inline bool netdev_uses_dsa(const struct net_device *dev)
 	return false;
 }
 
-static inline bool dsa_can_decode(const struct sk_buff *skb,
-				  struct net_device *dev)
-{
-#if IS_ENABLED(CONFIG_NET_DSA)
-	return !dev->dsa_ptr->filter || dev->dsa_ptr->filter(skb, dev);
-#endif
-	return false;
-}
-
 /* All DSA tags that push the EtherType to the right (basically all except tail
  * tags, which don't break dissection) can be treated the same from the
  * perspective of the flow dissector.
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 7b9bf45a76b6..b927d94b6934 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -888,7 +888,6 @@ int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
 void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
 			       const struct dsa_device_ops *tag_ops)
 {
-	cpu_dp->filter = tag_ops->filter;
 	cpu_dp->rcv = tag_ops->rcv;
 	cpu_dp->tag_ops = tag_ops;
 }
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 9cce612e8976..171ba75b74c9 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -182,12 +182,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 	 * at all, so we check here whether one of those tagging
 	 * variants has been configured on the receiving interface,
 	 * and if so, set skb->protocol without looking at the packet.
-	 * The DSA tagging protocol may be able to decode some but not all
-	 * traffic (for example only for management). In that case give it the
-	 * option to filter the packets from which it can decode source port
-	 * information.
 	 */
-	if (unlikely(netdev_uses_dsa(dev)) && dsa_can_decode(skb, dev))
+	if (unlikely(netdev_uses_dsa(dev)))
 		return htons(ETH_P_XDSA);
 
 	if (likely(eth_proto_is_802_3(eth->h_proto)))
-- 
cgit v1.2.3


From f9b282b36dfa9b6c6d6b3e8816cdf0e4defff482 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Tue, 27 Jul 2021 11:41:41 +0800
Subject: net: netlink: add the case when nlh is NULL

Add the case when nlh is NULL in nlmsg_report(),
so that the caller doesn't need to deal with this case.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h   | 2 +-
 net/core/rtnetlink.c    | 6 +-----
 net/netlink/genetlink.c | 9 ++++-----
 3 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 1ceec518ab49..7a2a9d3144ba 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -885,7 +885,7 @@ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh,
  */
 static inline int nlmsg_report(const struct nlmsghdr *nlh)
 {
-	return !!(nlh->nlmsg_flags & NLM_F_ECHO);
+	return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0;
 }
 
 /**
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 670d74ab91ae..e79aaf1f7139 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -726,12 +726,8 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
 		 struct nlmsghdr *nlh, gfp_t flags)
 {
 	struct sock *rtnl = net->rtnl;
-	int report = 0;
 
-	if (nlh)
-		report = nlmsg_report(nlh);
-
-	nlmsg_notify(rtnl, skb, pid, group, report, flags);
+	nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags);
 }
 EXPORT_SYMBOL(rtnl_notify);
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 2d6fdf40df66..ae58da608a31 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1485,6 +1485,7 @@ int genlmsg_multicast_allns(const struct genl_family *family,
 {
 	if (WARN_ON_ONCE(group >= family->n_mcgrps))
 		return -EINVAL;
+
 	group = family->mcgrp_offset + group;
 	return genlmsg_mcast(skb, portid, group, flags);
 }
@@ -1495,14 +1496,12 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb,
 {
 	struct net *net = genl_info_net(info);
 	struct sock *sk = net->genl_sock;
-	int report = 0;
-
-	if (info->nlhdr)
-		report = nlmsg_report(info->nlhdr);
 
 	if (WARN_ON_ONCE(group >= family->n_mcgrps))
 		return;
+
 	group = family->mcgrp_offset + group;
-	nlmsg_notify(sk, skb, info->snd_portid, group, report, flags);
+	nlmsg_notify(sk, skb, info->snd_portid, group,
+		     nlmsg_report(info->nlhdr), flags);
 }
 EXPORT_SYMBOL(genl_notify);
-- 
cgit v1.2.3


From e4252cb66637b846b916cca7c2cdb4ed22ab2fc3 Mon Sep 17 00:00:00 2001
From: Mark Gray <mark.d.gray@redhat.com>
Date: Fri, 23 Jul 2021 10:24:12 -0400
Subject: openvswitch: update kdoc OVS_DP_ATTR_PER_CPU_PIDS

Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 6571b57b2268..0e436a3755f1 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -70,7 +70,7 @@ enum ovs_datapath_cmd {
  * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
  * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
  * not be sent.
- * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
+ * @OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
  * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
  * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
  * datapath.  Always present in notifications.
-- 
cgit v1.2.3


From 784dcfa56e0453bb197601ba0b8196f6f892ebcb Mon Sep 17 00:00:00 2001
From: Mark Gray <mark.d.gray@redhat.com>
Date: Fri, 23 Jul 2021 10:24:13 -0400
Subject: openvswitch: fix alignment issues

Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  4 ++--
 net/openvswitch/actions.c        |  6 ++++--
 net/openvswitch/datapath.c       | 16 ++++++++++------
 3 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 0e436a3755f1..150bcff49b1c 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -89,8 +89,8 @@ enum ovs_datapath_attr {
 	OVS_DP_ATTR_USER_FEATURES,	/* OVS_DP_F_*  */
 	OVS_DP_ATTR_PAD,
 	OVS_DP_ATTR_MASKS_CACHE_SIZE,
-	OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in per-cpu
-				     * dispatch mode
+	OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in
+				     * per-cpu dispatch mode
 				     */
 	__OVS_DP_ATTR_MAX
 };
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index f79679746c62..076774034bb9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -924,9 +924,11 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			break;
 
 		case OVS_USERSPACE_ATTR_PID:
-			if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+			if (dp->user_features &
+			    OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
 				upcall.portid =
-				   ovs_dp_get_upcall_portid(dp, smp_processor_id());
+				  ovs_dp_get_upcall_portid(dp,
+							   smp_processor_id());
 			else
 				upcall.portid = nla_get_u32(a);
 			break;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 7a4edafdc685..e6f0ae5618dd 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -244,7 +244,8 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 		upcall.cmd = OVS_PACKET_CMD_MISS;
 
 		if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
-			upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id());
+			upcall.portid =
+			    ovs_dp_get_upcall_portid(dp, smp_processor_id());
 		else
 			upcall.portid = ovs_vport_find_upcall_portid(p, skb);
 
@@ -1636,13 +1637,16 @@ u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
 	if (dp_nlsk_pids) {
 		if (cpu_id < dp_nlsk_pids->n_pids) {
 			return dp_nlsk_pids->pids[cpu_id];
-		} else if (dp_nlsk_pids->n_pids > 0 && cpu_id >= dp_nlsk_pids->n_pids) {
-			/* If the number of netlink PIDs is mismatched with the number of
-			 * CPUs as seen by the kernel, log this and send the upcall to an
-			 * arbitrary socket (0) in order to not drop packets
+		} else if (dp_nlsk_pids->n_pids > 0 &&
+			   cpu_id >= dp_nlsk_pids->n_pids) {
+			/* If the number of netlink PIDs is mismatched with
+			 * the number of CPUs as seen by the kernel, log this
+			 * and send the upcall to an arbitrary socket (0) in
+			 * order to not drop packets
 			 */
 			pr_info_ratelimited("cpu_id mismatch with handler threads");
-			return dp_nlsk_pids->pids[cpu_id % dp_nlsk_pids->n_pids];
+			return dp_nlsk_pids->pids[cpu_id %
+						  dp_nlsk_pids->n_pids];
 		} else {
 			return 0;
 		}
-- 
cgit v1.2.3


From 409f386b8e5d69ba5b13bb94618b6b668a7a3736 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Tue, 27 Jul 2021 21:14:13 +0800
Subject: qdisc: add new field for qdisc_enqueue tracepoint

qdisc_enqueue tracepoint can work with qdisc:qdisc_dequeue
to measure packets latency in qdisc queues.

Add a new field txq for it, then we can retrieve more info.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/qdisc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
index c3006c6b4a87..59c945b66f9c 100644
--- a/include/trace/events/qdisc.h
+++ b/include/trace/events/qdisc.h
@@ -54,6 +54,7 @@ TRACE_EVENT(qdisc_enqueue,
 
 	TP_STRUCT__entry(
 		__field(struct Qdisc *, qdisc)
+		__field(const struct netdev_queue *, txq)
 		__field(void *,	skbaddr)
 		__field(int, ifindex)
 		__field(u32, handle)
@@ -62,6 +63,7 @@ TRACE_EVENT(qdisc_enqueue,
 
 	TP_fast_assign(
 		__entry->qdisc = qdisc;
+		__entry->txq	 = txq;
 		__entry->skbaddr = skb;
 		__entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
 		__entry->handle	 = qdisc->handle;
-- 
cgit v1.2.3


From b9067f5dc4a07c8e24e01a1b277c6722d91be39e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 27 Jul 2021 15:44:47 +0200
Subject: net: split out SIOCDEVPRIVATE handling from dev_ioctl

SIOCDEVPRIVATE ioctl commands are mainly used in really old
drivers, and they have a number of problems:

- They hide behind the normal .ndo_do_ioctl function that
  is also used for other things in modern drivers, so it's
  hard to spot a driver that actually uses one of these

- Since drivers use a number different calling conventions,
  it is impossible to support compat mode for them in
  a generic way.

- With all drivers using the same 16 commands codes, there
  is no way to introspect the data being passed through
  things like strace.

Add a new net_device_ops callback pointer, to address the
first two of these. Separating them from .ndo_do_ioctl
makes it easy to grep for drivers with a .ndo_siocdevprivate
callback, and the unwieldy name hopefully makes it easier
to spot in code review.

By passing the ifreq structure and the ifr_data pointer
separately, it is no longer necessary to overload these,
and the driver can use either one for a given command.

Cc: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdevices.rst |  7 +++++++
 include/linux/netdevice.h               |  3 +++
 net/core/dev_ioctl.c                    | 25 ++++++++++++++++++++++---
 3 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 17bdcb746dcf..02f1faac839a 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -222,6 +222,13 @@ ndo_do_ioctl:
 	Synchronization: rtnl_lock() semaphore.
 	Context: process
 
+ndo_siocdevprivate:
+	Synchronization: rtnl_lock() semaphore.
+	Context: process
+
+	This is used to implement SIOCDEVPRIVATE ioctl helpers.
+	These should not be added to new drivers, so don't use.
+
 ndo_get_stats:
 	Synchronization: rtnl_lock() semaphore, dev_base_lock rwlock, or RCU.
 	Context: atomic (can't sleep under rwlock or RCU)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c871dc223dfa..670e1a8e5928 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1361,6 +1361,9 @@ struct net_device_ops {
 	int			(*ndo_validate_addr)(struct net_device *dev);
 	int			(*ndo_do_ioctl)(struct net_device *dev,
 					        struct ifreq *ifr, int cmd);
+	int			(*ndo_siocdevprivate)(struct net_device *dev,
+						      struct ifreq *ifr,
+						      void __user *data, int cmd);
 	int			(*ndo_set_config)(struct net_device *dev,
 					          struct ifmap *map);
 	int			(*ndo_change_mtu)(struct net_device *dev,
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 950e2fe5d56a..75e3e340d884 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -259,6 +259,23 @@ static int dev_do_ioctl(struct net_device *dev,
 	return err;
 }
 
+static int dev_siocdevprivate(struct net_device *dev,
+			      struct ifreq *ifr, unsigned int cmd)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	void __user *data = ifr->ifr_data;
+
+	if (ops->ndo_siocdevprivate) {
+		if (netif_device_present(dev))
+			return ops->ndo_siocdevprivate(dev, ifr, data, cmd);
+		else
+			return -ENODEV;
+	}
+
+	/* fall back to do_ioctl for drivers not yet converted */
+	return dev_do_ioctl(dev, ifr, cmd);
+}
+
 /*
  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
  */
@@ -336,9 +353,11 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 	 *	Unknown or private ioctl
 	 */
 	default:
-		if ((cmd >= SIOCDEVPRIVATE &&
-		    cmd <= SIOCDEVPRIVATE + 15) ||
-		    cmd == SIOCBONDENSLAVE ||
+		if (cmd >= SIOCDEVPRIVATE &&
+		    cmd <= SIOCDEVPRIVATE + 15)
+			return dev_siocdevprivate(dev, ifr, cmd);
+
+		if (cmd == SIOCBONDENSLAVE ||
 		    cmd == SIOCBONDRELEASE ||
 		    cmd == SIOCBONDSETHWADDR ||
 		    cmd == SIOCBONDSLAVEINFOQUERY ||
-- 
cgit v1.2.3


From 25ec92fbdd23a0a2bfd2bdf489e60ea4f0ae46d1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 27 Jul 2021 15:45:04 +0200
Subject: hamradio: use ndo_siocdevprivate

hamradio uses a set of private ioctls that do seem to work
correctly in compat mode, as they only rely on the ifr_data
pointer.

Move them over to the ndo_siocdevprivate callback as a cleanup.

Cc: Thomas Sailer <t.sailer@alumni.ethz.ch>
Cc: Joerg Reuter <jreuter@yaina.de>
Cc: Jean-Paul Roubelat <jpr@f6fbb.org>
Cc: linux-hams@vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hamradio/baycom_epp.c     |  9 +++++----
 drivers/net/hamradio/baycom_par.c     | 12 ++++++------
 drivers/net/hamradio/baycom_ser_fdx.c | 12 ++++++------
 drivers/net/hamradio/baycom_ser_hdx.c | 12 ++++++------
 drivers/net/hamradio/bpqether.c       |  9 +++++----
 drivers/net/hamradio/dmascc.c         | 18 ++++++++----------
 drivers/net/hamradio/hdlcdrv.c        | 20 +++++++++++---------
 drivers/net/hamradio/scc.c            | 13 ++++++++-----
 drivers/net/hamradio/yam.c            | 19 +++++++++----------
 include/linux/hdlcdrv.h               |  2 +-
 10 files changed, 65 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 4435a1195194..775dcf4ebde5 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -1005,7 +1005,8 @@ static int baycom_setmode(struct baycom_state *bc, const char *modestr)
 
 /* --------------------------------------------------------------------- */
 
-static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int baycom_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+				 void __user *data, int cmd)
 {
 	struct baycom_state *bc = netdev_priv(dev);
 	struct hdlcdrv_ioctl hi;
@@ -1013,7 +1014,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	if (cmd != SIOCDEVPRIVATE)
 		return -ENOIOCTLCMD;
 
-	if (copy_from_user(&hi, ifr->ifr_data, sizeof(hi)))
+	if (copy_from_user(&hi, data, sizeof(hi)))
 		return -EFAULT;
 	switch (hi.cmd) {
 	default:
@@ -1104,7 +1105,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		return HDLCDRV_PARMASK_IOBASE;
 
 	}
-	if (copy_to_user(ifr->ifr_data, &hi, sizeof(hi)))
+	if (copy_to_user(data, &hi, sizeof(hi)))
 		return -EFAULT;
 	return 0;
 }
@@ -1114,7 +1115,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 static const struct net_device_ops baycom_netdev_ops = {
 	.ndo_open	     = epp_open,
 	.ndo_stop	     = epp_close,
-	.ndo_do_ioctl	     = baycom_ioctl,
+	.ndo_siocdevprivate  = baycom_siocdevprivate,
 	.ndo_start_xmit      = baycom_send_packet,
 	.ndo_set_mac_address = baycom_set_mac_address,
 };
diff --git a/drivers/net/hamradio/baycom_par.c b/drivers/net/hamradio/baycom_par.c
index 6a3dc7b3f28a..fd7da5bb1fa5 100644
--- a/drivers/net/hamradio/baycom_par.c
+++ b/drivers/net/hamradio/baycom_par.c
@@ -380,7 +380,7 @@ static int par96_close(struct net_device *dev)
  * ===================== hdlcdrv driver interface =========================
  */
 
-static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
+static int baycom_ioctl(struct net_device *dev, void __user *data,
 			struct hdlcdrv_ioctl *hi, int cmd);
 
 /* --------------------------------------------------------------------- */
@@ -408,7 +408,7 @@ static int baycom_setmode(struct baycom_state *bc, const char *modestr)
 
 /* --------------------------------------------------------------------- */
 
-static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
+static int baycom_ioctl(struct net_device *dev, void __user *data,
 			struct hdlcdrv_ioctl *hi, int cmd)
 {
 	struct baycom_state *bc;
@@ -428,7 +428,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 
 	case HDLCDRVCTL_GETMODE:
 		strcpy(hi->data.modename, bc->options ? "par96" : "picpar");
-		if (copy_to_user(ifr->ifr_data, hi, sizeof(struct hdlcdrv_ioctl)))
+		if (copy_to_user(data, hi, sizeof(struct hdlcdrv_ioctl)))
 			return -EFAULT;
 		return 0;
 
@@ -440,7 +440,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 
 	case HDLCDRVCTL_MODELIST:
 		strcpy(hi->data.modename, "par96,picpar");
-		if (copy_to_user(ifr->ifr_data, hi, sizeof(struct hdlcdrv_ioctl)))
+		if (copy_to_user(data, hi, sizeof(struct hdlcdrv_ioctl)))
 			return -EFAULT;
 		return 0;
 
@@ -449,7 +449,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 
 	}
 
-	if (copy_from_user(&bi, ifr->ifr_data, sizeof(bi)))
+	if (copy_from_user(&bi, data, sizeof(bi)))
 		return -EFAULT;
 	switch (bi.cmd) {
 	default:
@@ -464,7 +464,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 #endif /* BAYCOM_DEBUG */
 
 	}
-	if (copy_to_user(ifr->ifr_data, &bi, sizeof(bi)))
+	if (copy_to_user(data, &bi, sizeof(bi)))
 		return -EFAULT;
 	return 0;
 
diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c
index 04bb409707fc..646f605e358f 100644
--- a/drivers/net/hamradio/baycom_ser_fdx.c
+++ b/drivers/net/hamradio/baycom_ser_fdx.c
@@ -462,7 +462,7 @@ static int ser12_close(struct net_device *dev)
 
 /* --------------------------------------------------------------------- */
 
-static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
+static int baycom_ioctl(struct net_device *dev, void __user *data,
 			struct hdlcdrv_ioctl *hi, int cmd);
 
 /* --------------------------------------------------------------------- */
@@ -497,7 +497,7 @@ static int baycom_setmode(struct baycom_state *bc, const char *modestr)
 
 /* --------------------------------------------------------------------- */
 
-static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
+static int baycom_ioctl(struct net_device *dev, void __user *data,
 			struct hdlcdrv_ioctl *hi, int cmd)
 {
 	struct baycom_state *bc;
@@ -519,7 +519,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 		sprintf(hi->data.modename, "ser%u", bc->baud / 100);
 		if (bc->opt_dcd <= 0)
 			strcat(hi->data.modename, (!bc->opt_dcd) ? "*" : "+");
-		if (copy_to_user(ifr->ifr_data, hi, sizeof(struct hdlcdrv_ioctl)))
+		if (copy_to_user(data, hi, sizeof(struct hdlcdrv_ioctl)))
 			return -EFAULT;
 		return 0;
 
@@ -531,7 +531,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 
 	case HDLCDRVCTL_MODELIST:
 		strcpy(hi->data.modename, "ser12,ser3,ser24");
-		if (copy_to_user(ifr->ifr_data, hi, sizeof(struct hdlcdrv_ioctl)))
+		if (copy_to_user(data, hi, sizeof(struct hdlcdrv_ioctl)))
 			return -EFAULT;
 		return 0;
 
@@ -540,7 +540,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 
 	}
 
-	if (copy_from_user(&bi, ifr->ifr_data, sizeof(bi)))
+	if (copy_from_user(&bi, data, sizeof(bi)))
 		return -EFAULT;
 	switch (bi.cmd) {
 	default:
@@ -555,7 +555,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 #endif /* BAYCOM_DEBUG */
 
 	}
-	if (copy_to_user(ifr->ifr_data, &bi, sizeof(bi)))
+	if (copy_to_user(data, &bi, sizeof(bi)))
 		return -EFAULT;
 	return 0;
 
diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c
index a1acb3a47bdb..5d1ab4840753 100644
--- a/drivers/net/hamradio/baycom_ser_hdx.c
+++ b/drivers/net/hamradio/baycom_ser_hdx.c
@@ -521,7 +521,7 @@ static int ser12_close(struct net_device *dev)
 
 /* --------------------------------------------------------------------- */
 
-static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
+static int baycom_ioctl(struct net_device *dev, void __user *data,
 			struct hdlcdrv_ioctl *hi, int cmd);
 
 /* --------------------------------------------------------------------- */
@@ -551,7 +551,7 @@ static int baycom_setmode(struct baycom_state *bc, const char *modestr)
 
 /* --------------------------------------------------------------------- */
 
-static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
+static int baycom_ioctl(struct net_device *dev, void __user *data,
 			struct hdlcdrv_ioctl *hi, int cmd)
 {
 	struct baycom_state *bc;
@@ -573,7 +573,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 		strcpy(hi->data.modename, "ser12");
 		if (bc->opt_dcd <= 0)
 			strcat(hi->data.modename, (!bc->opt_dcd) ? "*" : (bc->opt_dcd == -2) ? "@" : "+");
-		if (copy_to_user(ifr->ifr_data, hi, sizeof(struct hdlcdrv_ioctl)))
+		if (copy_to_user(data, hi, sizeof(struct hdlcdrv_ioctl)))
 			return -EFAULT;
 		return 0;
 
@@ -585,7 +585,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 
 	case HDLCDRVCTL_MODELIST:
 		strcpy(hi->data.modename, "ser12");
-		if (copy_to_user(ifr->ifr_data, hi, sizeof(struct hdlcdrv_ioctl)))
+		if (copy_to_user(data, hi, sizeof(struct hdlcdrv_ioctl)))
 			return -EFAULT;
 		return 0;
 
@@ -594,7 +594,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 
 	}
 
-	if (copy_from_user(&bi, ifr->ifr_data, sizeof(bi)))
+	if (copy_from_user(&bi, data, sizeof(bi)))
 		return -EFAULT;
 	switch (bi.cmd) {
 	default:
@@ -609,7 +609,7 @@ static int baycom_ioctl(struct net_device *dev, struct ifreq *ifr,
 #endif /* BAYCOM_DEBUG */
 
 	}
-	if (copy_to_user(ifr->ifr_data, &bi, sizeof(bi)))
+	if (copy_to_user(data, &bi, sizeof(bi)))
 		return -EFAULT;
 	return 0;
 
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index 0e623c2e8b2d..d967b0748773 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -314,9 +314,10 @@ static int bpq_set_mac_address(struct net_device *dev, void *addr)
  *					source ethernet address (broadcast
  *					or multicast: accept all)
  */
-static int bpq_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int bpq_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+			      void __user *data, int cmd)
 {
-	struct bpq_ethaddr __user *ethaddr = ifr->ifr_data;
+	struct bpq_ethaddr __user *ethaddr = data;
 	struct bpqdev *bpq = netdev_priv(dev);
 	struct bpq_req req;
 
@@ -325,7 +326,7 @@ static int bpq_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 
 	switch (cmd) {
 		case SIOCSBPQETHOPT:
-			if (copy_from_user(&req, ifr->ifr_data, sizeof(struct bpq_req)))
+			if (copy_from_user(&req, data, sizeof(struct bpq_req)))
 				return -EFAULT;
 			switch (req.cmd) {
 				case SIOCGBPQETHPARAM:
@@ -448,7 +449,7 @@ static const struct net_device_ops bpq_netdev_ops = {
 	.ndo_stop	     = bpq_close,
 	.ndo_start_xmit	     = bpq_xmit,
 	.ndo_set_mac_address = bpq_set_mac_address,
-	.ndo_do_ioctl	     = bpq_ioctl,
+	.ndo_siocdevprivate  = bpq_siocdevprivate,
 };
 
 static void bpq_setup(struct net_device *dev)
diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c
index c25c8c99c5c7..b50b7fafd8d6 100644
--- a/drivers/net/hamradio/dmascc.c
+++ b/drivers/net/hamradio/dmascc.c
@@ -225,7 +225,8 @@ static int read_scc_data(struct scc_priv *priv);
 
 static int scc_open(struct net_device *dev);
 static int scc_close(struct net_device *dev);
-static int scc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+static int scc_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+			      void __user *data, int cmd);
 static int scc_send_packet(struct sk_buff *skb, struct net_device *dev);
 static int scc_set_mac_address(struct net_device *dev, void *sa);
 
@@ -432,7 +433,7 @@ static const struct net_device_ops scc_netdev_ops = {
 	.ndo_open = scc_open,
 	.ndo_stop = scc_close,
 	.ndo_start_xmit = scc_send_packet,
-	.ndo_do_ioctl = scc_ioctl,
+	.ndo_siocdevprivate = scc_siocdevprivate,
 	.ndo_set_mac_address = scc_set_mac_address,
 };
 
@@ -881,15 +882,13 @@ static int scc_close(struct net_device *dev)
 }
 
 
-static int scc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int scc_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd)
 {
 	struct scc_priv *priv = dev->ml_priv;
 
 	switch (cmd) {
 	case SIOCGSCCPARAM:
-		if (copy_to_user
-		    (ifr->ifr_data, &priv->param,
-		     sizeof(struct scc_param)))
+		if (copy_to_user(data, &priv->param, sizeof(struct scc_param)))
 			return -EFAULT;
 		return 0;
 	case SIOCSSCCPARAM:
@@ -897,13 +896,12 @@ static int scc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			return -EPERM;
 		if (netif_running(dev))
 			return -EAGAIN;
-		if (copy_from_user
-		    (&priv->param, ifr->ifr_data,
-		     sizeof(struct scc_param)))
+		if (copy_from_user(&priv->param, data,
+				   sizeof(struct scc_param)))
 			return -EFAULT;
 		return 0;
 	default:
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 }
 
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c
index cbaf1cdde7cb..5805cfc83854 100644
--- a/drivers/net/hamradio/hdlcdrv.c
+++ b/drivers/net/hamradio/hdlcdrv.c
@@ -483,23 +483,25 @@ static int hdlcdrv_close(struct net_device *dev)
 
 /* --------------------------------------------------------------------- */
 
-static int hdlcdrv_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int hdlcdrv_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+				  void __user *data, int cmd)
 {
 	struct hdlcdrv_state *s = netdev_priv(dev);
 	struct hdlcdrv_ioctl bi;
 
-	if (cmd != SIOCDEVPRIVATE) {
-		if (s->ops && s->ops->ioctl)
-			return s->ops->ioctl(dev, ifr, &bi, cmd);
+	if (cmd != SIOCDEVPRIVATE)
 		return -ENOIOCTLCMD;
-	}
-	if (copy_from_user(&bi, ifr->ifr_data, sizeof(bi)))
+
+	if (in_compat_syscall()) /* to be implemented */
+		return -ENOIOCTLCMD;
+
+	if (copy_from_user(&bi, data, sizeof(bi)))
 		return -EFAULT;
 
 	switch (bi.cmd) {
 	default:
 		if (s->ops && s->ops->ioctl)
-			return s->ops->ioctl(dev, ifr, &bi, cmd);
+			return s->ops->ioctl(dev, data, &bi, cmd);
 		return -ENOIOCTLCMD;
 
 	case HDLCDRVCTL_GETCHANNELPAR:
@@ -605,7 +607,7 @@ static int hdlcdrv_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		break;
 		
 	}
-	if (copy_to_user(ifr->ifr_data, &bi, sizeof(bi)))
+	if (copy_to_user(data, &bi, sizeof(bi)))
 		return -EFAULT;
 	return 0;
 
@@ -617,7 +619,7 @@ static const struct net_device_ops hdlcdrv_netdev = {
 	.ndo_open	= hdlcdrv_open,
 	.ndo_stop	= hdlcdrv_close,
 	.ndo_start_xmit = hdlcdrv_send_packet,
-	.ndo_do_ioctl	= hdlcdrv_ioctl,
+	.ndo_siocdevprivate  = hdlcdrv_siocdevprivate,
 	.ndo_set_mac_address = hdlcdrv_set_mac_address,
 };
 
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index 3f1edd0526a4..e0bb131a33d7 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -210,7 +210,8 @@ static int scc_net_close(struct net_device *dev);
 static void scc_net_rx(struct scc_channel *scc, struct sk_buff *skb);
 static netdev_tx_t scc_net_tx(struct sk_buff *skb,
 			      struct net_device *dev);
-static int scc_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+static int scc_net_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+				  void __user *data, int cmd);
 static int scc_net_set_mac_address(struct net_device *dev, void *addr);
 static struct net_device_stats * scc_net_get_stats(struct net_device *dev);
 
@@ -1550,7 +1551,7 @@ static const struct net_device_ops scc_netdev_ops = {
 	.ndo_start_xmit	     = scc_net_tx,
 	.ndo_set_mac_address = scc_net_set_mac_address,
 	.ndo_get_stats       = scc_net_get_stats,
-	.ndo_do_ioctl        = scc_net_ioctl,
+	.ndo_siocdevprivate  = scc_net_siocdevprivate,
 };
 
 /* ----> Initialize device <----- */
@@ -1703,7 +1704,8 @@ static netdev_tx_t scc_net_tx(struct sk_buff *skb, struct net_device *dev)
  * SIOCSCCCAL		- send calib. pattern	arg: (struct scc_calibrate *) arg
  */
 
-static int scc_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int scc_net_siocdevprivate(struct net_device *dev,
+				  struct ifreq *ifr, void __user *arg, int cmd)
 {
 	struct scc_kiss_cmd kiss_cmd;
 	struct scc_mem_config memcfg;
@@ -1712,8 +1714,6 @@ static int scc_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	struct scc_channel *scc = (struct scc_channel *) dev->ml_priv;
 	int chan;
 	unsigned char device_name[IFNAMSIZ];
-	void __user *arg = ifr->ifr_data;
-	
 	
 	if (!Driver_Initialized)
 	{
@@ -1722,6 +1722,9 @@ static int scc_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			int found = 1;
 
 			if (!capable(CAP_SYS_RAWIO)) return -EPERM;
+			if (in_compat_syscall())
+				return -EOPNOTSUPP;
+
 			if (!arg) return -EFAULT;
 
 			if (Nchips >= SCC_MAXCHIPS) 
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index d4911041596c..6ddacbdb224b 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -920,15 +920,15 @@ static int yam_close(struct net_device *dev)
 
 /* --------------------------------------------------------------------- */
 
-static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int yam_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd)
 {
 	struct yam_port *yp = netdev_priv(dev);
 	struct yamdrv_ioctl_cfg yi;
 	struct yamdrv_ioctl_mcs *ym;
 	int ioctl_cmd;
 
-	if (copy_from_user(&ioctl_cmd, ifr->ifr_data, sizeof(int)))
-		 return -EFAULT;
+	if (copy_from_user(&ioctl_cmd, data, sizeof(int)))
+		return -EFAULT;
 
 	if (yp->magic != YAM_MAGIC)
 		return -EINVAL;
@@ -947,8 +947,7 @@ static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCYAMSMCS:
 		if (netif_running(dev))
 			return -EINVAL;		/* Cannot change this parameter when up */
-		ym = memdup_user(ifr->ifr_data,
-				 sizeof(struct yamdrv_ioctl_mcs));
+		ym = memdup_user(data, sizeof(struct yamdrv_ioctl_mcs));
 		if (IS_ERR(ym))
 			return PTR_ERR(ym);
 		if (ym->cmd != SIOCYAMSMCS)
@@ -965,8 +964,8 @@ static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCYAMSCFG:
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
-		if (copy_from_user(&yi, ifr->ifr_data, sizeof(struct yamdrv_ioctl_cfg)))
-			 return -EFAULT;
+		if (copy_from_user(&yi, data, sizeof(struct yamdrv_ioctl_cfg)))
+			return -EFAULT;
 
 		if (yi.cmd != SIOCYAMSCFG)
 			return -EINVAL;
@@ -1045,8 +1044,8 @@ static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		yi.cfg.txtail = yp->txtail;
 		yi.cfg.persist = yp->pers;
 		yi.cfg.slottime = yp->slot;
-		if (copy_to_user(ifr->ifr_data, &yi, sizeof(struct yamdrv_ioctl_cfg)))
-			 return -EFAULT;
+		if (copy_to_user(data, &yi, sizeof(struct yamdrv_ioctl_cfg)))
+			return -EFAULT;
 		break;
 
 	default:
@@ -1074,7 +1073,7 @@ static const struct net_device_ops yam_netdev_ops = {
 	.ndo_open	     = yam_open,
 	.ndo_stop	     = yam_close,
 	.ndo_start_xmit      = yam_send_packet,
-	.ndo_do_ioctl 	     = yam_ioctl,
+	.ndo_siocdevprivate  = yam_siocdevprivate,
 	.ndo_set_mac_address = yam_set_mac_address,
 };
 
diff --git a/include/linux/hdlcdrv.h b/include/linux/hdlcdrv.h
index d4d633a49d36..5d70c3f98f5b 100644
--- a/include/linux/hdlcdrv.h
+++ b/include/linux/hdlcdrv.h
@@ -79,7 +79,7 @@ struct hdlcdrv_ops {
 	 */
 	int (*open)(struct net_device *);
 	int (*close)(struct net_device *);
-	int (*ioctl)(struct net_device *, struct ifreq *, 
+	int (*ioctl)(struct net_device *, void __user *,
 		     struct hdlcdrv_ioctl *, int);
 };
 
-- 
cgit v1.2.3


From 3e7a1c7c561ed8508fbdb98ed5708175bbcf7938 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 27 Jul 2021 15:45:06 +0200
Subject: ip_tunnel: use ndo_siocdevprivate

The various ipv4 and ipv6 tunnel drivers each implement a set
of 12 SIOCDEVPRIVATE commands for managing tunnels. These
all work correctly in compat mode.

Move them over to the new .ndo_siocdevprivate operation.

Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: David Ahern <dsahern@kernel.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h |  3 ++-
 net/ipv4/ip_gre.c        |  2 +-
 net/ipv4/ip_tunnel.c     |  9 +++++----
 net/ipv4/ip_vti.c        |  2 +-
 net/ipv4/ipip.c          |  2 +-
 net/ipv6/ip6_gre.c       | 17 +++++++++--------
 net/ipv6/ip6_tunnel.c    | 21 +++++++++++----------
 net/ipv6/ip6_vti.c       | 21 +++++++++++----------
 net/ipv6/sit.c           | 35 ++++++++++++++++++-----------------
 9 files changed, 59 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 548b65bd3973..bc3b13ec93c9 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -270,7 +270,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		       const u8 proto, int tunnel_hlen);
 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
-int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+			     void __user *data, int cmd);
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 12dca0c85f3c..6ebf05859acb 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -923,7 +923,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
 	.ndo_stop		= ipgre_close,
 #endif
 	.ndo_start_xmit		= ipgre_xmit,
-	.ndo_do_ioctl		= ip_tunnel_ioctl,
+	.ndo_siocdevprivate	= ip_tunnel_siocdevprivate,
 	.ndo_change_mtu		= ip_tunnel_change_mtu,
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_get_iflink		= ip_tunnel_get_iflink,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 0dca00745ac3..7f0e810c06f4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -958,19 +958,20 @@ done:
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
 
-int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+			     void __user *data, int cmd)
 {
 	struct ip_tunnel_parm p;
 	int err;
 
-	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+	if (copy_from_user(&p, data, sizeof(p)))
 		return -EFAULT;
 	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
-	if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+	if (!err && copy_to_user(data, &p, sizeof(p)))
 		return -EFAULT;
 	return err;
 }
-EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
+EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
 
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index eb560eecee08..efe25a0172e6 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -405,7 +405,7 @@ static const struct net_device_ops vti_netdev_ops = {
 	.ndo_init	= vti_tunnel_init,
 	.ndo_uninit	= ip_tunnel_uninit,
 	.ndo_start_xmit	= vti_tunnel_xmit,
-	.ndo_do_ioctl	= ip_tunnel_ioctl,
+	.ndo_siocdevprivate = ip_tunnel_siocdevprivate,
 	.ndo_change_mtu	= ip_tunnel_change_mtu,
 	.ndo_get_stats64 = dev_get_tstats64,
 	.ndo_get_iflink = ip_tunnel_get_iflink,
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 266c65577ba6..3aa78ccbec3e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -347,7 +347,7 @@ static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_init       = ipip_tunnel_init,
 	.ndo_uninit     = ip_tunnel_uninit,
 	.ndo_start_xmit	= ipip_tunnel_xmit,
-	.ndo_do_ioctl	= ip_tunnel_ioctl,
+	.ndo_siocdevprivate = ip_tunnel_siocdevprivate,
 	.ndo_change_mtu = ip_tunnel_change_mtu,
 	.ndo_get_stats64 = dev_get_tstats64,
 	.ndo_get_iflink = ip_tunnel_get_iflink,
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index bc224f917bbd..3ad201d372d8 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1244,8 +1244,9 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u,
 	memcpy(u->name, p->name, sizeof(u->name));
 }
 
-static int ip6gre_tunnel_ioctl(struct net_device *dev,
-	struct ifreq *ifr, int cmd)
+static int ip6gre_tunnel_siocdevprivate(struct net_device *dev,
+					struct ifreq *ifr, void __user *data,
+					int cmd)
 {
 	int err = 0;
 	struct ip6_tnl_parm2 p;
@@ -1259,7 +1260,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
 	switch (cmd) {
 	case SIOCGETTUNNEL:
 		if (dev == ign->fb_tunnel_dev) {
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+			if (copy_from_user(&p, data, sizeof(p))) {
 				err = -EFAULT;
 				break;
 			}
@@ -1270,7 +1271,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
 		}
 		memset(&p, 0, sizeof(p));
 		ip6gre_tnl_parm_to_user(&p, &t->parms);
-		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+		if (copy_to_user(data, &p, sizeof(p)))
 			err = -EFAULT;
 		break;
 
@@ -1281,7 +1282,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
 			goto done;
 
 		err = -EFAULT;
-		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+		if (copy_from_user(&p, data, sizeof(p)))
 			goto done;
 
 		err = -EINVAL;
@@ -1318,7 +1319,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
 
 			memset(&p, 0, sizeof(p));
 			ip6gre_tnl_parm_to_user(&p, &t->parms);
-			if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+			if (copy_to_user(data, &p, sizeof(p)))
 				err = -EFAULT;
 		} else
 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
@@ -1331,7 +1332,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
 
 		if (dev == ign->fb_tunnel_dev) {
 			err = -EFAULT;
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			if (copy_from_user(&p, data, sizeof(p)))
 				goto done;
 			err = -ENOENT;
 			ip6gre_tnl_parm_from_user(&p1, &p);
@@ -1398,7 +1399,7 @@ static const struct net_device_ops ip6gre_netdev_ops = {
 	.ndo_init		= ip6gre_tunnel_init,
 	.ndo_uninit		= ip6gre_tunnel_uninit,
 	.ndo_start_xmit		= ip6gre_tunnel_xmit,
-	.ndo_do_ioctl		= ip6gre_tunnel_ioctl,
+	.ndo_siocdevprivate	= ip6gre_tunnel_siocdevprivate,
 	.ndo_change_mtu		= ip6_tnl_change_mtu,
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_get_iflink		= ip6_tnl_get_iflink,
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 322698d9fcf4..20a67efda47f 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1581,9 +1581,10 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p)
 }
 
 /**
- * ip6_tnl_ioctl - configure ipv6 tunnels from userspace
+ * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace
  *   @dev: virtual device associated with tunnel
- *   @ifr: parameters passed from userspace
+ *   @ifr: unused
+ *   @data: parameters passed from userspace
  *   @cmd: command to be performed
  *
  * Description:
@@ -1609,7 +1610,8 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p)
  **/
 
 static int
-ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+		       void __user *data, int cmd)
 {
 	int err = 0;
 	struct ip6_tnl_parm p;
@@ -1623,7 +1625,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	switch (cmd) {
 	case SIOCGETTUNNEL:
 		if (dev == ip6n->fb_tnl_dev) {
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+			if (copy_from_user(&p, data, sizeof(p))) {
 				err = -EFAULT;
 				break;
 			}
@@ -1635,9 +1637,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			memset(&p, 0, sizeof(p));
 		}
 		ip6_tnl_parm_to_user(&p, &t->parms);
-		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) {
+		if (copy_to_user(data, &p, sizeof(p)))
 			err = -EFAULT;
-		}
 		break;
 	case SIOCADDTUNNEL:
 	case SIOCCHGTUNNEL:
@@ -1645,7 +1646,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			break;
 		err = -EFAULT;
-		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+		if (copy_from_user(&p, data, sizeof(p)))
 			break;
 		err = -EINVAL;
 		if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP &&
@@ -1669,7 +1670,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (!IS_ERR(t)) {
 			err = 0;
 			ip6_tnl_parm_to_user(&p, &t->parms);
-			if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+			if (copy_to_user(data, &p, sizeof(p)))
 				err = -EFAULT;
 
 		} else {
@@ -1683,7 +1684,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 
 		if (dev == ip6n->fb_tnl_dev) {
 			err = -EFAULT;
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			if (copy_from_user(&p, data, sizeof(p)))
 				break;
 			err = -ENOENT;
 			ip6_tnl_parm_from_user(&p1, &p);
@@ -1802,7 +1803,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
 	.ndo_init	= ip6_tnl_dev_init,
 	.ndo_uninit	= ip6_tnl_dev_uninit,
 	.ndo_start_xmit = ip6_tnl_start_xmit,
-	.ndo_do_ioctl	= ip6_tnl_ioctl,
+	.ndo_siocdevprivate = ip6_tnl_siocdevprivate,
 	.ndo_change_mtu = ip6_tnl_change_mtu,
 	.ndo_get_stats64 = dev_get_tstats64,
 	.ndo_get_iflink = ip6_tnl_get_iflink,
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 2d048e21abbb..1d8e3ffa225d 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -771,13 +771,14 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
 }
 
 /**
- * vti6_ioctl - configure vti6 tunnels from userspace
+ * vti6_siocdevprivate - configure vti6 tunnels from userspace
  *   @dev: virtual device associated with tunnel
- *   @ifr: parameters passed from userspace
+ *   @ifr: unused
+ *   @data: parameters passed from userspace
  *   @cmd: command to be performed
  *
  * Description:
- *   vti6_ioctl() is used for managing vti6 tunnels
+ *   vti6_siocdevprivate() is used for managing vti6 tunnels
  *   from userspace.
  *
  *   The possible commands are the following:
@@ -798,7 +799,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
  *   %-ENODEV if attempting to change or delete a nonexisting device
  **/
 static int
-vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd)
 {
 	int err = 0;
 	struct ip6_tnl_parm2 p;
@@ -810,7 +811,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	switch (cmd) {
 	case SIOCGETTUNNEL:
 		if (dev == ip6n->fb_tnl_dev) {
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+			if (copy_from_user(&p, data, sizeof(p))) {
 				err = -EFAULT;
 				break;
 			}
@@ -822,7 +823,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (!t)
 			t = netdev_priv(dev);
 		vti6_parm_to_user(&p, &t->parms);
-		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+		if (copy_to_user(data, &p, sizeof(p)))
 			err = -EFAULT;
 		break;
 	case SIOCADDTUNNEL:
@@ -831,7 +832,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			break;
 		err = -EFAULT;
-		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+		if (copy_from_user(&p, data, sizeof(p)))
 			break;
 		err = -EINVAL;
 		if (p.proto != IPPROTO_IPV6  && p.proto != 0)
@@ -852,7 +853,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (t) {
 			err = 0;
 			vti6_parm_to_user(&p, &t->parms);
-			if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+			if (copy_to_user(data, &p, sizeof(p)))
 				err = -EFAULT;
 
 		} else
@@ -865,7 +866,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 
 		if (dev == ip6n->fb_tnl_dev) {
 			err = -EFAULT;
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			if (copy_from_user(&p, data, sizeof(p)))
 				break;
 			err = -ENOENT;
 			vti6_parm_from_user(&p1, &p);
@@ -890,7 +891,7 @@ static const struct net_device_ops vti6_netdev_ops = {
 	.ndo_init	= vti6_dev_init,
 	.ndo_uninit	= vti6_dev_uninit,
 	.ndo_start_xmit = vti6_tnl_xmit,
-	.ndo_do_ioctl	= vti6_ioctl,
+	.ndo_siocdevprivate = vti6_siocdevprivate,
 	.ndo_get_stats64 = dev_get_tstats64,
 	.ndo_get_iflink = ip6_tnl_get_iflink,
 };
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 33adc12b697d..ef0c7a7c18e2 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -299,9 +299,8 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
 
 }
 
-static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr)
+static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a)
 {
-	struct ip_tunnel_prl __user *a = ifr->ifr_ifru.ifru_data;
 	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_prl kprl, *kp;
 	struct ip_tunnel_prl_entry *prl;
@@ -454,8 +453,8 @@ out:
 	return err;
 }
 
-static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr,
-		int cmd)
+static int ipip6_tunnel_prl_ctl(struct net_device *dev,
+				struct ip_tunnel_prl __user *data, int cmd)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_prl prl;
@@ -466,7 +465,7 @@ static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr,
 	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
 		return -EINVAL;
 
-	if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl)))
+	if (copy_from_user(&prl, data, sizeof(prl)))
 		return -EFAULT;
 
 	switch (cmd) {
@@ -1198,14 +1197,14 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
 }
 
 static int
-ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr)
+ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_6rd ip6rd;
 	struct ip_tunnel_parm p;
 
 	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
-		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+		if (copy_from_user(&p, data, sizeof(p)))
 			return -EFAULT;
 		t = ipip6_tunnel_locate(t->net, &p, 0);
 	}
@@ -1216,13 +1215,14 @@ ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr)
 	ip6rd.relay_prefix = t->ip6rd.relay_prefix;
 	ip6rd.prefixlen = t->ip6rd.prefixlen;
 	ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
-	if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, sizeof(ip6rd)))
+	if (copy_to_user(data, &ip6rd, sizeof(ip6rd)))
 		return -EFAULT;
 	return 0;
 }
 
 static int
-ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data,
+		    int cmd)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_6rd ip6rd;
@@ -1230,7 +1230,7 @@ ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 
 	if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
-	if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, sizeof(ip6rd)))
+	if (copy_from_user(&ip6rd, data, sizeof(ip6rd)))
 		return -EFAULT;
 
 	if (cmd != SIOCDEL6RD) {
@@ -1369,27 +1369,28 @@ ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 }
 
 static int
-ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+			    void __user *data, int cmd)
 {
 	switch (cmd) {
 	case SIOCGETTUNNEL:
 	case SIOCADDTUNNEL:
 	case SIOCCHGTUNNEL:
 	case SIOCDELTUNNEL:
-		return ip_tunnel_ioctl(dev, ifr, cmd);
+		return ip_tunnel_siocdevprivate(dev, ifr, data, cmd);
 	case SIOCGETPRL:
-		return ipip6_tunnel_get_prl(dev, ifr);
+		return ipip6_tunnel_get_prl(dev, data);
 	case SIOCADDPRL:
 	case SIOCDELPRL:
 	case SIOCCHGPRL:
-		return ipip6_tunnel_prl_ctl(dev, ifr, cmd);
+		return ipip6_tunnel_prl_ctl(dev, data, cmd);
 #ifdef CONFIG_IPV6_SIT_6RD
 	case SIOCGET6RD:
-		return ipip6_tunnel_get6rd(dev, ifr);
+		return ipip6_tunnel_get6rd(dev, data);
 	case SIOCADD6RD:
 	case SIOCCHG6RD:
 	case SIOCDEL6RD:
-		return ipip6_tunnel_6rdctl(dev, ifr, cmd);
+		return ipip6_tunnel_6rdctl(dev, data, cmd);
 #endif
 	default:
 		return -EINVAL;
@@ -1400,7 +1401,7 @@ static const struct net_device_ops ipip6_netdev_ops = {
 	.ndo_init	= ipip6_tunnel_init,
 	.ndo_uninit	= ipip6_tunnel_uninit,
 	.ndo_start_xmit	= sit_tunnel_xmit,
-	.ndo_do_ioctl	= ipip6_tunnel_ioctl,
+	.ndo_siocdevprivate = ipip6_tunnel_siocdevprivate,
 	.ndo_get_stats64 = dev_get_tstats64,
 	.ndo_get_iflink = ip_tunnel_get_iflink,
 	.ndo_tunnel_ctl = ipip6_tunnel_ctl,
-- 
cgit v1.2.3


From a554bf96b49db4c208e305ae92546422e9489380 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 27 Jul 2021 15:45:12 +0200
Subject: dev_ioctl: pass SIOCDEVPRIVATE data separately

The compat handlers for SIOCDEVPRIVATE are incorrect for any driver that
passes data as part of struct ifreq rather than as an ifr_data pointer, or
that passes data back this way, since the compat_ifr_data_ioctl() helper
overwrites the ifr_data pointer and does not copy anything back out.

Since all drivers using devprivate commands are now converted to the
new .ndo_siocdevprivate callback, fix this by adding the missing piece
and passing the pointer separately the whole way.

This further unifies the native and compat logic for socket ioctls,
as the new code now passes the correct pointer as well as the correct
data for both native and compat ioctls.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 net/core/dev_ioctl.c      | 22 ++++++++++---------
 net/ethtool/ioctl.c       |  3 +--
 net/socket.c              | 55 +++++++++++------------------------------------
 4 files changed, 28 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 670e1a8e5928..658d8cf57342 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4012,9 +4012,9 @@ bool dev_valid_name(const char *name);
 int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg);
 int put_user_ifreq(struct ifreq *ifr, void __user *arg);
 int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
-		bool *need_copyout);
+		void __user *data, bool *need_copyout);
 int dev_ifconf(struct net *net, struct ifconf __user *ifc);
-int dev_ethtool(struct net *net, struct ifreq *);
+int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
 unsigned int dev_get_flags(const struct net_device *);
 int __dev_change_flags(struct net_device *dev, unsigned int flags,
 		       struct netlink_ext_ack *extack);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 75e3e340d884..3ace1e4f6b80 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -259,11 +259,10 @@ static int dev_do_ioctl(struct net_device *dev,
 	return err;
 }
 
-static int dev_siocdevprivate(struct net_device *dev,
-			      struct ifreq *ifr, unsigned int cmd)
+static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+			      void __user *data, unsigned int cmd)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-	void __user *data = ifr->ifr_data;
 
 	if (ops->ndo_siocdevprivate) {
 		if (netif_device_present(dev))
@@ -273,13 +272,15 @@ static int dev_siocdevprivate(struct net_device *dev,
 	}
 
 	/* fall back to do_ioctl for drivers not yet converted */
+	ifr->ifr_data = data;
 	return dev_do_ioctl(dev, ifr, cmd);
 }
 
 /*
  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
  */
-static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
+		      unsigned int cmd)
 {
 	int err;
 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -355,7 +356,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 	default:
 		if (cmd >= SIOCDEVPRIVATE &&
 		    cmd <= SIOCDEVPRIVATE + 15)
-			return dev_siocdevprivate(dev, ifr, cmd);
+			return dev_siocdevprivate(dev, ifr, data, cmd);
 
 		if (cmd == SIOCBONDENSLAVE ||
 		    cmd == SIOCBONDRELEASE ||
@@ -424,7 +425,8 @@ EXPORT_SYMBOL(dev_load);
  *	positive or a negative errno code on error.
  */
 
-int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout)
+int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
+	      void __user *data, bool *need_copyout)
 {
 	int ret;
 	char *colon;
@@ -475,7 +477,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
 	case SIOCETHTOOL:
 		dev_load(net, ifr->ifr_name);
 		rtnl_lock();
-		ret = dev_ethtool(net, ifr);
+		ret = dev_ethtool(net, ifr, data);
 		rtnl_unlock();
 		if (colon)
 			*colon = ':';
@@ -494,7 +496,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 		rtnl_lock();
-		ret = dev_ifsioc(net, ifr, cmd);
+		ret = dev_ifsioc(net, ifr, data, cmd);
 		rtnl_unlock();
 		if (colon)
 			*colon = ':';
@@ -540,7 +542,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
 	case SIOCBONDINFOQUERY:
 		dev_load(net, ifr->ifr_name);
 		rtnl_lock();
-		ret = dev_ifsioc(net, ifr, cmd);
+		ret = dev_ifsioc(net, ifr, data, cmd);
 		rtnl_unlock();
 		if (need_copyout)
 			*need_copyout = false;
@@ -565,7 +567,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
 		     cmd <= SIOCDEVPRIVATE + 15)) {
 			dev_load(net, ifr->ifr_name);
 			rtnl_lock();
-			ret = dev_ifsioc(net, ifr, cmd);
+			ret = dev_ifsioc(net, ifr, data, cmd);
 			rtnl_unlock();
 			return ret;
 		}
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index af011534bcb2..b0fa2b00ad43 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -2685,10 +2685,9 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr)
 
 /* The main entry point in this file.  Called from net/core/dev_ioctl.c */
 
-int dev_ethtool(struct net *net, struct ifreq *ifr)
+int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr)
 {
 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
-	void __user *useraddr = ifr->ifr_data;
 	u32 ethcmd, sub_cmd;
 	int rc;
 	netdev_features_t old_features;
diff --git a/net/socket.c b/net/socket.c
index 84de89c1ee9d..ddce6327633e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1092,6 +1092,7 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
 	bool need_copyout;
 	int err;
 	void __user *argp = (void __user *)arg;
+	void __user *data;
 
 	err = sock->ops->ioctl(sock, cmd, arg);
 
@@ -1102,11 +1103,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
 	if (err != -ENOIOCTLCMD)
 		return err;
 
-	if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
+	if (get_user_ifreq(&ifr, &data, argp))
 		return -EFAULT;
-	err = dev_ioctl(net, cmd, &ifr, &need_copyout);
+	err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
 	if (!err && need_copyout)
-		if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
+		if (put_user_ifreq(&ifr, argp))
 			return -EFAULT;
 
 	return err;
@@ -1130,12 +1131,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	net = sock_net(sk);
 	if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
 		struct ifreq ifr;
+		void __user *data;
 		bool need_copyout;
-		if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
+		if (get_user_ifreq(&ifr, &data, argp))
 			return -EFAULT;
-		err = dev_ioctl(net, cmd, &ifr, &need_copyout);
+		err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
 		if (!err && need_copyout)
-			if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
+			if (put_user_ifreq(&ifr, argp))
 				return -EFAULT;
 	} else
 #ifdef CONFIG_WEXT_CORE
@@ -3186,7 +3188,7 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32
 	saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc;
 	ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32);
 
-	err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL);
+	err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL);
 	if (!err) {
 		ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved;
 		if (put_user_ifreq(&ifr, uifr32))
@@ -3200,42 +3202,13 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
 				 struct compat_ifreq __user *u_ifreq32)
 {
 	struct ifreq ifreq;
-	u32 data32;
+	void __user *data;
 
-	if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ))
+	if (get_user_ifreq(&ifreq, &data, u_ifreq32))
 		return -EFAULT;
-	if (get_user(data32, &u_ifreq32->ifr_data))
-		return -EFAULT;
-	ifreq.ifr_data = compat_ptr(data32);
+	ifreq.ifr_data = data;
 
-	return dev_ioctl(net, cmd, &ifreq, NULL);
-}
-
-static int compat_ifreq_ioctl(struct net *net, struct socket *sock,
-			      unsigned int cmd,
-			      unsigned long arg,
-			      struct compat_ifreq __user *uifr32)
-{
-	struct ifreq ifr;
-	bool need_copyout;
-	int err;
-
-	err = sock->ops->ioctl(sock, cmd, arg);
-
-	/* If this ioctl is unknown try to hand it down
-	 * to the NIC driver.
-	 */
-	if (err != -ENOIOCTLCMD)
-		return err;
-
-	if (get_user_ifreq(&ifr, NULL, uifr32))
-		return -EFAULT;
-	err = dev_ioctl(net, cmd, &ifr, &need_copyout);
-	if (!err && need_copyout)
-		if (put_user_ifreq(&ifr, uifr32))
-			return -EFAULT;
-
-	return err;
+	return dev_ioctl(net, cmd, &ifreq, data, NULL);
 }
 
 /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
@@ -3337,8 +3310,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCBONDRELEASE:
 	case SIOCBONDSETHWADDR:
 	case SIOCBONDCHANGEACTIVE:
-		return compat_ifreq_ioctl(net, sock, cmd, arg, argp);
-
 	case SIOCSARP:
 	case SIOCGARP:
 	case SIOCDARP:
-- 
cgit v1.2.3


From a76053707dbf0dc020a73b4d90cd952409ef3691 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 27 Jul 2021 15:45:13 +0200
Subject: dev_ioctl: split out ndo_eth_ioctl

Most users of ndo_do_ioctl are ethernet drivers that implement
the MII commands SIOCGMIIPHY/SIOCGMIIREG/SIOCSMIIREG, or hardware
timestamping with SIOCSHWTSTAMP/SIOCGHWTSTAMP.

Separate these from the few drivers that use ndo_do_ioctl to
implement SIOCBOND, SIOCBR and SIOCWANDEV commands.

This is a purely cosmetic change intended to help readers find
their way through the implementation.

Cc: Doug Ledford <dledford@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Vivien Didelot <vivien.didelot@gmail.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Vladimir Oltean <olteanv@gmail.com>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: linux-rdma@vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdevices.rst            |  4 +++
 Documentation/networking/timestamping.rst          |  6 ++--
 drivers/infiniband/ulp/ipoib/ipoib_main.c          |  8 ++---
 drivers/net/bonding/bond_main.c                    | 42 +++++++++++++++-------
 drivers/net/ethernet/3com/3c574_cs.c               |  2 +-
 drivers/net/ethernet/3com/3c59x.c                  |  4 +--
 drivers/net/ethernet/8390/ax88796.c                |  2 +-
 drivers/net/ethernet/8390/axnet_cs.c               |  2 +-
 drivers/net/ethernet/8390/pcnet_cs.c               |  2 +-
 drivers/net/ethernet/actions/owl-emac.c            |  6 ++--
 drivers/net/ethernet/adaptec/starfire.c            |  2 +-
 drivers/net/ethernet/agere/et131x.c                |  2 +-
 drivers/net/ethernet/allwinner/sun4i-emac.c        |  2 +-
 drivers/net/ethernet/amd/amd8111e.c                |  2 +-
 drivers/net/ethernet/amd/au1000_eth.c              |  2 +-
 drivers/net/ethernet/amd/pcnet32.c                 |  2 +-
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c           |  2 +-
 drivers/net/ethernet/aquantia/atlantic/aq_main.c   |  2 +-
 drivers/net/ethernet/arc/emac_main.c               |  2 +-
 drivers/net/ethernet/atheros/ag71xx.c              |  2 +-
 drivers/net/ethernet/atheros/alx/main.c            |  2 +-
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c    |  2 +-
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c    |  2 +-
 drivers/net/ethernet/atheros/atlx/atl1.c           |  2 +-
 drivers/net/ethernet/atheros/atlx/atl2.c           |  2 +-
 drivers/net/ethernet/broadcom/b44.c                |  2 +-
 drivers/net/ethernet/broadcom/bcm63xx_enet.c       |  4 +--
 drivers/net/ethernet/broadcom/bgmac.c              |  2 +-
 drivers/net/ethernet/broadcom/bnx2.c               |  2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  2 +-
 drivers/net/ethernet/broadcom/genet/bcmgenet.c     |  2 +-
 drivers/net/ethernet/broadcom/sb1250-mac.c         |  2 +-
 drivers/net/ethernet/broadcom/tg3.c                |  2 +-
 drivers/net/ethernet/cadence/macb_main.c           |  4 +--
 drivers/net/ethernet/cavium/liquidio/lio_main.c    |  2 +-
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c |  2 +-
 drivers/net/ethernet/cavium/octeon/octeon_mgmt.c   |  2 +-
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |  2 +-
 drivers/net/ethernet/chelsio/cxgb/cxgb2.c          |  2 +-
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c    |  2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  2 +-
 .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c    |  2 +-
 drivers/net/ethernet/cirrus/ep93xx_eth.c           |  2 +-
 drivers/net/ethernet/davicom/dm9000.c              |  2 +-
 drivers/net/ethernet/dec/tulip/tulip_core.c        |  2 +-
 drivers/net/ethernet/dec/tulip/winbond-840.c       |  2 +-
 drivers/net/ethernet/dlink/dl2k.c                  |  2 +-
 drivers/net/ethernet/dlink/sundance.c              |  2 +-
 drivers/net/ethernet/dnet.c                        |  2 +-
 drivers/net/ethernet/ethoc.c                       |  2 +-
 drivers/net/ethernet/faraday/ftgmac100.c           |  2 +-
 drivers/net/ethernet/faraday/ftmac100.c            |  2 +-
 drivers/net/ethernet/fealnx.c                      |  2 +-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c     |  2 +-
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c   |  2 +-
 drivers/net/ethernet/freescale/enetc/enetc_pf.c    |  2 +-
 drivers/net/ethernet/freescale/enetc/enetc_vf.c    |  2 +-
 drivers/net/ethernet/freescale/fec_main.c          |  2 +-
 drivers/net/ethernet/freescale/fec_mpc52xx.c       |  2 +-
 .../net/ethernet/freescale/fs_enet/fs_enet-main.c  |  2 +-
 drivers/net/ethernet/freescale/gianfar.c           |  2 +-
 drivers/net/ethernet/freescale/ucc_geth.c          |  2 +-
 drivers/net/ethernet/hisilicon/hisi_femac.c        |  2 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c      |  2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c    |  2 +-
 drivers/net/ethernet/ibm/emac/core.c               |  4 +--
 drivers/net/ethernet/ibm/ibmveth.c                 |  2 +-
 drivers/net/ethernet/intel/e100.c                  |  2 +-
 drivers/net/ethernet/intel/e1000/e1000_main.c      |  2 +-
 drivers/net/ethernet/intel/e1000e/netdev.c         |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  2 +-
 drivers/net/ethernet/intel/ice/ice_main.c          |  6 ++--
 drivers/net/ethernet/intel/igb/igb_main.c          |  2 +-
 drivers/net/ethernet/intel/igbvf/netdev.c          |  2 +-
 drivers/net/ethernet/intel/igc/igc_main.c          |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |  2 +-
 drivers/net/ethernet/jme.c                         |  2 +-
 drivers/net/ethernet/korina.c                      |  2 +-
 drivers/net/ethernet/lantiq_etop.c                 |  2 +-
 drivers/net/ethernet/marvell/mv643xx_eth.c         |  2 +-
 drivers/net/ethernet/marvell/mvneta.c              |  2 +-
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c    |  2 +-
 .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c   |  2 +-
 drivers/net/ethernet/marvell/pxa168_eth.c          |  2 +-
 drivers/net/ethernet/marvell/skge.c                |  2 +-
 drivers/net/ethernet/marvell/sky2.c                |  4 +--
 drivers/net/ethernet/mediatek/mtk_eth_soc.c        |  2 +-
 drivers/net/ethernet/mediatek/mtk_star_emac.c      |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |  2 +-
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c |  2 +-
 .../ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c |  4 +--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  2 +-
 drivers/net/ethernet/micrel/ks8851_common.c        |  2 +-
 drivers/net/ethernet/micrel/ksz884x.c              |  2 +-
 drivers/net/ethernet/microchip/lan743x_main.c      |  2 +-
 drivers/net/ethernet/mscc/ocelot_net.c             |  2 +-
 drivers/net/ethernet/natsemi/natsemi.c             |  2 +-
 drivers/net/ethernet/neterion/s2io.c               |  2 +-
 drivers/net/ethernet/neterion/vxge/vxge-main.c     |  2 +-
 drivers/net/ethernet/nxp/lpc_eth.c                 |  2 +-
 .../net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c   |  2 +-
 drivers/net/ethernet/packetengines/hamachi.c       |  2 +-
 drivers/net/ethernet/packetengines/yellowfin.c     |  2 +-
 drivers/net/ethernet/pensando/ionic/ionic_lif.c    |  4 +--
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  2 +-
 drivers/net/ethernet/qualcomm/emac/emac.c          |  2 +-
 drivers/net/ethernet/rdc/r6040.c                   |  2 +-
 drivers/net/ethernet/realtek/8139cp.c              |  2 +-
 drivers/net/ethernet/realtek/8139too.c             |  2 +-
 drivers/net/ethernet/realtek/r8169_main.c          |  2 +-
 drivers/net/ethernet/renesas/ravb_main.c           |  2 +-
 drivers/net/ethernet/renesas/sh_eth.c              |  4 +--
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c    |  2 +-
 drivers/net/ethernet/sfc/efx.c                     |  2 +-
 drivers/net/ethernet/sfc/falcon/efx.c              |  2 +-
 drivers/net/ethernet/sgi/ioc3-eth.c                |  2 +-
 drivers/net/ethernet/sgi/meth.c                    |  2 +-
 drivers/net/ethernet/sis/sis190.c                  |  2 +-
 drivers/net/ethernet/sis/sis900.c                  |  2 +-
 drivers/net/ethernet/smsc/epic100.c                |  2 +-
 drivers/net/ethernet/smsc/smc91c92_cs.c            |  2 +-
 drivers/net/ethernet/smsc/smsc911x.c               |  2 +-
 drivers/net/ethernet/smsc/smsc9420.c               |  2 +-
 drivers/net/ethernet/socionext/netsec.c            |  2 +-
 drivers/net/ethernet/socionext/sni_ave.c           |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  2 +-
 drivers/net/ethernet/sun/cassini.c                 |  2 +-
 drivers/net/ethernet/sun/niu.c                     |  2 +-
 drivers/net/ethernet/sun/sungem.c                  |  2 +-
 drivers/net/ethernet/synopsys/dwc-xlgmac-net.c     |  2 +-
 drivers/net/ethernet/ti/am65-cpsw-nuss.c           |  2 +-
 drivers/net/ethernet/ti/cpmac.c                    |  2 +-
 drivers/net/ethernet/ti/cpsw.c                     |  2 +-
 drivers/net/ethernet/ti/cpsw_new.c                 |  2 +-
 drivers/net/ethernet/ti/davinci_emac.c             |  2 +-
 drivers/net/ethernet/ti/netcp_core.c               |  2 +-
 drivers/net/ethernet/ti/tlan.c                     |  2 +-
 drivers/net/ethernet/toshiba/spider_net.c          |  2 +-
 drivers/net/ethernet/toshiba/tc35815.c             |  2 +-
 drivers/net/ethernet/tundra/tsi108_eth.c           |  2 +-
 drivers/net/ethernet/via/via-rhine.c               |  2 +-
 drivers/net/ethernet/via/via-velocity.c            |  2 +-
 drivers/net/ethernet/xilinx/ll_temac_main.c        |  2 +-
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c  |  2 +-
 drivers/net/ethernet/xilinx/xilinx_emaclite.c      |  2 +-
 drivers/net/ethernet/xircom/xirc2ps_cs.c           |  2 +-
 drivers/net/ethernet/xscale/ixp4xx_eth.c           |  2 +-
 drivers/net/macvlan.c                              |  8 ++---
 drivers/net/phy/phy.c                              |  4 +--
 drivers/net/usb/asix_devices.c                     |  6 ++--
 drivers/net/usb/ax88172a.c                         |  2 +-
 drivers/net/usb/ax88179_178a.c                     |  2 +-
 drivers/net/usb/dm9601.c                           |  2 +-
 drivers/net/usb/lan78xx.c                          |  2 +-
 drivers/net/usb/mcs7830.c                          |  2 +-
 drivers/net/usb/r8152.c                            |  2 +-
 drivers/net/usb/smsc75xx.c                         |  2 +-
 drivers/net/usb/smsc95xx.c                         |  2 +-
 drivers/net/usb/sr9700.c                           |  2 +-
 drivers/net/usb/sr9800.c                           |  2 +-
 drivers/s390/net/qeth_l2_main.c                    |  2 +-
 drivers/s390/net/qeth_l3_main.c                    |  4 +--
 drivers/staging/octeon/ethernet.c                  | 12 +++----
 include/linux/netdevice.h                          |  6 ++++
 include/net/dsa.h                                  | 14 ++++----
 net/8021q/vlan_dev.c                               |  6 ++--
 net/core/dev_ioctl.c                               | 38 ++++++++++++++------
 net/dsa/master.c                                   |  6 ++--
 net/dsa/slave.c                                    |  2 +-
 172 files changed, 273 insertions(+), 231 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 02f1faac839a..f57f255f2397 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -229,6 +229,10 @@ ndo_siocdevprivate:
 	This is used to implement SIOCDEVPRIVATE ioctl helpers.
 	These should not be added to new drivers, so don't use.
 
+ndo_eth_ioctl:
+	Synchronization: rtnl_lock() semaphore.
+	Context: process
+
 ndo_get_stats:
 	Synchronization: rtnl_lock() semaphore, dev_base_lock rwlock, or RCU.
 	Context: atomic (can't sleep under rwlock or RCU)
diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst
index 7db3985359bc..a722eb30e014 100644
--- a/Documentation/networking/timestamping.rst
+++ b/Documentation/networking/timestamping.rst
@@ -625,7 +625,7 @@ interfaces of a DSA switch to share the same PHC.
 By design, PTP timestamping with a DSA switch does not need any special
 handling in the driver for the host port it is attached to.  However, when the
 host port also supports PTP timestamping, DSA will take care of intercepting
-the ``.ndo_do_ioctl`` calls towards the host port, and block attempts to enable
+the ``.ndo_eth_ioctl`` calls towards the host port, and block attempts to enable
 hardware timestamping on it. This is because the SO_TIMESTAMPING API does not
 allow the delivery of multiple hardware timestamps for the same packet, so
 anybody else except for the DSA switch port must be prevented from doing so.
@@ -688,7 +688,7 @@ ethtool ioctl operations for them need to be mediated by their respective MAC
 driver.  Therefore, as opposed to DSA switches, modifications need to be done
 to each individual MAC driver for PHY timestamping support. This entails:
 
-- Checking, in ``.ndo_do_ioctl``, whether ``phy_has_hwtstamp(netdev->phydev)``
+- Checking, in ``.ndo_eth_ioctl``, whether ``phy_has_hwtstamp(netdev->phydev)``
   is true or not. If it is, then the MAC driver should not process this request
   but instead pass it on to the PHY using ``phy_mii_ioctl()``.
 
@@ -747,7 +747,7 @@ For example, a typical driver design for TX timestamping might be to split the
 transmission part into 2 portions:
 
 1. "TX": checks whether PTP timestamping has been previously enabled through
-   the ``.ndo_do_ioctl`` ("``priv->hwtstamp_tx_enabled == true``") and the
+   the ``.ndo_eth_ioctl`` ("``priv->hwtstamp_tx_enabled == true``") and the
    current skb requires a TX timestamp ("``skb_shinfo(skb)->tx_flags &
    SKBTX_HW_TSTAMP``"). If this is true, it sets the
    "``skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS``" flag. Note: as
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index abf60f4d9203..0aa8629fdf62 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1745,10 +1745,10 @@ static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 
-	if (!priv->rn_ops->ndo_do_ioctl)
+	if (!priv->rn_ops->ndo_eth_ioctl)
 		return -EOPNOTSUPP;
 
-	return priv->rn_ops->ndo_do_ioctl(dev, ifr, cmd);
+	return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd);
 }
 
 static int ipoib_dev_init(struct net_device *dev)
@@ -2078,7 +2078,7 @@ static const struct net_device_ops ipoib_netdev_ops_pf = {
 	.ndo_set_vf_guid	 = ipoib_set_vf_guid,
 	.ndo_set_mac_address	 = ipoib_set_mac,
 	.ndo_get_stats64	 = ipoib_get_stats,
-	.ndo_do_ioctl		 = ipoib_ioctl,
+	.ndo_eth_ioctl		 = ipoib_ioctl,
 };
 
 static const struct net_device_ops ipoib_netdev_ops_vf = {
@@ -2093,7 +2093,7 @@ static const struct net_device_ops ipoib_netdev_ops_vf = {
 	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
 	.ndo_get_iflink		 = ipoib_get_iflink,
 	.ndo_get_stats64	 = ipoib_get_stats,
-	.ndo_do_ioctl		 = ipoib_ioctl,
+	.ndo_eth_ioctl		 = ipoib_ioctl,
 };
 
 static const struct net_device_ops ipoib_netdev_default_pf = {
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 96864183f92e..23769e937c28 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -732,7 +732,7 @@ static int bond_check_dev_link(struct bonding *bond,
 			BMSR_LSTATUS : 0;
 
 	/* Ethtool can't be used, fallback to MII ioctls. */
-	ioctl = slave_ops->ndo_do_ioctl;
+	ioctl = slave_ops->ndo_eth_ioctl;
 	if (ioctl) {
 		/* TODO: set pointer to correct ioctl on a per team member
 		 *       bases to make this more efficient. that is, once
@@ -756,7 +756,7 @@ static int bond_check_dev_link(struct bonding *bond,
 		}
 	}
 
-	/* If reporting, report that either there's no dev->do_ioctl,
+	/* If reporting, report that either there's no ndo_eth_ioctl,
 	 * or both SIOCGMIIREG and get_link failed (meaning that we
 	 * cannot report link status).  If not reporting, pretend
 	 * we're ok.
@@ -1733,7 +1733,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 
 	if (!bond->params.use_carrier &&
 	    slave_dev->ethtool_ops->get_link == NULL &&
-	    slave_ops->ndo_do_ioctl == NULL) {
+	    slave_ops->ndo_eth_ioctl == NULL) {
 		slave_warn(bond_dev, slave_dev, "no link monitoring support\n");
 	}
 
@@ -3962,20 +3962,13 @@ static void bond_get_stats(struct net_device *bond_dev,
 	rcu_read_unlock();
 }
 
-static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
+static int bond_eth_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
-	struct net_device *slave_dev = NULL;
-	struct ifbond k_binfo;
-	struct ifbond __user *u_binfo = NULL;
-	struct ifslave k_sinfo;
-	struct ifslave __user *u_sinfo = NULL;
 	struct mii_ioctl_data *mii = NULL;
-	struct bond_opt_value newval;
-	struct net *net;
-	int res = 0;
+	int res;
 
-	netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd);
+	netdev_dbg(bond_dev, "bond_eth_ioctl: cmd=%d\n", cmd);
 
 	switch (cmd) {
 	case SIOCGMIIPHY:
@@ -4000,6 +3993,28 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd
 		}
 
 		return 0;
+	default:
+		res = -EOPNOTSUPP;
+	}
+
+	return res;
+}
+
+static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct net_device *slave_dev = NULL;
+	struct ifbond k_binfo;
+	struct ifbond __user *u_binfo = NULL;
+	struct ifslave k_sinfo;
+	struct ifslave __user *u_sinfo = NULL;
+	struct bond_opt_value newval;
+	struct net *net;
+	int res = 0;
+
+	netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd);
+
+	switch (cmd) {
 	case SIOCBONDINFOQUERY:
 		u_binfo = (struct ifbond __user *)ifr->ifr_data;
 
@@ -4972,6 +4987,7 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_start_xmit		= bond_start_xmit,
 	.ndo_select_queue	= bond_select_queue,
 	.ndo_get_stats64	= bond_get_stats,
+	.ndo_eth_ioctl		= bond_eth_ioctl,
 	.ndo_do_ioctl		= bond_do_ioctl,
 	.ndo_siocdevprivate	= bond_siocdevprivate,
 	.ndo_change_rx_flags	= bond_change_rx_flags,
diff --git a/drivers/net/ethernet/3com/3c574_cs.c b/drivers/net/ethernet/3com/3c574_cs.c
index f66e7fb9a2bb..dd4d3c48b98d 100644
--- a/drivers/net/ethernet/3com/3c574_cs.c
+++ b/drivers/net/ethernet/3com/3c574_cs.c
@@ -252,7 +252,7 @@ static const struct net_device_ops el3_netdev_ops = {
 	.ndo_start_xmit		= el3_start_xmit,
 	.ndo_tx_timeout 	= el3_tx_timeout,
 	.ndo_get_stats		= el3_get_stats,
-	.ndo_do_ioctl		= el3_ioctl,
+	.ndo_eth_ioctl		= el3_ioctl,
 	.ndo_set_rx_mode	= set_multicast_list,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 7d7d3ffe25c3..17c16333a412 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -1052,7 +1052,7 @@ static const struct net_device_ops boomrang_netdev_ops = {
 	.ndo_tx_timeout		= vortex_tx_timeout,
 	.ndo_get_stats		= vortex_get_stats,
 #ifdef CONFIG_PCI
-	.ndo_do_ioctl 		= vortex_ioctl,
+	.ndo_eth_ioctl		= vortex_ioctl,
 #endif
 	.ndo_set_rx_mode	= set_rx_mode,
 	.ndo_set_mac_address 	= eth_mac_addr,
@@ -1069,7 +1069,7 @@ static const struct net_device_ops vortex_netdev_ops = {
 	.ndo_tx_timeout		= vortex_tx_timeout,
 	.ndo_get_stats		= vortex_get_stats,
 #ifdef CONFIG_PCI
-	.ndo_do_ioctl 		= vortex_ioctl,
+	.ndo_eth_ioctl		= vortex_ioctl,
 #endif
 	.ndo_set_rx_mode	= set_rx_mode,
 	.ndo_set_mac_address 	= eth_mac_addr,
diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 172947fc051a..9595dd1f32ca 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -635,7 +635,7 @@ static void ax_eeprom_register_write(struct eeprom_93cx6 *eeprom)
 static const struct net_device_ops ax_netdev_ops = {
 	.ndo_open		= ax_open,
 	.ndo_stop		= ax_close,
-	.ndo_do_ioctl		= ax_ioctl,
+	.ndo_eth_ioctl		= ax_ioctl,
 
 	.ndo_start_xmit		= ax_ei_start_xmit,
 	.ndo_tx_timeout		= ax_ei_tx_timeout,
diff --git a/drivers/net/ethernet/8390/axnet_cs.c b/drivers/net/ethernet/8390/axnet_cs.c
index 8c321dfc7b3b..3c370e686ec3 100644
--- a/drivers/net/ethernet/8390/axnet_cs.c
+++ b/drivers/net/ethernet/8390/axnet_cs.c
@@ -128,7 +128,7 @@ static inline struct axnet_dev *PRIV(struct net_device *dev)
 static const struct net_device_ops axnet_netdev_ops = {
 	.ndo_open 		= axnet_open,
 	.ndo_stop		= axnet_close,
-	.ndo_do_ioctl		= axnet_ioctl,
+	.ndo_eth_ioctl		= axnet_ioctl,
 	.ndo_start_xmit		= axnet_start_xmit,
 	.ndo_tx_timeout		= axnet_tx_timeout,
 	.ndo_get_stats		= get_stats,
diff --git a/drivers/net/ethernet/8390/pcnet_cs.c b/drivers/net/ethernet/8390/pcnet_cs.c
index cac036706382..96ad72abd373 100644
--- a/drivers/net/ethernet/8390/pcnet_cs.c
+++ b/drivers/net/ethernet/8390/pcnet_cs.c
@@ -223,7 +223,7 @@ static const struct net_device_ops pcnet_netdev_ops = {
 	.ndo_set_config		= set_config,
 	.ndo_start_xmit 	= ei_start_xmit,
 	.ndo_get_stats		= ei_get_stats,
-	.ndo_do_ioctl 		= ei_ioctl,
+	.ndo_eth_ioctl		= ei_ioctl,
 	.ndo_set_rx_mode	= ei_set_multicast_list,
 	.ndo_tx_timeout 	= ei_tx_timeout,
 	.ndo_set_mac_address 	= eth_mac_addr,
diff --git a/drivers/net/ethernet/actions/owl-emac.c b/drivers/net/ethernet/actions/owl-emac.c
index b8e771c2bc40..c4ecf4fcadf8 100644
--- a/drivers/net/ethernet/actions/owl-emac.c
+++ b/drivers/net/ethernet/actions/owl-emac.c
@@ -1179,8 +1179,8 @@ static int owl_emac_ndo_set_mac_addr(struct net_device *netdev, void *addr)
 	return owl_emac_setup_frame_xmit(netdev_priv(netdev));
 }
 
-static int owl_emac_ndo_do_ioctl(struct net_device *netdev,
-				 struct ifreq *req, int cmd)
+static int owl_emac_ndo_eth_ioctl(struct net_device *netdev,
+				  struct ifreq *req, int cmd)
 {
 	if (!netif_running(netdev))
 		return -EINVAL;
@@ -1224,7 +1224,7 @@ static const struct net_device_ops owl_emac_netdev_ops = {
 	.ndo_set_rx_mode	= owl_emac_ndo_set_rx_mode,
 	.ndo_set_mac_address	= owl_emac_ndo_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= owl_emac_ndo_do_ioctl,
+	.ndo_eth_ioctl		= owl_emac_ndo_eth_ioctl,
 	.ndo_tx_timeout         = owl_emac_ndo_tx_timeout,
 	.ndo_get_stats		= owl_emac_ndo_get_stats,
 };
diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
index 7965e5e3c985..e0f6cc910bd2 100644
--- a/drivers/net/ethernet/adaptec/starfire.c
+++ b/drivers/net/ethernet/adaptec/starfire.c
@@ -625,7 +625,7 @@ static const struct net_device_ops netdev_ops = {
 	.ndo_tx_timeout		= tx_timeout,
 	.ndo_get_stats		= get_stats,
 	.ndo_set_rx_mode	= set_rx_mode,
-	.ndo_do_ioctl		= netdev_ioctl,
+	.ndo_eth_ioctl		= netdev_ioctl,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 #ifdef VLAN_SUPPORT
diff --git a/drivers/net/ethernet/agere/et131x.c b/drivers/net/ethernet/agere/et131x.c
index 41f8821f792d..920633161174 100644
--- a/drivers/net/ethernet/agere/et131x.c
+++ b/drivers/net/ethernet/agere/et131x.c
@@ -3882,7 +3882,7 @@ static const struct net_device_ops et131x_netdev_ops = {
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_get_stats		= et131x_stats,
-	.ndo_do_ioctl		= phy_do_ioctl,
+	.ndo_eth_ioctl		= phy_do_ioctl,
 };
 
 static int et131x_pci_setup(struct pci_dev *pdev,
diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c
index f99ae317c188..037baea1c738 100644
--- a/drivers/net/ethernet/allwinner/sun4i-emac.c
+++ b/drivers/net/ethernet/allwinner/sun4i-emac.c
@@ -774,7 +774,7 @@ static const struct net_device_ops emac_netdev_ops = {
 	.ndo_start_xmit		= emac_start_xmit,
 	.ndo_tx_timeout		= emac_timeout,
 	.ndo_set_rx_mode	= emac_set_rx_mode,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= emac_set_mac_address,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index 9cac5aa75a73..92e4246dc359 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -1729,7 +1729,7 @@ static const struct net_device_ops amd8111e_netdev_ops = {
 	.ndo_set_rx_mode	= amd8111e_set_multicast_list,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= amd8111e_set_mac_address,
-	.ndo_do_ioctl		= amd8111e_ioctl,
+	.ndo_eth_ioctl		= amd8111e_ioctl,
 	.ndo_change_mtu		= amd8111e_change_mtu,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	 = amd8111e_poll,
diff --git a/drivers/net/ethernet/amd/au1000_eth.c b/drivers/net/ethernet/amd/au1000_eth.c
index 19e195420e24..9c1636222b99 100644
--- a/drivers/net/ethernet/amd/au1000_eth.c
+++ b/drivers/net/ethernet/amd/au1000_eth.c
@@ -1051,7 +1051,7 @@ static const struct net_device_ops au1000_netdev_ops = {
 	.ndo_stop		= au1000_close,
 	.ndo_start_xmit		= au1000_tx,
 	.ndo_set_rx_mode	= au1000_multicast_list,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_tx_timeout		= au1000_tx_timeout,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index 4100ab07e6b7..70d76fdb9f56 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -1572,7 +1572,7 @@ static const struct net_device_ops pcnet32_netdev_ops = {
 	.ndo_tx_timeout		= pcnet32_tx_timeout,
 	.ndo_get_stats		= pcnet32_get_stats,
 	.ndo_set_rx_mode	= pcnet32_set_multicast_list,
-	.ndo_do_ioctl		= pcnet32_ioctl,
+	.ndo_eth_ioctl		= pcnet32_ioctl,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 4f714f874c4f..17a585adfb49 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -2284,7 +2284,7 @@ static const struct net_device_ops xgbe_netdev_ops = {
 	.ndo_set_rx_mode	= xgbe_set_rx_mode,
 	.ndo_set_mac_address	= xgbe_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= xgbe_ioctl,
+	.ndo_eth_ioctl		= xgbe_ioctl,
 	.ndo_change_mtu		= xgbe_change_mtu,
 	.ndo_tx_timeout		= xgbe_tx_timeout,
 	.ndo_get_stats64	= xgbe_get_stats64,
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_main.c b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
index 4af0cd9530de..e22935ce9573 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_main.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
@@ -421,7 +421,7 @@ static const struct net_device_ops aq_ndev_ops = {
 	.ndo_change_mtu = aq_ndev_change_mtu,
 	.ndo_set_mac_address = aq_ndev_set_mac_address,
 	.ndo_set_features = aq_ndev_set_features,
-	.ndo_do_ioctl = aq_ndev_ioctl,
+	.ndo_eth_ioctl = aq_ndev_ioctl,
 	.ndo_vlan_rx_add_vid = aq_ndo_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = aq_ndo_vlan_rx_kill_vid,
 	.ndo_setup_tc = aq_ndo_setup_tc,
diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c
index 67b8113a2b53..38c288ec9059 100644
--- a/drivers/net/ethernet/arc/emac_main.c
+++ b/drivers/net/ethernet/arc/emac_main.c
@@ -844,7 +844,7 @@ static const struct net_device_ops arc_emac_netdev_ops = {
 	.ndo_set_mac_address	= arc_emac_set_address,
 	.ndo_get_stats		= arc_emac_stats,
 	.ndo_set_rx_mode	= arc_emac_set_rx_mode,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= arc_emac_poll_controller,
 #endif
diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c
index 1ba81b1eb6fd..02ae98aabf91 100644
--- a/drivers/net/ethernet/atheros/ag71xx.c
+++ b/drivers/net/ethernet/atheros/ag71xx.c
@@ -1851,7 +1851,7 @@ static const struct net_device_ops ag71xx_netdev_ops = {
 	.ndo_open		= ag71xx_open,
 	.ndo_stop		= ag71xx_stop,
 	.ndo_start_xmit		= ag71xx_hard_start_xmit,
-	.ndo_do_ioctl		= phy_do_ioctl,
+	.ndo_eth_ioctl		= phy_do_ioctl,
 	.ndo_tx_timeout		= ag71xx_tx_timeout,
 	.ndo_change_mtu		= ag71xx_change_mtu,
 	.ndo_set_mac_address	= eth_mac_addr,
diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index 11ef1fbe7aee..4ea157efca86 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1701,7 +1701,7 @@ static const struct net_device_ops alx_netdev_ops = {
 	.ndo_validate_addr      = eth_validate_addr,
 	.ndo_set_mac_address    = alx_set_mac_address,
 	.ndo_change_mtu         = alx_change_mtu,
-	.ndo_do_ioctl           = alx_ioctl,
+	.ndo_eth_ioctl           = alx_ioctl,
 	.ndo_tx_timeout         = alx_tx_timeout,
 	.ndo_fix_features	= alx_fix_features,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 1c6246a5dc22..3b51b172b317 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2609,7 +2609,7 @@ static const struct net_device_ops atl1c_netdev_ops = {
 	.ndo_change_mtu		= atl1c_change_mtu,
 	.ndo_fix_features	= atl1c_fix_features,
 	.ndo_set_features	= atl1c_set_features,
-	.ndo_do_ioctl		= atl1c_ioctl,
+	.ndo_eth_ioctl		= atl1c_ioctl,
 	.ndo_tx_timeout		= atl1c_tx_timeout,
 	.ndo_get_stats		= atl1c_get_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 2eb0a2ab69f6..753973ac922e 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -2247,7 +2247,7 @@ static const struct net_device_ops atl1e_netdev_ops = {
 	.ndo_fix_features	= atl1e_fix_features,
 	.ndo_set_features	= atl1e_set_features,
 	.ndo_change_mtu		= atl1e_change_mtu,
-	.ndo_do_ioctl		= atl1e_ioctl,
+	.ndo_eth_ioctl		= atl1e_ioctl,
 	.ndo_tx_timeout		= atl1e_tx_timeout,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= atl1e_netpoll,
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index c67201a13cf5..68f6c0bbd945 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -2885,7 +2885,7 @@ static const struct net_device_ops atl1_netdev_ops = {
 	.ndo_change_mtu		= atl1_change_mtu,
 	.ndo_fix_features	= atlx_fix_features,
 	.ndo_set_features	= atlx_set_features,
-	.ndo_do_ioctl		= atlx_ioctl,
+	.ndo_eth_ioctl		= atlx_ioctl,
 	.ndo_tx_timeout		= atlx_tx_timeout,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= atl1_poll_controller,
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 0cc0db04c27d..b69298ddb647 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -1293,7 +1293,7 @@ static const struct net_device_ops atl2_netdev_ops = {
 	.ndo_change_mtu		= atl2_change_mtu,
 	.ndo_fix_features	= atl2_fix_features,
 	.ndo_set_features	= atl2_set_features,
-	.ndo_do_ioctl		= atl2_ioctl,
+	.ndo_eth_ioctl		= atl2_ioctl,
 	.ndo_tx_timeout		= atl2_tx_timeout,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= atl2_poll_controller,
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index ad2655efe423..fa784953c601 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -2198,7 +2198,7 @@ static const struct net_device_ops b44_netdev_ops = {
 	.ndo_set_rx_mode	= b44_set_rx_mode,
 	.ndo_set_mac_address	= b44_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= b44_ioctl,
+	.ndo_eth_ioctl		= b44_ioctl,
 	.ndo_tx_timeout		= b44_tx_timeout,
 	.ndo_change_mtu		= b44_change_mtu,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 977f097fc7bf..5ec056a26cf8 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -1699,7 +1699,7 @@ static const struct net_device_ops bcm_enet_ops = {
 	.ndo_start_xmit		= bcm_enet_start_xmit,
 	.ndo_set_mac_address	= bcm_enet_set_mac_address,
 	.ndo_set_rx_mode	= bcm_enet_set_multicast_list,
-	.ndo_do_ioctl		= bcm_enet_ioctl,
+	.ndo_eth_ioctl		= bcm_enet_ioctl,
 	.ndo_change_mtu		= bcm_enet_change_mtu,
 };
 
@@ -2446,7 +2446,7 @@ static const struct net_device_ops bcm_enetsw_ops = {
 	.ndo_stop		= bcm_enetsw_stop,
 	.ndo_start_xmit		= bcm_enet_start_xmit,
 	.ndo_change_mtu		= bcm_enet_change_mtu,
-	.ndo_do_ioctl		= bcm_enetsw_ioctl,
+	.ndo_eth_ioctl		= bcm_enetsw_ioctl,
 };
 
 
diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c
index 075f6e146b29..fe4d99abd548 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1263,7 +1263,7 @@ static const struct net_device_ops bgmac_netdev_ops = {
 	.ndo_set_rx_mode	= bgmac_set_rx_mode,
 	.ndo_set_mac_address	= bgmac_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl           = phy_do_ioctl_running,
+	.ndo_eth_ioctl           = phy_do_ioctl_running,
 	.ndo_change_mtu		= bgmac_change_mtu,
 };
 
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index bee6cfad9fc6..89ee1c0e9c79 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -8546,7 +8546,7 @@ static const struct net_device_ops bnx2_netdev_ops = {
 	.ndo_stop		= bnx2_close,
 	.ndo_get_stats64	= bnx2_get_stats64,
 	.ndo_set_rx_mode	= bnx2_set_rx_mode,
-	.ndo_do_ioctl		= bnx2_ioctl,
+	.ndo_eth_ioctl		= bnx2_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= bnx2_change_mac_addr,
 	.ndo_change_mtu		= bnx2_change_mtu,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 2acbc73dcd18..6d98134913cd 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13048,7 +13048,7 @@ static const struct net_device_ops bnx2x_netdev_ops = {
 	.ndo_set_rx_mode	= bnx2x_set_rx_mode,
 	.ndo_set_mac_address	= bnx2x_change_mac_addr,
 	.ndo_validate_addr	= bnx2x_validate_addr,
-	.ndo_do_ioctl		= bnx2x_ioctl,
+	.ndo_eth_ioctl		= bnx2x_ioctl,
 	.ndo_change_mtu		= bnx2x_change_mtu,
 	.ndo_fix_features	= bnx2x_fix_features,
 	.ndo_set_features	= bnx2x_set_features,
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4db162cee911..e34c362a3c58 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -12667,7 +12667,7 @@ static const struct net_device_ops bnxt_netdev_ops = {
 	.ndo_stop		= bnxt_close,
 	.ndo_get_stats64	= bnxt_get_stats64,
 	.ndo_set_rx_mode	= bnxt_set_rx_mode,
-	.ndo_do_ioctl		= bnxt_ioctl,
+	.ndo_eth_ioctl		= bnxt_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= bnxt_change_mac_addr,
 	.ndo_change_mtu		= bnxt_change_mtu,
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index db74241935ab..63e2237e0cb4 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -3659,7 +3659,7 @@ static const struct net_device_ops bcmgenet_netdev_ops = {
 	.ndo_tx_timeout		= bcmgenet_timeout,
 	.ndo_set_rx_mode	= bcmgenet_set_rx_mode,
 	.ndo_set_mac_address	= bcmgenet_set_mac_addr,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_set_features	= bcmgenet_set_features,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= bcmgenet_poll_controller,
diff --git a/drivers/net/ethernet/broadcom/sb1250-mac.c b/drivers/net/ethernet/broadcom/sb1250-mac.c
index 5b4568c2ad1c..f38f40eb966e 100644
--- a/drivers/net/ethernet/broadcom/sb1250-mac.c
+++ b/drivers/net/ethernet/broadcom/sb1250-mac.c
@@ -2136,7 +2136,7 @@ static const struct net_device_ops sbmac_netdev_ops = {
 	.ndo_start_xmit		= sbmac_start_tx,
 	.ndo_set_rx_mode	= sbmac_set_rx_mode,
 	.ndo_tx_timeout		= sbmac_tx_timeout,
-	.ndo_do_ioctl		= sbmac_mii_ioctl,
+	.ndo_eth_ioctl		= sbmac_mii_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index b0e49643f483..6f82eeaa4b9f 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14290,7 +14290,7 @@ static const struct net_device_ops tg3_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= tg3_set_rx_mode,
 	.ndo_set_mac_address	= tg3_set_mac_addr,
-	.ndo_do_ioctl		= tg3_ioctl,
+	.ndo_eth_ioctl		= tg3_ioctl,
 	.ndo_tx_timeout		= tg3_tx_timeout,
 	.ndo_change_mtu		= tg3_change_mtu,
 	.ndo_fix_features	= tg3_fix_features,
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 7d2fe13a52f8..181ebc235925 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -3664,7 +3664,7 @@ static const struct net_device_ops macb_netdev_ops = {
 	.ndo_start_xmit		= macb_start_xmit,
 	.ndo_set_rx_mode	= macb_set_rx_mode,
 	.ndo_get_stats		= macb_get_stats,
-	.ndo_do_ioctl		= macb_ioctl,
+	.ndo_eth_ioctl		= macb_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_change_mtu		= macb_change_mtu,
 	.ndo_set_mac_address	= eth_mac_addr,
@@ -4323,7 +4323,7 @@ static const struct net_device_ops at91ether_netdev_ops = {
 	.ndo_get_stats		= macb_get_stats,
 	.ndo_set_rx_mode	= macb_set_rx_mode,
 	.ndo_set_mac_address	= eth_mac_addr,
-	.ndo_do_ioctl		= macb_ioctl,
+	.ndo_eth_ioctl		= macb_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= at91ether_poll_controller,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 591229b96257..a4a5209a9386 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3223,7 +3223,7 @@ static const struct net_device_ops lionetdevops = {
 	.ndo_vlan_rx_add_vid    = liquidio_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid   = liquidio_vlan_rx_kill_vid,
 	.ndo_change_mtu		= liquidio_change_mtu,
-	.ndo_do_ioctl		= liquidio_ioctl,
+	.ndo_eth_ioctl		= liquidio_ioctl,
 	.ndo_fix_features	= liquidio_fix_features,
 	.ndo_set_features	= liquidio_set_features,
 	.ndo_set_vf_mac		= liquidio_set_vf_mac,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index ffddb3126a32..3085dd455a17 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1889,7 +1889,7 @@ static const struct net_device_ops lionetdevops = {
 	.ndo_vlan_rx_add_vid    = liquidio_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid   = liquidio_vlan_rx_kill_vid,
 	.ndo_change_mtu		= liquidio_change_mtu,
-	.ndo_do_ioctl		= liquidio_ioctl,
+	.ndo_eth_ioctl		= liquidio_ioctl,
 	.ndo_fix_features	= liquidio_fix_features,
 	.ndo_set_features	= liquidio_set_features,
 };
diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
index 48ff6fb0eed9..30463a6d1f8c 100644
--- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
+++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
@@ -1373,7 +1373,7 @@ static const struct net_device_ops octeon_mgmt_ops = {
 	.ndo_start_xmit =		octeon_mgmt_xmit,
 	.ndo_set_rx_mode =		octeon_mgmt_set_rx_filtering,
 	.ndo_set_mac_address =		octeon_mgmt_set_mac_address,
-	.ndo_do_ioctl =			octeon_mgmt_ioctl,
+	.ndo_eth_ioctl =			octeon_mgmt_ioctl,
 	.ndo_change_mtu =		octeon_mgmt_change_mtu,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller =		octeon_mgmt_poll_controller,
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index e2b290135fd9..efaaa57d4ed5 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -2096,7 +2096,7 @@ static const struct net_device_ops nicvf_netdev_ops = {
 	.ndo_fix_features       = nicvf_fix_features,
 	.ndo_set_features       = nicvf_set_features,
 	.ndo_bpf		= nicvf_xdp,
-	.ndo_do_ioctl           = nicvf_ioctl,
+	.ndo_eth_ioctl           = nicvf_ioctl,
 	.ndo_set_rx_mode        = nicvf_set_rx_mode,
 };
 
diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
index 512da98019c6..e7575d41f4f5 100644
--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
+++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
@@ -924,7 +924,7 @@ static const struct net_device_ops cxgb_netdev_ops = {
 	.ndo_get_stats		= t1_get_stats,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= t1_set_rxmode,
-	.ndo_do_ioctl		= t1_ioctl,
+	.ndo_eth_ioctl		= t1_ioctl,
 	.ndo_change_mtu		= t1_change_mtu,
 	.ndo_set_mac_address	= t1_set_mac_addr,
 	.ndo_fix_features	= t1_fix_features,
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index eae893d7d840..72af9d2a00ae 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -3184,7 +3184,7 @@ static const struct net_device_ops cxgb_netdev_ops = {
 	.ndo_get_stats		= cxgb_get_stats,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= cxgb_set_rxmode,
-	.ndo_do_ioctl		= cxgb_ioctl,
+	.ndo_eth_ioctl		= cxgb_ioctl,
 	.ndo_siocdevprivate	= cxgb_siocdevprivate,
 	.ndo_change_mtu		= cxgb_change_mtu,
 	.ndo_set_mac_address	= cxgb_set_mac_addr,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index dbf9a0e6601d..aa8573202c37 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3872,7 +3872,7 @@ static const struct net_device_ops cxgb4_netdev_ops = {
 	.ndo_set_mac_address  = cxgb_set_mac_addr,
 	.ndo_set_features     = cxgb_set_features,
 	.ndo_validate_addr    = eth_validate_addr,
-	.ndo_do_ioctl         = cxgb_ioctl,
+	.ndo_eth_ioctl         = cxgb_ioctl,
 	.ndo_change_mtu       = cxgb_change_mtu,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller  = cxgb_netpoll,
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 2820a0bb971b..2842628ad2c5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -2837,7 +2837,7 @@ static const struct net_device_ops cxgb4vf_netdev_ops	= {
 	.ndo_set_rx_mode	= cxgb4vf_set_rxmode,
 	.ndo_set_mac_address	= cxgb4vf_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= cxgb4vf_do_ioctl,
+	.ndo_eth_ioctl		= cxgb4vf_do_ioctl,
 	.ndo_change_mtu		= cxgb4vf_change_mtu,
 	.ndo_fix_features	= cxgb4vf_fix_features,
 	.ndo_set_features	= cxgb4vf_set_features,
diff --git a/drivers/net/ethernet/cirrus/ep93xx_eth.c b/drivers/net/ethernet/cirrus/ep93xx_eth.c
index 9f5e5ec69991..072fac5f5d24 100644
--- a/drivers/net/ethernet/cirrus/ep93xx_eth.c
+++ b/drivers/net/ethernet/cirrus/ep93xx_eth.c
@@ -733,7 +733,7 @@ static const struct net_device_ops ep93xx_netdev_ops = {
 	.ndo_open		= ep93xx_open,
 	.ndo_stop		= ep93xx_close,
 	.ndo_start_xmit		= ep93xx_xmit,
-	.ndo_do_ioctl		= ep93xx_ioctl,
+	.ndo_eth_ioctl		= ep93xx_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 };
diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c
index 2a8bf53c2f75..e842de6f6635 100644
--- a/drivers/net/ethernet/davicom/dm9000.c
+++ b/drivers/net/ethernet/davicom/dm9000.c
@@ -1372,7 +1372,7 @@ static const struct net_device_ops dm9000_netdev_ops = {
 	.ndo_start_xmit		= dm9000_start_xmit,
 	.ndo_tx_timeout		= dm9000_timeout,
 	.ndo_set_rx_mode	= dm9000_hash_table,
-	.ndo_do_ioctl		= dm9000_ioctl,
+	.ndo_eth_ioctl		= dm9000_ioctl,
 	.ndo_set_features	= dm9000_set_features,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c
index c1dcd6ca1457..fcedd733bacb 100644
--- a/drivers/net/ethernet/dec/tulip/tulip_core.c
+++ b/drivers/net/ethernet/dec/tulip/tulip_core.c
@@ -1271,7 +1271,7 @@ static const struct net_device_ops tulip_netdev_ops = {
 	.ndo_tx_timeout		= tulip_tx_timeout,
 	.ndo_stop		= tulip_close,
 	.ndo_get_stats		= tulip_get_stats,
-	.ndo_do_ioctl 		= private_ioctl,
+	.ndo_eth_ioctl		= private_ioctl,
 	.ndo_set_rx_mode	= set_rx_mode,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/dec/tulip/winbond-840.c b/drivers/net/ethernet/dec/tulip/winbond-840.c
index f6ff1f76eacb..07a48f6bf0fa 100644
--- a/drivers/net/ethernet/dec/tulip/winbond-840.c
+++ b/drivers/net/ethernet/dec/tulip/winbond-840.c
@@ -341,7 +341,7 @@ static const struct net_device_ops netdev_ops = {
 	.ndo_start_xmit		= start_tx,
 	.ndo_get_stats		= get_stats,
 	.ndo_set_rx_mode	= set_rx_mode,
-	.ndo_do_ioctl		= netdev_ioctl,
+	.ndo_eth_ioctl		= netdev_ioctl,
 	.ndo_tx_timeout		= tx_timeout,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/dlink/dl2k.c b/drivers/net/ethernet/dlink/dl2k.c
index 734acb834c98..202ecb132053 100644
--- a/drivers/net/ethernet/dlink/dl2k.c
+++ b/drivers/net/ethernet/dlink/dl2k.c
@@ -95,7 +95,7 @@ static const struct net_device_ops netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_set_rx_mode	= set_multicast,
-	.ndo_do_ioctl		= rio_ioctl,
+	.ndo_eth_ioctl		= rio_ioctl,
 	.ndo_tx_timeout		= rio_tx_timeout,
 };
 
diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c
index ee0ca712dd1c..c36d186dffed 100644
--- a/drivers/net/ethernet/dlink/sundance.c
+++ b/drivers/net/ethernet/dlink/sundance.c
@@ -479,7 +479,7 @@ static const struct net_device_ops netdev_ops = {
 	.ndo_start_xmit		= start_tx,
 	.ndo_get_stats 		= get_stats,
 	.ndo_set_rx_mode	= set_rx_mode,
-	.ndo_do_ioctl 		= netdev_ioctl,
+	.ndo_eth_ioctl		= netdev_ioctl,
 	.ndo_tx_timeout		= tx_timeout,
 	.ndo_change_mtu		= change_mtu,
 	.ndo_set_mac_address 	= sundance_set_mac_addr,
diff --git a/drivers/net/ethernet/dnet.c b/drivers/net/ethernet/dnet.c
index 48c6eb142dcc..6c51cf991dad 100644
--- a/drivers/net/ethernet/dnet.c
+++ b/drivers/net/ethernet/dnet.c
@@ -742,7 +742,7 @@ static const struct net_device_ops dnet_netdev_ops = {
 	.ndo_stop		= dnet_close,
 	.ndo_get_stats		= dnet_get_stats,
 	.ndo_start_xmit		= dnet_start_xmit,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
index e1b43b07755b..ed1ed48e7483 100644
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -1009,7 +1009,7 @@ static const struct ethtool_ops ethoc_ethtool_ops = {
 static const struct net_device_ops ethoc_netdev_ops = {
 	.ndo_open = ethoc_open,
 	.ndo_stop = ethoc_stop,
-	.ndo_do_ioctl = ethoc_ioctl,
+	.ndo_eth_ioctl = ethoc_ioctl,
 	.ndo_set_mac_address = ethoc_set_mac_address,
 	.ndo_set_rx_mode = ethoc_set_multicast_list,
 	.ndo_change_mtu = ethoc_change_mtu,
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 11dbbfd38770..ff76e401a014 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1616,7 +1616,7 @@ static const struct net_device_ops ftgmac100_netdev_ops = {
 	.ndo_start_xmit		= ftgmac100_hard_start_xmit,
 	.ndo_set_mac_address	= ftgmac100_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= phy_do_ioctl,
+	.ndo_eth_ioctl		= phy_do_ioctl,
 	.ndo_tx_timeout		= ftgmac100_tx_timeout,
 	.ndo_set_rx_mode	= ftgmac100_set_rx_mode,
 	.ndo_set_features	= ftgmac100_set_features,
diff --git a/drivers/net/ethernet/faraday/ftmac100.c b/drivers/net/ethernet/faraday/ftmac100.c
index 5a1a8f2ea63c..8a341e2d5833 100644
--- a/drivers/net/ethernet/faraday/ftmac100.c
+++ b/drivers/net/ethernet/faraday/ftmac100.c
@@ -1043,7 +1043,7 @@ static const struct net_device_ops ftmac100_netdev_ops = {
 	.ndo_start_xmit		= ftmac100_hard_start_xmit,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= ftmac100_do_ioctl,
+	.ndo_eth_ioctl		= ftmac100_do_ioctl,
 };
 
 /******************************************************************************
diff --git a/drivers/net/ethernet/fealnx.c b/drivers/net/ethernet/fealnx.c
index 0f141c14d72d..25c91b3c5fd3 100644
--- a/drivers/net/ethernet/fealnx.c
+++ b/drivers/net/ethernet/fealnx.c
@@ -463,7 +463,7 @@ static const struct net_device_ops netdev_ops = {
 	.ndo_start_xmit		= start_tx,
 	.ndo_get_stats 		= get_stats,
 	.ndo_set_rx_mode	= set_rx_mode,
-	.ndo_do_ioctl		= mii_ioctl,
+	.ndo_eth_ioctl		= mii_ioctl,
 	.ndo_tx_timeout		= fealnx_tx_timeout,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index e6826561cf11..685d2d8a3b36 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -3157,7 +3157,7 @@ static const struct net_device_ops dpaa_ops = {
 	.ndo_set_mac_address = dpaa_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
 	.ndo_set_rx_mode = dpaa_set_rx_mode,
-	.ndo_do_ioctl = dpaa_ioctl,
+	.ndo_eth_ioctl = dpaa_ioctl,
 	.ndo_setup_tc = dpaa_setup_tc,
 	.ndo_change_mtu = dpaa_change_mtu,
 	.ndo_bpf = dpaa_xdp,
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 973352393bd4..f664021c3ad1 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -2594,7 +2594,7 @@ static const struct net_device_ops dpaa2_eth_ops = {
 	.ndo_get_stats64 = dpaa2_eth_get_stats,
 	.ndo_set_rx_mode = dpaa2_eth_set_rx_mode,
 	.ndo_set_features = dpaa2_eth_set_features,
-	.ndo_do_ioctl = dpaa2_eth_ioctl,
+	.ndo_eth_ioctl = dpaa2_eth_ioctl,
 	.ndo_change_mtu = dpaa2_eth_change_mtu,
 	.ndo_bpf = dpaa2_eth_xdp,
 	.ndo_xdp_xmit = dpaa2_eth_xdp_xmit,
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index c84f6c226743..60d94e0a07d6 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
@@ -735,7 +735,7 @@ static const struct net_device_ops enetc_ndev_ops = {
 	.ndo_set_vf_vlan	= enetc_pf_set_vf_vlan,
 	.ndo_set_vf_spoofchk	= enetc_pf_set_vf_spoofchk,
 	.ndo_set_features	= enetc_pf_set_features,
-	.ndo_do_ioctl		= enetc_ioctl,
+	.ndo_eth_ioctl		= enetc_ioctl,
 	.ndo_setup_tc		= enetc_setup_tc,
 	.ndo_bpf		= enetc_setup_bpf,
 	.ndo_xdp_xmit		= enetc_xdp_xmit,
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c
index 03090ba7e226..1a9d1e8b772c 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c
@@ -99,7 +99,7 @@ static const struct net_device_ops enetc_ndev_ops = {
 	.ndo_get_stats		= enetc_get_stats,
 	.ndo_set_mac_address	= enetc_vf_set_mac_addr,
 	.ndo_set_features	= enetc_vf_set_features,
-	.ndo_do_ioctl		= enetc_ioctl,
+	.ndo_eth_ioctl		= enetc_ioctl,
 	.ndo_setup_tc		= enetc_setup_tc,
 };
 
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 8aea707a65a7..e361be85f26f 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3280,7 +3280,7 @@ static const struct net_device_ops fec_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_tx_timeout		= fec_timeout,
 	.ndo_set_mac_address	= fec_set_mac_address,
-	.ndo_do_ioctl		= fec_enet_ioctl,
+	.ndo_eth_ioctl		= fec_enet_ioctl,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= fec_poll_controller,
 #endif
diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c
index 02c47658a215..73ff359a15f1 100644
--- a/drivers/net/ethernet/freescale/fec_mpc52xx.c
+++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c
@@ -792,7 +792,7 @@ static const struct net_device_ops mpc52xx_fec_netdev_ops = {
 	.ndo_set_rx_mode = mpc52xx_fec_set_multicast_list,
 	.ndo_set_mac_address = mpc52xx_fec_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
-	.ndo_do_ioctl = phy_do_ioctl,
+	.ndo_eth_ioctl = phy_do_ioctl,
 	.ndo_tx_timeout = mpc52xx_fec_tx_timeout,
 	.ndo_get_stats = mpc52xx_fec_get_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
index 6ee325ad35c5..2db6e38a772e 100644
--- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
+++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
@@ -900,7 +900,7 @@ static const struct net_device_ops fs_enet_netdev_ops = {
 	.ndo_start_xmit		= fs_enet_start_xmit,
 	.ndo_tx_timeout		= fs_timeout,
 	.ndo_set_rx_mode	= fs_set_multicast_list,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 9646483137c4..af6ad94bf24a 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -3184,7 +3184,7 @@ static const struct net_device_ops gfar_netdev_ops = {
 	.ndo_set_features = gfar_set_features,
 	.ndo_set_rx_mode = gfar_set_multi,
 	.ndo_tx_timeout = gfar_timeout,
-	.ndo_do_ioctl = gfar_ioctl,
+	.ndo_eth_ioctl = gfar_ioctl,
 	.ndo_get_stats64 = gfar_get_stats64,
 	.ndo_change_carrier = fixed_phy_change_carrier,
 	.ndo_set_mac_address = gfar_set_mac_addr,
diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index 0acfafb73db1..3eb288d10b0c 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -3516,7 +3516,7 @@ static const struct net_device_ops ucc_geth_netdev_ops = {
 	.ndo_set_mac_address	= ucc_geth_set_mac_addr,
 	.ndo_set_rx_mode	= ucc_geth_set_multi,
 	.ndo_tx_timeout		= ucc_geth_timeout,
-	.ndo_do_ioctl		= ucc_geth_ioctl,
+	.ndo_eth_ioctl		= ucc_geth_ioctl,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ucc_netpoll,
 #endif
diff --git a/drivers/net/ethernet/hisilicon/hisi_femac.c b/drivers/net/ethernet/hisilicon/hisi_femac.c
index 3c4db4a6b431..22bf914f2dbd 100644
--- a/drivers/net/ethernet/hisilicon/hisi_femac.c
+++ b/drivers/net/ethernet/hisilicon/hisi_femac.c
@@ -685,7 +685,7 @@ static const struct net_device_ops hisi_femac_netdev_ops = {
 	.ndo_open		= hisi_femac_net_open,
 	.ndo_stop		= hisi_femac_net_close,
 	.ndo_start_xmit		= hisi_femac_net_xmit,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_set_mac_address	= hisi_femac_set_mac_address,
 	.ndo_set_rx_mode	= hisi_femac_net_set_rx_mode,
 };
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index ad534f9e41ab..343c605c4be8 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -1945,7 +1945,7 @@ static const struct net_device_ops hns_nic_netdev_ops = {
 	.ndo_tx_timeout = hns_nic_net_timeout,
 	.ndo_set_mac_address = hns_nic_net_set_mac_address,
 	.ndo_change_mtu = hns_nic_change_mtu,
-	.ndo_do_ioctl = phy_do_ioctl_running,
+	.ndo_eth_ioctl = phy_do_ioctl_running,
 	.ndo_set_features = hns_nic_set_features,
 	.ndo_fix_features = hns_nic_fix_features,
 	.ndo_get_stats64 = hns_nic_get_stats64,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index cdb5f14fb6bc..cb8d5da3654f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2852,7 +2852,7 @@ static const struct net_device_ops hns3_nic_netdev_ops = {
 	.ndo_start_xmit		= hns3_nic_net_xmit,
 	.ndo_tx_timeout		= hns3_nic_net_timeout,
 	.ndo_set_mac_address	= hns3_nic_net_set_mac_address,
-	.ndo_do_ioctl		= hns3_nic_do_ioctl,
+	.ndo_eth_ioctl		= hns3_nic_do_ioctl,
 	.ndo_change_mtu		= hns3_nic_change_mtu,
 	.ndo_set_features	= hns3_nic_set_features,
 	.ndo_features_check	= hns3_features_check,
diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 471be6ec7e8a..664a91af662d 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -3011,7 +3011,7 @@ static const struct net_device_ops emac_netdev_ops = {
 	.ndo_stop		= emac_close,
 	.ndo_get_stats		= emac_stats,
 	.ndo_set_rx_mode	= emac_set_multicast_list,
-	.ndo_do_ioctl		= emac_ioctl,
+	.ndo_eth_ioctl		= emac_ioctl,
 	.ndo_tx_timeout		= emac_tx_timeout,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= emac_set_mac_address,
@@ -3023,7 +3023,7 @@ static const struct net_device_ops emac_gige_netdev_ops = {
 	.ndo_stop		= emac_close,
 	.ndo_get_stats		= emac_stats,
 	.ndo_set_rx_mode	= emac_set_multicast_list,
-	.ndo_do_ioctl		= emac_ioctl,
+	.ndo_eth_ioctl		= emac_ioctl,
 	.ndo_tx_timeout		= emac_tx_timeout,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= emac_set_mac_address,
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index 737ba85e409f..3d9b4f99d357 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1630,7 +1630,7 @@ static const struct net_device_ops ibmveth_netdev_ops = {
 	.ndo_stop		= ibmveth_close,
 	.ndo_start_xmit		= ibmveth_start_xmit,
 	.ndo_set_rx_mode	= ibmveth_set_multicast_list,
-	.ndo_do_ioctl		= ibmveth_ioctl,
+	.ndo_eth_ioctl		= ibmveth_ioctl,
 	.ndo_change_mtu		= ibmveth_change_mtu,
 	.ndo_fix_features	= ibmveth_fix_features,
 	.ndo_set_features	= ibmveth_set_features,
diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c
index 1ec924c556c5..373eb027b925 100644
--- a/drivers/net/ethernet/intel/e100.c
+++ b/drivers/net/ethernet/intel/e100.c
@@ -2809,7 +2809,7 @@ static const struct net_device_ops e100_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= e100_set_multicast_list,
 	.ndo_set_mac_address	= e100_set_mac_address,
-	.ndo_do_ioctl		= e100_do_ioctl,
+	.ndo_eth_ioctl		= e100_do_ioctl,
 	.ndo_tx_timeout		= e100_tx_timeout,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= e100_netpoll,
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index c2a109126c27..bed4f040face 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -832,7 +832,7 @@ static const struct net_device_ops e1000_netdev_ops = {
 	.ndo_set_mac_address	= e1000_set_mac,
 	.ndo_tx_timeout		= e1000_tx_timeout,
 	.ndo_change_mtu		= e1000_change_mtu,
-	.ndo_do_ioctl		= e1000_ioctl,
+	.ndo_eth_ioctl		= e1000_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_vlan_rx_add_vid	= e1000_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= e1000_vlan_rx_kill_vid,
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 3c22b509fa79..900b3ab998bd 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -7354,7 +7354,7 @@ static const struct net_device_ops e1000e_netdev_ops = {
 	.ndo_set_rx_mode	= e1000e_set_rx_mode,
 	.ndo_set_mac_address	= e1000_set_mac,
 	.ndo_change_mtu		= e1000_change_mtu,
-	.ndo_do_ioctl		= e1000_ioctl,
+	.ndo_eth_ioctl		= e1000_ioctl,
 	.ndo_tx_timeout		= e1000_tx_timeout,
 	.ndo_validate_addr	= eth_validate_addr,
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 53c1fbeee62a..5b4012a09acb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -13256,7 +13256,7 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= i40e_set_mac,
 	.ndo_change_mtu		= i40e_change_mtu,
-	.ndo_do_ioctl		= i40e_ioctl,
+	.ndo_eth_ioctl		= i40e_ioctl,
 	.ndo_tx_timeout		= i40e_tx_timeout,
 	.ndo_vlan_rx_add_vid	= i40e_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= i40e_vlan_rx_kill_vid,
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index ef8d1815af56..33916ed9e874 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -6558,12 +6558,12 @@ event_after:
 }
 
 /**
- * ice_do_ioctl - Access the hwtstamp interface
+ * ice_eth_ioctl - Access the hwtstamp interface
  * @netdev: network interface device structure
  * @ifr: interface request data
  * @cmd: ioctl command
  */
-static int ice_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+static int ice_eth_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_pf *pf = np->vsi->back;
@@ -7229,7 +7229,7 @@ static const struct net_device_ops ice_netdev_ops = {
 	.ndo_change_mtu = ice_change_mtu,
 	.ndo_get_stats64 = ice_get_stats64,
 	.ndo_set_tx_maxrate = ice_set_tx_maxrate,
-	.ndo_do_ioctl = ice_do_ioctl,
+	.ndo_eth_ioctl = ice_eth_ioctl,
 	.ndo_set_vf_spoofchk = ice_set_vf_spoofchk,
 	.ndo_set_vf_mac = ice_set_vf_mac,
 	.ndo_get_vf_config = ice_get_vf_cfg,
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 171a7a629b20..751de06019a0 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2991,7 +2991,7 @@ static const struct net_device_ops igb_netdev_ops = {
 	.ndo_set_rx_mode	= igb_set_rx_mode,
 	.ndo_set_mac_address	= igb_set_mac,
 	.ndo_change_mtu		= igb_change_mtu,
-	.ndo_do_ioctl		= igb_ioctl,
+	.ndo_eth_ioctl		= igb_ioctl,
 	.ndo_tx_timeout		= igb_tx_timeout,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_vlan_rx_add_vid	= igb_vlan_rx_add_vid,
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 1bbe9862a758..d32e72d953c8 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -2657,7 +2657,7 @@ static const struct net_device_ops igbvf_netdev_ops = {
 	.ndo_set_rx_mode	= igbvf_set_rx_mode,
 	.ndo_set_mac_address	= igbvf_set_mac,
 	.ndo_change_mtu		= igbvf_change_mtu,
-	.ndo_do_ioctl		= igbvf_ioctl,
+	.ndo_eth_ioctl		= igbvf_ioctl,
 	.ndo_tx_timeout		= igbvf_tx_timeout,
 	.ndo_vlan_rx_add_vid	= igbvf_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= igbvf_vlan_rx_kill_vid,
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 5c95bf82eaf7..b7aab35c1132 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -6013,7 +6013,7 @@ static const struct net_device_ops igc_netdev_ops = {
 	.ndo_fix_features	= igc_fix_features,
 	.ndo_set_features	= igc_set_features,
 	.ndo_features_check	= igc_features_check,
-	.ndo_do_ioctl		= igc_ioctl,
+	.ndo_eth_ioctl		= igc_ioctl,
 	.ndo_setup_tc		= igc_setup_tc,
 	.ndo_bpf		= igc_bpf,
 	.ndo_xdp_xmit		= igc_xdp_xmit,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 14aea40da50f..24e06ba6f5e9 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10247,7 +10247,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_set_tx_maxrate	= ixgbe_tx_maxrate,
 	.ndo_vlan_rx_add_vid	= ixgbe_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= ixgbe_vlan_rx_kill_vid,
-	.ndo_do_ioctl		= ixgbe_ioctl,
+	.ndo_eth_ioctl		= ixgbe_ioctl,
 	.ndo_set_vf_mac		= ixgbe_ndo_set_vf_mac,
 	.ndo_set_vf_vlan	= ixgbe_ndo_set_vf_vlan,
 	.ndo_set_vf_rate	= ixgbe_ndo_set_vf_bw,
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index f1b9284e0bea..1251b74fe0e2 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -2901,7 +2901,7 @@ static const struct net_device_ops jme_netdev_ops = {
 	.ndo_open		= jme_open,
 	.ndo_stop		= jme_close,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= jme_ioctl,
+	.ndo_eth_ioctl		= jme_ioctl,
 	.ndo_start_xmit		= jme_start_xmit,
 	.ndo_set_mac_address	= jme_set_macaddr,
 	.ndo_set_rx_mode	= jme_set_multi,
diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c
index b30a45725374..3e9f324f1061 100644
--- a/drivers/net/ethernet/korina.c
+++ b/drivers/net/ethernet/korina.c
@@ -1272,7 +1272,7 @@ static const struct net_device_ops korina_netdev_ops = {
 	.ndo_start_xmit		= korina_send_packet,
 	.ndo_set_rx_mode	= korina_multicast_list,
 	.ndo_tx_timeout		= korina_tx_timeout,
-	.ndo_do_ioctl		= korina_ioctl,
+	.ndo_eth_ioctl		= korina_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c
index 2d0c52f7106b..62f8c5212182 100644
--- a/drivers/net/ethernet/lantiq_etop.c
+++ b/drivers/net/ethernet/lantiq_etop.c
@@ -609,7 +609,7 @@ static const struct net_device_ops ltq_eth_netdev_ops = {
 	.ndo_stop = ltq_etop_stop,
 	.ndo_start_xmit = ltq_etop_tx,
 	.ndo_change_mtu = ltq_etop_change_mtu,
-	.ndo_do_ioctl = phy_do_ioctl,
+	.ndo_eth_ioctl = phy_do_ioctl,
 	.ndo_set_mac_address = ltq_etop_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
 	.ndo_set_rx_mode = ltq_etop_set_multicast_list,
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index d207bfcaf31d..6502c5c2ebca 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -3060,7 +3060,7 @@ static const struct net_device_ops mv643xx_eth_netdev_ops = {
 	.ndo_set_rx_mode	= mv643xx_eth_set_rx_mode,
 	.ndo_set_mac_address	= mv643xx_eth_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= mv643xx_eth_ioctl,
+	.ndo_eth_ioctl		= mv643xx_eth_ioctl,
 	.ndo_change_mtu		= mv643xx_eth_change_mtu,
 	.ndo_set_features	= mv643xx_eth_set_features,
 	.ndo_tx_timeout		= mv643xx_eth_tx_timeout,
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 975a1a77d445..ff8db311963c 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4994,7 +4994,7 @@ static const struct net_device_ops mvneta_netdev_ops = {
 	.ndo_change_mtu      = mvneta_change_mtu,
 	.ndo_fix_features    = mvneta_fix_features,
 	.ndo_get_stats64     = mvneta_get_stats64,
-	.ndo_do_ioctl        = mvneta_ioctl,
+	.ndo_eth_ioctl        = mvneta_ioctl,
 	.ndo_bpf	     = mvneta_xdp,
 	.ndo_xdp_xmit        = mvneta_xdp_xmit,
 	.ndo_setup_tc	     = mvneta_setup_tc,
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 878fb17dea41..99bd8b8aa0e2 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -5702,7 +5702,7 @@ static const struct net_device_ops mvpp2_netdev_ops = {
 	.ndo_set_mac_address	= mvpp2_set_mac_address,
 	.ndo_change_mtu		= mvpp2_change_mtu,
 	.ndo_get_stats64	= mvpp2_get_stats64,
-	.ndo_do_ioctl		= mvpp2_ioctl,
+	.ndo_eth_ioctl		= mvpp2_ioctl,
 	.ndo_vlan_rx_add_vid	= mvpp2_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= mvpp2_vlan_rx_kill_vid,
 	.ndo_set_features	= mvpp2_set_features,
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index f300b807a85b..3f03bbdd8d04 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -2326,7 +2326,7 @@ static const struct net_device_ops otx2_netdev_ops = {
 	.ndo_set_features	= otx2_set_features,
 	.ndo_tx_timeout		= otx2_tx_timeout,
 	.ndo_get_stats64	= otx2_get_stats64,
-	.ndo_do_ioctl		= otx2_ioctl,
+	.ndo_eth_ioctl		= otx2_ioctl,
 	.ndo_set_vf_mac		= otx2_set_vf_mac,
 	.ndo_set_vf_vlan	= otx2_set_vf_vlan,
 	.ndo_get_vf_config	= otx2_get_vf_config,
diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c
index 9b48ae4bac39..fab53c9b8380 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c
@@ -1377,7 +1377,7 @@ static const struct net_device_ops pxa168_eth_netdev_ops = {
 	.ndo_set_rx_mode	= pxa168_eth_set_rx_mode,
 	.ndo_set_mac_address	= pxa168_eth_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= phy_do_ioctl,
+	.ndo_eth_ioctl		= phy_do_ioctl,
 	.ndo_change_mtu		= pxa168_eth_change_mtu,
 	.ndo_tx_timeout		= pxa168_eth_tx_timeout,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index d4bb27ba1419..150c06ee3627 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -3787,7 +3787,7 @@ static const struct net_device_ops skge_netdev_ops = {
 	.ndo_open		= skge_up,
 	.ndo_stop		= skge_down,
 	.ndo_start_xmit		= skge_xmit_frame,
-	.ndo_do_ioctl		= skge_ioctl,
+	.ndo_eth_ioctl		= skge_ioctl,
 	.ndo_get_stats		= skge_get_stats,
 	.ndo_tx_timeout		= skge_tx_timeout,
 	.ndo_change_mtu		= skge_change_mtu,
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 8b8bff59c8fe..743ca96527fa 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -4693,7 +4693,7 @@ static const struct net_device_ops sky2_netdev_ops[2] = {
 	.ndo_open		= sky2_open,
 	.ndo_stop		= sky2_close,
 	.ndo_start_xmit		= sky2_xmit_frame,
-	.ndo_do_ioctl		= sky2_ioctl,
+	.ndo_eth_ioctl		= sky2_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= sky2_set_mac_address,
 	.ndo_set_rx_mode	= sky2_set_multicast,
@@ -4710,7 +4710,7 @@ static const struct net_device_ops sky2_netdev_ops[2] = {
 	.ndo_open		= sky2_open,
 	.ndo_stop		= sky2_close,
 	.ndo_start_xmit		= sky2_xmit_frame,
-	.ndo_do_ioctl		= sky2_ioctl,
+	.ndo_eth_ioctl		= sky2_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= sky2_set_mac_address,
 	.ndo_set_rx_mode	= sky2_set_multicast,
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 64adfd24e134..398c23cec815 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2933,7 +2933,7 @@ static const struct net_device_ops mtk_netdev_ops = {
 	.ndo_start_xmit		= mtk_start_xmit,
 	.ndo_set_mac_address	= mtk_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= mtk_do_ioctl,
+	.ndo_eth_ioctl		= mtk_do_ioctl,
 	.ndo_change_mtu		= mtk_change_mtu,
 	.ndo_tx_timeout		= mtk_tx_timeout,
 	.ndo_get_stats64        = mtk_get_stats64,
diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c
index 96d2891f1675..1d5dd2015453 100644
--- a/drivers/net/ethernet/mediatek/mtk_star_emac.c
+++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c
@@ -1162,7 +1162,7 @@ static const struct net_device_ops mtk_star_netdev_ops = {
 	.ndo_start_xmit		= mtk_star_netdev_start_xmit,
 	.ndo_get_stats64	= mtk_star_netdev_get_stats64,
 	.ndo_set_rx_mode	= mtk_star_set_rx_mode,
-	.ndo_do_ioctl		= mtk_star_netdev_ioctl,
+	.ndo_eth_ioctl		= mtk_star_netdev_ioctl,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 5d0c9c62382d..a2f61a87cef8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2828,7 +2828,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_set_mac_address	= mlx4_en_set_mac,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_change_mtu		= mlx4_en_change_mtu,
-	.ndo_do_ioctl		= mlx4_en_ioctl,
+	.ndo_eth_ioctl		= mlx4_en_ioctl,
 	.ndo_tx_timeout		= mlx4_en_tx_timeout,
 	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b9a0459b58f1..b6c1e3124f96 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4417,7 +4417,7 @@ const struct net_device_ops mlx5e_netdev_ops = {
 	.ndo_set_features        = mlx5e_set_features,
 	.ndo_fix_features        = mlx5e_fix_features,
 	.ndo_change_mtu          = mlx5e_change_nic_mtu,
-	.ndo_do_ioctl            = mlx5e_ioctl,
+	.ndo_eth_ioctl            = mlx5e_ioctl,
 	.ndo_set_tx_maxrate      = mlx5e_set_tx_maxrate,
 	.ndo_features_check      = mlx5e_features_check,
 	.ndo_tx_timeout          = mlx5e_tx_timeout,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 6535c636ae22..a126cbc6f0d6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -50,7 +50,7 @@ static const struct net_device_ops mlx5i_netdev_ops = {
 	.ndo_init                = mlx5i_dev_init,
 	.ndo_uninit              = mlx5i_dev_cleanup,
 	.ndo_change_mtu          = mlx5i_change_mtu,
-	.ndo_do_ioctl            = mlx5i_ioctl,
+	.ndo_eth_ioctl            = mlx5i_ioctl,
 };
 
 /* IPoIB mlx5 netdev profile */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
index 18ee21b06a00..5308f23702bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
@@ -149,7 +149,7 @@ static const struct net_device_ops mlx5i_pkey_netdev_ops = {
 	.ndo_get_stats64         = mlx5i_get_stats,
 	.ndo_uninit              = mlx5i_pkey_dev_cleanup,
 	.ndo_change_mtu          = mlx5i_pkey_change_mtu,
-	.ndo_do_ioctl            = mlx5i_pkey_ioctl,
+	.ndo_eth_ioctl            = mlx5i_pkey_ioctl,
 };
 
 /* Child NDOs */
diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
index a0a059e0154f..d22219613719 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
@@ -199,7 +199,7 @@ static int mlxbf_gige_stop(struct net_device *netdev)
 	return 0;
 }
 
-static int mlxbf_gige_do_ioctl(struct net_device *netdev,
+static int mlxbf_gige_eth_ioctl(struct net_device *netdev,
 			       struct ifreq *ifr, int cmd)
 {
 	if (!(netif_running(netdev)))
@@ -253,7 +253,7 @@ static const struct net_device_ops mlxbf_gige_netdev_ops = {
 	.ndo_start_xmit		= mlxbf_gige_start_xmit,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= mlxbf_gige_do_ioctl,
+	.ndo_eth_ioctl		= mlxbf_gige_eth_ioctl,
 	.ndo_set_rx_mode        = mlxbf_gige_set_rx_mode,
 	.ndo_get_stats64        = mlxbf_gige_get_stats64,
 };
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 88699e678544..081408e892d5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1207,7 +1207,7 @@ static const struct net_device_ops mlxsw_sp_port_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= mlxsw_sp_port_kill_vid,
 	.ndo_set_features	= mlxsw_sp_set_features,
 	.ndo_get_devlink_port	= mlxsw_sp_port_get_devlink_port,
-	.ndo_do_ioctl		= mlxsw_sp_port_ioctl,
+	.ndo_eth_ioctl		= mlxsw_sp_port_ioctl,
 };
 
 static int
diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c
index 831518466de2..3f69bb59ba49 100644
--- a/drivers/net/ethernet/micrel/ks8851_common.c
+++ b/drivers/net/ethernet/micrel/ks8851_common.c
@@ -689,7 +689,7 @@ static int ks8851_net_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
 static const struct net_device_ops ks8851_netdev_ops = {
 	.ndo_open		= ks8851_net_open,
 	.ndo_stop		= ks8851_net_stop,
-	.ndo_do_ioctl		= ks8851_net_ioctl,
+	.ndo_eth_ioctl		= ks8851_net_ioctl,
 	.ndo_start_xmit		= ks8851_start_xmit,
 	.ndo_set_mac_address	= ks8851_set_mac_address,
 	.ndo_set_rx_mode	= ks8851_set_rx_mode,
diff --git a/drivers/net/ethernet/micrel/ksz884x.c b/drivers/net/ethernet/micrel/ksz884x.c
index 7945eb5e2fe8..a0ee155f9f51 100644
--- a/drivers/net/ethernet/micrel/ksz884x.c
+++ b/drivers/net/ethernet/micrel/ksz884x.c
@@ -6738,7 +6738,7 @@ static const struct net_device_ops netdev_ops = {
 	.ndo_set_features	= netdev_set_features,
 	.ndo_set_mac_address	= netdev_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= netdev_ioctl,
+	.ndo_eth_ioctl		= netdev_ioctl,
 	.ndo_set_rx_mode	= netdev_set_rx_mode,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= netdev_netpoll,
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c
index dae10328c6cf..9e8561cdc32a 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -2655,7 +2655,7 @@ static const struct net_device_ops lan743x_netdev_ops = {
 	.ndo_open		= lan743x_netdev_open,
 	.ndo_stop		= lan743x_netdev_close,
 	.ndo_start_xmit		= lan743x_netdev_xmit_frame,
-	.ndo_do_ioctl		= lan743x_netdev_ioctl,
+	.ndo_eth_ioctl		= lan743x_netdev_ioctl,
 	.ndo_set_rx_mode	= lan743x_netdev_set_multicast,
 	.ndo_change_mtu		= lan743x_netdev_change_mtu,
 	.ndo_get_stats64	= lan743x_netdev_get_stats64,
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index c52f175df389..de900ea70fd4 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -823,7 +823,7 @@ static const struct net_device_ops ocelot_port_netdev_ops = {
 	.ndo_vlan_rx_kill_vid		= ocelot_vlan_rx_kill_vid,
 	.ndo_set_features		= ocelot_set_features,
 	.ndo_setup_tc			= ocelot_setup_tc,
-	.ndo_do_ioctl			= ocelot_ioctl,
+	.ndo_eth_ioctl			= ocelot_ioctl,
 	.ndo_get_devlink_port		= ocelot_get_devlink_port,
 };
 
diff --git a/drivers/net/ethernet/natsemi/natsemi.c b/drivers/net/ethernet/natsemi/natsemi.c
index 51b4b25d15ad..bd9d026e609d 100644
--- a/drivers/net/ethernet/natsemi/natsemi.c
+++ b/drivers/net/ethernet/natsemi/natsemi.c
@@ -790,7 +790,7 @@ static const struct net_device_ops natsemi_netdev_ops = {
 	.ndo_get_stats		= get_stats,
 	.ndo_set_rx_mode	= set_rx_mode,
 	.ndo_change_mtu		= natsemi_change_mtu,
-	.ndo_do_ioctl		= netdev_ioctl,
+	.ndo_eth_ioctl		= netdev_ioctl,
 	.ndo_tx_timeout 	= ns_tx_timeout,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index 0b017d4f5c08..09c0e839cca5 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -7625,7 +7625,7 @@ static const struct net_device_ops s2io_netdev_ops = {
 	.ndo_start_xmit    	= s2io_xmit,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= s2io_ndo_set_multicast,
-	.ndo_do_ioctl	   	= s2io_ioctl,
+	.ndo_eth_ioctl		= s2io_ioctl,
 	.ndo_set_mac_address    = s2io_set_mac_addr,
 	.ndo_change_mtu	   	= s2io_change_mtu,
 	.ndo_set_features	= s2io_set_features,
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index 82eef4c72f01..20fb4ad29865 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3339,7 +3339,7 @@ static const struct net_device_ops vxge_netdev_ops = {
 	.ndo_start_xmit         = vxge_xmit,
 	.ndo_validate_addr      = eth_validate_addr,
 	.ndo_set_rx_mode	= vxge_set_multicast,
-	.ndo_do_ioctl           = vxge_ioctl,
+	.ndo_eth_ioctl           = vxge_ioctl,
 	.ndo_set_mac_address    = vxge_set_mac_addr,
 	.ndo_change_mtu         = vxge_change_mtu,
 	.ndo_fix_features	= vxge_fix_features,
diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c
index 64c6842bd452..d29fe562b3de 100644
--- a/drivers/net/ethernet/nxp/lpc_eth.c
+++ b/drivers/net/ethernet/nxp/lpc_eth.c
@@ -1219,7 +1219,7 @@ static const struct net_device_ops lpc_netdev_ops = {
 	.ndo_stop		= lpc_eth_close,
 	.ndo_start_xmit		= lpc_eth_hard_start_xmit,
 	.ndo_set_rx_mode	= lpc_eth_set_multicast_list,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_set_mac_address	= lpc_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index e351f3d1608f..bc35d5703bd2 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -2333,7 +2333,7 @@ static const struct net_device_ops pch_gbe_netdev_ops = {
 	.ndo_tx_timeout = pch_gbe_tx_timeout,
 	.ndo_change_mtu = pch_gbe_change_mtu,
 	.ndo_set_features = pch_gbe_set_features,
-	.ndo_do_ioctl = pch_gbe_ioctl,
+	.ndo_eth_ioctl = pch_gbe_ioctl,
 	.ndo_set_rx_mode = pch_gbe_set_multi,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = pch_gbe_netpoll,
diff --git a/drivers/net/ethernet/packetengines/hamachi.c b/drivers/net/ethernet/packetengines/hamachi.c
index 94823c5f7dff..1a6336a56d3d 100644
--- a/drivers/net/ethernet/packetengines/hamachi.c
+++ b/drivers/net/ethernet/packetengines/hamachi.c
@@ -573,7 +573,7 @@ static const struct net_device_ops hamachi_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_tx_timeout		= hamachi_tx_timeout,
-	.ndo_do_ioctl		= hamachi_ioctl,
+	.ndo_eth_ioctl		= hamachi_ioctl,
 	.ndo_siocdevprivate	= hamachi_siocdevprivate,
 };
 
diff --git a/drivers/net/ethernet/packetengines/yellowfin.c b/drivers/net/ethernet/packetengines/yellowfin.c
index d1dd9bc1bc7f..f5cd8f51be7c 100644
--- a/drivers/net/ethernet/packetengines/yellowfin.c
+++ b/drivers/net/ethernet/packetengines/yellowfin.c
@@ -362,7 +362,7 @@ static const struct net_device_ops netdev_ops = {
 	.ndo_set_rx_mode	= set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address 	= eth_mac_addr,
-	.ndo_do_ioctl 		= netdev_ioctl,
+	.ndo_eth_ioctl		= netdev_ioctl,
 	.ndo_tx_timeout 	= yellowfin_tx_timeout,
 };
 
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
index af3a5368529c..537c2907b91e 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
@@ -2264,7 +2264,7 @@ static int ionic_stop(struct net_device *netdev)
 	return 0;
 }
 
-static int ionic_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+static int ionic_eth_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 {
 	struct ionic_lif *lif = netdev_priv(netdev);
 
@@ -2526,7 +2526,7 @@ static int ionic_set_vf_link_state(struct net_device *netdev, int vf, int set)
 static const struct net_device_ops ionic_netdev_ops = {
 	.ndo_open               = ionic_open,
 	.ndo_stop               = ionic_stop,
-	.ndo_do_ioctl		= ionic_do_ioctl,
+	.ndo_eth_ioctl		= ionic_eth_ioctl,
 	.ndo_start_xmit		= ionic_start_xmit,
 	.ndo_get_stats64	= ionic_get_stats64,
 	.ndo_set_rx_mode	= ionic_ndo_set_rx_mode,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 01ac1e93d27a..173878696143 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -644,7 +644,7 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_set_mac_address	= qede_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_change_mtu		= qede_change_mtu,
-	.ndo_do_ioctl		= qede_ioctl,
+	.ndo_eth_ioctl		= qede_ioctl,
 	.ndo_tx_timeout		= qede_tx_timeout,
 #ifdef CONFIG_QED_SRIOV
 	.ndo_set_vf_mac		= qede_set_vf_mac,
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index ad655f0a4965..9015a38eaced 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -377,7 +377,7 @@ static const struct net_device_ops emac_netdev_ops = {
 	.ndo_start_xmit		= emac_start_xmit,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_change_mtu		= emac_change_mtu,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_tx_timeout		= emac_tx_timeout,
 	.ndo_get_stats64	= emac_get_stats64,
 	.ndo_set_features       = emac_set_features,
diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c
index 47e9998b62f0..4b2eca5e08e2 100644
--- a/drivers/net/ethernet/rdc/r6040.c
+++ b/drivers/net/ethernet/rdc/r6040.c
@@ -954,7 +954,7 @@ static const struct net_device_ops r6040_netdev_ops = {
 	.ndo_set_rx_mode	= r6040_multicast_list,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
-	.ndo_do_ioctl		= phy_do_ioctl,
+	.ndo_eth_ioctl		= phy_do_ioctl,
 	.ndo_tx_timeout		= r6040_tx_timeout,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= r6040_poll_controller,
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 9677e257e9a1..edc61906694f 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -1869,7 +1869,7 @@ static const struct net_device_ops cp_netdev_ops = {
 	.ndo_set_mac_address 	= cp_set_mac_address,
 	.ndo_set_rx_mode	= cp_set_rx_mode,
 	.ndo_get_stats		= cp_get_stats,
-	.ndo_do_ioctl		= cp_ioctl,
+	.ndo_eth_ioctl		= cp_ioctl,
 	.ndo_start_xmit		= cp_start_xmit,
 	.ndo_tx_timeout		= cp_tx_timeout,
 	.ndo_set_features	= cp_set_features,
diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index f0608f050050..2e6923cc653e 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -932,7 +932,7 @@ static const struct net_device_ops rtl8139_netdev_ops = {
 	.ndo_set_mac_address 	= rtl8139_set_mac_address,
 	.ndo_start_xmit		= rtl8139_start_xmit,
 	.ndo_set_rx_mode	= rtl8139_set_rx_mode,
-	.ndo_do_ioctl		= netdev_ioctl,
+	.ndo_eth_ioctl		= netdev_ioctl,
 	.ndo_tx_timeout		= rtl8139_tx_timeout,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= rtl8139_poll_controller,
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index c7af5bc3b8af..fa2dab6980bb 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -4979,7 +4979,7 @@ static const struct net_device_ops rtl_netdev_ops = {
 	.ndo_fix_features	= rtl8169_fix_features,
 	.ndo_set_features	= rtl8169_set_features,
 	.ndo_set_mac_address	= rtl_set_mac_address,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_set_rx_mode	= rtl_set_rx_mode,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= rtl8169_netpoll,
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 805397088850..f4dfe9f71d06 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1872,7 +1872,7 @@ static const struct net_device_ops ravb_netdev_ops = {
 	.ndo_get_stats		= ravb_get_stats,
 	.ndo_set_rx_mode	= ravb_set_rx_mode,
 	.ndo_tx_timeout		= ravb_tx_timeout,
-	.ndo_do_ioctl		= ravb_do_ioctl,
+	.ndo_eth_ioctl		= ravb_do_ioctl,
 	.ndo_change_mtu		= ravb_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 840478692a37..6c8ba916d1a6 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3141,7 +3141,7 @@ static const struct net_device_ops sh_eth_netdev_ops = {
 	.ndo_get_stats		= sh_eth_get_stats,
 	.ndo_set_rx_mode	= sh_eth_set_rx_mode,
 	.ndo_tx_timeout		= sh_eth_tx_timeout,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_change_mtu		= sh_eth_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
@@ -3157,7 +3157,7 @@ static const struct net_device_ops sh_eth_netdev_ops_tsu = {
 	.ndo_vlan_rx_add_vid	= sh_eth_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= sh_eth_vlan_rx_kill_vid,
 	.ndo_tx_timeout		= sh_eth_tx_timeout,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_change_mtu		= sh_eth_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index 090bcd2fb758..6781aa636d58 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -1964,7 +1964,7 @@ static const struct net_device_ops sxgbe_netdev_ops = {
 	.ndo_set_features	= sxgbe_set_features,
 	.ndo_set_rx_mode	= sxgbe_set_rx_mode,
 	.ndo_tx_timeout		= sxgbe_tx_timeout,
-	.ndo_do_ioctl		= sxgbe_ioctl,
+	.ndo_eth_ioctl		= sxgbe_ioctl,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= sxgbe_poll_controller,
 #endif
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 37fcf2eb0741..a295e2621cf3 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -591,7 +591,7 @@ static const struct net_device_ops efx_netdev_ops = {
 	.ndo_tx_timeout		= efx_watchdog,
 	.ndo_start_xmit		= efx_hard_start_xmit,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= efx_ioctl,
+	.ndo_eth_ioctl		= efx_ioctl,
 	.ndo_change_mtu		= efx_change_mtu,
 	.ndo_set_mac_address	= efx_set_mac_address,
 	.ndo_set_rx_mode	= efx_set_rx_mode,
diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c
index 9ec752a43c75..c177ea0f301e 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.c
+++ b/drivers/net/ethernet/sfc/falcon/efx.c
@@ -2219,7 +2219,7 @@ static const struct net_device_ops ef4_netdev_ops = {
 	.ndo_tx_timeout		= ef4_watchdog,
 	.ndo_start_xmit		= ef4_hard_start_xmit,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= ef4_ioctl,
+	.ndo_eth_ioctl		= ef4_ioctl,
 	.ndo_change_mtu		= ef4_change_mtu,
 	.ndo_set_mac_address	= ef4_set_mac_address,
 	.ndo_set_rx_mode	= ef4_set_rx_mode,
diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index 2b29fd4cbdf4..062f7844c496 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -820,7 +820,7 @@ static const struct net_device_ops ioc3_netdev_ops = {
 	.ndo_tx_timeout		= ioc3_timeout,
 	.ndo_get_stats		= ioc3_get_stats,
 	.ndo_set_rx_mode	= ioc3_set_multicast_list,
-	.ndo_do_ioctl		= ioc3_ioctl,
+	.ndo_eth_ioctl		= ioc3_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= ioc3_set_mac_address,
 };
diff --git a/drivers/net/ethernet/sgi/meth.c b/drivers/net/ethernet/sgi/meth.c
index 0c396ecd3389..efce834d8ee6 100644
--- a/drivers/net/ethernet/sgi/meth.c
+++ b/drivers/net/ethernet/sgi/meth.c
@@ -812,7 +812,7 @@ static const struct net_device_ops meth_netdev_ops = {
 	.ndo_open		= meth_open,
 	.ndo_stop		= meth_release,
 	.ndo_start_xmit		= meth_tx,
-	.ndo_do_ioctl		= meth_ioctl,
+	.ndo_eth_ioctl		= meth_ioctl,
 	.ndo_tx_timeout		= meth_tx_timeout,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
diff --git a/drivers/net/ethernet/sis/sis190.c b/drivers/net/ethernet/sis/sis190.c
index 676b193833c0..3d1a18a01ce5 100644
--- a/drivers/net/ethernet/sis/sis190.c
+++ b/drivers/net/ethernet/sis/sis190.c
@@ -1841,7 +1841,7 @@ static int sis190_mac_addr(struct net_device  *dev, void *p)
 static const struct net_device_ops sis190_netdev_ops = {
 	.ndo_open		= sis190_open,
 	.ndo_stop		= sis190_close,
-	.ndo_do_ioctl		= sis190_ioctl,
+	.ndo_eth_ioctl		= sis190_ioctl,
 	.ndo_start_xmit		= sis190_start_xmit,
 	.ndo_tx_timeout		= sis190_tx_timeout,
 	.ndo_set_rx_mode	= sis190_set_rx_mode,
diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c
index ca9c00b7f588..ec6f7f993eb7 100644
--- a/drivers/net/ethernet/sis/sis900.c
+++ b/drivers/net/ethernet/sis/sis900.c
@@ -404,7 +404,7 @@ static const struct net_device_ops sis900_netdev_ops = {
 	.ndo_set_rx_mode	= set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address 	= eth_mac_addr,
-	.ndo_do_ioctl		= mii_ioctl,
+	.ndo_eth_ioctl		= mii_ioctl,
 	.ndo_tx_timeout		= sis900_tx_timeout,
 #ifdef CONFIG_NET_POLL_CONTROLLER
         .ndo_poll_controller	= sis900_poll,
diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c
index 51cd7dca91cd..44daf79a8f97 100644
--- a/drivers/net/ethernet/smsc/epic100.c
+++ b/drivers/net/ethernet/smsc/epic100.c
@@ -312,7 +312,7 @@ static const struct net_device_ops epic_netdev_ops = {
 	.ndo_tx_timeout 	= epic_tx_timeout,
 	.ndo_get_stats		= epic_get_stats,
 	.ndo_set_rx_mode	= set_rx_mode,
-	.ndo_do_ioctl 		= netdev_ioctl,
+	.ndo_eth_ioctl		= netdev_ioctl,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/ethernet/smsc/smc91c92_cs.c b/drivers/net/ethernet/smsc/smc91c92_cs.c
index f2a50eb3c1e0..42fc37c7887a 100644
--- a/drivers/net/ethernet/smsc/smc91c92_cs.c
+++ b/drivers/net/ethernet/smsc/smc91c92_cs.c
@@ -294,7 +294,7 @@ static const struct net_device_ops smc_netdev_ops = {
 	.ndo_tx_timeout 	= smc_tx_timeout,
 	.ndo_set_config 	= s9k_config,
 	.ndo_set_rx_mode	= set_rx_mode,
-	.ndo_do_ioctl		= smc_ioctl,
+	.ndo_eth_ioctl		= smc_ioctl,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index 556a9790cdcf..199a97339280 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -2148,7 +2148,7 @@ static const struct net_device_ops smsc911x_netdev_ops = {
 	.ndo_start_xmit		= smsc911x_hard_start_xmit,
 	.ndo_get_stats		= smsc911x_get_stats,
 	.ndo_set_rx_mode	= smsc911x_set_multicast_list,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address 	= smsc911x_set_mac_address,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/smsc/smsc9420.c b/drivers/net/ethernet/smsc/smsc9420.c
index c1dab009415d..fdbd2a43e267 100644
--- a/drivers/net/ethernet/smsc/smsc9420.c
+++ b/drivers/net/ethernet/smsc/smsc9420.c
@@ -1482,7 +1482,7 @@ static const struct net_device_ops smsc9420_netdev_ops = {
 	.ndo_start_xmit		= smsc9420_hard_start_xmit,
 	.ndo_get_stats		= smsc9420_get_stats,
 	.ndo_set_rx_mode	= smsc9420_set_multicast_list,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address 	= eth_mac_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 20d148c019d8..d15f7b3a3f10 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -1831,7 +1831,7 @@ static const struct net_device_ops netsec_netdev_ops = {
 	.ndo_set_features	= netsec_netdev_set_features,
 	.ndo_set_mac_address    = eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= phy_do_ioctl,
+	.ndo_eth_ioctl		= phy_do_ioctl,
 	.ndo_xdp_xmit		= netsec_xdp_xmit,
 	.ndo_bpf		= netsec_xdp,
 };
diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 5eb6bb4f7b6c..ae31ed93aaf0 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -1543,7 +1543,7 @@ static const struct net_device_ops ave_netdev_ops = {
 	.ndo_open		= ave_open,
 	.ndo_stop		= ave_stop,
 	.ndo_start_xmit		= ave_start_xmit,
-	.ndo_do_ioctl		= ave_ioctl,
+	.ndo_eth_ioctl		= ave_ioctl,
 	.ndo_set_rx_mode	= ave_set_rx_mode,
 	.ndo_get_stats64	= ave_get_stats64,
 	.ndo_set_mac_address	= ave_set_mac_address,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 7b8404a21544..a2aa75cb184e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -6451,7 +6451,7 @@ static const struct net_device_ops stmmac_netdev_ops = {
 	.ndo_set_features = stmmac_set_features,
 	.ndo_set_rx_mode = stmmac_set_rx_mode,
 	.ndo_tx_timeout = stmmac_tx_timeout,
-	.ndo_do_ioctl = stmmac_ioctl,
+	.ndo_eth_ioctl = stmmac_ioctl,
 	.ndo_setup_tc = stmmac_setup_tc,
 	.ndo_select_queue = stmmac_select_queue,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index 981685c88308..287ae4c538aa 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -4876,7 +4876,7 @@ static const struct net_device_ops cas_netdev_ops = {
 	.ndo_start_xmit		= cas_start_xmit,
 	.ndo_get_stats 		= cas_get_stats,
 	.ndo_set_rx_mode	= cas_set_multicast,
-	.ndo_do_ioctl		= cas_ioctl,
+	.ndo_eth_ioctl		= cas_ioctl,
 	.ndo_tx_timeout		= cas_tx_timeout,
 	.ndo_change_mtu		= cas_change_mtu,
 	.ndo_set_mac_address	= eth_mac_addr,
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 74e748662ec0..006fd4237725 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -9667,7 +9667,7 @@ static const struct net_device_ops niu_netdev_ops = {
 	.ndo_set_rx_mode	= niu_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= niu_set_mac_addr,
-	.ndo_do_ioctl		= niu_ioctl,
+	.ndo_eth_ioctl		= niu_ioctl,
 	.ndo_tx_timeout		= niu_tx_timeout,
 	.ndo_change_mtu		= niu_change_mtu,
 };
diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c
index cfb9e21b18b7..d72018a60c0f 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -2831,7 +2831,7 @@ static const struct net_device_ops gem_netdev_ops = {
 	.ndo_start_xmit		= gem_start_xmit,
 	.ndo_get_stats		= gem_get_stats,
 	.ndo_set_rx_mode	= gem_set_multicast,
-	.ndo_do_ioctl		= gem_ioctl,
+	.ndo_eth_ioctl		= gem_ioctl,
 	.ndo_tx_timeout		= gem_tx_timeout,
 	.ndo_change_mtu		= gem_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
index 26d178f8616b..1db7104fef3a 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
@@ -933,7 +933,7 @@ static const struct net_device_ops xlgmac_netdev_ops = {
 	.ndo_change_mtu		= xlgmac_change_mtu,
 	.ndo_set_mac_address	= xlgmac_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= xlgmac_ioctl,
+	.ndo_eth_ioctl		= xlgmac_ioctl,
 	.ndo_vlan_rx_add_vid	= xlgmac_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= xlgmac_vlan_rx_kill_vid,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index 229e2f09d605..dffb6839f0fa 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -1480,7 +1480,7 @@ static const struct net_device_ops am65_cpsw_nuss_netdev_ops = {
 	.ndo_tx_timeout		= am65_cpsw_nuss_ndo_host_tx_timeout,
 	.ndo_vlan_rx_add_vid	= am65_cpsw_nuss_ndo_slave_add_vid,
 	.ndo_vlan_rx_kill_vid	= am65_cpsw_nuss_ndo_slave_kill_vid,
-	.ndo_do_ioctl		= am65_cpsw_nuss_ndo_slave_ioctl,
+	.ndo_eth_ioctl		= am65_cpsw_nuss_ndo_slave_ioctl,
 	.ndo_setup_tc           = am65_cpsw_qos_ndo_setup_tc,
 	.ndo_get_devlink_port   = am65_cpsw_ndo_get_devlink_port,
 };
diff --git a/drivers/net/ethernet/ti/cpmac.c b/drivers/net/ethernet/ti/cpmac.c
index c20715107075..02d4e51f7306 100644
--- a/drivers/net/ethernet/ti/cpmac.c
+++ b/drivers/net/ethernet/ti/cpmac.c
@@ -1044,7 +1044,7 @@ static const struct net_device_ops cpmac_netdev_ops = {
 	.ndo_start_xmit		= cpmac_start_xmit,
 	.ndo_tx_timeout		= cpmac_tx_timeout,
 	.ndo_set_rx_mode	= cpmac_set_multicast_list,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 };
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index cbbd0f665796..abf9a2a6f7eb 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1159,7 +1159,7 @@ static const struct net_device_ops cpsw_netdev_ops = {
 	.ndo_stop		= cpsw_ndo_stop,
 	.ndo_start_xmit		= cpsw_ndo_start_xmit,
 	.ndo_set_mac_address	= cpsw_ndo_set_mac_address,
-	.ndo_do_ioctl		= cpsw_ndo_ioctl,
+	.ndo_eth_ioctl		= cpsw_ndo_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_tx_timeout		= cpsw_ndo_tx_timeout,
 	.ndo_set_rx_mode	= cpsw_ndo_set_rx_mode,
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 4448a91cce54..b4f55ff4e84f 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -1128,7 +1128,7 @@ static const struct net_device_ops cpsw_netdev_ops = {
 	.ndo_stop		= cpsw_ndo_stop,
 	.ndo_start_xmit		= cpsw_ndo_start_xmit,
 	.ndo_set_mac_address	= cpsw_ndo_set_mac_address,
-	.ndo_do_ioctl		= cpsw_ndo_ioctl,
+	.ndo_eth_ioctl		= cpsw_ndo_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_tx_timeout		= cpsw_ndo_tx_timeout,
 	.ndo_set_rx_mode	= cpsw_ndo_set_rx_mode,
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index c674e34b6839..637796670746 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -1670,7 +1670,7 @@ static const struct net_device_ops emac_netdev_ops = {
 	.ndo_start_xmit		= emac_dev_xmit,
 	.ndo_set_rx_mode	= emac_dev_mcast_set,
 	.ndo_set_mac_address	= emac_dev_setmac_addr,
-	.ndo_do_ioctl		= emac_devioctl,
+	.ndo_eth_ioctl		= emac_devioctl,
 	.ndo_tx_timeout		= emac_dev_tx_timeout,
 	.ndo_get_stats		= emac_dev_getnetstats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 97942b0e3897..eda2961c0fe2 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1944,7 +1944,7 @@ static const struct net_device_ops netcp_netdev_ops = {
 	.ndo_stop		= netcp_ndo_stop,
 	.ndo_start_xmit		= netcp_ndo_start_xmit,
 	.ndo_set_rx_mode	= netcp_set_rx_mode,
-	.ndo_do_ioctl           = netcp_ndo_ioctl,
+	.ndo_eth_ioctl           = netcp_ndo_ioctl,
 	.ndo_get_stats64        = netcp_get_stats,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c
index e0cb713193ea..77c448ad67ce 100644
--- a/drivers/net/ethernet/ti/tlan.c
+++ b/drivers/net/ethernet/ti/tlan.c
@@ -749,7 +749,7 @@ static const struct net_device_ops tlan_netdev_ops = {
 	.ndo_tx_timeout		= tlan_tx_timeout,
 	.ndo_get_stats		= tlan_get_stats,
 	.ndo_set_rx_mode	= tlan_set_multicast_list,
-	.ndo_do_ioctl		= tlan_ioctl,
+	.ndo_eth_ioctl		= tlan_ioctl,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c
index 226a76633e65..087f0af56c50 100644
--- a/drivers/net/ethernet/toshiba/spider_net.c
+++ b/drivers/net/ethernet/toshiba/spider_net.c
@@ -2214,7 +2214,7 @@ static const struct net_device_ops spider_net_ops = {
 	.ndo_start_xmit		= spider_net_xmit,
 	.ndo_set_rx_mode	= spider_net_set_multi,
 	.ndo_set_mac_address	= spider_net_set_mac,
-	.ndo_do_ioctl		= spider_net_do_ioctl,
+	.ndo_eth_ioctl		= spider_net_do_ioctl,
 	.ndo_tx_timeout		= spider_net_tx_timeout,
 	.ndo_validate_addr	= eth_validate_addr,
 	/* HW VLAN */
diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c
index fedb2bf69261..52245ac60fc7 100644
--- a/drivers/net/ethernet/toshiba/tc35815.c
+++ b/drivers/net/ethernet/toshiba/tc35815.c
@@ -750,7 +750,7 @@ static const struct net_device_ops tc35815_netdev_ops = {
 	.ndo_get_stats		= tc35815_get_stats,
 	.ndo_set_rx_mode	= tc35815_set_multicast_list,
 	.ndo_tx_timeout		= tc35815_tx_timeout,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/tundra/tsi108_eth.c b/drivers/net/ethernet/tundra/tsi108_eth.c
index c62f474b6d08..cf0917b29e30 100644
--- a/drivers/net/ethernet/tundra/tsi108_eth.c
+++ b/drivers/net/ethernet/tundra/tsi108_eth.c
@@ -1538,7 +1538,7 @@ static const struct net_device_ops tsi108_netdev_ops = {
 	.ndo_start_xmit		= tsi108_send_packet,
 	.ndo_set_rx_mode	= tsi108_set_rx_mode,
 	.ndo_get_stats		= tsi108_get_stats,
-	.ndo_do_ioctl		= tsi108_do_ioctl,
+	.ndo_eth_ioctl		= tsi108_do_ioctl,
 	.ndo_set_mac_address	= tsi108_set_mac,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index 73ca597ebd1b..961b623b7880 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -884,7 +884,7 @@ static const struct net_device_ops rhine_netdev_ops = {
 	.ndo_set_rx_mode	 = rhine_set_rx_mode,
 	.ndo_validate_addr	 = eth_validate_addr,
 	.ndo_set_mac_address 	 = eth_mac_addr,
-	.ndo_do_ioctl		 = netdev_ioctl,
+	.ndo_eth_ioctl		 = netdev_ioctl,
 	.ndo_tx_timeout 	 = rhine_tx_timeout,
 	.ndo_vlan_rx_add_vid	 = rhine_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	 = rhine_vlan_rx_kill_vid,
diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c
index 88426b5e410b..278f49518d3f 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -2637,7 +2637,7 @@ static const struct net_device_ops velocity_netdev_ops = {
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_set_rx_mode	= velocity_set_multi,
 	.ndo_change_mtu		= velocity_change_mtu,
-	.ndo_do_ioctl		= velocity_ioctl,
+	.ndo_eth_ioctl		= velocity_ioctl,
 	.ndo_vlan_rx_add_vid	= velocity_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= velocity_vlan_rx_kill_vid,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 60a4f79b8fa1..db1994fb51c5 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -1237,7 +1237,7 @@ static const struct net_device_ops temac_netdev_ops = {
 	.ndo_set_rx_mode = temac_set_multicast_list,
 	.ndo_set_mac_address = temac_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
-	.ndo_do_ioctl = phy_do_ioctl_running,
+	.ndo_eth_ioctl = phy_do_ioctl_running,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = temac_poll_controller,
 #endif
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 13cd799541aa..348c0ba5edcf 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1227,7 +1227,7 @@ static const struct net_device_ops axienet_netdev_ops = {
 	.ndo_change_mtu	= axienet_change_mtu,
 	.ndo_set_mac_address = netdev_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
-	.ndo_do_ioctl = axienet_ioctl,
+	.ndo_eth_ioctl = axienet_ioctl,
 	.ndo_set_rx_mode = axienet_set_multicast_list,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = axienet_poll_controller,
diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
index b06377fe7293..b780aad3550a 100644
--- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c
+++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
@@ -1263,7 +1263,7 @@ static const struct net_device_ops xemaclite_netdev_ops = {
 	.ndo_start_xmit		= xemaclite_send,
 	.ndo_set_mac_address	= xemaclite_set_mac_address,
 	.ndo_tx_timeout		= xemaclite_tx_timeout,
-	.ndo_do_ioctl		= xemaclite_ioctl,
+	.ndo_eth_ioctl		= xemaclite_ioctl,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = xemaclite_poll_controller,
 #endif
diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c
index 4f6db6f5c272..ae611e46da6a 100644
--- a/drivers/net/ethernet/xircom/xirc2ps_cs.c
+++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c
@@ -464,7 +464,7 @@ static const struct net_device_ops netdev_ops = {
 	.ndo_start_xmit		= do_start_xmit,
 	.ndo_tx_timeout 	= xirc_tx_timeout,
 	.ndo_set_config		= do_config,
-	.ndo_do_ioctl		= do_ioctl,
+	.ndo_eth_ioctl		= do_ioctl,
 	.ndo_set_rx_mode	= set_multicast_list,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c
index 7ae754eadf22..ff50305d6e13 100644
--- a/drivers/net/ethernet/xscale/ixp4xx_eth.c
+++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c
@@ -1357,7 +1357,7 @@ static const struct net_device_ops ixp4xx_netdev_ops = {
 	.ndo_stop = eth_close,
 	.ndo_start_xmit = eth_xmit,
 	.ndo_set_rx_mode = eth_set_mcast_list,
-	.ndo_do_ioctl = eth_ioctl,
+	.ndo_eth_ioctl = eth_ioctl,
 	.ndo_set_mac_address = eth_mac_addr,
 	.ndo_validate_addr = eth_validate_addr,
 };
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 80de9768ecd4..35f46ad040b0 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -829,7 +829,7 @@ static int macvlan_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
-static int macvlan_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int macvlan_eth_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	struct net_device *real_dev = macvlan_dev_real_dev(dev);
 	const struct net_device_ops *ops = real_dev->netdev_ops;
@@ -845,8 +845,8 @@ static int macvlan_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			break;
 		fallthrough;
 	case SIOCGHWTSTAMP:
-		if (netif_device_present(real_dev) && ops->ndo_do_ioctl)
-			err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd);
+		if (netif_device_present(real_dev) && ops->ndo_eth_ioctl)
+			err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd);
 		break;
 	}
 
@@ -1151,7 +1151,7 @@ static const struct net_device_ops macvlan_netdev_ops = {
 	.ndo_stop		= macvlan_stop,
 	.ndo_start_xmit		= macvlan_start_xmit,
 	.ndo_change_mtu		= macvlan_change_mtu,
-	.ndo_do_ioctl		= macvlan_do_ioctl,
+	.ndo_eth_ioctl		= macvlan_eth_ioctl,
 	.ndo_fix_features	= macvlan_fix_features,
 	.ndo_change_rx_flags	= macvlan_change_rx_flags,
 	.ndo_set_mac_address	= macvlan_set_mac_address,
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 8eeb26d8aeb7..f124a8a58bd4 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -426,7 +426,7 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 EXPORT_SYMBOL(phy_mii_ioctl);
 
 /**
- * phy_do_ioctl - generic ndo_do_ioctl implementation
+ * phy_do_ioctl - generic ndo_eth_ioctl implementation
  * @dev: the net_device struct
  * @ifr: &struct ifreq for socket ioctl's
  * @cmd: ioctl cmd to execute
@@ -441,7 +441,7 @@ int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 EXPORT_SYMBOL(phy_do_ioctl);
 
 /**
- * phy_do_ioctl_running - generic ndo_do_ioctl implementation but test first
+ * phy_do_ioctl_running - generic ndo_eth_ioctl implementation but test first
  *
  * @dev: the net_device struct
  * @ifr: &struct ifreq for socket ioctl's
diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c
index 9b914765c2de..cb01897c7a5d 100644
--- a/drivers/net/usb/asix_devices.c
+++ b/drivers/net/usb/asix_devices.c
@@ -197,7 +197,7 @@ static const struct net_device_ops ax88172_netdev_ops = {
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= asix_ioctl,
+	.ndo_eth_ioctl		= asix_ioctl,
 	.ndo_set_rx_mode	= ax88172_set_multicast,
 };
 
@@ -589,7 +589,7 @@ static const struct net_device_ops ax88772_netdev_ops = {
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= asix_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_set_rx_mode        = asix_set_multicast,
 };
 
@@ -1095,7 +1095,7 @@ static const struct net_device_ops ax88178_netdev_ops = {
 	.ndo_set_mac_address 	= asix_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= asix_set_multicast,
-	.ndo_do_ioctl 		= asix_ioctl,
+	.ndo_eth_ioctl		= asix_ioctl,
 	.ndo_change_mtu 	= ax88178_change_mtu,
 };
 
diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c
index 530947d7477b..d9777d9a7c5d 100644
--- a/drivers/net/usb/ax88172a.c
+++ b/drivers/net/usb/ax88172a.c
@@ -109,7 +109,7 @@ static const struct net_device_ops ax88172a_netdev_ops = {
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address	= asix_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_set_rx_mode        = asix_set_multicast,
 };
 
diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index c1316718304d..f25448a08870 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -1035,7 +1035,7 @@ static const struct net_device_ops ax88179_netdev_ops = {
 	.ndo_change_mtu		= ax88179_change_mtu,
 	.ndo_set_mac_address	= ax88179_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= ax88179_ioctl,
+	.ndo_eth_ioctl		= ax88179_ioctl,
 	.ndo_set_rx_mode	= ax88179_set_multicast,
 	.ndo_set_features	= ax88179_set_features,
 };
diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c
index 89cc61d7a675..907f98b1eefe 100644
--- a/drivers/net/usb/dm9601.c
+++ b/drivers/net/usb/dm9601.c
@@ -345,7 +345,7 @@ static const struct net_device_ops dm9601_netdev_ops = {
 	.ndo_change_mtu		= usbnet_change_mtu,
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl 		= dm9601_ioctl,
+	.ndo_eth_ioctl		= dm9601_ioctl,
 	.ndo_set_rx_mode	= dm9601_set_multicast,
 	.ndo_set_mac_address	= dm9601_set_mac_address,
 };
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 25489389ea49..13f86368b78a 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -3601,7 +3601,7 @@ static const struct net_device_ops lan78xx_netdev_ops = {
 	.ndo_change_mtu		= lan78xx_change_mtu,
 	.ndo_set_mac_address	= lan78xx_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= phy_do_ioctl_running,
+	.ndo_eth_ioctl		= phy_do_ioctl_running,
 	.ndo_set_rx_mode	= lan78xx_set_multicast,
 	.ndo_set_features	= lan78xx_set_features,
 	.ndo_vlan_rx_add_vid	= lan78xx_vlan_rx_add_vid,
diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c
index 2469bdcb1a04..66866bef25df 100644
--- a/drivers/net/usb/mcs7830.c
+++ b/drivers/net/usb/mcs7830.c
@@ -464,7 +464,7 @@ static const struct net_device_ops mcs7830_netdev_ops = {
 	.ndo_change_mtu		= usbnet_change_mtu,
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl 		= mcs7830_ioctl,
+	.ndo_eth_ioctl		= mcs7830_ioctl,
 	.ndo_set_rx_mode	= mcs7830_set_multicast,
 	.ndo_set_mac_address	= mcs7830_set_mac_address,
 };
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index e09b107b5c99..d7fbc81b518a 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -9173,7 +9173,7 @@ static int rtl8152_change_mtu(struct net_device *dev, int new_mtu)
 static const struct net_device_ops rtl8152_netdev_ops = {
 	.ndo_open		= rtl8152_open,
 	.ndo_stop		= rtl8152_close,
-	.ndo_do_ioctl		= rtl8152_ioctl,
+	.ndo_eth_ioctl		= rtl8152_ioctl,
 	.ndo_start_xmit		= rtl8152_start_xmit,
 	.ndo_tx_timeout		= rtl8152_tx_timeout,
 	.ndo_set_features	= rtl8152_set_features,
diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c
index 13141dbfa3a8..76f7af161313 100644
--- a/drivers/net/usb/smsc75xx.c
+++ b/drivers/net/usb/smsc75xx.c
@@ -1439,7 +1439,7 @@ static const struct net_device_ops smsc75xx_netdev_ops = {
 	.ndo_change_mtu		= smsc75xx_change_mtu,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl 		= smsc75xx_ioctl,
+	.ndo_eth_ioctl		= smsc75xx_ioctl,
 	.ndo_set_rx_mode	= smsc75xx_set_multicast,
 	.ndo_set_features	= smsc75xx_set_features,
 };
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index 4c8ee1cff4d4..7d953974eb9b 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -1044,7 +1044,7 @@ static const struct net_device_ops smsc95xx_netdev_ops = {
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl 		= smsc95xx_ioctl,
+	.ndo_eth_ioctl		= smsc95xx_ioctl,
 	.ndo_set_rx_mode	= smsc95xx_set_multicast,
 	.ndo_set_features	= smsc95xx_set_features,
 };
diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c
index ce29261263cd..6516a37893e2 100644
--- a/drivers/net/usb/sr9700.c
+++ b/drivers/net/usb/sr9700.c
@@ -310,7 +310,7 @@ static const struct net_device_ops sr9700_netdev_ops = {
 	.ndo_change_mtu		= usbnet_change_mtu,
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= sr9700_ioctl,
+	.ndo_eth_ioctl		= sr9700_ioctl,
 	.ndo_set_rx_mode	= sr9700_set_multicast,
 	.ndo_set_mac_address	= sr9700_set_mac_address,
 };
diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c
index a822d81310d5..576401c8b1be 100644
--- a/drivers/net/usb/sr9800.c
+++ b/drivers/net/usb/sr9800.c
@@ -684,7 +684,7 @@ static const struct net_device_ops sr9800_netdev_ops = {
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address	= sr_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= sr_ioctl,
+	.ndo_eth_ioctl		= sr_ioctl,
 	.ndo_set_rx_mode        = sr_set_multicast,
 };
 
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index d50d3cba238e..69afc0311dd1 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -836,7 +836,7 @@ static const struct net_device_ops qeth_l2_netdev_ops = {
 	.ndo_select_queue	= qeth_l2_select_queue,
 	.ndo_validate_addr	= qeth_l2_validate_addr,
 	.ndo_set_rx_mode	= qeth_l2_set_rx_mode,
-	.ndo_do_ioctl		= qeth_do_ioctl,
+	.ndo_eth_ioctl		= qeth_do_ioctl,
 	.ndo_siocdevprivate	= qeth_siocdevprivate,
 	.ndo_set_mac_address    = qeth_l2_set_mac_address,
 	.ndo_vlan_rx_add_vid	= qeth_l2_vlan_rx_add_vid,
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index d7a895372f19..3a523e700a5a 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1841,7 +1841,7 @@ static const struct net_device_ops qeth_l3_netdev_ops = {
 	.ndo_select_queue	= qeth_l3_iqd_select_queue,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= qeth_l3_set_rx_mode,
-	.ndo_do_ioctl		= qeth_do_ioctl,
+	.ndo_eth_ioctl		= qeth_do_ioctl,
 	.ndo_siocdevprivate	= qeth_siocdevprivate,
 	.ndo_fix_features	= qeth_fix_features,
 	.ndo_set_features	= qeth_set_features,
@@ -1857,7 +1857,7 @@ static const struct net_device_ops qeth_l3_osa_netdev_ops = {
 	.ndo_select_queue	= qeth_l3_osa_select_queue,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= qeth_l3_set_rx_mode,
-	.ndo_do_ioctl		= qeth_do_ioctl,
+	.ndo_eth_ioctl		= qeth_do_ioctl,
 	.ndo_siocdevprivate	= qeth_siocdevprivate,
 	.ndo_fix_features	= qeth_fix_features,
 	.ndo_set_features	= qeth_set_features,
diff --git a/drivers/staging/octeon/ethernet.c b/drivers/staging/octeon/ethernet.c
index dcbba9621b21..5d24c1b6663b 100644
--- a/drivers/staging/octeon/ethernet.c
+++ b/drivers/staging/octeon/ethernet.c
@@ -524,7 +524,7 @@ static const struct net_device_ops cvm_oct_npi_netdev_ops = {
 	.ndo_start_xmit		= cvm_oct_xmit,
 	.ndo_set_rx_mode	= cvm_oct_common_set_multicast_list,
 	.ndo_set_mac_address	= cvm_oct_common_set_mac_address,
-	.ndo_do_ioctl		= cvm_oct_ioctl,
+	.ndo_eth_ioctl		= cvm_oct_ioctl,
 	.ndo_change_mtu		= cvm_oct_common_change_mtu,
 	.ndo_get_stats		= cvm_oct_common_get_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -540,7 +540,7 @@ static const struct net_device_ops cvm_oct_xaui_netdev_ops = {
 	.ndo_start_xmit		= cvm_oct_xmit,
 	.ndo_set_rx_mode	= cvm_oct_common_set_multicast_list,
 	.ndo_set_mac_address	= cvm_oct_common_set_mac_address,
-	.ndo_do_ioctl		= cvm_oct_ioctl,
+	.ndo_eth_ioctl		= cvm_oct_ioctl,
 	.ndo_change_mtu		= cvm_oct_common_change_mtu,
 	.ndo_get_stats		= cvm_oct_common_get_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -556,7 +556,7 @@ static const struct net_device_ops cvm_oct_sgmii_netdev_ops = {
 	.ndo_start_xmit		= cvm_oct_xmit,
 	.ndo_set_rx_mode	= cvm_oct_common_set_multicast_list,
 	.ndo_set_mac_address	= cvm_oct_common_set_mac_address,
-	.ndo_do_ioctl		= cvm_oct_ioctl,
+	.ndo_eth_ioctl		= cvm_oct_ioctl,
 	.ndo_change_mtu		= cvm_oct_common_change_mtu,
 	.ndo_get_stats		= cvm_oct_common_get_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -570,7 +570,7 @@ static const struct net_device_ops cvm_oct_spi_netdev_ops = {
 	.ndo_start_xmit		= cvm_oct_xmit,
 	.ndo_set_rx_mode	= cvm_oct_common_set_multicast_list,
 	.ndo_set_mac_address	= cvm_oct_common_set_mac_address,
-	.ndo_do_ioctl		= cvm_oct_ioctl,
+	.ndo_eth_ioctl		= cvm_oct_ioctl,
 	.ndo_change_mtu		= cvm_oct_common_change_mtu,
 	.ndo_get_stats		= cvm_oct_common_get_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -586,7 +586,7 @@ static const struct net_device_ops cvm_oct_rgmii_netdev_ops = {
 	.ndo_start_xmit		= cvm_oct_xmit,
 	.ndo_set_rx_mode	= cvm_oct_common_set_multicast_list,
 	.ndo_set_mac_address	= cvm_oct_common_set_mac_address,
-	.ndo_do_ioctl		= cvm_oct_ioctl,
+	.ndo_eth_ioctl		= cvm_oct_ioctl,
 	.ndo_change_mtu		= cvm_oct_common_change_mtu,
 	.ndo_get_stats		= cvm_oct_common_get_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -599,7 +599,7 @@ static const struct net_device_ops cvm_oct_pow_netdev_ops = {
 	.ndo_start_xmit		= cvm_oct_xmit_pow,
 	.ndo_set_rx_mode	= cvm_oct_common_set_multicast_list,
 	.ndo_set_mac_address	= cvm_oct_common_set_mac_address,
-	.ndo_do_ioctl		= cvm_oct_ioctl,
+	.ndo_eth_ioctl		= cvm_oct_ioctl,
 	.ndo_change_mtu		= cvm_oct_common_change_mtu,
 	.ndo_get_stats		= cvm_oct_common_get_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 658d8cf57342..b6e062a3b0d4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1090,6 +1090,10 @@ struct netdev_net_notifier {
  *	the generic interface code. If not defined ioctls return
  *	not supported error code.
  *
+ * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
+ *	Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG,
+ *	SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP.
+ *
  * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
  *	Used to set network devices bus interface parameters. This interface
  *	is retained for legacy reasons; new devices should use the bus
@@ -1361,6 +1365,8 @@ struct net_device_ops {
 	int			(*ndo_validate_addr)(struct net_device *dev);
 	int			(*ndo_do_ioctl)(struct net_device *dev,
 					        struct ifreq *ifr, int cmd);
+	int			(*ndo_eth_ioctl)(struct net_device *dev,
+						 struct ifreq *ifr, int cmd);
 	int			(*ndo_siocdevprivate)(struct net_device *dev,
 						      struct ifreq *ifr,
 						      void __user *data, int cmd);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 55fcac854058..2af6ee2f2bfb 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -106,8 +106,8 @@ struct dsa_device_ops {
  * function pointers.
  */
 struct dsa_netdevice_ops {
-	int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr,
-			    int cmd);
+	int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr,
+			     int cmd);
 };
 
 #define DSA_TAG_DRIVER_ALIAS "dsa_tag-"
@@ -1019,8 +1019,8 @@ static inline int __dsa_netdevice_ops_check(struct net_device *dev)
 	return 0;
 }
 
-static inline int dsa_ndo_do_ioctl(struct net_device *dev, struct ifreq *ifr,
-				   int cmd)
+static inline int dsa_ndo_eth_ioctl(struct net_device *dev, struct ifreq *ifr,
+				    int cmd)
 {
 	const struct dsa_netdevice_ops *ops;
 	int err;
@@ -1031,11 +1031,11 @@ static inline int dsa_ndo_do_ioctl(struct net_device *dev, struct ifreq *ifr,
 
 	ops = dev->dsa_ptr->netdev_ops;
 
-	return ops->ndo_do_ioctl(dev, ifr, cmd);
+	return ops->ndo_eth_ioctl(dev, ifr, cmd);
 }
 #else
-static inline int dsa_ndo_do_ioctl(struct net_device *dev, struct ifreq *ifr,
-				   int cmd)
+static inline int dsa_ndo_eth_ioctl(struct net_device *dev, struct ifreq *ifr,
+				    int cmd)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index a0367b37512d..0c21d1fec852 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -372,8 +372,8 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCGMIIREG:
 	case SIOCSMIIREG:
 	case SIOCGHWTSTAMP:
-		if (netif_device_present(real_dev) && ops->ndo_do_ioctl)
-			err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd);
+		if (netif_device_present(real_dev) && ops->ndo_eth_ioctl)
+			err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd);
 		break;
 	}
 
@@ -814,7 +814,7 @@ static const struct net_device_ops vlan_netdev_ops = {
 	.ndo_set_mac_address	= vlan_dev_set_mac_address,
 	.ndo_set_rx_mode	= vlan_dev_set_rx_mode,
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
-	.ndo_do_ioctl		= vlan_dev_ioctl,
+	.ndo_eth_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
 	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if IS_ENABLED(CONFIG_FCOE)
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 3ace1e4f6b80..8e30fe8b5645 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -239,19 +239,19 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
 	return 0;
 }
 
-static int dev_do_ioctl(struct net_device *dev,
-			struct ifreq *ifr, unsigned int cmd)
+static int dev_eth_ioctl(struct net_device *dev,
+			 struct ifreq *ifr, unsigned int cmd)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int err;
 
-	err = dsa_ndo_do_ioctl(dev, ifr, cmd);
+	err = dsa_ndo_eth_ioctl(dev, ifr, cmd);
 	if (err == 0 || err != -EOPNOTSUPP)
 		return err;
 
-	if (ops->ndo_do_ioctl) {
+	if (ops->ndo_eth_ioctl) {
 		if (netif_device_present(dev))
-			err = ops->ndo_do_ioctl(dev, ifr, cmd);
+			err = ops->ndo_eth_ioctl(dev, ifr, cmd);
 		else
 			err = -ENODEV;
 	}
@@ -259,6 +259,21 @@ static int dev_do_ioctl(struct net_device *dev,
 	return err;
 }
 
+static int dev_do_ioctl(struct net_device *dev,
+			struct ifreq *ifr, unsigned int cmd)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (ops->ndo_do_ioctl) {
+		if (netif_device_present(dev))
+			return ops->ndo_do_ioctl(dev, ifr, cmd);
+		else
+			return -ENODEV;
+	}
+
+	return -EOPNOTSUPP;
+}
+
 static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
 			      void __user *data, unsigned int cmd)
 {
@@ -358,19 +373,20 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
 		    cmd <= SIOCDEVPRIVATE + 15)
 			return dev_siocdevprivate(dev, ifr, data, cmd);
 
-		if (cmd == SIOCBONDENSLAVE ||
+		if (cmd == SIOCGMIIPHY ||
+		    cmd == SIOCGMIIREG ||
+		    cmd == SIOCSMIIREG ||
+		    cmd == SIOCSHWTSTAMP ||
+		    cmd == SIOCGHWTSTAMP) {
+			err = dev_eth_ioctl(dev, ifr, cmd);
+		} else if (cmd == SIOCBONDENSLAVE ||
 		    cmd == SIOCBONDRELEASE ||
 		    cmd == SIOCBONDSETHWADDR ||
 		    cmd == SIOCBONDSLAVEINFOQUERY ||
 		    cmd == SIOCBONDINFOQUERY ||
 		    cmd == SIOCBONDCHANGEACTIVE ||
-		    cmd == SIOCGMIIPHY ||
-		    cmd == SIOCGMIIREG ||
-		    cmd == SIOCSMIIREG ||
 		    cmd == SIOCBRADDIF ||
 		    cmd == SIOCBRDELIF ||
-		    cmd == SIOCSHWTSTAMP ||
-		    cmd == SIOCGHWTSTAMP ||
 		    cmd == SIOCWANDEV) {
 			err = dev_do_ioctl(dev, ifr, cmd);
 		} else
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 3fc90e36772d..e8e19857621b 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -210,14 +210,14 @@ static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		break;
 	}
 
-	if (dev->netdev_ops->ndo_do_ioctl)
-		err = dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
+	if (dev->netdev_ops->ndo_eth_ioctl)
+		err = dev->netdev_ops->ndo_eth_ioctl(dev, ifr, cmd);
 
 	return err;
 }
 
 static const struct dsa_netdevice_ops dsa_netdev_ops = {
-	.ndo_do_ioctl = dsa_master_ioctl,
+	.ndo_eth_ioctl = dsa_master_ioctl,
 };
 
 static int dsa_master_ethtool_setup(struct net_device *dev)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8c112d7d5b0a..6e1135d3ee33 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1687,7 +1687,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_set_rx_mode	= dsa_slave_set_rx_mode,
 	.ndo_set_mac_address	= dsa_slave_set_mac_address,
 	.ndo_fdb_dump		= dsa_slave_fdb_dump,
-	.ndo_do_ioctl		= dsa_slave_ioctl,
+	.ndo_eth_ioctl		= dsa_slave_ioctl,
 	.ndo_get_iflink		= dsa_slave_get_iflink,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_netpoll_setup	= dsa_slave_netpoll_setup,
-- 
cgit v1.2.3


From ad7eab2ab014748b062507b7ac69f8e856057717 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 27 Jul 2021 15:45:14 +0200
Subject: net: split out ndo_siowandev ioctl

In order to further reduce the scope of ndo_do_ioctl(), move
out the SIOCWANDEV handling into a new network device operation
function.

Adjust the prototype to only pass the if_settings sub-structure
in place of the ifreq, and remove the redundant 'cmd' argument
in the process.

Cc: Krzysztof Halasa <khc@pm.waw.pl>
Cc: "Jan \"Yenya\" Kasprzak" <kas@fi.muni.cz>
Cc: Kevin Curtis <kevin.curtis@farsite.co.uk>
Cc: Zhao Qiang <qiang.zhao@nxp.com>
Cc: Martin Schiller <ms@dev.tdt.de>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: linux-x25@vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdevices.rst |  7 +++
 drivers/char/pcmcia/synclink_cs.c       | 23 ++++-----
 drivers/net/wan/c101.c                  | 19 +++----
 drivers/net/wan/cosa.c                  |  2 +-
 drivers/net/wan/farsync.c               | 92 +++++++++++++++------------------
 drivers/net/wan/fsl_ucc_hdlc.c          | 19 +++----
 drivers/net/wan/hdlc.c                  |  9 ++--
 drivers/net/wan/hdlc_cisco.c            | 14 ++---
 drivers/net/wan/hdlc_fr.c               | 43 +++++++--------
 drivers/net/wan/hdlc_ppp.c              |  8 +--
 drivers/net/wan/hdlc_raw.c              | 14 ++---
 drivers/net/wan/hdlc_raw_eth.c          | 14 ++---
 drivers/net/wan/hdlc_x25.c              | 16 +++---
 drivers/net/wan/hostess_sv11.c          |  7 +--
 drivers/net/wan/ixp4xx_hss.c            | 19 +++----
 drivers/net/wan/lmc/lmc.h               |  2 +-
 drivers/net/wan/lmc/lmc_main.c          | 10 +---
 drivers/net/wan/lmc/lmc_proto.c         |  7 ---
 drivers/net/wan/lmc/lmc_proto.h         |  1 -
 drivers/net/wan/n2.c                    | 19 +++----
 drivers/net/wan/pc300too.c              | 29 +++++------
 drivers/net/wan/pci200syn.c             | 19 +++----
 drivers/net/wan/sealevel.c              | 10 +---
 drivers/net/wan/wanxl.c                 | 21 ++++----
 drivers/tty/synclink_gt.c               | 19 +++----
 include/linux/hdlc.h                    |  4 +-
 include/linux/netdevice.h               |  2 +
 net/core/dev_ioctl.c                    | 20 ++++++-
 28 files changed, 211 insertions(+), 258 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index f57f255f2397..3c42b0b0be93 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -222,6 +222,13 @@ ndo_do_ioctl:
 	Synchronization: rtnl_lock() semaphore.
 	Context: process
 
+ndo_siocwandev:
+	Synchronization: rtnl_lock() semaphore.
+	Context: process
+
+	Used by the drivers/net/wan framework to handle
+	the SIOCWANDEV ioctl with the if_settings structure.
+
 ndo_siocdevprivate:
 	Synchronization: rtnl_lock() semaphore.
 	Context: process
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 6eaefea0520e..5ac53dcb3a6a 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -4050,16 +4050,15 @@ static int hdlcdev_close(struct net_device *dev)
  * called by network layer to process IOCTL call to network device
  *
  * dev  pointer to network device structure
- * ifr  pointer to network interface request structure
- * cmd  IOCTL command code
+ * ifs  pointer to network interface settings structure
  *
  * returns 0 if success, otherwise error code
  */
-static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int hdlcdev_wan_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	const size_t size = sizeof(sync_serial_settings);
 	sync_serial_settings new_line;
-	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	sync_serial_settings __user *line = ifs->ifs_ifsu.sync;
 	MGSLPC_INFO *info = dev_to_port(dev);
 	unsigned int flags;
 
@@ -4070,17 +4069,14 @@ static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	if (info->port.count)
 		return -EBUSY;
 
-	if (cmd != SIOCWANDEV)
-		return hdlc_ioctl(dev, ifr, cmd);
-
 	memset(&new_line, 0, size);
 
-	switch(ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_IFACE: /* return current sync_serial_settings */
 
-		ifr->ifr_settings.type = IF_IFACE_SYNC_SERIAL;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_IFACE_SYNC_SERIAL;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 
@@ -4148,9 +4144,8 @@ static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			tty_kref_put(tty);
 		}
 		return 0;
-
 	default:
-		return hdlc_ioctl(dev, ifr, cmd);
+		return hdlc_ioctl(dev, ifs);
 	}
 }
 
@@ -4225,7 +4220,7 @@ static const struct net_device_ops hdlcdev_ops = {
 	.ndo_open       = hdlcdev_open,
 	.ndo_stop       = hdlcdev_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = hdlcdev_ioctl,
+	.ndo_siocwandev = hdlcdev_wan_ioctl,
 	.ndo_tx_timeout = hdlcdev_tx_timeout,
 };
 
diff --git a/drivers/net/wan/c101.c b/drivers/net/wan/c101.c
index ca308230500d..8dd14d916c3a 100644
--- a/drivers/net/wan/c101.c
+++ b/drivers/net/wan/c101.c
@@ -228,21 +228,18 @@ static int c101_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
 	return -EOPNOTSUPP;
 }
 
-static int c101_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int c101_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	const size_t size = sizeof(sync_serial_settings);
 	sync_serial_settings new_line;
-	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	sync_serial_settings __user *line = ifs->ifs_ifsu.sync;
 	port_t *port = dev_to_port(dev);
 
-	if (cmd != SIOCWANDEV)
-		return hdlc_ioctl(dev, ifr, cmd);
-
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_IFACE:
-		ifr->ifr_settings.type = IF_IFACE_SYNC_SERIAL;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_IFACE_SYNC_SERIAL;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		if (copy_to_user(line, &port->settings, size))
@@ -270,7 +267,7 @@ static int c101_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		return 0;
 
 	default:
-		return hdlc_ioctl(dev, ifr, cmd);
+		return hdlc_ioctl(dev, ifs);
 	}
 }
 
@@ -295,7 +292,7 @@ static const struct net_device_ops c101_ops = {
 	.ndo_open       = c101_open,
 	.ndo_stop       = c101_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = c101_ioctl,
+	.ndo_siocwandev = c101_ioctl,
 	.ndo_siocdevprivate = c101_siocdevprivate,
 };
 
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index 4c0e9cf02217..23d2954d9747 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -414,7 +414,7 @@ static const struct net_device_ops cosa_ops = {
 	.ndo_open       = cosa_net_open,
 	.ndo_stop       = cosa_net_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = hdlc_ioctl,
+	.ndo_siocwandev = hdlc_ioctl,
 	.ndo_tx_timeout = cosa_net_timeout,
 };
 
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index d0e3cab98645..6a212c085435 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -1784,16 +1784,15 @@ gather_conf_info(struct fst_card_info *card, struct fst_port_info *port,
 
 static int
 fst_set_iface(struct fst_card_info *card, struct fst_port_info *port,
-	      struct ifreq *ifr)
+	      struct if_settings *ifs)
 {
 	sync_serial_settings sync;
 	int i;
 
-	if (ifr->ifr_settings.size != sizeof(sync))
+	if (ifs->size != sizeof(sync))
 		return -ENOMEM;
 
-	if (copy_from_user
-	    (&sync, ifr->ifr_settings.ifs_ifsu.sync, sizeof(sync)))
+	if (copy_from_user(&sync, ifs->ifs_ifsu.sync, sizeof(sync)))
 		return -EFAULT;
 
 	if (sync.loopback)
@@ -1801,7 +1800,7 @@ fst_set_iface(struct fst_card_info *card, struct fst_port_info *port,
 
 	i = port->index;
 
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_IFACE_V35:
 		FST_WRW(card, portConfig[i].lineInterface, V35);
 		port->hwif = V35;
@@ -1857,7 +1856,7 @@ fst_set_iface(struct fst_card_info *card, struct fst_port_info *port,
 
 static int
 fst_get_iface(struct fst_card_info *card, struct fst_port_info *port,
-	      struct ifreq *ifr)
+	      struct if_settings *ifs)
 {
 	sync_serial_settings sync;
 	int i;
@@ -1868,29 +1867,29 @@ fst_get_iface(struct fst_card_info *card, struct fst_port_info *port,
 	 */
 	switch (port->hwif) {
 	case E1:
-		ifr->ifr_settings.type = IF_IFACE_E1;
+		ifs->type = IF_IFACE_E1;
 		break;
 	case T1:
-		ifr->ifr_settings.type = IF_IFACE_T1;
+		ifs->type = IF_IFACE_T1;
 		break;
 	case V35:
-		ifr->ifr_settings.type = IF_IFACE_V35;
+		ifs->type = IF_IFACE_V35;
 		break;
 	case V24:
-		ifr->ifr_settings.type = IF_IFACE_V24;
+		ifs->type = IF_IFACE_V24;
 		break;
 	case X21D:
-		ifr->ifr_settings.type = IF_IFACE_X21D;
+		ifs->type = IF_IFACE_X21D;
 		break;
 	case X21:
 	default:
-		ifr->ifr_settings.type = IF_IFACE_X21;
+		ifs->type = IF_IFACE_X21;
 		break;
 	}
-	if (ifr->ifr_settings.size == 0)
+	if (!ifs->size)
 		return 0;	/* only type requested */
 
-	if (ifr->ifr_settings.size < sizeof(sync))
+	if (ifs->size < sizeof(sync))
 		return -ENOMEM;
 
 	i = port->index;
@@ -1901,10 +1900,10 @@ fst_get_iface(struct fst_card_info *card, struct fst_port_info *port,
 	    INTCLK ? CLOCK_INT : CLOCK_EXT;
 	sync.loopback = 0;
 
-	if (copy_to_user(ifr->ifr_settings.ifs_ifsu.sync, &sync, sizeof(sync)))
+	if (copy_to_user(ifs->ifs_ifsu.sync, &sync, sizeof(sync)))
 		return -EFAULT;
 
-	ifr->ifr_settings.size = sizeof(sync);
+	ifs->size = sizeof(sync);
 	return 0;
 }
 
@@ -2020,12 +2019,12 @@ fst_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data,
 }
 
 static int
-fst_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+fst_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	struct fst_card_info *card;
 	struct fst_port_info *port;
 
-	dbg(DBG_IOCTL, "ioctl: %x, %x\n", cmd, ifr->ifr_settings.type);
+	dbg(DBG_IOCTL, "SIOCDEVPRIVATE, %x\n", ifs->type);
 
 	port = dev_to_port(dev);
 	card = port->card;
@@ -2033,42 +2032,35 @@ fst_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
-	switch (cmd) {
-	case SIOCWANDEV:
-		switch (ifr->ifr_settings.type) {
-		case IF_GET_IFACE:
-			return fst_get_iface(card, port, ifr);
-
-		case IF_IFACE_SYNC_SERIAL:
-		case IF_IFACE_V35:
-		case IF_IFACE_V24:
-		case IF_IFACE_X21:
-		case IF_IFACE_X21D:
-		case IF_IFACE_T1:
-		case IF_IFACE_E1:
-			return fst_set_iface(card, port, ifr);
-
-		case IF_PROTO_RAW:
-			port->mode = FST_RAW;
-			return 0;
+	switch (ifs->type) {
+	case IF_GET_IFACE:
+		return fst_get_iface(card, port, ifs);
 
-		case IF_GET_PROTO:
-			if (port->mode == FST_RAW) {
-				ifr->ifr_settings.type = IF_PROTO_RAW;
-				return 0;
-			}
-			return hdlc_ioctl(dev, ifr, cmd);
+	case IF_IFACE_SYNC_SERIAL:
+	case IF_IFACE_V35:
+	case IF_IFACE_V24:
+	case IF_IFACE_X21:
+	case IF_IFACE_X21D:
+	case IF_IFACE_T1:
+	case IF_IFACE_E1:
+		return fst_set_iface(card, port, ifs);
 
-		default:
-			port->mode = FST_GEN_HDLC;
-			dbg(DBG_IOCTL, "Passing this type to hdlc %x\n",
-			    ifr->ifr_settings.type);
-			return hdlc_ioctl(dev, ifr, cmd);
+	case IF_PROTO_RAW:
+		port->mode = FST_RAW;
+		return 0;
+
+	case IF_GET_PROTO:
+		if (port->mode == FST_RAW) {
+			ifs->type = IF_PROTO_RAW;
+			return 0;
 		}
+		return hdlc_ioctl(dev, ifs);
 
 	default:
-		/* Not one of ours. Pass through to HDLC package */
-		return hdlc_ioctl(dev, ifr, cmd);
+		port->mode = FST_GEN_HDLC;
+		dbg(DBG_IOCTL, "Passing this type to hdlc %x\n",
+		    ifs->type);
+		return hdlc_ioctl(dev, ifs);
 	}
 }
 
@@ -2328,7 +2320,7 @@ static const struct net_device_ops fst_ops = {
 	.ndo_open       = fst_open,
 	.ndo_stop       = fst_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl	= fst_ioctl,
+	.ndo_siocwandev	= fst_ioctl,
 	.ndo_siocdevprivate = fst_siocdevprivate,
 	.ndo_tx_timeout = fst_tx_timeout,
 };
diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 39f05fabbfa4..cda1b4ce6b21 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -674,31 +674,28 @@ static irqreturn_t ucc_hdlc_irq_handler(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int uhdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int uhdlc_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	const size_t size = sizeof(te1_settings);
 	te1_settings line;
 	struct ucc_hdlc_private *priv = netdev_priv(dev);
 
-	if (cmd != SIOCWANDEV)
-		return hdlc_ioctl(dev, ifr, cmd);
-
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_IFACE:
-		ifr->ifr_settings.type = IF_IFACE_E1;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_IFACE_E1;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		memset(&line, 0, sizeof(line));
 		line.clock_type = priv->clocking;
 
-		if (copy_to_user(ifr->ifr_settings.ifs_ifsu.sync, &line, size))
+		if (copy_to_user(ifs->ifs_ifsu.sync, &line, size))
 			return -EFAULT;
 		return 0;
 
 	default:
-		return hdlc_ioctl(dev, ifr, cmd);
+		return hdlc_ioctl(dev, ifs);
 	}
 }
 
@@ -1053,7 +1050,7 @@ static const struct net_device_ops uhdlc_ops = {
 	.ndo_open       = uhdlc_open,
 	.ndo_stop       = uhdlc_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = uhdlc_ioctl,
+	.ndo_siocwandev = uhdlc_ioctl,
 	.ndo_tx_timeout	= uhdlc_tx_timeout,
 };
 
diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c
index dd6312b69861..cbed10b1d862 100644
--- a/drivers/net/wan/hdlc.c
+++ b/drivers/net/wan/hdlc.c
@@ -196,16 +196,13 @@ void hdlc_close(struct net_device *dev)
 }
 EXPORT_SYMBOL(hdlc_close);
 
-int hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+int hdlc_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	struct hdlc_proto *proto = first_proto;
 	int result;
 
-	if (cmd != SIOCWANDEV)
-		return -EINVAL;
-
 	if (dev_to_hdlc(dev)->proto) {
-		result = dev_to_hdlc(dev)->proto->ioctl(dev, ifr);
+		result = dev_to_hdlc(dev)->proto->ioctl(dev, ifs);
 		if (result != -EINVAL)
 			return result;
 	}
@@ -213,7 +210,7 @@ int hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	/* Not handled by currently attached protocol (if any) */
 
 	while (proto) {
-		result = proto->ioctl(dev, ifr);
+		result = proto->ioctl(dev, ifs);
 		if (result != -EINVAL)
 			return result;
 		proto = proto->next;
diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c
index c54fdae950fb..cdebe65a7e2d 100644
--- a/drivers/net/wan/hdlc_cisco.c
+++ b/drivers/net/wan/hdlc_cisco.c
@@ -56,7 +56,7 @@ struct cisco_state {
 	u32 rxseq; /* RX sequence number */
 };
 
-static int cisco_ioctl(struct net_device *dev, struct ifreq *ifr);
+static int cisco_ioctl(struct net_device *dev, struct if_settings *ifs);
 
 static inline struct cisco_state *state(hdlc_device *hdlc)
 {
@@ -306,21 +306,21 @@ static const struct header_ops cisco_header_ops = {
 	.create = cisco_hard_header,
 };
 
-static int cisco_ioctl(struct net_device *dev, struct ifreq *ifr)
+static int cisco_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
-	cisco_proto __user *cisco_s = ifr->ifr_settings.ifs_ifsu.cisco;
+	cisco_proto __user *cisco_s = ifs->ifs_ifsu.cisco;
 	const size_t size = sizeof(cisco_proto);
 	cisco_proto new_settings;
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 	int result;
 
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_PROTO:
 		if (dev_to_hdlc(dev)->proto != &proto)
 			return -EINVAL;
-		ifr->ifr_settings.type = IF_PROTO_CISCO;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_PROTO_CISCO;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		if (copy_to_user(cisco_s, &state(hdlc)->settings, size))
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index 2910ea25e51d..7637edce443e 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -146,7 +146,7 @@ struct frad_state {
 	u8 rxseq; /* RX sequence number */
 };
 
-static int fr_ioctl(struct net_device *dev, struct ifreq *ifr);
+static int fr_ioctl(struct net_device *dev, struct if_settings *ifs);
 
 static inline u16 q922_to_dlci(u8 *hdr)
 {
@@ -357,29 +357,26 @@ static int pvc_close(struct net_device *dev)
 	return 0;
 }
 
-static int pvc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int pvc_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	struct pvc_device *pvc = dev->ml_priv;
 	fr_proto_pvc_info info;
 
-	if (cmd != SIOCWANDEV)
-		return -EOPNOTSUPP;
-
-	if (ifr->ifr_settings.type == IF_GET_PROTO) {
+	if (ifs->type == IF_GET_PROTO) {
 		if (dev->type == ARPHRD_ETHER)
-			ifr->ifr_settings.type = IF_PROTO_FR_ETH_PVC;
+			ifs->type = IF_PROTO_FR_ETH_PVC;
 		else
-			ifr->ifr_settings.type = IF_PROTO_FR_PVC;
+			ifs->type = IF_PROTO_FR_PVC;
 
-		if (ifr->ifr_settings.size < sizeof(info)) {
+		if (ifs->size < sizeof(info)) {
 			/* data size wanted */
-			ifr->ifr_settings.size = sizeof(info);
+			ifs->size = sizeof(info);
 			return -ENOBUFS;
 		}
 
 		info.dlci = pvc->dlci;
 		memcpy(info.master, pvc->frad->name, IFNAMSIZ);
-		if (copy_to_user(ifr->ifr_settings.ifs_ifsu.fr_pvc_info,
+		if (copy_to_user(ifs->ifs_ifsu.fr_pvc_info,
 				 &info, sizeof(info)))
 			return -EFAULT;
 		return 0;
@@ -1059,7 +1056,7 @@ static const struct net_device_ops pvc_ops = {
 	.ndo_open       = pvc_open,
 	.ndo_stop       = pvc_close,
 	.ndo_start_xmit = pvc_xmit,
-	.ndo_do_ioctl   = pvc_ioctl,
+	.ndo_siocwandev = pvc_ioctl,
 };
 
 static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
@@ -1182,22 +1179,22 @@ static struct hdlc_proto proto = {
 	.module		= THIS_MODULE,
 };
 
-static int fr_ioctl(struct net_device *dev, struct ifreq *ifr)
+static int fr_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
-	fr_proto __user *fr_s = ifr->ifr_settings.ifs_ifsu.fr;
+	fr_proto __user *fr_s = ifs->ifs_ifsu.fr;
 	const size_t size = sizeof(fr_proto);
 	fr_proto new_settings;
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 	fr_proto_pvc pvc;
 	int result;
 
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_PROTO:
 		if (dev_to_hdlc(dev)->proto != &proto) /* Different proto */
 			return -EINVAL;
-		ifr->ifr_settings.type = IF_PROTO_FR;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_PROTO_FR;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		if (copy_to_user(fr_s, &state(hdlc)->settings, size))
@@ -1259,21 +1256,21 @@ static int fr_ioctl(struct net_device *dev, struct ifreq *ifr)
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
 
-		if (copy_from_user(&pvc, ifr->ifr_settings.ifs_ifsu.fr_pvc,
+		if (copy_from_user(&pvc, ifs->ifs_ifsu.fr_pvc,
 				   sizeof(fr_proto_pvc)))
 			return -EFAULT;
 
 		if (pvc.dlci <= 0 || pvc.dlci >= 1024)
 			return -EINVAL;	/* Only 10 bits, DLCI 0 reserved */
 
-		if (ifr->ifr_settings.type == IF_PROTO_FR_ADD_ETH_PVC ||
-		    ifr->ifr_settings.type == IF_PROTO_FR_DEL_ETH_PVC)
+		if (ifs->type == IF_PROTO_FR_ADD_ETH_PVC ||
+		    ifs->type == IF_PROTO_FR_DEL_ETH_PVC)
 			result = ARPHRD_ETHER; /* bridged Ethernet device */
 		else
 			result = ARPHRD_DLCI;
 
-		if (ifr->ifr_settings.type == IF_PROTO_FR_ADD_PVC ||
-		    ifr->ifr_settings.type == IF_PROTO_FR_ADD_ETH_PVC)
+		if (ifs->type == IF_PROTO_FR_ADD_PVC ||
+		    ifs->type == IF_PROTO_FR_ADD_ETH_PVC)
 			return fr_add_pvc(dev, pvc.dlci, result);
 		else
 			return fr_del_pvc(hdlc, pvc.dlci, result);
diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c
index b81ecf432a0c..37a3c989cba1 100644
--- a/drivers/net/wan/hdlc_ppp.c
+++ b/drivers/net/wan/hdlc_ppp.c
@@ -100,7 +100,7 @@ static const char *const event_names[EVENTS] = {
 
 static struct sk_buff_head tx_queue; /* used when holding the spin lock */
 
-static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr);
+static int ppp_ioctl(struct net_device *dev, struct if_settings *ifs);
 
 static inline struct ppp *get_ppp(struct net_device *dev)
 {
@@ -655,17 +655,17 @@ static const struct header_ops ppp_header_ops = {
 	.create = ppp_hard_header,
 };
 
-static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr)
+static int ppp_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 	struct ppp *ppp;
 	int result;
 
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_PROTO:
 		if (dev_to_hdlc(dev)->proto != &proto)
 			return -EINVAL;
-		ifr->ifr_settings.type = IF_PROTO_PPP;
+		ifs->type = IF_PROTO_PPP;
 		return 0; /* return protocol only, no settable parameters */
 
 	case IF_PROTO_PPP:
diff --git a/drivers/net/wan/hdlc_raw.c b/drivers/net/wan/hdlc_raw.c
index 54d28496fefd..4a2f068721bc 100644
--- a/drivers/net/wan/hdlc_raw.c
+++ b/drivers/net/wan/hdlc_raw.c
@@ -19,7 +19,7 @@
 #include <linux/skbuff.h>
 
 
-static int raw_ioctl(struct net_device *dev, struct ifreq *ifr);
+static int raw_ioctl(struct net_device *dev, struct if_settings *ifs);
 
 static __be16 raw_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
@@ -33,21 +33,21 @@ static struct hdlc_proto proto = {
 };
 
 
-static int raw_ioctl(struct net_device *dev, struct ifreq *ifr)
+static int raw_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
-	raw_hdlc_proto __user *raw_s = ifr->ifr_settings.ifs_ifsu.raw_hdlc;
+	raw_hdlc_proto __user *raw_s = ifs->ifs_ifsu.raw_hdlc;
 	const size_t size = sizeof(raw_hdlc_proto);
 	raw_hdlc_proto new_settings;
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 	int result;
 
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_PROTO:
 		if (dev_to_hdlc(dev)->proto != &proto)
 			return -EINVAL;
-		ifr->ifr_settings.type = IF_PROTO_HDLC;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_PROTO_HDLC;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		if (copy_to_user(raw_s, hdlc->state, size))
diff --git a/drivers/net/wan/hdlc_raw_eth.c b/drivers/net/wan/hdlc_raw_eth.c
index 927596276a07..0a66b7356405 100644
--- a/drivers/net/wan/hdlc_raw_eth.c
+++ b/drivers/net/wan/hdlc_raw_eth.c
@@ -20,7 +20,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
 
-static int raw_eth_ioctl(struct net_device *dev, struct ifreq *ifr);
+static int raw_eth_ioctl(struct net_device *dev, struct if_settings *ifs);
 
 static netdev_tx_t eth_tx(struct sk_buff *skb, struct net_device *dev)
 {
@@ -48,22 +48,22 @@ static struct hdlc_proto proto = {
 };
 
 
-static int raw_eth_ioctl(struct net_device *dev, struct ifreq *ifr)
+static int raw_eth_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
-	raw_hdlc_proto __user *raw_s = ifr->ifr_settings.ifs_ifsu.raw_hdlc;
+	raw_hdlc_proto __user *raw_s = ifs->ifs_ifsu.raw_hdlc;
 	const size_t size = sizeof(raw_hdlc_proto);
 	raw_hdlc_proto new_settings;
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 	unsigned int old_qlen;
 	int result;
 
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_PROTO:
 		if (dev_to_hdlc(dev)->proto != &proto)
 			return -EINVAL;
-		ifr->ifr_settings.type = IF_PROTO_HDLC_ETH;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_PROTO_HDLC_ETH;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		if (copy_to_user(raw_s, hdlc->state, size))
diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c
index 9b7ebf8bd85c..f72c92c24003 100644
--- a/drivers/net/wan/hdlc_x25.c
+++ b/drivers/net/wan/hdlc_x25.c
@@ -29,7 +29,7 @@ struct x25_state {
 	struct tasklet_struct rx_tasklet;
 };
 
-static int x25_ioctl(struct net_device *dev, struct ifreq *ifr);
+static int x25_ioctl(struct net_device *dev, struct if_settings *ifs);
 
 static struct x25_state *state(hdlc_device *hdlc)
 {
@@ -274,21 +274,21 @@ static struct hdlc_proto proto = {
 	.module		= THIS_MODULE,
 };
 
-static int x25_ioctl(struct net_device *dev, struct ifreq *ifr)
+static int x25_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
-	x25_hdlc_proto __user *x25_s = ifr->ifr_settings.ifs_ifsu.x25;
+	x25_hdlc_proto __user *x25_s = ifs->ifs_ifsu.x25;
 	const size_t size = sizeof(x25_hdlc_proto);
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 	x25_hdlc_proto new_settings;
 	int result;
 
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_PROTO:
 		if (dev_to_hdlc(dev)->proto != &proto)
 			return -EINVAL;
-		ifr->ifr_settings.type = IF_PROTO_X25;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_PROTO_X25;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		if (copy_to_user(x25_s, &state(hdlc)->settings, size))
@@ -303,7 +303,7 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr)
 			return -EBUSY;
 
 		/* backward compatibility */
-		if (ifr->ifr_settings.size == 0) {
+		if (ifs->size == 0) {
 			new_settings.dce = 0;
 			new_settings.modulo = 8;
 			new_settings.window = 7;
diff --git a/drivers/net/wan/hostess_sv11.c b/drivers/net/wan/hostess_sv11.c
index fd61a7cc4fdf..15a754310fd7 100644
--- a/drivers/net/wan/hostess_sv11.c
+++ b/drivers/net/wan/hostess_sv11.c
@@ -142,11 +142,6 @@ static int hostess_close(struct net_device *d)
 	return 0;
 }
 
-static int hostess_ioctl(struct net_device *d, struct ifreq *ifr, int cmd)
-{
-	return hdlc_ioctl(d, ifr, cmd);
-}
-
 /*	Passed network frames, fire them downwind.
  */
 
@@ -171,7 +166,7 @@ static const struct net_device_ops hostess_ops = {
 	.ndo_open       = hostess_open,
 	.ndo_stop       = hostess_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = hostess_ioctl,
+	.ndo_siocwandev = hdlc_ioctl,
 };
 
 static struct z8530_dev *sv11_init(int iobase, int irq)
diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
index 2cebbfca0bd1..88a36a069311 100644
--- a/drivers/net/wan/ixp4xx_hss.c
+++ b/drivers/net/wan/ixp4xx_hss.c
@@ -1254,23 +1254,20 @@ static void find_best_clock(u32 timer_freq, u32 rate, u32 *best, u32 *reg)
 	}
 }
 
-static int hss_hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int hss_hdlc_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	const size_t size = sizeof(sync_serial_settings);
 	sync_serial_settings new_line;
-	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	sync_serial_settings __user *line = ifs->ifs_ifsu.sync;
 	struct port *port = dev_to_port(dev);
 	unsigned long flags;
 	int clk;
 
-	if (cmd != SIOCWANDEV)
-		return hdlc_ioctl(dev, ifr, cmd);
-
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_IFACE:
-		ifr->ifr_settings.type = IF_IFACE_V35;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_IFACE_V35;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		memset(&new_line, 0, sizeof(new_line));
@@ -1323,7 +1320,7 @@ static int hss_hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		return 0;
 
 	default:
-		return hdlc_ioctl(dev, ifr, cmd);
+		return hdlc_ioctl(dev, ifs);
 	}
 }
 
@@ -1335,7 +1332,7 @@ static const struct net_device_ops hss_hdlc_ops = {
 	.ndo_open       = hss_hdlc_open,
 	.ndo_stop       = hss_hdlc_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = hss_hdlc_ioctl,
+	.ndo_siocwandev = hss_hdlc_ioctl,
 };
 
 static int hss_init_one(struct platform_device *pdev)
diff --git a/drivers/net/wan/lmc/lmc.h b/drivers/net/wan/lmc/lmc.h
index 3bd541c868d5..d7d59b4595f9 100644
--- a/drivers/net/wan/lmc/lmc.h
+++ b/drivers/net/wan/lmc/lmc.h
@@ -19,7 +19,7 @@ void lmc_mii_writereg(lmc_softc_t * const, unsigned, unsigned, unsigned);
 void lmc_gpio_mkinput(lmc_softc_t * const sc, u32 bits);
 void lmc_gpio_mkoutput(lmc_softc_t * const sc, u32 bits);
 
-int lmc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+int lmc_ioctl(struct net_device *dev, struct if_settings *ifs);
 
 extern lmc_media_t lmc_ds3_media;
 extern lmc_media_t lmc_ssi_media;
diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c
index 26a4ffbff73b..ed687bf6ec47 100644
--- a/drivers/net/wan/lmc/lmc_main.c
+++ b/drivers/net/wan/lmc/lmc_main.c
@@ -616,14 +616,6 @@ static int lmc_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
     return ret;
 }
 
-int lmc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	if (cmd != SIOCWANDEV)
-		return -EOPNOTSUPP;
-
-	return lmc_proto_ioctl(dev_to_sc(dev), ifr, cmd);
-}
-
 
 /* the watchdog process that cruises around */
 static void lmc_watchdog(struct timer_list *t) /*fold00*/
@@ -794,7 +786,7 @@ static const struct net_device_ops lmc_ops = {
 	.ndo_open       = lmc_open,
 	.ndo_stop       = lmc_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = lmc_ioctl,
+	.ndo_siocwandev = hdlc_ioctl,
 	.ndo_siocdevprivate = lmc_siocdevprivate,
 	.ndo_tx_timeout = lmc_driver_timeout,
 	.ndo_get_stats  = lmc_get_stats,
diff --git a/drivers/net/wan/lmc/lmc_proto.c b/drivers/net/wan/lmc/lmc_proto.c
index 4e9cc83b615a..e5487616a816 100644
--- a/drivers/net/wan/lmc/lmc_proto.c
+++ b/drivers/net/wan/lmc/lmc_proto.c
@@ -58,13 +58,6 @@ void lmc_proto_attach(lmc_softc_t *sc) /*FOLD00*/
         }
 }
 
-int lmc_proto_ioctl(lmc_softc_t *sc, struct ifreq *ifr, int cmd)
-{
-	if (sc->if_type == LMC_PPP)
-		return hdlc_ioctl(sc->lmc_device, ifr, cmd);
-	return -EOPNOTSUPP;
-}
-
 int lmc_proto_open(lmc_softc_t *sc)
 {
 	int ret = 0;
diff --git a/drivers/net/wan/lmc/lmc_proto.h b/drivers/net/wan/lmc/lmc_proto.h
index bb098e443776..e56e7072de44 100644
--- a/drivers/net/wan/lmc/lmc_proto.h
+++ b/drivers/net/wan/lmc/lmc_proto.h
@@ -5,7 +5,6 @@
 #include <linux/hdlc.h>
 
 void lmc_proto_attach(lmc_softc_t *sc);
-int lmc_proto_ioctl(lmc_softc_t *sc, struct ifreq *ifr, int cmd);
 int lmc_proto_open(lmc_softc_t *sc);
 void lmc_proto_close(lmc_softc_t *sc);
 __be16 lmc_proto_type(lmc_softc_t *sc, struct sk_buff *skb);
diff --git a/drivers/net/wan/n2.c b/drivers/net/wan/n2.c
index 4122ca2cd07d..f3e80722ba1d 100644
--- a/drivers/net/wan/n2.c
+++ b/drivers/net/wan/n2.c
@@ -239,21 +239,18 @@ static int n2_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
 	return -EOPNOTSUPP;
 }
 
-static int n2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int n2_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	const size_t size = sizeof(sync_serial_settings);
 	sync_serial_settings new_line;
-	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	sync_serial_settings __user *line = ifs->ifs_ifsu.sync;
 	port_t *port = dev_to_port(dev);
 
-	if (cmd != SIOCWANDEV)
-		return hdlc_ioctl(dev, ifr, cmd);
-
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_IFACE:
-		ifr->ifr_settings.type = IF_IFACE_SYNC_SERIAL;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_IFACE_SYNC_SERIAL;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		if (copy_to_user(line, &port->settings, size))
@@ -281,7 +278,7 @@ static int n2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		return 0;
 
 	default:
-		return hdlc_ioctl(dev, ifr, cmd);
+		return hdlc_ioctl(dev, ifs);
 	}
 }
 
@@ -317,7 +314,7 @@ static const struct net_device_ops n2_ops = {
 	.ndo_open       = n2_open,
 	.ndo_stop       = n2_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = n2_ioctl,
+	.ndo_siocwandev = n2_ioctl,
 	.ndo_siocdevprivate = n2_siocdevprivate,
 };
 
diff --git a/drivers/net/wan/pc300too.c b/drivers/net/wan/pc300too.c
index 8cdfd0056c81..4766446f0fa0 100644
--- a/drivers/net/wan/pc300too.c
+++ b/drivers/net/wan/pc300too.c
@@ -186,21 +186,18 @@ static int pc300_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
 	return -EOPNOTSUPP;
 }
 
-static int pc300_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int pc300_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	const size_t size = sizeof(sync_serial_settings);
 	sync_serial_settings new_line;
-	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	sync_serial_settings __user *line = ifs->ifs_ifsu.sync;
 	int new_type;
 	port_t *port = dev_to_port(dev);
 
-	if (cmd != SIOCWANDEV)
-		return hdlc_ioctl(dev, ifr, cmd);
-
-	if (ifr->ifr_settings.type == IF_GET_IFACE) {
-		ifr->ifr_settings.type = port->iface;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+	if (ifs->type == IF_GET_IFACE) {
+		ifs->type = port->iface;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		if (copy_to_user(line, &port->settings, size))
@@ -209,21 +206,21 @@ static int pc300_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	}
 
 	if (port->card->type == PC300_X21 &&
-	    (ifr->ifr_settings.type == IF_IFACE_SYNC_SERIAL ||
-	     ifr->ifr_settings.type == IF_IFACE_X21))
+	    (ifs->type == IF_IFACE_SYNC_SERIAL ||
+	     ifs->type == IF_IFACE_X21))
 		new_type = IF_IFACE_X21;
 
 	else if (port->card->type == PC300_RSV &&
-		 (ifr->ifr_settings.type == IF_IFACE_SYNC_SERIAL ||
-		  ifr->ifr_settings.type == IF_IFACE_V35))
+		 (ifs->type == IF_IFACE_SYNC_SERIAL ||
+		  ifs->type == IF_IFACE_V35))
 		new_type = IF_IFACE_V35;
 
 	else if (port->card->type == PC300_RSV &&
-		 ifr->ifr_settings.type == IF_IFACE_V24)
+		 ifs->type == IF_IFACE_V24)
 		new_type = IF_IFACE_V24;
 
 	else
-		return hdlc_ioctl(dev, ifr, cmd);
+		return hdlc_ioctl(dev, ifs);
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -278,7 +275,7 @@ static const struct net_device_ops pc300_ops = {
 	.ndo_open       = pc300_open,
 	.ndo_stop       = pc300_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = pc300_ioctl,
+	.ndo_siocwandev = pc300_ioctl,
 	.ndo_siocdevprivate = pc300_siocdevprivate,
 };
 
diff --git a/drivers/net/wan/pci200syn.c b/drivers/net/wan/pci200syn.c
index f4dc3dda25b7..ea86c7035653 100644
--- a/drivers/net/wan/pci200syn.c
+++ b/drivers/net/wan/pci200syn.c
@@ -179,21 +179,18 @@ static int pci200_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
 	return -EOPNOTSUPP;
 }
 
-static int pci200_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int pci200_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	const size_t size = sizeof(sync_serial_settings);
 	sync_serial_settings new_line;
-	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	sync_serial_settings __user *line = ifs->ifs_ifsu.sync;
 	port_t *port = dev_to_port(dev);
 
-	if (cmd != SIOCWANDEV)
-		return hdlc_ioctl(dev, ifr, cmd);
-
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_IFACE:
-		ifr->ifr_settings.type = IF_IFACE_V35;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_IFACE_V35;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		if (copy_to_user(line, &port->settings, size))
@@ -223,7 +220,7 @@ static int pci200_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		return 0;
 
 	default:
-		return hdlc_ioctl(dev, ifr, cmd);
+		return hdlc_ioctl(dev, ifs);
 	}
 }
 
@@ -259,7 +256,7 @@ static const struct net_device_ops pci200_ops = {
 	.ndo_open       = pci200_open,
 	.ndo_stop       = pci200_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = pci200_ioctl,
+	.ndo_siocwandev = pci200_ioctl,
 	.ndo_siocdevprivate = pci200_siocdevprivate,
 };
 
diff --git a/drivers/net/wan/sealevel.c b/drivers/net/wan/sealevel.c
index 4403e219ca03..eddd20aab691 100644
--- a/drivers/net/wan/sealevel.c
+++ b/drivers/net/wan/sealevel.c
@@ -124,14 +124,6 @@ static int sealevel_close(struct net_device *d)
 	return 0;
 }
 
-static int sealevel_ioctl(struct net_device *d, struct ifreq *ifr, int cmd)
-{
-	/* struct slvl_device *slvl=dev_to_chan(d);
-	 * z8530_ioctl(d,&slvl->sync.chanA,ifr,cmd)
-	 */
-	return hdlc_ioctl(d, ifr, cmd);
-}
-
 /*	Passed network frames, fire them downwind. */
 
 static netdev_tx_t sealevel_queue_xmit(struct sk_buff *skb,
@@ -152,7 +144,7 @@ static const struct net_device_ops sealevel_ops = {
 	.ndo_open       = sealevel_open,
 	.ndo_stop       = sealevel_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = sealevel_ioctl,
+	.ndo_siocwandev = hdlc_ioctl,
 };
 
 static int slvl_setup(struct slvl_device *sv, int iobase, int irq)
diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c
index f22e48415e6f..5a9e262188ef 100644
--- a/drivers/net/wan/wanxl.c
+++ b/drivers/net/wan/wanxl.c
@@ -343,20 +343,17 @@ static int wanxl_attach(struct net_device *dev, unsigned short encoding,
 	return 0;
 }
 
-static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int wanxl_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	const size_t size = sizeof(sync_serial_settings);
 	sync_serial_settings line;
 	struct port *port = dev_to_port(dev);
 
-	if (cmd != SIOCWANDEV)
-		return hdlc_ioctl(dev, ifr, cmd);
-
-	switch (ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_IFACE:
-		ifr->ifr_settings.type = IF_IFACE_SYNC_SERIAL;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_IFACE_SYNC_SERIAL;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 		memset(&line, 0, sizeof(line));
@@ -364,7 +361,7 @@ static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		line.clock_rate = 0;
 		line.loopback = 0;
 
-		if (copy_to_user(ifr->ifr_settings.ifs_ifsu.sync, &line, size))
+		if (copy_to_user(ifs->ifs_ifsu.sync, &line, size))
 			return -EFAULT;
 		return 0;
 
@@ -374,7 +371,7 @@ static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (dev->flags & IFF_UP)
 			return -EBUSY;
 
-		if (copy_from_user(&line, ifr->ifr_settings.ifs_ifsu.sync,
+		if (copy_from_user(&line, ifs->ifs_ifsu.sync,
 				   size))
 			return -EFAULT;
 
@@ -389,7 +386,7 @@ static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		return 0;
 
 	default:
-		return hdlc_ioctl(dev, ifr, cmd);
+		return hdlc_ioctl(dev, ifs);
 	}
 }
 
@@ -545,7 +542,7 @@ static const struct net_device_ops wanxl_ops = {
 	.ndo_open       = wanxl_open,
 	.ndo_stop       = wanxl_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = wanxl_ioctl,
+	.ndo_siocwandev = wanxl_ioctl,
 	.ndo_get_stats  = wanxl_get_stats,
 };
 
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 5bb928b7873e..3e3b8873fa29 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -1524,11 +1524,11 @@ static int hdlcdev_close(struct net_device *dev)
  *
  * Return: 0 if success, otherwise error code
  */
-static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int hdlcdev_ioctl(struct net_device *dev, struct if_settings *ifs)
 {
 	const size_t size = sizeof(sync_serial_settings);
 	sync_serial_settings new_line;
-	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	sync_serial_settings __user *line = ifs->ifs_ifsu.sync;
 	struct slgt_info *info = dev_to_port(dev);
 	unsigned int flags;
 
@@ -1538,17 +1538,14 @@ static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	if (info->port.count)
 		return -EBUSY;
 
-	if (cmd != SIOCWANDEV)
-		return hdlc_ioctl(dev, ifr, cmd);
-
 	memset(&new_line, 0, sizeof(new_line));
 
-	switch(ifr->ifr_settings.type) {
+	switch (ifs->type) {
 	case IF_GET_IFACE: /* return current sync_serial_settings */
 
-		ifr->ifr_settings.type = IF_IFACE_SYNC_SERIAL;
-		if (ifr->ifr_settings.size < size) {
-			ifr->ifr_settings.size = size; /* data size wanted */
+		ifs->type = IF_IFACE_SYNC_SERIAL;
+		if (ifs->size < size) {
+			ifs->size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
 
@@ -1615,7 +1612,7 @@ static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		return 0;
 
 	default:
-		return hdlc_ioctl(dev, ifr, cmd);
+		return hdlc_ioctl(dev, ifs);
 	}
 }
 
@@ -1688,7 +1685,7 @@ static const struct net_device_ops hdlcdev_ops = {
 	.ndo_open       = hdlcdev_open,
 	.ndo_stop       = hdlcdev_close,
 	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_do_ioctl   = hdlcdev_ioctl,
+	.ndo_siocwandev = hdlcdev_ioctl,
 	.ndo_tx_timeout = hdlcdev_tx_timeout,
 };
 
diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index cacc4dd27794..630a388035f1 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -22,7 +22,7 @@ struct hdlc_proto {
 	void (*start)(struct net_device *dev); /* if open & DCD */
 	void (*stop)(struct net_device *dev); /* if open & !DCD */
 	void (*detach)(struct net_device *dev);
-	int (*ioctl)(struct net_device *dev, struct ifreq *ifr);
+	int (*ioctl)(struct net_device *dev, struct if_settings *ifs);
 	__be16 (*type_trans)(struct sk_buff *skb, struct net_device *dev);
 	int (*netif_rx)(struct sk_buff *skb);
 	netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev);
@@ -54,7 +54,7 @@ typedef struct hdlc_device {
 /* Exported from hdlc module */
 
 /* Called by hardware driver when a user requests HDLC service */
-int hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+int hdlc_ioctl(struct net_device *dev, struct if_settings *ifs);
 
 /* Must be used by hardware driver on module startup/exit */
 #define register_hdlc_device(dev)	register_netdev(dev)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b6e062a3b0d4..cc11382f76a3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1367,6 +1367,8 @@ struct net_device_ops {
 					        struct ifreq *ifr, int cmd);
 	int			(*ndo_eth_ioctl)(struct net_device *dev,
 						 struct ifreq *ifr, int cmd);
+	int			(*ndo_siocwandev)(struct net_device *dev,
+						  struct if_settings *ifs);
 	int			(*ndo_siocdevprivate)(struct net_device *dev,
 						      struct ifreq *ifr,
 						      void __user *data, int cmd);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 8e30fe8b5645..e0586bc4d6c6 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -291,6 +291,20 @@ static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
 	return dev_do_ioctl(dev, ifr, cmd);
 }
 
+static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (ops->ndo_siocwandev) {
+		if (netif_device_present(dev))
+			return ops->ndo_siocwandev(dev, ifs);
+		else
+			return -ENODEV;
+	}
+
+	return -EOPNOTSUPP;
+}
+
 /*
  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
  */
@@ -359,6 +373,9 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
 		return dev_change_name(dev, ifr->ifr_newname);
 
+	case SIOCWANDEV:
+		return dev_siocwandev(dev, &ifr->ifr_settings);
+
 	case SIOCSHWTSTAMP:
 		err = net_hwtstamp_validate(ifr);
 		if (err)
@@ -386,8 +403,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
 		    cmd == SIOCBONDINFOQUERY ||
 		    cmd == SIOCBONDCHANGEACTIVE ||
 		    cmd == SIOCBRADDIF ||
-		    cmd == SIOCBRDELIF ||
-		    cmd == SIOCWANDEV) {
+		    cmd == SIOCBRDELIF) {
 			err = dev_do_ioctl(dev, ifr, cmd);
 		} else
 			err = -EINVAL;
-- 
cgit v1.2.3


From ad2f99aedf8fa77f3ae647153284fa63c43d3055 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 27 Jul 2021 15:45:16 +0200
Subject: net: bridge: move bridge ioctls out of .ndo_do_ioctl

Working towards obsoleting the .ndo_do_ioctl operation entirely,
stop passing the SIOCBRADDIF/SIOCBRDELIF device ioctl commands
into this callback.

My first attempt was to add another ndo_siocbr() callback, but
as there is only a single driver that takes these commands and
there is already a hook mechanism to call directly into this
driver, extend this hook instead, and use it for both the
deviceless and the device specific ioctl commands.

Cc: Roopa Prabhu <roopa@nvidia.com>
Cc: Nikolay Aleksandrov <nikolay@nvidia.com>
Cc: bridge@lists.linux-foundation.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  7 ++++++-
 net/bridge/br.c           |  2 +-
 net/bridge/br_device.c    |  1 -
 net/bridge/br_ioctl.c     | 15 +++------------
 net/bridge/br_private.h   |  5 ++---
 net/core/dev_ioctl.c      | 11 ++++++++---
 net/socket.c              | 33 +++++++++++++++++++++++----------
 7 files changed, 43 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index b73b4ff749e1..21daed10322e 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -61,7 +61,12 @@ struct br_ip_list {
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
-extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
+struct net_bridge;
+void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br,
+			     unsigned int cmd, struct ifreq *ifr,
+			     void __user *uarg));
+int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd,
+		  struct ifreq *ifr, void __user *uarg);
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)
 int br_multicast_list_adjacent(struct net_device *dev,
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 51f2e25c4cd6..8fb5dca5f8e0 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -359,7 +359,7 @@ static int __init br_init(void)
 	if (err)
 		goto err_out5;
 
-	brioctl_set(br_ioctl_deviceless_stub);
+	brioctl_set(br_ioctl_stub);
 
 #if IS_ENABLED(CONFIG_ATM_LANE)
 	br_fdb_test_addr_hook = br_fdb_test_addr;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 1952bb433ca7..8d6bab244c4a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -454,7 +454,6 @@ static const struct net_device_ops br_netdev_ops = {
 	.ndo_set_rx_mode	 = br_dev_set_multicast_list,
 	.ndo_change_rx_flags	 = br_dev_change_rx_flags,
 	.ndo_change_mtu		 = br_change_mtu,
-	.ndo_do_ioctl		 = br_dev_ioctl,
 	.ndo_siocdevprivate	 = br_dev_siocdevprivate,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_netpoll_setup	 = br_netpoll_setup,
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 9f924fe43641..46a24c20e405 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -366,7 +366,8 @@ static int old_deviceless(struct net *net, void __user *uarg)
 	return -EOPNOTSUPP;
 }
 
-int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
+int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd,
+		  struct ifreq *ifr, void __user *uarg)
 {
 	switch (cmd) {
 	case SIOCGIFBR:
@@ -390,21 +391,11 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uar
 
 		return br_del_bridge(net, buf);
 	}
-	}
-	return -EOPNOTSUPP;
-}
-
-int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
-{
-	struct net_bridge *br = netdev_priv(dev);
 
-	switch (cmd) {
 	case SIOCBRADDIF:
 	case SIOCBRDELIF:
-		return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
+		return add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF);
 
 	}
-
-	br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
 	return -EOPNOTSUPP;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 572c28ae41b8..f2d34ea1ea37 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -851,11 +851,10 @@ br_port_get_check_rtnl(const struct net_device *dev)
 }
 
 /* br_ioctl.c */
-int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq,
 			  void __user *data, int cmd);
-int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
-			     void __user *arg);
+int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd,
+		  struct ifreq *ifr, void __user *uarg);
 
 /* br_multicast.c */
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 70a379cee5fd..3166f196b296 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -6,6 +6,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/net_tstamp.h>
 #include <linux/wireless.h>
+#include <linux/if_bridge.h>
 #include <net/dsa.h>
 #include <net/wext.h>
 
@@ -374,6 +375,12 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
 	case SIOCWANDEV:
 		return dev_siocwandev(dev, &ifr->ifr_settings);
 
+	case SIOCBRADDIF:
+	case SIOCBRDELIF:
+		if (!netif_device_present(dev))
+			return -ENODEV;
+		return br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL);
+
 	case SIOCSHWTSTAMP:
 		err = net_hwtstamp_validate(ifr);
 		if (err)
@@ -399,9 +406,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
 		    cmd == SIOCBONDSETHWADDR ||
 		    cmd == SIOCBONDSLAVEINFOQUERY ||
 		    cmd == SIOCBONDINFOQUERY ||
-		    cmd == SIOCBONDCHANGEACTIVE ||
-		    cmd == SIOCBRADDIF ||
-		    cmd == SIOCBRDELIF) {
+		    cmd == SIOCBONDCHANGEACTIVE) {
 			err = dev_do_ioctl(dev, ifr, cmd);
 		} else
 			err = -EINVAL;
diff --git a/net/socket.c b/net/socket.c
index 48471a219c1d..42665bd99ea4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1064,9 +1064,13 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
  */
 
 static DEFINE_MUTEX(br_ioctl_mutex);
-static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
+static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br,
+			    unsigned int cmd, struct ifreq *ifr,
+			    void __user *uarg);
 
-void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
+void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br,
+			     unsigned int cmd, struct ifreq *ifr,
+			     void __user *uarg))
 {
 	mutex_lock(&br_ioctl_mutex);
 	br_ioctl_hook = hook;
@@ -1074,6 +1078,22 @@ void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
 }
 EXPORT_SYMBOL(brioctl_set);
 
+int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd,
+		  struct ifreq *ifr, void __user *uarg)
+{
+	int err = -ENOPKG;
+
+	if (!br_ioctl_hook)
+		request_module("bridge");
+
+	mutex_lock(&br_ioctl_mutex);
+	if (br_ioctl_hook)
+		err = br_ioctl_hook(net, br, cmd, ifr, uarg);
+	mutex_unlock(&br_ioctl_mutex);
+
+	return err;
+}
+
 static DEFINE_MUTEX(vlan_ioctl_mutex);
 static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
 
@@ -1162,14 +1182,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		case SIOCSIFBR:
 		case SIOCBRADDBR:
 		case SIOCBRDELBR:
-			err = -ENOPKG;
-			if (!br_ioctl_hook)
-				request_module("bridge");
-
-			mutex_lock(&br_ioctl_mutex);
-			if (br_ioctl_hook)
-				err = br_ioctl_hook(net, cmd, argp);
-			mutex_unlock(&br_ioctl_mutex);
+			err = br_ioctl_call(net, NULL, cmd, NULL, argp);
 			break;
 		case SIOCGIFVLAN:
 		case SIOCSIFVLAN:
-- 
cgit v1.2.3


From 3d9d00bd1885afa6b2c766cf9bab7b54b1a951ed Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 27 Jul 2021 15:45:17 +0200
Subject: net: bonding: move ioctl handling to private ndo operation

All other user triggered operations are gone from ndo_ioctl, so move
the SIOCBOND family into a custom operation as well.

The .ndo_ioctl() helper is no longer called by the dev_ioctl.c code now,
but there are still a few definitions in obsolete wireless drivers as well
as the appletalk and ieee802154 layers to call SIOCSIFADDR/SIOCGIFADDR
helpers from inside the kernel.

Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdevices.rst | 11 +++++++++++
 drivers/net/bonding/bond_main.c         |  2 +-
 include/linux/netdevice.h               | 13 ++++++++++---
 net/core/dev_ioctl.c                    |  8 ++++----
 4 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 3c42b0b0be93..9e4cccb90b87 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -222,6 +222,17 @@ ndo_do_ioctl:
 	Synchronization: rtnl_lock() semaphore.
 	Context: process
 
+        This is only called by network subsystems internally,
+        not by user space calling ioctl as it was in before
+        linux-5.14.
+
+ndo_siocbond:
+        Synchronization: rtnl_lock() semaphore.
+        Context: process
+
+        Used by the bonding driver for the SIOCBOND family of
+        ioctl commands.
+
 ndo_siocwandev:
 	Synchronization: rtnl_lock() semaphore.
 	Context: process
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 23769e937c28..bec8ceaff98f 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4988,7 +4988,7 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_select_queue	= bond_select_queue,
 	.ndo_get_stats64	= bond_get_stats,
 	.ndo_eth_ioctl		= bond_eth_ioctl,
-	.ndo_do_ioctl		= bond_do_ioctl,
+	.ndo_siocbond		= bond_do_ioctl,
 	.ndo_siocdevprivate	= bond_siocdevprivate,
 	.ndo_change_rx_flags	= bond_change_rx_flags,
 	.ndo_set_rx_mode	= bond_set_rx_mode,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cc11382f76a3..226bbee06730 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1086,9 +1086,14 @@ struct netdev_net_notifier {
  *	Test if Media Access Control address is valid for the device.
  *
  * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
- *	Called when a user requests an ioctl which can't be handled by
- *	the generic interface code. If not defined ioctls return
- *	not supported error code.
+ *	Old-style ioctl entry point. This is used internally by the
+ *	appletalk and ieee802154 subsystems but is no longer called by
+ *	the device ioctl handler.
+ *
+ * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd);
+ *	Used by the bonding driver for its device specific ioctls:
+ *	SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE,
+ *	SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY
  *
  * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
  *	Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG,
@@ -1367,6 +1372,8 @@ struct net_device_ops {
 					        struct ifreq *ifr, int cmd);
 	int			(*ndo_eth_ioctl)(struct net_device *dev,
 						 struct ifreq *ifr, int cmd);
+	int			(*ndo_siocbond)(struct net_device *dev,
+						struct ifreq *ifr, int cmd);
 	int			(*ndo_siocwandev)(struct net_device *dev,
 						  struct if_settings *ifs);
 	int			(*ndo_siocdevprivate)(struct net_device *dev,
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 3166f196b296..4035bce06bf8 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -260,14 +260,14 @@ static int dev_eth_ioctl(struct net_device *dev,
 	return err;
 }
 
-static int dev_do_ioctl(struct net_device *dev,
+static int dev_siocbond(struct net_device *dev,
 			struct ifreq *ifr, unsigned int cmd)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
-	if (ops->ndo_do_ioctl) {
+	if (ops->ndo_siocbond) {
 		if (netif_device_present(dev))
-			return ops->ndo_do_ioctl(dev, ifr, cmd);
+			return ops->ndo_siocbond(dev, ifr, cmd);
 		else
 			return -ENODEV;
 	}
@@ -407,7 +407,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
 		    cmd == SIOCBONDSLAVEINFOQUERY ||
 		    cmd == SIOCBONDINFOQUERY ||
 		    cmd == SIOCBONDCHANGEACTIVE) {
-			err = dev_do_ioctl(dev, ifr, cmd);
+			err = dev_siocbond(dev, ifr, cmd);
 		} else
 			err = -EINVAL;
 
-- 
cgit v1.2.3


From d7907a2b1a3b89bea136025f885035a083525e41 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 28 Jul 2021 10:33:47 +0300
Subject: devlink: Remove duplicated registration check

Both registered flag and devlink pointer are set at the same time
and indicate the same thing - devlink/devlink_port are ready. Instead
of checking ->registered use devlink pointer as an indication.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  4 +---
 net/core/devlink.c    | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 57b738b78073..e48a62320407 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -55,8 +55,7 @@ struct devlink {
 			    * port, sb, dpipe, resource, params, region, traps and more.
 			    */
 	u8 reload_failed:1,
-	   reload_enabled:1,
-	   registered:1;
+	   reload_enabled:1;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
@@ -158,7 +157,6 @@ struct devlink_port {
 	struct list_head region_list;
 	struct devlink *devlink;
 	unsigned int index;
-	bool registered;
 	spinlock_t type_lock; /* Protects type and type_dev
 			       * pointer consistency.
 			       */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8fdd04f00fd7..b596a971b473 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -115,7 +115,7 @@ static void __devlink_net_set(struct devlink *devlink, struct net *net)
 
 void devlink_net_set(struct devlink *devlink, struct net *net)
 {
-	if (WARN_ON(devlink->registered))
+	if (WARN_ON(devlink->dev))
 		return;
 	__devlink_net_set(devlink, net);
 }
@@ -1043,7 +1043,7 @@ static void devlink_port_notify(struct devlink_port *devlink_port,
 	struct sk_buff *msg;
 	int err;
 
-	if (!devlink_port->registered)
+	if (!devlink_port->devlink)
 		return;
 
 	WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
@@ -8817,8 +8817,8 @@ EXPORT_SYMBOL_GPL(devlink_alloc);
  */
 int devlink_register(struct devlink *devlink, struct device *dev)
 {
+	WARN_ON(devlink->dev);
 	devlink->dev = dev;
-	devlink->registered = true;
 	mutex_lock(&devlink_mutex);
 	list_add_tail(&devlink->list, &devlink_list);
 	devlink_notify(devlink, DEVLINK_CMD_NEW);
@@ -8960,9 +8960,10 @@ int devlink_port_register(struct devlink *devlink,
 		mutex_unlock(&devlink->lock);
 		return -EEXIST;
 	}
+
+	WARN_ON(devlink_port->devlink);
 	devlink_port->devlink = devlink;
 	devlink_port->index = port_index;
-	devlink_port->registered = true;
 	spin_lock_init(&devlink_port->type_lock);
 	INIT_LIST_HEAD(&devlink_port->reporter_list);
 	mutex_init(&devlink_port->reporters_lock);
@@ -9001,7 +9002,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
 				    enum devlink_port_type type,
 				    void *type_dev)
 {
-	if (WARN_ON(!devlink_port->registered))
+	if (WARN_ON(!devlink_port->devlink))
 		return;
 	devlink_port_type_warn_cancel(devlink_port);
 	spin_lock_bh(&devlink_port->type_lock);
@@ -9121,7 +9122,7 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 {
 	int ret;
 
-	if (WARN_ON(devlink_port->registered))
+	if (WARN_ON(devlink_port->devlink))
 		return;
 	devlink_port->attrs = *attrs;
 	ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
@@ -9145,7 +9146,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 	int ret;
 
-	if (WARN_ON(devlink_port->registered))
+	if (WARN_ON(devlink_port->devlink))
 		return;
 	ret = __devlink_port_attrs_set(devlink_port,
 				       DEVLINK_PORT_FLAVOUR_PCI_PF);
@@ -9172,7 +9173,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 	int ret;
 
-	if (WARN_ON(devlink_port->registered))
+	if (WARN_ON(devlink_port->devlink))
 		return;
 	ret = __devlink_port_attrs_set(devlink_port,
 				       DEVLINK_PORT_FLAVOUR_PCI_VF);
@@ -9200,7 +9201,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 	int ret;
 
-	if (WARN_ON(devlink_port->registered))
+	if (WARN_ON(devlink_port->devlink))
 		return;
 	ret = __devlink_port_attrs_set(devlink_port,
 				       DEVLINK_PORT_FLAVOUR_PCI_SF);
-- 
cgit v1.2.3


From 56af5e749f20c3a540310c207dcc373f4f09156e Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Tue, 27 Jul 2021 18:33:15 -0700
Subject: net/sched: act_skbmod: Add SKBMOD_F_ECN option support

Currently, when doing rate limiting using the tc-police(8) action, the
easiest way is to simply drop the packets which exceed or conform the
configured bandwidth limit.  Add a new option to tc-skbmod(8), so that
users may use the ECN [1] extension to explicitly inform the receiver
about the congestion instead of dropping packets "on the floor".

The 2 least significant bits of the Traffic Class field in IPv4 and IPv6
headers are used to represent different ECN states [2]:

	0b00: "Non ECN-Capable Transport", Non-ECT
	0b10: "ECN Capable Transport", ECT(0)
	0b01: "ECN Capable Transport", ECT(1)
	0b11: "Congestion Encountered", CE

As an example:

	$ tc filter add dev eth0 parent 1: protocol ip prio 10 \
		matchall action skbmod ecn

Doing the above marks all ECT(0) and ECT(1) packets as CE.  It does NOT
affect Non-ECT or non-IP packets.  In the tc-police scenario mentioned
above, users may pipe a tc-police action and a tc-skbmod "ecn" action
together to achieve ECN-based rate limiting.

For TCP connections, upon receiving a CE packet, the receiver will respond
with an ECE packet, asking the sender to reduce their congestion window.
However ECN also works with other L4 protocols e.g. DCCP and SCTP [2], and
our implementation does not touch or care about L4 headers.

The updated tc-skbmod SYNOPSIS looks like the following:

	tc ... action skbmod { set SETTABLE | swap SWAPPABLE | ecn } ...

Only one of "set", "swap" or "ecn" shall be used in a single tc-skbmod
command.  Trying to use more than one of them at a time is considered
undefined behavior; pipe multiple tc-skbmod commands together instead.
"set" and "swap" only affect Ethernet packets, while "ecn" only affects
IPv{4,6} packets.

It is also worth mentioning that, in theory, the same effect could be
achieved by piping a "police" action and a "bpf" action using the
bpf_skb_ecn_set_ce() helper, but this requires eBPF programming from the
user, thus impractical.

Depends on patch "net/sched: act_skbmod: Skip non-Ethernet packets".

[1] https://datatracker.ietf.org/doc/html/rfc3168
[2] https://en.wikipedia.org/wiki/Explicit_Congestion_Notification

Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_skbmod.h |  1 +
 net/sched/act_skbmod.c                | 44 +++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h
index c525b3503797..af6ef2cfbf3d 100644
--- a/include/uapi/linux/tc_act/tc_skbmod.h
+++ b/include/uapi/linux/tc_act/tc_skbmod.h
@@ -17,6 +17,7 @@
 #define SKBMOD_F_SMAC	0x2
 #define SKBMOD_F_ETYPE	0x4
 #define SKBMOD_F_SWAPMAC 0x8
+#define SKBMOD_F_ECN	0x10
 
 struct tc_skbmod {
 	tc_gen;
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 8d17a543cc9f..762ceec3e6f6 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
+#include <net/inet_ecn.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
@@ -21,15 +22,13 @@
 static unsigned int skbmod_net_id;
 static struct tc_action_ops act_skbmod_ops;
 
-#define MAX_EDIT_LEN ETH_HLEN
 static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_skbmod *d = to_skbmod(a);
-	int action;
+	int action, max_edit_len, err;
 	struct tcf_skbmod_params *p;
 	u64 flags;
-	int err;
 
 	tcf_lastuse_update(&d->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
@@ -38,19 +37,34 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
 	if (unlikely(action == TC_ACT_SHOT))
 		goto drop;
 
-	if (!skb->dev || skb->dev->type != ARPHRD_ETHER)
-		return action;
+	max_edit_len = skb_mac_header_len(skb);
+	p = rcu_dereference_bh(d->skbmod_p);
+	flags = p->flags;
+
+	/* tcf_skbmod_init() guarantees "flags" to be one of the following:
+	 *	1. a combination of SKBMOD_F_{DMAC,SMAC,ETYPE}
+	 *	2. SKBMOD_F_SWAPMAC
+	 *	3. SKBMOD_F_ECN
+	 * SKBMOD_F_ECN only works with IP packets; all other flags only work with Ethernet
+	 * packets.
+	 */
+	if (flags == SKBMOD_F_ECN) {
+		switch (skb_protocol(skb, true)) {
+		case cpu_to_be16(ETH_P_IP):
+		case cpu_to_be16(ETH_P_IPV6):
+			max_edit_len += skb_network_header_len(skb);
+			break;
+		default:
+			goto out;
+		}
+	} else if (!skb->dev || skb->dev->type != ARPHRD_ETHER) {
+		goto out;
+	}
 
-	/* XXX: if you are going to edit more fields beyond ethernet header
-	 * (example when you add IP header replacement or vlan swap)
-	 * then MAX_EDIT_LEN needs to change appropriately
-	*/
-	err = skb_ensure_writable(skb, MAX_EDIT_LEN);
+	err = skb_ensure_writable(skb, max_edit_len);
 	if (unlikely(err)) /* best policy is to drop on the floor */
 		goto drop;
 
-	p = rcu_dereference_bh(d->skbmod_p);
-	flags = p->flags;
 	if (flags & SKBMOD_F_DMAC)
 		ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
 	if (flags & SKBMOD_F_SMAC)
@@ -66,6 +80,10 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
 		ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
 	}
 
+	if (flags & SKBMOD_F_ECN)
+		INET_ECN_set_ce(skb);
+
+out:
 	return action;
 
 drop:
@@ -129,6 +147,8 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 	index = parm->index;
 	if (parm->flags & SKBMOD_F_SWAPMAC)
 		lflags = SKBMOD_F_SWAPMAC;
+	if (parm->flags & SKBMOD_F_ECN)
+		lflags = SKBMOD_F_ECN;
 
 	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
-- 
cgit v1.2.3


From 58ce6d5b271ab25fb2056f84a8e5546945eb5fc9 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 27 Jul 2021 06:12:04 +0900
Subject: Bluetooth: defer cleanup of resources in hci_unregister_dev()

syzbot is hitting might_sleep() warning at hci_sock_dev_event()
due to calling lock_sock() with rw spinlock held [1].

It seems that history of this locking problem is a trial and error.

Commit b40df5743ee8aed8 ("[PATCH] bluetooth: fix socket locking in
hci_sock_dev_event()") in 2.6.21-rc4 changed bh_lock_sock() to lock_sock()
as an attempt to fix lockdep warning.

Then, commit 4ce61d1c7a8ef4c1 ("[BLUETOOTH]: Fix locking in
hci_sock_dev_event().") in 2.6.22-rc2 changed lock_sock() to
local_bh_disable() + bh_lock_sock_nested() as an attempt to fix
sleep in atomic context warning.

Then, commit 4b5dd696f81b210c ("Bluetooth: Remove local_bh_disable() from
hci_sock.c") in 3.3-rc1 removed local_bh_disable().

Then, commit e305509e678b3a4a ("Bluetooth: use correct lock to prevent UAF
of hdev object") in 5.13-rc5 again changed bh_lock_sock_nested() to
lock_sock() as an attempt to fix CVE-2021-3573.

This difficulty comes from current implementation that
hci_sock_dev_event(HCI_DEV_UNREG) is responsible for dropping all
references from sockets because hci_unregister_dev() immediately reclaims
resources as soon as returning from hci_sock_dev_event(HCI_DEV_UNREG).
But the history suggests that hci_sock_dev_event(HCI_DEV_UNREG) was not
doing what it should do.

Therefore, instead of trying to detach sockets from device, let's accept
not detaching sockets from device at hci_sock_dev_event(HCI_DEV_UNREG),
by moving actual cleanup of resources from hci_unregister_dev() to
hci_release_dev() which is called by bt_host_release when all references
to this unregistered device (which is a kobject) are gone.

Link: https://syzkaller.appspot.com/bug?extid=a5df189917e79d5e59c9 [1]
Reported-by: syzbot <syzbot+a5df189917e79d5e59c9@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+a5df189917e79d5e59c9@syzkaller.appspotmail.com>
Fixes: e305509e678b3a4a ("Bluetooth: use correct lock to prevent UAF of hdev object")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_core.c         | 17 +++++++++--------
 net/bluetooth/hci_sock.c         | 20 +++++++++++++-------
 net/bluetooth/hci_sysfs.c        |  2 +-
 4 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index a53e94459ecd..4abe3c494002 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1230,6 +1230,7 @@ struct hci_dev *hci_alloc_dev(void);
 void hci_free_dev(struct hci_dev *hdev);
 int hci_register_dev(struct hci_dev *hdev);
 void hci_unregister_dev(struct hci_dev *hdev);
+void hci_release_dev(struct hci_dev *hdev);
 int hci_suspend_dev(struct hci_dev *hdev);
 int hci_resume_dev(struct hci_dev *hdev);
 int hci_reset_dev(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2560ed2f144d..2b78e1336c53 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3996,14 +3996,10 @@ EXPORT_SYMBOL(hci_register_dev);
 /* Unregister HCI device */
 void hci_unregister_dev(struct hci_dev *hdev)
 {
-	int id;
-
 	BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
 
 	hci_dev_set_flag(hdev, HCI_UNREGISTER);
 
-	id = hdev->id;
-
 	write_lock(&hci_dev_list_lock);
 	list_del(&hdev->list);
 	write_unlock(&hci_dev_list_lock);
@@ -4038,7 +4034,13 @@ void hci_unregister_dev(struct hci_dev *hdev)
 	}
 
 	device_del(&hdev->dev);
+	hci_dev_put(hdev);
+}
+EXPORT_SYMBOL(hci_unregister_dev);
 
+/* Release HCI device */
+void hci_release_dev(struct hci_dev *hdev)
+{
 	debugfs_remove_recursive(hdev->debugfs);
 	kfree_const(hdev->hw_info);
 	kfree_const(hdev->fw_info);
@@ -4063,11 +4065,10 @@ void hci_unregister_dev(struct hci_dev *hdev)
 	hci_blocked_keys_clear(hdev);
 	hci_dev_unlock(hdev);
 
-	hci_dev_put(hdev);
-
-	ida_simple_remove(&hci_index_ida, id);
+	ida_simple_remove(&hci_index_ida, hdev->id);
+	kfree(hdev);
 }
-EXPORT_SYMBOL(hci_unregister_dev);
+EXPORT_SYMBOL(hci_release_dev);
 
 /* Suspend HCI device */
 int hci_suspend_dev(struct hci_dev *hdev)
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index b04a5a02ecf3..d810a5adf064 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -759,19 +759,13 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event)
 	if (event == HCI_DEV_UNREG) {
 		struct sock *sk;
 
-		/* Detach sockets from device */
+		/* Wake up sockets using this dead device */
 		read_lock(&hci_sk_list.lock);
 		sk_for_each(sk, &hci_sk_list.head) {
-			lock_sock(sk);
 			if (hci_pi(sk)->hdev == hdev) {
-				hci_pi(sk)->hdev = NULL;
 				sk->sk_err = EPIPE;
-				sk->sk_state = BT_OPEN;
 				sk->sk_state_change(sk);
-
-				hci_dev_put(hdev);
 			}
-			release_sock(sk);
 		}
 		read_unlock(&hci_sk_list.lock);
 	}
@@ -1103,6 +1097,18 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 
 	lock_sock(sk);
 
+	/* Allow detaching from dead device and attaching to alive device, if
+	 * the caller wants to re-bind (instead of close) this socket in
+	 * response to hci_sock_dev_event(HCI_DEV_UNREG) notification.
+	 */
+	hdev = hci_pi(sk)->hdev;
+	if (hdev && hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+		hci_pi(sk)->hdev = NULL;
+		sk->sk_state = BT_OPEN;
+		hci_dev_put(hdev);
+	}
+	hdev = NULL;
+
 	if (sk->sk_state == BT_BOUND) {
 		err = -EALREADY;
 		goto done;
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 9874844a95a9..ebf282d1eb2b 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -83,7 +83,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn)
 static void bt_host_release(struct device *dev)
 {
 	struct hci_dev *hdev = to_hci_dev(dev);
-	kfree(hdev);
+	hci_release_dev(hdev);
 	module_put(THIS_MODULE);
 }
 
-- 
cgit v1.2.3


From 5d8dbb7fb82b8661c16d496644b931c0e2e3a12e Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Wed, 28 Jul 2021 19:38:18 +0300
Subject: net: xfrm: fix shift-out-of-bounce

We need to check up->dirmask to avoid shift-out-of-bounce bug,
since up->dirmask comes from userspace.

Also, added XFRM_USERPOLICY_DIRMASK_MAX constant to uapi to inform
user-space that up->dirmask has maximum possible value

Fixes: 2d151d39073a ("xfrm: Add possibility to set the default to block if we have no policy")
Reported-and-tested-by: syzbot+9cd5837a045bbee5b810@syzkaller.appspotmail.com
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 1 +
 net/xfrm/xfrm_user.c      | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 6e8095106192..b96c1ea7166d 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -514,6 +514,7 @@ struct xfrm_user_offload {
 #define XFRM_OFFLOAD_INBOUND	2
 
 struct xfrm_userpolicy_default {
+#define XFRM_USERPOLICY_DIRMASK_MAX	(sizeof(__u8) * 8)
 	__u8				dirmask;
 	__u8				action;
 };
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4eafd1130c3e..127c99f71c99 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1966,9 +1966,14 @@ static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh,
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_userpolicy_default *up = nlmsg_data(nlh);
-	u8 dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK;
+	u8 dirmask;
 	u8 old_default = net->xfrm.policy_default;
 
+	if (up->dirmask >= XFRM_USERPOLICY_DIRMASK_MAX)
+		return -EINVAL;
+
+	dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK;
+
 	net->xfrm.policy_default = (old_default & (0xff ^ dirmask))
 				    | (up->action << up->dirmask);
 
-- 
cgit v1.2.3


From 5fc88f93edf2f797f1aa63334cc6c86f9c15d585 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 28 Jul 2021 18:23:59 +0200
Subject: sk_buff: introduce 'slow_gro' flags

The new flag tracks if any state field is set, so that
GRO requires 'unusual'/slow prepare steps.

Set such flag when a ct entry is attached to the skb,
and never clear it.

The new bit uses an existing hole into the sk_buff struct

RFC -> v1:
 - use a single state bit, never clear it
 - avoid moving the _nfct field

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f19190820e63..3ff18300d210 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -689,6 +689,7 @@ typedef unsigned char *sk_buff_data_t;
  *		CHECKSUM_UNNECESSARY (max 3)
  *	@dst_pending_confirm: need to confirm neighbour
  *	@decrypted: Decrypted SKB
+ *	@slow_gro: state present at GRO time, slower prepare step required
  *	@napi_id: id of the NAPI struct this skb came from
  *	@sender_cpu: (aka @napi_id) source CPU in XPS
  *	@secmark: security marking
@@ -870,6 +871,7 @@ struct sk_buff {
 #ifdef CONFIG_TLS_DEVICE
 	__u8			decrypted:1;
 #endif
+	__u8			slow_gro:1;
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
@@ -4216,6 +4218,7 @@ static inline unsigned long skb_get_nfct(const struct sk_buff *skb)
 static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	skb->slow_gro |= !!nfct;
 	skb->_nfct = nfct;
 #endif
 }
@@ -4375,6 +4378,7 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	nf_conntrack_put(skb_nfct(dst));
 #endif
+	dst->slow_gro = src->slow_gro;
 	__nf_copy(dst, src, true);
 }
 
-- 
cgit v1.2.3


From 8a886b142bd03d36612747e9aefdf0282c8b02dd Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 28 Jul 2021 18:24:00 +0200
Subject: sk_buff: track dst status in slow_gro

Similar to the previous patch, but covering the dst field:
the slow_gro flag is additionally set when a dst is attached
to the skb

RFC -> v1:
 - use the existing flag instead of adding a new one

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 ++
 include/net/dst.h      | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3ff18300d210..b1e5bbfcc926 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -992,6 +992,7 @@ static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
  */
 static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 {
+	skb->slow_gro |= !!dst;
 	skb->_skb_refdst = (unsigned long)dst;
 }
 
@@ -1008,6 +1009,7 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
 {
 	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	skb->slow_gro = !!dst;
 	skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
 }
 
diff --git a/include/net/dst.h b/include/net/dst.h
index 75b1e734e9c2..a057319aabef 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -277,6 +277,7 @@ static inline void skb_dst_drop(struct sk_buff *skb)
 
 static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst)
 {
+	nskb->slow_gro |= !!refdst;
 	nskb->_skb_refdst = refdst;
 	if (!(nskb->_skb_refdst & SKB_DST_NOREF))
 		dst_clone(skb_dst(nskb));
@@ -316,6 +317,7 @@ static inline bool skb_dst_force(struct sk_buff *skb)
 			dst = NULL;
 
 		skb->_skb_refdst = (unsigned long)dst;
+		skb->slow_gro |= !!dst;
 	}
 
 	return skb->_skb_refdst != 0UL;
-- 
cgit v1.2.3


From 5e10da5385d20c4bae587bc2921e5fdd9655d5fc Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 28 Jul 2021 18:24:03 +0200
Subject: skbuff: allow 'slow_gro' for skb carring sock reference

This change leverages the infrastructure introduced by the previous
patches to allow soft devices passing to the GRO engine owned skbs
without impacting the fast-path.

It's up to the GRO caller ensuring the slow_gro bit validity before
invoking the GRO engine. The new helper skb_prepare_for_gro() is
introduced for that goal.

On slow_gro, skbs are aggregated only with equal sk.
Additionally, skb truesize on GRO recycle and free is correctly
updated so that sk wmem is not changed by the GRO processing.

rfc-> v1:
 - fixed bad truesize on dev_gro_receive NAPI_FREE
 - use the existing state bit

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h |  9 +++++++++
 net/core/dev.c     |  2 ++
 net/core/skbuff.c  | 17 +++++++++++++----
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index f23cb259b0e2..ff1be7e7e90b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2249,6 +2249,15 @@ static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struc
 	return false;
 }
 
+static inline void skb_prepare_for_gro(struct sk_buff *skb)
+{
+	if (skb->destructor != sock_wfree) {
+		skb_orphan(skb);
+		return;
+	}
+	skb->slow_gro = 1;
+}
+
 void sk_reset_timer(struct sock *sk, struct timer_list *timer,
 		    unsigned long expires);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 19565f7497ee..dcc87fcd64ba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6034,6 +6034,7 @@ static void gro_list_prepare(const struct list_head *head,
 			struct tc_skb_ext *p_ext;
 #endif
 
+			diffs |= p->sk != skb->sk;
 			diffs |= skb_metadata_dst_cmp(p, skb);
 			diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
 
@@ -6311,6 +6312,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 	skb_shinfo(skb)->gso_type = 0;
 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 	if (unlikely(skb->slow_gro)) {
+		skb_orphan(skb);
 		skb_ext_reset(skb);
 		nf_reset_ct(skb);
 		skb->slow_gro = 0;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d04e286149cc..fcbd977186b0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -958,6 +958,7 @@ void napi_skb_free_stolen_head(struct sk_buff *skb)
 		nf_reset_ct(skb);
 		skb_dst_drop(skb);
 		skb_ext_put(skb);
+		skb_orphan(skb);
 		skb->slow_gro = 0;
 	}
 	napi_skb_cache_put(skb);
@@ -3892,6 +3893,9 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
 	NAPI_GRO_CB(p)->last = skb;
 	NAPI_GRO_CB(p)->count++;
 	p->data_len += skb->len;
+
+	/* sk owenrship - if any - completely transferred to the aggregated packet */
+	skb->destructor = NULL;
 	p->truesize += skb->truesize;
 	p->len += skb->len;
 
@@ -4259,6 +4263,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 	unsigned int headlen = skb_headlen(skb);
 	unsigned int len = skb_gro_len(skb);
 	unsigned int delta_truesize;
+	unsigned int new_truesize;
 	struct sk_buff *lp;
 
 	if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
@@ -4290,10 +4295,10 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 		skb_frag_size_sub(frag, offset);
 
 		/* all fragments truesize : remove (head size + sk_buff) */
-		delta_truesize = skb->truesize -
-				 SKB_TRUESIZE(skb_end_offset(skb));
+		new_truesize = SKB_TRUESIZE(skb_end_offset(skb));
+		delta_truesize = skb->truesize - new_truesize;
 
-		skb->truesize -= skb->data_len;
+		skb->truesize = new_truesize;
 		skb->len -= skb->data_len;
 		skb->data_len = 0;
 
@@ -4322,12 +4327,16 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 		memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
 		/* We dont need to clear skbinfo->nr_frags here */
 
-		delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+		new_truesize = SKB_TRUESIZE(sizeof(struct sk_buff));
+		delta_truesize = skb->truesize - new_truesize;
+		skb->truesize = new_truesize;
 		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
 		goto done;
 	}
 
 merge:
+	/* sk owenrship - if any - completely transferred to the aggregated packet */
+	skb->destructor = NULL;
 	delta_truesize = skb->truesize;
 	if (offset > headlen) {
 		unsigned int eat = offset - headlen;
-- 
cgit v1.2.3


From dd8987a394c0730380167e0b0aebd766cf3511e1 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Thu, 29 Jul 2021 12:40:11 +0200
Subject: nfc: constify passed nfc_dev

The struct nfc_dev is not modified by nfc_get_drvdata() and
nfc_device_name() so it can be made a const.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nfc/nfc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 85b698794b14..901779138e2b 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -245,7 +245,7 @@ static inline void nfc_set_drvdata(struct nfc_dev *dev, void *data)
  *
  * @dev: The nfc device
  */
-static inline void *nfc_get_drvdata(struct nfc_dev *dev)
+static inline void *nfc_get_drvdata(const struct nfc_dev *dev)
 {
 	return dev_get_drvdata(&dev->dev);
 }
@@ -255,7 +255,7 @@ static inline void *nfc_get_drvdata(struct nfc_dev *dev)
  *
  * @dev: The nfc device whose name to return
  */
-static inline const char *nfc_device_name(struct nfc_dev *dev)
+static inline const char *nfc_device_name(const struct nfc_dev *dev)
 {
 	return dev_name(&dev->dev);
 }
-- 
cgit v1.2.3


From bc49d8169aa72295104f1558830c568efb946315 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:39 +0800
Subject: mctp: Add MCTP base

Add basic Kconfig, an initial (empty) af_mctp source object, and
{AF,PF}_MCTP definitions, and the required definitions for a new
protocol type.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                         |  7 +++++++
 include/linux/socket.h              |  6 +++++-
 include/uapi/linux/mctp.h           | 15 +++++++++++++++
 net/Kconfig                         |  1 +
 net/Makefile                        |  1 +
 net/core/sock.c                     |  1 +
 net/mctp/Kconfig                    | 13 +++++++++++++
 net/mctp/Makefile                   |  3 +++
 net/mctp/af_mctp.c                  | 13 +++++++++++++
 net/socket.c                        |  1 +
 security/selinux/hooks.c            |  4 +++-
 security/selinux/include/classmap.h |  4 +++-
 12 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 include/uapi/linux/mctp.h
 create mode 100644 net/mctp/Kconfig
 create mode 100644 net/mctp/Makefile
 create mode 100644 net/mctp/af_mctp.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 4c32a9c532b7..22a1ff9afd9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11032,6 +11032,13 @@ F:	drivers/mailbox/arm_mhuv2.c
 F:	include/linux/mailbox/arm_mhuv2_message.h
 F:	Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml
 
+MANAGEMENT COMPONENT TRANSPORT PROTOCOL (MCTP)
+M:	Jeremy Kerr <jk@codeconstruct.com.au>
+M:	Matt Johnston <matt@codeconstruct.com.au>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	net/mctp/
+
 MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7
 M:	Michael Kerrisk <mtk.manpages@gmail.com>
 L:	linux-man@vger.kernel.org
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 0d8e3dcb7f88..fd9ce51582d8 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -223,8 +223,11 @@ struct ucred {
 				 * reuses AF_INET address family
 				 */
 #define AF_XDP		44	/* XDP sockets			*/
+#define AF_MCTP		45	/* Management component
+				 * transport protocol
+				 */
 
-#define AF_MAX		45	/* For now.. */
+#define AF_MAX		46	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -274,6 +277,7 @@ struct ucred {
 #define PF_QIPCRTR	AF_QIPCRTR
 #define PF_SMC		AF_SMC
 #define PF_XDP		AF_XDP
+#define PF_MCTP		AF_MCTP
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
new file mode 100644
index 000000000000..2640a589c14c
--- /dev/null
+++ b/include/uapi/linux/mctp.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Management Component Transport Protocol (MCTP)
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#ifndef __UAPI_MCTP_H
+#define __UAPI_MCTP_H
+
+struct sockaddr_mctp {
+};
+
+#endif /* __UAPI_MCTP_H */
diff --git a/net/Kconfig b/net/Kconfig
index c7392c449b25..fb13460c6dab 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -363,6 +363,7 @@ source "net/bluetooth/Kconfig"
 source "net/rxrpc/Kconfig"
 source "net/kcm/Kconfig"
 source "net/strparser/Kconfig"
+source "net/mctp/Kconfig"
 
 config FIB_RULES
 	bool
diff --git a/net/Makefile b/net/Makefile
index 9ca9572188fe..fbfeb8a0bb37 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_QRTR)		+= qrtr/
 obj-$(CONFIG_NET_NCSI)		+= ncsi/
 obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
 obj-$(CONFIG_MPTCP)		+= mptcp/
+obj-$(CONFIG_MCTP)		+= mctp/
diff --git a/net/core/sock.c b/net/core/sock.c
index a3eea6e0b30a..9671c32e6ef5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,6 +226,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
+  x "AF_MCTP"  , \
   x "AF_MAX"
 
 static const char *const af_family_key_strings[AF_MAX+1] = {
diff --git a/net/mctp/Kconfig b/net/mctp/Kconfig
new file mode 100644
index 000000000000..2cdf3d0a28c9
--- /dev/null
+++ b/net/mctp/Kconfig
@@ -0,0 +1,13 @@
+
+menuconfig MCTP
+	depends on NET
+	tristate "MCTP core protocol support"
+	help
+	  Management Component Transport Protocol (MCTP) is an in-system
+	  protocol for communicating between management controllers and
+	  their managed devices (peripherals, host processors, etc.). The
+	  protocol is defined by DMTF specification DSP0236.
+
+	  This option enables core MCTP support. For communicating with other
+	  devices, you'll want to enable a driver for a specific hardware
+	  channel.
diff --git a/net/mctp/Makefile b/net/mctp/Makefile
new file mode 100644
index 000000000000..7c056b1b7939
--- /dev/null
+++ b/net/mctp/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MCTP) += mctp.o
+mctp-objs := af_mctp.o
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
new file mode 100644
index 000000000000..8f9c77e97357
--- /dev/null
+++ b/net/mctp/af_mctp.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Management Component Transport Protocol (MCTP)
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("MCTP core");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jeremy Kerr <jk@codeconstruct.com.au>");
diff --git a/net/socket.c b/net/socket.c
index 42665bd99ea4..3c10504e46d9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -212,6 +212,7 @@ static const char * const pf_family_names[] = {
 	[PF_QIPCRTR]	= "PF_QIPCRTR",
 	[PF_SMC]	= "PF_SMC",
 	[PF_XDP]	= "PF_XDP",
+	[PF_MCTP]	= "PF_MCTP",
 };
 
 /*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b0032c42333e..2143f590e3d6 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1330,7 +1330,9 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc
 			return SECCLASS_SMC_SOCKET;
 		case PF_XDP:
 			return SECCLASS_XDP_SOCKET;
-#if PF_MAX > 45
+		case PF_MCTP:
+			return SECCLASS_MCTP_SOCKET;
+#if PF_MAX > 46
 #error New address family defined, please update this function.
 #endif
 		}
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 62d19bccf3de..084757ff4390 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -246,6 +246,8 @@ struct security_class_mapping secclass_map[] = {
 	    NULL } },
 	{ "xdp_socket",
 	  { COMMON_SOCK_PERMS, NULL } },
+	{ "mctp_socket",
+	  { COMMON_SOCK_PERMS, NULL } },
 	{ "perf_event",
 	  { "open", "cpu", "kernel", "tracepoint", "read", "write", NULL } },
 	{ "lockdown",
@@ -255,6 +257,6 @@ struct security_class_mapping secclass_map[] = {
 	{ NULL }
   };
 
-#if PF_MAX > 45
+#if PF_MAX > 46
 #error New address family defined, please update secclass_map.
 #endif
-- 
cgit v1.2.3


From 2c8e2e9aec7927b387540a88351b8405ee82b34a Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:41 +0800
Subject: mctp: Add base packet definitions

Simple packet header format as defined by DMTF DSP0236.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS        |  1 +
 include/net/mctp.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 include/net/mctp.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 22a1ff9afd9d..770b986d10f0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11037,6 +11037,7 @@ M:	Jeremy Kerr <jk@codeconstruct.com.au>
 M:	Matt Johnston <matt@codeconstruct.com.au>
 L:	netdev@vger.kernel.org
 S:	Maintained
+F:	include/net/mctp.h
 F:	net/mctp/
 
 MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7
diff --git a/include/net/mctp.h b/include/net/mctp.h
new file mode 100644
index 000000000000..4c01e083be45
--- /dev/null
+++ b/include/net/mctp.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Management Component Transport Protocol (MCTP)
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#ifndef __NET_MCTP_H
+#define __NET_MCTP_H
+
+#include <linux/bits.h>
+
+/* MCTP packet definitions */
+struct mctp_hdr {
+	u8	ver;
+	u8	dest;
+	u8	src;
+	u8	flags_seq_tag;
+};
+
+#define MCTP_VER_MIN	1
+#define MCTP_VER_MAX	1
+
+/* Definitions for flags_seq_tag field */
+#define MCTP_HDR_FLAG_SOM	BIT(7)
+#define MCTP_HDR_FLAG_EOM	BIT(6)
+#define MCTP_HDR_FLAG_TO	BIT(3)
+#define MCTP_HDR_FLAGS		GENMASK(5, 3)
+#define MCTP_HDR_SEQ_SHIFT	4
+#define MCTP_HDR_SEQ_MASK	GENMASK(1, 0)
+#define MCTP_HDR_TAG_SHIFT	0
+#define MCTP_HDR_TAG_MASK	GENMASK(2, 0)
+
+#endif /* __NET_MCTP_H */
-- 
cgit v1.2.3


From 60fc63981693f807baa0e404104dedea0e8b4e61 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:42 +0800
Subject: mctp: Add sockaddr_mctp to uapi

This change introduces the user-visible MCTP header, containing the
protocol-specific addressing definitions.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mctp.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index 2640a589c14c..52b54d13f385 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -9,7 +9,28 @@
 #ifndef __UAPI_MCTP_H
 #define __UAPI_MCTP_H
 
+#include <linux/types.h>
+
+typedef __u8			mctp_eid_t;
+
+struct mctp_addr {
+	mctp_eid_t		s_addr;
+};
+
 struct sockaddr_mctp {
+	unsigned short int	smctp_family;
+	int			smctp_network;
+	struct mctp_addr	smctp_addr;
+	__u8			smctp_type;
+	__u8			smctp_tag;
 };
 
+#define MCTP_NET_ANY		0x0
+
+#define MCTP_ADDR_NULL		0x00
+#define MCTP_ADDR_ANY		0xff
+
+#define MCTP_TAG_MASK		0x07
+#define MCTP_TAG_OWNER		0x08
+
 #endif /* __UAPI_MCTP_H */
-- 
cgit v1.2.3


From 4b2e69305cbbc7c32ecbd946110b505c4ff6071a Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:43 +0800
Subject: mctp: Add initial driver infrastructure

Add an empty drivers/net/mctp/, for future interface drivers.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                 | 1 +
 drivers/net/Kconfig         | 2 ++
 drivers/net/Makefile        | 1 +
 drivers/net/mctp/Kconfig    | 8 ++++++++
 drivers/net/mctp/Makefile   | 0
 include/uapi/linux/if_arp.h | 1 +
 6 files changed, 13 insertions(+)
 create mode 100644 drivers/net/mctp/Kconfig
 create mode 100644 drivers/net/mctp/Makefile

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 770b986d10f0..e95eb3b00cd2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11037,6 +11037,7 @@ M:	Jeremy Kerr <jk@codeconstruct.com.au>
 M:	Matt Johnston <matt@codeconstruct.com.au>
 L:	netdev@vger.kernel.org
 S:	Maintained
+F:	drivers/net/mctp/
 F:	include/net/mctp.h
 F:	net/mctp/
 
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6977f8248df7..56213a8a1ec5 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -483,6 +483,8 @@ config NET_SB1000
 
 source "drivers/net/phy/Kconfig"
 
+source "drivers/net/mctp/Kconfig"
+
 source "drivers/net/mdio/Kconfig"
 
 source "drivers/net/pcs/Kconfig"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 7ffd2d03efaf..a48a664605a3 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_WAN) += wan/
 obj-$(CONFIG_WLAN) += wireless/
 obj-$(CONFIG_IEEE802154) += ieee802154/
 obj-$(CONFIG_WWAN) += wwan/
+obj-$(CONFIG_MCTP) += mctp/
 
 obj-$(CONFIG_VMXNET3) += vmxnet3/
 obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
diff --git a/drivers/net/mctp/Kconfig b/drivers/net/mctp/Kconfig
new file mode 100644
index 000000000000..d8f966cedc89
--- /dev/null
+++ b/drivers/net/mctp/Kconfig
@@ -0,0 +1,8 @@
+
+if MCTP
+
+menu "MCTP Device Drivers"
+
+endmenu
+
+endif
diff --git a/drivers/net/mctp/Makefile b/drivers/net/mctp/Makefile
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index c3cc5a9e5eaf..4783af9fe520 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -54,6 +54,7 @@
 #define ARPHRD_X25	271		/* CCITT X.25			*/
 #define ARPHRD_HWX25	272		/* Boards with X.25 in firmware	*/
 #define ARPHRD_CAN	280		/* Controller Area Network      */
+#define ARPHRD_MCTP	290
 #define ARPHRD_PPP	512
 #define ARPHRD_CISCO	513		/* Cisco HDLC	 		*/
 #define ARPHRD_HDLC	ARPHRD_CISCO
-- 
cgit v1.2.3


From 583be982d93479ea3d85091b0fd0b01201ede87d Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:44 +0800
Subject: mctp: Add device handling and netlink interface

This change adds the infrastructure for managing MCTP netdevices; we add
a pointer to the AF_MCTP-specific data to struct netdevice, and hook up
the rtnetlink operations for adding and removing addresses.

Includes changes from Matt Johnston <matt@codeconstruct.com.au>.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                   |   1 +
 include/linux/netdevice.h     |   4 +
 include/net/mctp.h            |  14 ++
 include/net/mctpdevice.h      |  35 ++++
 include/uapi/linux/if_ether.h |   3 +
 include/uapi/linux/if_link.h  |  10 +
 include/uapi/linux/mctp.h     |   1 +
 net/mctp/Makefile             |   2 +-
 net/mctp/af_mctp.c            |   8 +
 net/mctp/device.c             | 414 ++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 491 insertions(+), 1 deletion(-)
 create mode 100644 include/net/mctpdevice.h
 create mode 100644 net/mctp/device.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index e95eb3b00cd2..c2943c227ee6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11039,6 +11039,7 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/mctp/
 F:	include/net/mctp.h
+F:	include/net/mctpdevice.h
 F:	net/mctp/
 
 MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 226bbee06730..d63a94ecbf3b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1823,6 +1823,7 @@ enum netdev_ml_priv_type {
  *	@ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
  *			 device struct
  *	@mpls_ptr:	mpls_dev struct pointer
+ *	@mctp_ptr:	MCTP specific data
  *
  *	@dev_addr:	Hw address (before bcast,
  *			because most packets are unicast)
@@ -2110,6 +2111,9 @@ struct net_device {
 #if IS_ENABLED(CONFIG_MPLS_ROUTING)
 	struct mpls_dev __rcu	*mpls_ptr;
 #endif
+#if IS_ENABLED(CONFIG_MCTP)
+	struct mctp_dev __rcu	*mctp_ptr;
+#endif
 
 /*
  * Cache lines mostly used on receive path (including eth_type_trans())
diff --git a/include/net/mctp.h b/include/net/mctp.h
index 4c01e083be45..61452e03aa85 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -10,6 +10,7 @@
 #define __NET_MCTP_H
 
 #include <linux/bits.h>
+#include <linux/mctp.h>
 
 /* MCTP packet definitions */
 struct mctp_hdr {
@@ -32,4 +33,17 @@ struct mctp_hdr {
 #define MCTP_HDR_TAG_SHIFT	0
 #define MCTP_HDR_TAG_MASK	GENMASK(2, 0)
 
+static inline bool mctp_address_ok(mctp_eid_t eid)
+{
+	return eid >= 8 && eid < 255;
+}
+
+static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb)
+{
+	return (struct mctp_hdr *)skb_network_header(skb);
+}
+
+void mctp_device_init(void);
+void mctp_device_exit(void);
+
 #endif /* __NET_MCTP_H */
diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h
new file mode 100644
index 000000000000..71a11012fac7
--- /dev/null
+++ b/include/net/mctpdevice.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Management Component Transport Protocol (MCTP) - device
+ * definitions.
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#ifndef __NET_MCTPDEVICE_H
+#define __NET_MCTPDEVICE_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/refcount.h>
+
+struct mctp_dev {
+	struct net_device	*dev;
+
+	unsigned int		net;
+
+	/* Only modified under RTNL. Reads have addrs_lock held */
+	u8			*addrs;
+	size_t			num_addrs;
+	spinlock_t		addrs_lock;
+
+	struct rcu_head		rcu;
+};
+
+#define MCTP_INITIAL_DEFAULT_NET	1
+
+struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev);
+struct mctp_dev *__mctp_dev_get(const struct net_device *dev);
+
+#endif /* __NET_MCTPDEVICE_H */
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index a0b637911d3c..5f589c7a8382 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -151,6 +151,9 @@
 #define ETH_P_MAP	0x00F9		/* Qualcomm multiplexing and
 					 * aggregation protocol
 					 */
+#define ETH_P_MCTP	0x00FA		/* Management component transport
+					 * protocol packets
+					 */
 
 /*
  *	This is an Ethernet frame header.
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 4882e81514b6..49b22afab78f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1260,4 +1260,14 @@ struct ifla_rmnet_flags {
 	__u32	mask;
 };
 
+/* MCTP section */
+
+enum {
+	IFLA_MCTP_UNSPEC,
+	IFLA_MCTP_NET,
+	__IFLA_MCTP_MAX,
+};
+
+#define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1)
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index 52b54d13f385..a9d8edb3402b 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -26,6 +26,7 @@ struct sockaddr_mctp {
 };
 
 #define MCTP_NET_ANY		0x0
+#define MCTP_NET_DEFAULT	0x0
 
 #define MCTP_ADDR_NULL		0x00
 #define MCTP_ADDR_ANY		0xff
diff --git a/net/mctp/Makefile b/net/mctp/Makefile
index 7c056b1b7939..2ea98c27b262 100644
--- a/net/mctp/Makefile
+++ b/net/mctp/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_MCTP) += mctp.o
-mctp-objs := af_mctp.o
+mctp-objs := af_mctp.o device.o
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index b3aeca6486e3..401b4fa141a5 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -6,13 +6,18 @@
  * Copyright (c) 2021 Google
  */
 
+#include <linux/if_arp.h>
 #include <linux/net.h>
 #include <linux/mctp.h>
 #include <linux/module.h>
 #include <linux/socket.h>
 
+#include <net/mctp.h>
+#include <net/mctpdevice.h>
 #include <net/sock.h>
 
+/* socket implementation */
+
 struct mctp_sock {
 	struct sock	sk;
 };
@@ -152,6 +157,8 @@ static __init int mctp_init(void)
 	if (rc)
 		goto err_unreg_sock;
 
+	mctp_device_init();
+
 	return 0;
 
 err_unreg_sock:
@@ -162,6 +169,7 @@ err_unreg_sock:
 
 static __exit void mctp_exit(void)
 {
+	mctp_device_exit();
 	proto_unregister(&mctp_proto);
 	sock_unregister(PF_MCTP);
 }
diff --git a/net/mctp/device.c b/net/mctp/device.c
new file mode 100644
index 000000000000..877abe5312cd
--- /dev/null
+++ b/net/mctp/device.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Management Component Transport Protocol (MCTP) - device implementation.
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#include <linux/if_link.h>
+#include <linux/mctp.h>
+#include <linux/netdevice.h>
+#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
+
+#include <net/addrconf.h>
+#include <net/netlink.h>
+#include <net/mctp.h>
+#include <net/mctpdevice.h>
+#include <net/sock.h>
+
+struct mctp_dump_cb {
+	int h;
+	int idx;
+	size_t a_idx;
+};
+
+/* unlocked: caller must hold rcu_read_lock */
+struct mctp_dev *__mctp_dev_get(const struct net_device *dev)
+{
+	return rcu_dereference(dev->mctp_ptr);
+}
+
+struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev)
+{
+	return rtnl_dereference(dev->mctp_ptr);
+}
+
+static void mctp_dev_destroy(struct mctp_dev *mdev)
+{
+	struct net_device *dev = mdev->dev;
+
+	dev_put(dev);
+	kfree_rcu(mdev, rcu);
+}
+
+static int mctp_fill_addrinfo(struct sk_buff *skb, struct netlink_callback *cb,
+			      struct mctp_dev *mdev, mctp_eid_t eid)
+{
+	struct ifaddrmsg *hdr;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			RTM_NEWADDR, sizeof(*hdr), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	hdr = nlmsg_data(nlh);
+	hdr->ifa_family = AF_MCTP;
+	hdr->ifa_prefixlen = 0;
+	hdr->ifa_flags = 0;
+	hdr->ifa_scope = 0;
+	hdr->ifa_index = mdev->dev->ifindex;
+
+	if (nla_put_u8(skb, IFA_LOCAL, eid))
+		goto cancel;
+
+	if (nla_put_u8(skb, IFA_ADDRESS, eid))
+		goto cancel;
+
+	nlmsg_end(skb, nlh);
+
+	return 0;
+
+cancel:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int mctp_dump_dev_addrinfo(struct mctp_dev *mdev, struct sk_buff *skb,
+				  struct netlink_callback *cb)
+{
+	struct mctp_dump_cb *mcb = (void *)cb->ctx;
+	int rc = 0;
+
+	for (; mcb->a_idx < mdev->num_addrs; mcb->a_idx++) {
+		rc = mctp_fill_addrinfo(skb, cb, mdev, mdev->addrs[mcb->a_idx]);
+		if (rc < 0)
+			break;
+	}
+
+	return rc;
+}
+
+static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct mctp_dump_cb *mcb = (void *)cb->ctx;
+	struct net *net = sock_net(skb->sk);
+	struct hlist_head *head;
+	struct net_device *dev;
+	struct ifaddrmsg *hdr;
+	struct mctp_dev *mdev;
+	int ifindex;
+	int idx, rc;
+
+	hdr = nlmsg_data(cb->nlh);
+	// filter by ifindex if requested
+	ifindex = hdr->ifa_index;
+
+	rcu_read_lock();
+	for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[mcb->h];
+		hlist_for_each_entry_rcu(dev, head, index_hlist) {
+			if (idx >= mcb->idx &&
+			    (ifindex == 0 || ifindex == dev->ifindex)) {
+				mdev = __mctp_dev_get(dev);
+				if (mdev) {
+					rc = mctp_dump_dev_addrinfo(mdev,
+								    skb, cb);
+					// Error indicates full buffer, this
+					// callback will get retried.
+					if (rc < 0)
+						goto out;
+				}
+			}
+			idx++;
+			// reset for next iteration
+			mcb->a_idx = 0;
+		}
+	}
+out:
+	rcu_read_unlock();
+	mcb->idx = idx;
+
+	return skb->len;
+}
+
+static const struct nla_policy ifa_mctp_policy[IFA_MAX + 1] = {
+	[IFA_ADDRESS]		= { .type = NLA_U8 },
+	[IFA_LOCAL]		= { .type = NLA_U8 },
+};
+
+static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX + 1];
+	struct net_device *dev;
+	struct mctp_addr *addr;
+	struct mctp_dev *mdev;
+	struct ifaddrmsg *ifm;
+	unsigned long flags;
+	u8 *tmp_addrs;
+	int rc;
+
+	rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy,
+			 extack);
+	if (rc < 0)
+		return rc;
+
+	ifm = nlmsg_data(nlh);
+
+	if (tb[IFA_LOCAL])
+		addr = nla_data(tb[IFA_LOCAL]);
+	else if (tb[IFA_ADDRESS])
+		addr = nla_data(tb[IFA_ADDRESS]);
+	else
+		return -EINVAL;
+
+	/* find device */
+	dev = __dev_get_by_index(net, ifm->ifa_index);
+	if (!dev)
+		return -ENODEV;
+
+	mdev = mctp_dev_get_rtnl(dev);
+	if (!mdev)
+		return -ENODEV;
+
+	if (!mctp_address_ok(addr->s_addr))
+		return -EINVAL;
+
+	/* Prevent duplicates. Under RTNL so don't need to lock for reading */
+	if (memchr(mdev->addrs, addr->s_addr, mdev->num_addrs))
+		return -EEXIST;
+
+	tmp_addrs = kmalloc(mdev->num_addrs + 1, GFP_KERNEL);
+	if (!tmp_addrs)
+		return -ENOMEM;
+	memcpy(tmp_addrs, mdev->addrs, mdev->num_addrs);
+	tmp_addrs[mdev->num_addrs] = addr->s_addr;
+
+	/* Lock to write */
+	spin_lock_irqsave(&mdev->addrs_lock, flags);
+	mdev->num_addrs++;
+	swap(mdev->addrs, tmp_addrs);
+	spin_unlock_irqrestore(&mdev->addrs_lock, flags);
+
+	kfree(tmp_addrs);
+
+	return 0;
+}
+
+static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX + 1];
+	struct net_device *dev;
+	struct mctp_addr *addr;
+	struct mctp_dev *mdev;
+	struct ifaddrmsg *ifm;
+	unsigned long flags;
+	u8 *pos;
+	int rc;
+
+	rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy,
+			 extack);
+	if (rc < 0)
+		return rc;
+
+	ifm = nlmsg_data(nlh);
+
+	if (tb[IFA_LOCAL])
+		addr = nla_data(tb[IFA_LOCAL]);
+	else if (tb[IFA_ADDRESS])
+		addr = nla_data(tb[IFA_ADDRESS]);
+	else
+		return -EINVAL;
+
+	/* find device */
+	dev = __dev_get_by_index(net, ifm->ifa_index);
+	if (!dev)
+		return -ENODEV;
+
+	mdev = mctp_dev_get_rtnl(dev);
+	if (!mdev)
+		return -ENODEV;
+
+	pos = memchr(mdev->addrs, addr->s_addr, mdev->num_addrs);
+	if (!pos)
+		return -ENOENT;
+
+	spin_lock_irqsave(&mdev->addrs_lock, flags);
+	memmove(pos, pos + 1, mdev->num_addrs - 1 - (pos - mdev->addrs));
+	mdev->num_addrs--;
+	spin_unlock_irqrestore(&mdev->addrs_lock, flags);
+
+	return 0;
+}
+
+static struct mctp_dev *mctp_add_dev(struct net_device *dev)
+{
+	struct mctp_dev *mdev;
+
+	ASSERT_RTNL();
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&mdev->addrs_lock);
+
+	mdev->net = MCTP_INITIAL_DEFAULT_NET;
+
+	/* associate to net_device */
+	rcu_assign_pointer(dev->mctp_ptr, mdev);
+	dev_hold(dev);
+	mdev->dev = dev;
+
+	return mdev;
+}
+
+static int mctp_fill_link_af(struct sk_buff *skb,
+			     const struct net_device *dev, u32 ext_filter_mask)
+{
+	struct mctp_dev *mdev;
+
+	mdev = mctp_dev_get_rtnl(dev);
+	if (!mdev)
+		return -ENODATA;
+	if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static size_t mctp_get_link_af_size(const struct net_device *dev,
+				    u32 ext_filter_mask)
+{
+	struct mctp_dev *mdev;
+	unsigned int ret;
+
+	/* caller holds RCU */
+	mdev = __mctp_dev_get(dev);
+	if (!mdev)
+		return 0;
+	ret = nla_total_size(4); /* IFLA_MCTP_NET */
+	return ret;
+}
+
+static const struct nla_policy ifla_af_mctp_policy[IFLA_MCTP_MAX + 1] = {
+	[IFLA_MCTP_NET]		= { .type = NLA_U32 },
+};
+
+static int mctp_set_link_af(struct net_device *dev, const struct nlattr *attr,
+			    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_MCTP_MAX + 1];
+	struct mctp_dev *mdev;
+	int rc;
+
+	rc = nla_parse_nested(tb, IFLA_MCTP_MAX, attr, ifla_af_mctp_policy,
+			      NULL);
+	if (rc)
+		return rc;
+
+	mdev = mctp_dev_get_rtnl(dev);
+	if (!mdev)
+		return 0;
+
+	if (tb[IFLA_MCTP_NET])
+		WRITE_ONCE(mdev->net, nla_get_u32(tb[IFLA_MCTP_NET]));
+
+	return 0;
+}
+
+static void mctp_unregister(struct net_device *dev)
+{
+	struct mctp_dev *mdev;
+
+	mdev = mctp_dev_get_rtnl(dev);
+
+	if (!mdev)
+		return;
+
+	RCU_INIT_POINTER(mdev->dev->mctp_ptr, NULL);
+
+	kfree(mdev->addrs);
+
+	mctp_dev_destroy(mdev);
+}
+
+static int mctp_register(struct net_device *dev)
+{
+	struct mctp_dev *mdev;
+
+	/* Already registered? */
+	if (rtnl_dereference(dev->mctp_ptr))
+		return 0;
+
+	/* only register specific types; MCTP-specific and loopback for now */
+	if (dev->type != ARPHRD_MCTP && dev->type != ARPHRD_LOOPBACK)
+		return 0;
+
+	mdev = mctp_add_dev(dev);
+	if (IS_ERR(mdev))
+		return PTR_ERR(mdev);
+
+	return 0;
+}
+
+static int mctp_dev_notify(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	int rc;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		rc = mctp_register(dev);
+		if (rc)
+			return notifier_from_errno(rc);
+		break;
+	case NETDEV_UNREGISTER:
+		mctp_unregister(dev);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct rtnl_af_ops mctp_af_ops = {
+	.family = AF_MCTP,
+	.fill_link_af = mctp_fill_link_af,
+	.get_link_af_size = mctp_get_link_af_size,
+	.set_link_af = mctp_set_link_af,
+};
+
+static struct notifier_block mctp_dev_nb = {
+	.notifier_call = mctp_dev_notify,
+	.priority = ADDRCONF_NOTIFY_PRIORITY,
+};
+
+void __init mctp_device_init(void)
+{
+	register_netdevice_notifier(&mctp_dev_nb);
+
+	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETADDR,
+			     NULL, mctp_dump_addrinfo, 0);
+	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWADDR,
+			     mctp_rtm_newaddr, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELADDR,
+			     mctp_rtm_deladdr, NULL, 0);
+	rtnl_af_register(&mctp_af_ops);
+}
+
+void __exit mctp_device_exit(void)
+{
+	rtnl_af_unregister(&mctp_af_ops);
+	rtnl_unregister(PF_MCTP, RTM_DELADDR);
+	rtnl_unregister(PF_MCTP, RTM_NEWADDR);
+	rtnl_unregister(PF_MCTP, RTM_GETADDR);
+
+	unregister_netdevice_notifier(&mctp_dev_nb);
+}
-- 
cgit v1.2.3


From 889b7da23abf92faf34491df95733bda63639e32 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:45 +0800
Subject: mctp: Add initial routing framework

Add a simple routing table, and a couple of route output handlers, and
the mctp packet_type & handler.

Includes changes from Matt Johnston <matt@codeconstruct.com.au>.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                 |   1 +
 include/net/mctp.h          |  75 ++++++++++
 include/net/net_namespace.h |   4 +
 include/net/netns/mctp.h    |  16 +++
 net/mctp/Makefile           |   2 +-
 net/mctp/af_mctp.c          |   7 +
 net/mctp/device.c           |   8 ++
 net/mctp/route.c            | 329 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 441 insertions(+), 1 deletion(-)
 create mode 100644 include/net/netns/mctp.h
 create mode 100644 net/mctp/route.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index c2943c227ee6..4ca73465e690 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11040,6 +11040,7 @@ S:	Maintained
 F:	drivers/net/mctp/
 F:	include/net/mctp.h
 F:	include/net/mctpdevice.h
+F:	include/net/netns/mctp.h
 F:	net/mctp/
 
 MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7
diff --git a/include/net/mctp.h b/include/net/mctp.h
index 61452e03aa85..4094bec5e5db 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -11,6 +11,7 @@
 
 #include <linux/bits.h>
 #include <linux/mctp.h>
+#include <net/net_namespace.h>
 
 /* MCTP packet definitions */
 struct mctp_hdr {
@@ -33,6 +34,8 @@ struct mctp_hdr {
 #define MCTP_HDR_TAG_SHIFT	0
 #define MCTP_HDR_TAG_MASK	GENMASK(2, 0)
 
+#define MCTP_HEADER_MAXLEN	4
+
 static inline bool mctp_address_ok(mctp_eid_t eid)
 {
 	return eid >= 8 && eid < 255;
@@ -43,6 +46,78 @@ static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb)
 	return (struct mctp_hdr *)skb_network_header(skb);
 }
 
+struct mctp_skb_cb {
+	unsigned int	magic;
+	unsigned int	net;
+	mctp_eid_t	src;
+};
+
+/* skb control-block accessors with a little extra debugging for initial
+ * development.
+ *
+ * TODO: remove checks & mctp_skb_cb->magic; replace callers of __mctp_cb
+ * with mctp_cb().
+ *
+ * __mctp_cb() is only for the initial ingress code; we should see ->magic set
+ * at all times after this.
+ */
+static inline struct mctp_skb_cb *__mctp_cb(struct sk_buff *skb)
+{
+	struct mctp_skb_cb *cb = (void *)skb->cb;
+
+	cb->magic = 0x4d435450;
+	return cb;
+}
+
+static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb)
+{
+	struct mctp_skb_cb *cb = (void *)skb->cb;
+
+	WARN_ON(cb->magic != 0x4d435450);
+	return (void *)(skb->cb);
+}
+
+/* Route definition.
+ *
+ * These are held in the pernet->mctp.routes list, with RCU protection for
+ * removed routes. We hold a reference to the netdev; routes need to be
+ * dropped on NETDEV_UNREGISTER events.
+ *
+ * Updates to the route table are performed under rtnl; all reads under RCU,
+ * so routes cannot be referenced over a RCU grace period. Specifically: A
+ * caller cannot block between mctp_route_lookup and passing the route to
+ * mctp_do_route.
+ */
+struct mctp_route {
+	mctp_eid_t		min, max;
+
+	struct mctp_dev		*dev;
+	unsigned int		mtu;
+	int			(*output)(struct mctp_route *route,
+					  struct sk_buff *skb);
+
+	struct list_head	list;
+	refcount_t		refs;
+	struct rcu_head		rcu;
+};
+
+/* route interfaces */
+struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet,
+				     mctp_eid_t daddr);
+
+int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb);
+
+int mctp_local_output(struct sock *sk, struct mctp_route *rt,
+		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag);
+
+/* routing <--> device interface */
+int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr);
+int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr);
+void mctp_route_remove_dev(struct mctp_dev *mdev);
+
+int mctp_routes_init(void);
+void mctp_routes_exit(void);
+
 void mctp_device_init(void);
 void mctp_device_exit(void);
 
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 12cf6d7ea62c..cc54750dd3db 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -34,6 +34,7 @@
 #include <net/netns/xdp.h>
 #include <net/netns/smc.h>
 #include <net/netns/bpf.h>
+#include <net/netns/mctp.h>
 #include <linux/ns_common.h>
 #include <linux/idr.h>
 #include <linux/skbuff.h>
@@ -167,6 +168,9 @@ struct net {
 #ifdef CONFIG_XDP_SOCKETS
 	struct netns_xdp	xdp;
 #endif
+#if IS_ENABLED(CONFIG_MCTP)
+	struct netns_mctp	mctp;
+#endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	struct sock		*crypto_nlsk;
 #endif
diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h
new file mode 100644
index 000000000000..508459b08a59
--- /dev/null
+++ b/include/net/netns/mctp.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * MCTP per-net structures
+ */
+
+#ifndef __NETNS_MCTP_H__
+#define __NETNS_MCTP_H__
+
+#include <linux/types.h>
+
+struct netns_mctp {
+	/* Only updated under RTNL, entries freed via RCU */
+	struct list_head routes;
+};
+
+#endif /* __NETNS_MCTP_H__ */
diff --git a/net/mctp/Makefile b/net/mctp/Makefile
index 2ea98c27b262..b1a330e9d82a 100644
--- a/net/mctp/Makefile
+++ b/net/mctp/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_MCTP) += mctp.o
-mctp-objs := af_mctp.o device.o
+mctp-objs := af_mctp.o device.o route.o
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 401b4fa141a5..8085f5912101 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -157,10 +157,16 @@ static __init int mctp_init(void)
 	if (rc)
 		goto err_unreg_sock;
 
+	rc = mctp_routes_init();
+	if (rc)
+		goto err_unreg_proto;
+
 	mctp_device_init();
 
 	return 0;
 
+err_unreg_proto:
+	proto_unregister(&mctp_proto);
 err_unreg_sock:
 	sock_unregister(PF_MCTP);
 
@@ -170,6 +176,7 @@ err_unreg_sock:
 static __exit void mctp_exit(void)
 {
 	mctp_device_exit();
+	mctp_routes_exit();
 	proto_unregister(&mctp_proto);
 	sock_unregister(PF_MCTP);
 }
diff --git a/net/mctp/device.c b/net/mctp/device.c
index 877abe5312cd..5f1b18c17351 100644
--- a/net/mctp/device.c
+++ b/net/mctp/device.c
@@ -197,6 +197,8 @@ static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	kfree(tmp_addrs);
 
+	mctp_route_add_local(mdev, addr->s_addr);
+
 	return 0;
 }
 
@@ -240,6 +242,11 @@ static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (!pos)
 		return -ENOENT;
 
+	rc = mctp_route_remove_local(mdev, addr->s_addr);
+	// we can ignore -ENOENT in the case a route was already removed
+	if (rc < 0 && rc != -ENOENT)
+		return rc;
+
 	spin_lock_irqsave(&mdev->addrs_lock, flags);
 	memmove(pos, pos + 1, mdev->num_addrs - 1 - (pos - mdev->addrs));
 	mdev->num_addrs--;
@@ -334,6 +341,7 @@ static void mctp_unregister(struct net_device *dev)
 
 	RCU_INIT_POINTER(mdev->dev->mctp_ptr, NULL);
 
+	mctp_route_remove_dev(mdev);
 	kfree(mdev->addrs);
 
 	mctp_dev_destroy(mdev);
diff --git a/net/mctp/route.c b/net/mctp/route.c
new file mode 100644
index 000000000000..2bcc13175a9e
--- /dev/null
+++ b/net/mctp/route.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Management Component Transport Protocol (MCTP) - routing
+ * implementation.
+ *
+ * This is currently based on a simple routing table, with no dst cache. The
+ * number of routes should stay fairly small, so the lookup cost is small.
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#include <linux/idr.h>
+#include <linux/mctp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+
+#include <uapi/linux/if_arp.h>
+
+#include <net/mctp.h>
+#include <net/mctpdevice.h>
+
+/* route output callbacks */
+static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+	return 0;
+}
+
+static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
+{
+	/* -> to local stack */
+	/* TODO: socket lookup, reassemble */
+	kfree_skb(skb);
+	return 0;
+}
+
+static int __always_unused mctp_route_output(struct mctp_route *route,
+					     struct sk_buff *skb)
+{
+	unsigned int mtu;
+	int rc;
+
+	skb->protocol = htons(ETH_P_MCTP);
+
+	mtu = READ_ONCE(skb->dev->mtu);
+	if (skb->len > mtu) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	/* TODO: daddr (from rt->neigh), saddr (from device?)  */
+	rc = dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
+			     NULL, NULL, skb->len);
+	if (rc) {
+		kfree_skb(skb);
+		return -EHOSTUNREACH;
+	}
+
+	rc = dev_queue_xmit(skb);
+	if (rc)
+		rc = net_xmit_errno(rc);
+
+	return rc;
+}
+
+/* route alloc/release */
+static void mctp_route_release(struct mctp_route *rt)
+{
+	if (refcount_dec_and_test(&rt->refs)) {
+		dev_put(rt->dev->dev);
+		kfree_rcu(rt, rcu);
+	}
+}
+
+/* returns a route with the refcount at 1 */
+static struct mctp_route *mctp_route_alloc(void)
+{
+	struct mctp_route *rt;
+
+	rt = kzalloc(sizeof(*rt), GFP_KERNEL);
+	if (!rt)
+		return NULL;
+
+	INIT_LIST_HEAD(&rt->list);
+	refcount_set(&rt->refs, 1);
+	rt->output = mctp_route_discard;
+
+	return rt;
+}
+
+/* routing lookups */
+static bool mctp_rt_match_eid(struct mctp_route *rt,
+			      unsigned int net, mctp_eid_t eid)
+{
+	return READ_ONCE(rt->dev->net) == net &&
+		rt->min <= eid && rt->max >= eid;
+}
+
+/* compares match, used for duplicate prevention */
+static bool mctp_rt_compare_exact(struct mctp_route *rt1,
+				  struct mctp_route *rt2)
+{
+	ASSERT_RTNL();
+	return rt1->dev->net == rt2->dev->net &&
+		rt1->min == rt2->min &&
+		rt1->max == rt2->max;
+}
+
+struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet,
+				     mctp_eid_t daddr)
+{
+	struct mctp_route *tmp, *rt = NULL;
+
+	list_for_each_entry_rcu(tmp, &net->mctp.routes, list) {
+		/* TODO: add metrics */
+		if (mctp_rt_match_eid(tmp, dnet, daddr)) {
+			if (refcount_inc_not_zero(&tmp->refs)) {
+				rt = tmp;
+				break;
+			}
+		}
+	}
+
+	return rt;
+}
+
+/* sends a skb to rt and releases the route. */
+int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb)
+{
+	int rc;
+
+	rc = rt->output(rt, skb);
+	mctp_route_release(rt);
+	return rc;
+}
+
+int mctp_local_output(struct sock *sk, struct mctp_route *rt,
+		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
+{
+	struct mctp_skb_cb *cb = mctp_cb(skb);
+	struct mctp_hdr *hdr;
+	unsigned long flags;
+	mctp_eid_t saddr;
+	int rc;
+
+	if (WARN_ON(!rt->dev))
+		return -EINVAL;
+
+	spin_lock_irqsave(&rt->dev->addrs_lock, flags);
+	if (rt->dev->num_addrs == 0) {
+		rc = -EHOSTUNREACH;
+	} else {
+		/* use the outbound interface's first address as our source */
+		saddr = rt->dev->addrs[0];
+		rc = 0;
+	}
+	spin_unlock_irqrestore(&rt->dev->addrs_lock, flags);
+
+	if (rc)
+		return rc;
+
+	/* TODO: we have the route MTU here; packetise */
+
+	skb_reset_transport_header(skb);
+	skb_push(skb, sizeof(struct mctp_hdr));
+	skb_reset_network_header(skb);
+	hdr = mctp_hdr(skb);
+	hdr->ver = 1;
+	hdr->dest = daddr;
+	hdr->src = saddr;
+	hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM; /* TODO */
+
+	skb->protocol = htons(ETH_P_MCTP);
+	skb->priority = 0;
+
+	/* cb->net will have been set on initial ingress */
+	cb->src = saddr;
+
+	return mctp_do_route(rt, skb);
+}
+
+/* route management */
+int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr)
+{
+	struct net *net = dev_net(mdev->dev);
+	struct mctp_route *rt, *ert;
+
+	rt = mctp_route_alloc();
+	if (!rt)
+		return -ENOMEM;
+
+	rt->min = addr;
+	rt->max = addr;
+	rt->dev = mdev;
+	dev_hold(rt->dev->dev);
+	rt->output = mctp_route_input;
+
+	ASSERT_RTNL();
+	/* Prevent duplicate identical routes. */
+	list_for_each_entry(ert, &net->mctp.routes, list) {
+		if (mctp_rt_compare_exact(rt, ert)) {
+			mctp_route_release(rt);
+			return -EEXIST;
+		}
+	}
+
+	list_add_rcu(&rt->list, &net->mctp.routes);
+
+	return 0;
+}
+
+int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr)
+{
+	struct net *net = dev_net(mdev->dev);
+	struct mctp_route *rt, *tmp;
+
+	ASSERT_RTNL();
+
+	list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
+		if (rt->dev == mdev && rt->min == addr && rt->max == addr) {
+			list_del_rcu(&rt->list);
+			/* TODO: immediate RTM_DELROUTE */
+			mctp_route_release(rt);
+		}
+	}
+
+	return 0;
+}
+
+/* removes all entries for a given device */
+void mctp_route_remove_dev(struct mctp_dev *mdev)
+{
+	struct net *net = dev_net(mdev->dev);
+	struct mctp_route *rt, *tmp;
+
+	ASSERT_RTNL();
+	list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
+		if (rt->dev == mdev) {
+			list_del_rcu(&rt->list);
+			/* TODO: immediate RTM_DELROUTE */
+			mctp_route_release(rt);
+		}
+	}
+}
+
+/* Incoming packet-handling */
+
+static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev,
+				struct packet_type *pt,
+				struct net_device *orig_dev)
+{
+	struct net *net = dev_net(dev);
+	struct mctp_skb_cb *cb;
+	struct mctp_route *rt;
+	struct mctp_hdr *mh;
+
+	/* basic non-data sanity checks */
+	if (dev->type != ARPHRD_MCTP)
+		goto err_drop;
+
+	if (!pskb_may_pull(skb, sizeof(struct mctp_hdr)))
+		goto err_drop;
+
+	skb_reset_transport_header(skb);
+	skb_reset_network_header(skb);
+
+	/* We have enough for a header; decode and route */
+	mh = mctp_hdr(skb);
+	if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX)
+		goto err_drop;
+
+	cb = __mctp_cb(skb);
+	rcu_read_lock();
+	cb->net = READ_ONCE(__mctp_dev_get(dev)->net);
+	rcu_read_unlock();
+
+	rt = mctp_route_lookup(net, cb->net, mh->dest);
+	if (!rt)
+		goto err_drop;
+
+	mctp_do_route(rt, skb);
+
+	return NET_RX_SUCCESS;
+
+err_drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static struct packet_type mctp_packet_type = {
+	.type = cpu_to_be16(ETH_P_MCTP),
+	.func = mctp_pkttype_receive,
+};
+
+/* net namespace implementation */
+static int __net_init mctp_routes_net_init(struct net *net)
+{
+	struct netns_mctp *ns = &net->mctp;
+
+	INIT_LIST_HEAD(&ns->routes);
+	return 0;
+}
+
+static void __net_exit mctp_routes_net_exit(struct net *net)
+{
+	struct mctp_route *rt;
+
+	list_for_each_entry_rcu(rt, &net->mctp.routes, list)
+		mctp_route_release(rt);
+}
+
+static struct pernet_operations mctp_net_ops = {
+	.init = mctp_routes_net_init,
+	.exit = mctp_routes_net_exit,
+};
+
+int __init mctp_routes_init(void)
+{
+	dev_add_pack(&mctp_packet_type);
+	return register_pernet_subsys(&mctp_net_ops);
+}
+
+void __exit mctp_routes_exit(void)
+{
+	unregister_pernet_subsys(&mctp_net_ops);
+	dev_remove_pack(&mctp_packet_type);
+}
-- 
cgit v1.2.3


From 06d2f4c583a7d892300920fc85d654d48a15e914 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:46 +0800
Subject: mctp: Add netlink route management

This change adds RTM_GETROUTE, RTM_NEWROUTE & RTM_DELROUTE handlers,
allowing management of the MCTP route table.

Includes changes from Jeremy Kerr <jk@codeconstruct.com.au>.

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctp.h |   2 +
 net/mctp/route.c   | 258 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 251 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index 4094bec5e5db..bc36e37e8198 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -111,6 +111,8 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag);
 
 /* routing <--> device interface */
+unsigned int mctp_default_net(struct net *net);
+int mctp_default_net_set(struct net *net, unsigned int index);
 int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr);
 int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr);
 void mctp_route_remove_dev(struct mctp_dev *mdev);
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 2bcc13175a9e..31568203f9d0 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -20,6 +20,8 @@
 
 #include <net/mctp.h>
 #include <net/mctpdevice.h>
+#include <net/netlink.h>
+#include <net/sock.h>
 
 /* route output callbacks */
 static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
@@ -36,8 +38,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 	return 0;
 }
 
-static int __always_unused mctp_route_output(struct mctp_route *route,
-					     struct sk_buff *skb)
+static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
 {
 	unsigned int mtu;
 	int rc;
@@ -182,20 +183,29 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 }
 
 /* route management */
-int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr)
+static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
+			  unsigned int daddr_extent, unsigned int mtu,
+			  bool is_local)
 {
 	struct net *net = dev_net(mdev->dev);
 	struct mctp_route *rt, *ert;
 
+	if (!mctp_address_ok(daddr_start))
+		return -EINVAL;
+
+	if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255)
+		return -EINVAL;
+
 	rt = mctp_route_alloc();
 	if (!rt)
 		return -ENOMEM;
 
-	rt->min = addr;
-	rt->max = addr;
+	rt->min = daddr_start;
+	rt->max = daddr_start + daddr_extent;
+	rt->mtu = mtu;
 	rt->dev = mdev;
 	dev_hold(rt->dev->dev);
-	rt->output = mctp_route_input;
+	rt->output = is_local ? mctp_route_input : mctp_route_output;
 
 	ASSERT_RTNL();
 	/* Prevent duplicate identical routes. */
@@ -211,22 +221,43 @@ int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr)
 	return 0;
 }
 
-int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr)
+static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start,
+			     unsigned int daddr_extent)
 {
 	struct net *net = dev_net(mdev->dev);
 	struct mctp_route *rt, *tmp;
+	mctp_eid_t daddr_end;
+	bool dropped;
+
+	if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255)
+		return -EINVAL;
+
+	daddr_end = daddr_start + daddr_extent;
+	dropped = false;
 
 	ASSERT_RTNL();
 
 	list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
-		if (rt->dev == mdev && rt->min == addr && rt->max == addr) {
+		if (rt->dev == mdev &&
+		    rt->min == daddr_start && rt->max == daddr_end) {
 			list_del_rcu(&rt->list);
 			/* TODO: immediate RTM_DELROUTE */
 			mctp_route_release(rt);
+			dropped = true;
 		}
 	}
 
-	return 0;
+	return dropped ? 0 : -ENOENT;
+}
+
+int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr)
+{
+	return mctp_route_add(mdev, addr, 0, 0, true);
+}
+
+int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr)
+{
+	return mctp_route_remove(mdev, addr, 0);
 }
 
 /* removes all entries for a given device */
@@ -294,6 +325,204 @@ static struct packet_type mctp_packet_type = {
 	.func = mctp_pkttype_receive,
 };
 
+/* netlink interface */
+
+static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = {
+	[RTA_DST]		= { .type = NLA_U8 },
+	[RTA_METRICS]		= { .type = NLA_NESTED },
+	[RTA_OIF]		= { .type = NLA_U32 },
+};
+
+/* Common part for RTM_NEWROUTE and RTM_DELROUTE parsing.
+ * tb must hold RTA_MAX+1 elements.
+ */
+static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh,
+			      struct netlink_ext_ack *extack,
+			      struct nlattr **tb, struct rtmsg **rtm,
+			      struct mctp_dev **mdev, mctp_eid_t *daddr_start)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *dev;
+	unsigned int ifindex;
+	int rc;
+
+	rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX,
+			 rta_mctp_policy, extack);
+	if (rc < 0) {
+		NL_SET_ERR_MSG(extack, "incorrect format");
+		return rc;
+	}
+
+	if (!tb[RTA_DST]) {
+		NL_SET_ERR_MSG(extack, "dst EID missing");
+		return -EINVAL;
+	}
+	*daddr_start = nla_get_u8(tb[RTA_DST]);
+
+	if (!tb[RTA_OIF]) {
+		NL_SET_ERR_MSG(extack, "ifindex missing");
+		return -EINVAL;
+	}
+	ifindex = nla_get_u32(tb[RTA_OIF]);
+
+	*rtm = nlmsg_data(nlh);
+	if ((*rtm)->rtm_family != AF_MCTP) {
+		NL_SET_ERR_MSG(extack, "route family must be AF_MCTP");
+		return -EINVAL;
+	}
+
+	dev = __dev_get_by_index(net, ifindex);
+	if (!dev) {
+		NL_SET_ERR_MSG(extack, "bad ifindex");
+		return -ENODEV;
+	}
+	*mdev = mctp_dev_get_rtnl(dev);
+	if (!*mdev)
+		return -ENODEV;
+
+	if (dev->flags & IFF_LOOPBACK) {
+		NL_SET_ERR_MSG(extack, "no routes to loopback");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+			 struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RTA_MAX + 1];
+	mctp_eid_t daddr_start;
+	struct mctp_dev *mdev;
+	struct rtmsg *rtm;
+	unsigned int mtu;
+	int rc;
+
+	rc = mctp_route_nlparse(skb, nlh, extack, tb,
+				&rtm, &mdev, &daddr_start);
+	if (rc < 0)
+		return rc;
+
+	if (rtm->rtm_type != RTN_UNICAST) {
+		NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST");
+		return -EINVAL;
+	}
+
+	/* TODO: parse mtu from nlparse */
+	mtu = 0;
+
+	rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu, false);
+	return rc;
+}
+
+static int mctp_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+			 struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RTA_MAX + 1];
+	mctp_eid_t daddr_start;
+	struct mctp_dev *mdev;
+	struct rtmsg *rtm;
+	int rc;
+
+	rc = mctp_route_nlparse(skb, nlh, extack, tb,
+				&rtm, &mdev, &daddr_start);
+	if (rc < 0)
+		return rc;
+
+	/* we only have unicast routes */
+	if (rtm->rtm_type != RTN_UNICAST)
+		return -EINVAL;
+
+	rc = mctp_route_remove(mdev, daddr_start, rtm->rtm_dst_len);
+	return rc;
+}
+
+static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt,
+			    u32 portid, u32 seq, int event, unsigned int flags)
+{
+	struct nlmsghdr *nlh;
+	struct rtmsg *hdr;
+	void *metrics;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	hdr = nlmsg_data(nlh);
+	hdr->rtm_family = AF_MCTP;
+
+	/* we use the _len fields as a number of EIDs, rather than
+	 * a number of bits in the address
+	 */
+	hdr->rtm_dst_len = rt->max - rt->min;
+	hdr->rtm_src_len = 0;
+	hdr->rtm_tos = 0;
+	hdr->rtm_table = RT_TABLE_DEFAULT;
+	hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */
+	hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */
+	hdr->rtm_type = RTN_ANYCAST; /* TODO: type from route */
+
+	if (nla_put_u8(skb, RTA_DST, rt->min))
+		goto cancel;
+
+	metrics = nla_nest_start_noflag(skb, RTA_METRICS);
+	if (!metrics)
+		goto cancel;
+
+	if (rt->mtu) {
+		if (nla_put_u32(skb, RTAX_MTU, rt->mtu))
+			goto cancel;
+	}
+
+	nla_nest_end(skb, metrics);
+
+	if (rt->dev) {
+		if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex))
+			goto cancel;
+	}
+
+	/* TODO: conditional neighbour physaddr? */
+
+	nlmsg_end(skb, nlh);
+
+	return 0;
+
+cancel:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int mctp_dump_rtinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct mctp_route *rt;
+	int s_idx, idx;
+
+	/* TODO: allow filtering on route data, possibly under
+	 * cb->strict_check
+	 */
+
+	/* TODO: change to struct overlay */
+	s_idx = cb->args[0];
+	idx = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
+		if (idx++ < s_idx)
+			continue;
+		if (mctp_fill_rtinfo(skb, rt,
+				     NETLINK_CB(cb->skb).portid,
+				     cb->nlh->nlmsg_seq,
+				     RTM_NEWROUTE, NLM_F_MULTI) < 0)
+			break;
+	}
+
+	rcu_read_unlock();
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
 /* net namespace implementation */
 static int __net_init mctp_routes_net_init(struct net *net)
 {
@@ -319,11 +548,22 @@ static struct pernet_operations mctp_net_ops = {
 int __init mctp_routes_init(void)
 {
 	dev_add_pack(&mctp_packet_type);
+
+	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETROUTE,
+			     NULL, mctp_dump_rtinfo, 0);
+	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWROUTE,
+			     mctp_newroute, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELROUTE,
+			     mctp_delroute, NULL, 0);
+
 	return register_pernet_subsys(&mctp_net_ops);
 }
 
 void __exit mctp_routes_exit(void)
 {
 	unregister_pernet_subsys(&mctp_net_ops);
+	rtnl_unregister(PF_MCTP, RTM_DELROUTE);
+	rtnl_unregister(PF_MCTP, RTM_NEWROUTE);
+	rtnl_unregister(PF_MCTP, RTM_GETROUTE);
 	dev_remove_pack(&mctp_packet_type);
 }
-- 
cgit v1.2.3


From 4d8b9319282ae84f5a17b28d8b5b5d1e7e537312 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:47 +0800
Subject: mctp: Add neighbour implementation

Add an initial neighbour table implementation, to be used in the route
output path.

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctp.h       |  25 +++++++++
 include/net/mctpdevice.h |   1 +
 include/net/netns/mctp.h |   4 ++
 net/mctp/Makefile        |   2 +-
 net/mctp/af_mctp.c       |   5 ++
 net/mctp/device.c        |   1 +
 net/mctp/neigh.c         | 141 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 178 insertions(+), 1 deletion(-)
 create mode 100644 net/mctp/neigh.c

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index bc36e37e8198..53f035c8b59c 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -117,6 +117,31 @@ int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr);
 int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr);
 void mctp_route_remove_dev(struct mctp_dev *mdev);
 
+/* neighbour definitions */
+enum mctp_neigh_source {
+	MCTP_NEIGH_STATIC,
+	MCTP_NEIGH_DISCOVER,
+};
+
+struct mctp_neigh {
+	struct mctp_dev		*dev;
+	mctp_eid_t		eid;
+	enum mctp_neigh_source	source;
+
+	unsigned char		ha[MAX_ADDR_LEN];
+
+	struct list_head	list;
+	struct rcu_head		rcu;
+};
+
+int mctp_neigh_init(void);
+void mctp_neigh_exit(void);
+
+// ret_hwaddr may be NULL, otherwise must have space for MAX_ADDR_LEN
+int mctp_neigh_lookup(struct mctp_dev *dev, mctp_eid_t eid,
+		      void *ret_hwaddr);
+void mctp_neigh_remove_dev(struct mctp_dev *mdev);
+
 int mctp_routes_init(void);
 void mctp_routes_exit(void);
 
diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h
index 71a11012fac7..57e773ff08bb 100644
--- a/include/net/mctpdevice.h
+++ b/include/net/mctpdevice.h
@@ -31,5 +31,6 @@ struct mctp_dev {
 
 struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev);
 struct mctp_dev *__mctp_dev_get(const struct net_device *dev);
+struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev);
 
 #endif /* __NET_MCTPDEVICE_H */
diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h
index 508459b08a59..2f5ebeeb320e 100644
--- a/include/net/netns/mctp.h
+++ b/include/net/netns/mctp.h
@@ -11,6 +11,10 @@
 struct netns_mctp {
 	/* Only updated under RTNL, entries freed via RCU */
 	struct list_head routes;
+
+	/* neighbour table */
+	struct mutex neigh_lock;
+	struct list_head neighbours;
 };
 
 #endif /* __NETNS_MCTP_H__ */
diff --git a/net/mctp/Makefile b/net/mctp/Makefile
index b1a330e9d82a..0171333384d7 100644
--- a/net/mctp/Makefile
+++ b/net/mctp/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_MCTP) += mctp.o
-mctp-objs := af_mctp.o device.o route.o
+mctp-objs := af_mctp.o device.o route.o neigh.o
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 8085f5912101..58701e6b282c 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -161,6 +161,10 @@ static __init int mctp_init(void)
 	if (rc)
 		goto err_unreg_proto;
 
+	rc = mctp_neigh_init();
+	if (rc)
+		goto err_unreg_proto;
+
 	mctp_device_init();
 
 	return 0;
@@ -176,6 +180,7 @@ err_unreg_sock:
 static __exit void mctp_exit(void)
 {
 	mctp_device_exit();
+	mctp_neigh_exit();
 	mctp_routes_exit();
 	proto_unregister(&mctp_proto);
 	sock_unregister(PF_MCTP);
diff --git a/net/mctp/device.c b/net/mctp/device.c
index 5f1b18c17351..aa049590acda 100644
--- a/net/mctp/device.c
+++ b/net/mctp/device.c
@@ -342,6 +342,7 @@ static void mctp_unregister(struct net_device *dev)
 	RCU_INIT_POINTER(mdev->dev->mctp_ptr, NULL);
 
 	mctp_route_remove_dev(mdev);
+	mctp_neigh_remove_dev(mdev);
 	kfree(mdev->addrs);
 
 	mctp_dev_destroy(mdev);
diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c
new file mode 100644
index 000000000000..8603f0c45a8f
--- /dev/null
+++ b/net/mctp/neigh.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Management Component Transport Protocol (MCTP) - routing
+ * implementation.
+ *
+ * This is currently based on a simple routing table, with no dst cache. The
+ * number of routes should stay fairly small, so the lookup cost is small.
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#include <linux/idr.h>
+#include <linux/mctp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+
+#include <net/mctp.h>
+#include <net/mctpdevice.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+
+static int __always_unused mctp_neigh_add(struct mctp_dev *mdev, mctp_eid_t eid,
+					  enum mctp_neigh_source source,
+					  size_t lladdr_len, const void *lladdr)
+{
+	struct net *net = dev_net(mdev->dev);
+	struct mctp_neigh *neigh;
+	int rc;
+
+	mutex_lock(&net->mctp.neigh_lock);
+	if (mctp_neigh_lookup(mdev, eid, NULL) == 0) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	if (lladdr_len > sizeof(neigh->ha)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	neigh = kzalloc(sizeof(*neigh), GFP_KERNEL);
+	if (!neigh) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	INIT_LIST_HEAD(&neigh->list);
+	neigh->dev = mdev;
+	dev_hold(neigh->dev->dev);
+	neigh->eid = eid;
+	neigh->source = source;
+	memcpy(neigh->ha, lladdr, lladdr_len);
+
+	list_add_rcu(&neigh->list, &net->mctp.neighbours);
+	rc = 0;
+out:
+	mutex_unlock(&net->mctp.neigh_lock);
+	return rc;
+}
+
+static void __mctp_neigh_free(struct rcu_head *rcu)
+{
+	struct mctp_neigh *neigh = container_of(rcu, struct mctp_neigh, rcu);
+
+	dev_put(neigh->dev->dev);
+	kfree(neigh);
+}
+
+/* Removes all neighbour entries referring to a device */
+void mctp_neigh_remove_dev(struct mctp_dev *mdev)
+{
+	struct net *net = dev_net(mdev->dev);
+	struct mctp_neigh *neigh, *tmp;
+
+	mutex_lock(&net->mctp.neigh_lock);
+	list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) {
+		if (neigh->dev == mdev) {
+			list_del_rcu(&neigh->list);
+			/* TODO: immediate RTM_DELNEIGH */
+			call_rcu(&neigh->rcu, __mctp_neigh_free);
+		}
+	}
+
+	mutex_unlock(&net->mctp.neigh_lock);
+}
+
+int mctp_neigh_lookup(struct mctp_dev *mdev, mctp_eid_t eid, void *ret_hwaddr)
+{
+	struct net *net = dev_net(mdev->dev);
+	struct mctp_neigh *neigh;
+	int rc = -EHOSTUNREACH; // TODO: or ENOENT?
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) {
+		if (mdev == neigh->dev && eid == neigh->eid) {
+			if (ret_hwaddr)
+				memcpy(ret_hwaddr, neigh->ha,
+				       sizeof(neigh->ha));
+			rc = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return rc;
+}
+
+/* namespace registration */
+static int __net_init mctp_neigh_net_init(struct net *net)
+{
+	struct netns_mctp *ns = &net->mctp;
+
+	INIT_LIST_HEAD(&ns->neighbours);
+	return 0;
+}
+
+static void __net_exit mctp_neigh_net_exit(struct net *net)
+{
+	struct netns_mctp *ns = &net->mctp;
+	struct mctp_neigh *neigh;
+
+	list_for_each_entry(neigh, &ns->neighbours, list)
+		call_rcu(&neigh->rcu, __mctp_neigh_free);
+}
+
+/* net namespace implementation */
+
+static struct pernet_operations mctp_net_ops = {
+	.init = mctp_neigh_net_init,
+	.exit = mctp_neigh_net_exit,
+};
+
+int __init mctp_neigh_init(void)
+{
+	return register_pernet_subsys(&mctp_net_ops);
+}
+
+void __exit mctp_neigh_exit(void)
+{
+	unregister_pernet_subsys(&mctp_net_ops);
+}
-- 
cgit v1.2.3


From 833ef3b91de692ef33b800bca6b1569c39dece74 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:49 +0800
Subject: mctp: Populate socket implementation

Start filling-out the socket syscalls: bind, sendmsg & recvmsg.

This requires an input route implementation, so we add to
mctp_route_input, allowing lookups on binds & message tags. This just
handles single-packet messages at present, we will add fragmentation in
a future change.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctp.h       |  59 +++++++++++++
 include/net/netns/mctp.h |  13 +++
 net/mctp/af_mctp.c       | 203 ++++++++++++++++++++++++++++++++++++++++--
 net/mctp/route.c         | 226 ++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 491 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index 53f035c8b59c..f2d98f6993c0 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -12,6 +12,7 @@
 #include <linux/bits.h>
 #include <linux/mctp.h>
 #include <net/net_namespace.h>
+#include <net/sock.h>
 
 /* MCTP packet definitions */
 struct mctp_hdr {
@@ -46,6 +47,64 @@ static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb)
 	return (struct mctp_hdr *)skb_network_header(skb);
 }
 
+/* socket implementation */
+struct mctp_sock {
+	struct sock	sk;
+
+	/* bind() params */
+	int		bind_net;
+	mctp_eid_t	bind_addr;
+	__u8		bind_type;
+
+	/* list of mctp_sk_key, for incoming tag lookup. updates protected
+	 * by sk->net->keys_lock
+	 */
+	struct hlist_head keys;
+};
+
+/* Key for matching incoming packets to sockets or reassembly contexts.
+ * Packets are matched on (src,dest,tag).
+ *
+ * Lifetime requirements:
+ *
+ *  - keys are free()ed via RCU
+ *
+ *  - a mctp_sk_key contains a reference to a struct sock; this is valid
+ *    for the life of the key. On sock destruction (through unhash), the key is
+ *    removed from lists (see below), and will not be observable after a RCU
+ *    grace period.
+ *
+ *    any RX occurring within that grace period may still queue to the socket,
+ *    but will hit the SOCK_DEAD case before the socket is freed.
+ *
+ * - these mctp_sk_keys appear on two lists:
+ *     1) the struct mctp_sock->keys list
+ *     2) the struct netns_mctp->keys list
+ *
+ *        updates to either list are performed under the netns_mctp->keys
+ *        lock.
+ *
+ * - there is a single destruction path for a mctp_sk_key - through socket
+ *   unhash (see mctp_sk_unhash). This performs the list removal under
+ *   keys_lock.
+ */
+struct mctp_sk_key {
+	mctp_eid_t	peer_addr;
+	mctp_eid_t	local_addr;
+	__u8		tag; /* incoming tag match; invert TO for local */
+
+	/* we hold a ref to sk when set */
+	struct sock	*sk;
+
+	/* routing lookup list */
+	struct hlist_node hlist;
+
+	/* per-socket list */
+	struct hlist_node sklist;
+
+	struct rcu_head	rcu;
+};
+
 struct mctp_skb_cb {
 	unsigned int	magic;
 	unsigned int	net;
diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h
index 2f5ebeeb320e..14ae6d37e52a 100644
--- a/include/net/netns/mctp.h
+++ b/include/net/netns/mctp.h
@@ -12,6 +12,19 @@ struct netns_mctp {
 	/* Only updated under RTNL, entries freed via RCU */
 	struct list_head routes;
 
+	/* Bound sockets: list of sockets bound by type.
+	 * This list is updated from non-atomic contexts (under bind_lock),
+	 * and read (under rcu) in packet rx
+	 */
+	struct mutex bind_lock;
+	struct hlist_head binds;
+
+	/* tag allocations. This list is read and updated from atomic contexts,
+	 * but elements are free()ed after a RCU grace-period
+	 */
+	spinlock_t keys_lock;
+	struct hlist_head keys;
+
 	/* neighbour table */
 	struct mutex neigh_lock;
 	struct list_head neighbours;
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 58701e6b282c..52bd7f2b78db 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -18,10 +18,6 @@
 
 /* socket implementation */
 
-struct mctp_sock {
-	struct sock	sk;
-};
-
 static int mctp_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -36,18 +32,160 @@ static int mctp_release(struct socket *sock)
 
 static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
 {
-	return 0;
+	struct sock *sk = sock->sk;
+	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+	struct sockaddr_mctp *smctp;
+	int rc;
+
+	if (addrlen < sizeof(*smctp))
+		return -EINVAL;
+
+	if (addr->sa_family != AF_MCTP)
+		return -EAFNOSUPPORT;
+
+	if (!capable(CAP_NET_BIND_SERVICE))
+		return -EACCES;
+
+	/* it's a valid sockaddr for MCTP, cast and do protocol checks */
+	smctp = (struct sockaddr_mctp *)addr;
+
+	lock_sock(sk);
+
+	/* TODO: allow rebind */
+	if (sk_hashed(sk)) {
+		rc = -EADDRINUSE;
+		goto out_release;
+	}
+	msk->bind_net = smctp->smctp_network;
+	msk->bind_addr = smctp->smctp_addr.s_addr;
+	msk->bind_type = smctp->smctp_type & 0x7f; /* ignore the IC bit */
+
+	rc = sk->sk_prot->hash(sk);
+
+out_release:
+	release_sock(sk);
+
+	return rc;
 }
 
 static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 {
-	return 0;
+	DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name);
+	const int hlen = MCTP_HEADER_MAXLEN + sizeof(struct mctp_hdr);
+	int rc, addrlen = msg->msg_namelen;
+	struct sock *sk = sock->sk;
+	struct mctp_skb_cb *cb;
+	struct mctp_route *rt;
+	struct sk_buff *skb;
+
+	if (addr) {
+		if (addrlen < sizeof(struct sockaddr_mctp))
+			return -EINVAL;
+		if (addr->smctp_family != AF_MCTP)
+			return -EINVAL;
+		if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER))
+			return -EINVAL;
+
+	} else {
+		/* TODO: connect()ed sockets */
+		return -EDESTADDRREQ;
+	}
+
+	if (!capable(CAP_NET_RAW))
+		return -EACCES;
+
+	rt = mctp_route_lookup(sock_net(sk), addr->smctp_network,
+			       addr->smctp_addr.s_addr);
+	if (!rt)
+		return -EHOSTUNREACH;
+
+	skb = sock_alloc_send_skb(sk, hlen + 1 + len,
+				  msg->msg_flags & MSG_DONTWAIT, &rc);
+	if (!skb)
+		return rc;
+
+	skb_reserve(skb, hlen);
+
+	/* set type as fist byte in payload */
+	*(u8 *)skb_put(skb, 1) = addr->smctp_type;
+
+	rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len);
+	if (rc < 0) {
+		kfree_skb(skb);
+		return rc;
+	}
+
+	/* set up cb */
+	cb = __mctp_cb(skb);
+	cb->net = addr->smctp_network;
+
+	rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr,
+			       addr->smctp_tag);
+
+	return rc ? : len;
 }
 
 static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			int flags)
 {
-	return 0;
+	DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name);
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	size_t msglen;
+	u8 type;
+	int rc;
+
+	if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK))
+		return -EOPNOTSUPP;
+
+	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &rc);
+	if (!skb)
+		return rc;
+
+	if (!skb->len) {
+		rc = 0;
+		goto out_free;
+	}
+
+	/* extract message type, remove from data */
+	type = *((u8 *)skb->data);
+	msglen = skb->len - 1;
+
+	if (len < msglen)
+		msg->msg_flags |= MSG_TRUNC;
+	else
+		len = msglen;
+
+	rc = skb_copy_datagram_msg(skb, 1, msg, len);
+	if (rc < 0)
+		goto out_free;
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	if (addr) {
+		struct mctp_skb_cb *cb = mctp_cb(skb);
+		/* TODO: expand mctp_skb_cb for header fields? */
+		struct mctp_hdr *hdr = mctp_hdr(skb);
+
+		hdr = mctp_hdr(skb);
+		addr = msg->msg_name;
+		addr->smctp_family = AF_MCTP;
+		addr->smctp_network = cb->net;
+		addr->smctp_addr.s_addr = hdr->src;
+		addr->smctp_type = type;
+		addr->smctp_tag = hdr->flags_seq_tag &
+					(MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
+		msg->msg_namelen = sizeof(*addr);
+	}
+
+	rc = len;
+
+	if (flags & MSG_TRUNC)
+		rc = msglen;
+
+out_free:
+	skb_free_datagram(sk, skb);
+	return rc;
 }
 
 static int mctp_setsockopt(struct socket *sock, int level, int optname,
@@ -83,16 +221,63 @@ static const struct proto_ops mctp_dgram_ops = {
 	.sendpage	= sock_no_sendpage,
 };
 
+static int mctp_sk_init(struct sock *sk)
+{
+	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+
+	INIT_HLIST_HEAD(&msk->keys);
+	return 0;
+}
+
 static void mctp_sk_close(struct sock *sk, long timeout)
 {
 	sk_common_release(sk);
 }
 
+static int mctp_sk_hash(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+
+	mutex_lock(&net->mctp.bind_lock);
+	sk_add_node_rcu(sk, &net->mctp.binds);
+	mutex_unlock(&net->mctp.bind_lock);
+
+	return 0;
+}
+
+static void mctp_sk_unhash(struct sock *sk)
+{
+	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+	struct net *net = sock_net(sk);
+	struct mctp_sk_key *key;
+	struct hlist_node *tmp;
+	unsigned long flags;
+
+	/* remove from any type-based binds */
+	mutex_lock(&net->mctp.bind_lock);
+	sk_del_node_init_rcu(sk);
+	mutex_unlock(&net->mctp.bind_lock);
+
+	/* remove tag allocations */
+	spin_lock_irqsave(&net->mctp.keys_lock, flags);
+	hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
+		hlist_del_rcu(&key->sklist);
+		hlist_del_rcu(&key->hlist);
+		kfree_rcu(key, rcu);
+	}
+	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+
+	synchronize_rcu();
+}
+
 static struct proto mctp_proto = {
 	.name		= "MCTP",
 	.owner		= THIS_MODULE,
 	.obj_size	= sizeof(struct mctp_sock),
+	.init		= mctp_sk_init,
 	.close		= mctp_sk_close,
+	.hash		= mctp_sk_hash,
+	.unhash		= mctp_sk_unhash,
 };
 
 static int mctp_pf_create(struct net *net, struct socket *sock,
@@ -147,6 +332,10 @@ static __init int mctp_init(void)
 {
 	int rc;
 
+	/* ensure our uapi tag definitions match the header format */
+	BUILD_BUG_ON(MCTP_TAG_OWNER != MCTP_HDR_FLAG_TO);
+	BUILD_BUG_ON(MCTP_TAG_MASK != MCTP_HDR_TAG_MASK);
+
 	pr_info("mctp: management component transport protocol core\n");
 
 	rc = sock_register(&mctp_pf);
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 31568203f9d0..cc9891672eaa 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -30,10 +30,139 @@ static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
 	return 0;
 }
 
+static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
+{
+	struct mctp_skb_cb *cb = mctp_cb(skb);
+	struct mctp_hdr *mh;
+	struct sock *sk;
+	u8 type;
+
+	WARN_ON(!rcu_read_lock_held());
+
+	/* TODO: look up in skb->cb? */
+	mh = mctp_hdr(skb);
+
+	if (!skb_headlen(skb))
+		return NULL;
+
+	type = (*(u8 *)skb->data) & 0x7f;
+
+	sk_for_each_rcu(sk, &net->mctp.binds) {
+		struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+
+		if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net)
+			continue;
+
+		if (msk->bind_type != type)
+			continue;
+
+		if (msk->bind_addr != MCTP_ADDR_ANY &&
+		    msk->bind_addr != mh->dest)
+			continue;
+
+		return msk;
+	}
+
+	return NULL;
+}
+
+static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local,
+			   mctp_eid_t peer, u8 tag)
+{
+	if (key->local_addr != local)
+		return false;
+
+	if (key->peer_addr != peer)
+		return false;
+
+	if (key->tag != tag)
+		return false;
+
+	return true;
+}
+
+static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb,
+					   mctp_eid_t peer)
+{
+	struct mctp_sk_key *key, *ret;
+	struct mctp_hdr *mh;
+	u8 tag;
+
+	WARN_ON(!rcu_read_lock_held());
+
+	mh = mctp_hdr(skb);
+	tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
+
+	ret = NULL;
+
+	hlist_for_each_entry_rcu(key, &net->mctp.keys, hlist) {
+		if (mctp_key_match(key, mh->dest, peer, tag)) {
+			ret = key;
+			break;
+		}
+	}
+
+	return ret;
+}
+
 static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 {
-	/* -> to local stack */
-	/* TODO: socket lookup, reassemble */
+	struct net *net = dev_net(skb->dev);
+	struct mctp_sk_key *key;
+	struct mctp_sock *msk;
+	struct mctp_hdr *mh;
+
+	msk = NULL;
+
+	/* we may be receiving a locally-routed packet; drop source sk
+	 * accounting
+	 */
+	skb_orphan(skb);
+
+	/* ensure we have enough data for a header and a type */
+	if (skb->len < sizeof(struct mctp_hdr) + 1)
+		goto drop;
+
+	/* grab header, advance data ptr */
+	mh = mctp_hdr(skb);
+	skb_pull(skb, sizeof(struct mctp_hdr));
+
+	if (mh->ver != 1)
+		goto drop;
+
+	/* TODO: reassembly */
+	if ((mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM))
+				!= (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM))
+		goto drop;
+
+	rcu_read_lock();
+	/* 1. lookup socket matching (src,dest,tag) */
+	key = mctp_lookup_key(net, skb, mh->src);
+
+	/* 2. lookup socket macthing (BCAST,dest,tag) */
+	if (!key)
+		key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY);
+
+	/* 3. SOM? -> lookup bound socket, conditionally (!EOM) create
+	 * mapping for future (1)/(2).
+	 */
+	if (key)
+		msk = container_of(key->sk, struct mctp_sock, sk);
+	else if (!msk && (mh->flags_seq_tag & MCTP_HDR_FLAG_SOM))
+		msk = mctp_lookup_bind(net, skb);
+
+	if (!msk)
+		goto unlock_drop;
+
+	sock_queue_rcv_skb(&msk->sk, skb);
+
+	rcu_read_unlock();
+
+	return 0;
+
+unlock_drop:
+	rcu_read_unlock();
+drop:
 	kfree_skb(skb);
 	return 0;
 }
@@ -91,6 +220,80 @@ static struct mctp_route *mctp_route_alloc(void)
 	return rt;
 }
 
+/* tag management */
+static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
+			     struct mctp_sock *msk)
+{
+	struct netns_mctp *mns = &net->mctp;
+
+	lockdep_assert_held(&mns->keys_lock);
+
+	key->sk = &msk->sk;
+
+	/* we hold the net->key_lock here, allowing updates to both
+	 * then net and sk
+	 */
+	hlist_add_head_rcu(&key->hlist, &mns->keys);
+	hlist_add_head_rcu(&key->sklist, &msk->keys);
+}
+
+/* Allocate a locally-owned tag value for (saddr, daddr), and reserve
+ * it for the socket msk
+ */
+static int mctp_alloc_local_tag(struct mctp_sock *msk,
+				mctp_eid_t saddr, mctp_eid_t daddr, u8 *tagp)
+{
+	struct net *net = sock_net(&msk->sk);
+	struct netns_mctp *mns = &net->mctp;
+	struct mctp_sk_key *key, *tmp;
+	unsigned long flags;
+	int rc = -EAGAIN;
+	u8 tagbits;
+
+	/* be optimistic, alloc now */
+	key = kzalloc(sizeof(*key), GFP_KERNEL);
+	if (!key)
+		return -ENOMEM;
+	key->local_addr = saddr;
+	key->peer_addr = daddr;
+
+	/* 8 possible tag values */
+	tagbits = 0xff;
+
+	spin_lock_irqsave(&mns->keys_lock, flags);
+
+	/* Walk through the existing keys, looking for potential conflicting
+	 * tags. If we find a conflict, clear that bit from tagbits
+	 */
+	hlist_for_each_entry(tmp, &mns->keys, hlist) {
+		/* if we don't own the tag, it can't conflict */
+		if (tmp->tag & MCTP_HDR_FLAG_TO)
+			continue;
+
+		if ((tmp->peer_addr == daddr ||
+		     tmp->peer_addr == MCTP_ADDR_ANY) &&
+		    tmp->local_addr == saddr)
+			tagbits &= ~(1 << tmp->tag);
+
+		if (!tagbits)
+			break;
+	}
+
+	if (tagbits) {
+		key->tag = __ffs(tagbits);
+		mctp_reserve_tag(net, key, msk);
+		*tagp = key->tag;
+		rc = 0;
+	}
+
+	spin_unlock_irqrestore(&mns->keys_lock, flags);
+
+	if (!tagbits)
+		kfree(key);
+
+	return rc;
+}
+
 /* routing lookups */
 static bool mctp_rt_match_eid(struct mctp_route *rt,
 			      unsigned int net, mctp_eid_t eid)
@@ -140,11 +343,13 @@ int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb)
 int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
 {
+	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
 	struct mctp_skb_cb *cb = mctp_cb(skb);
 	struct mctp_hdr *hdr;
 	unsigned long flags;
 	mctp_eid_t saddr;
 	int rc;
+	u8 tag;
 
 	if (WARN_ON(!rt->dev))
 		return -EINVAL;
@@ -162,6 +367,15 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 	if (rc)
 		return rc;
 
+	if (req_tag & MCTP_HDR_FLAG_TO) {
+		rc = mctp_alloc_local_tag(msk, saddr, daddr, &tag);
+		if (rc)
+			return rc;
+		tag |= MCTP_HDR_FLAG_TO;
+	} else {
+		tag = req_tag;
+	}
+
 	/* TODO: we have the route MTU here; packetise */
 
 	skb_reset_transport_header(skb);
@@ -171,8 +385,10 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 	hdr->ver = 1;
 	hdr->dest = daddr;
 	hdr->src = saddr;
-	hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM; /* TODO */
+	hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | /* TODO */
+		tag;
 
+	skb->dev = rt->dev->dev;
 	skb->protocol = htons(ETH_P_MCTP);
 	skb->priority = 0;
 
@@ -529,6 +745,10 @@ static int __net_init mctp_routes_net_init(struct net *net)
 	struct netns_mctp *ns = &net->mctp;
 
 	INIT_LIST_HEAD(&ns->routes);
+	INIT_HLIST_HEAD(&ns->binds);
+	mutex_init(&ns->bind_lock);
+	INIT_HLIST_HEAD(&ns->keys);
+	spin_lock_init(&ns->keys_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4a992bbd365094730a31bae1e12a6ca695336d57 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:50 +0800
Subject: mctp: Implement message fragmentation & reassembly

This change implements MCTP fragmentation (based on route & device MTU),
and corresponding reassembly.

The MCTP specification only allows for fragmentation on the originating
message endpoint, and reassembly on the destination endpoint -
intermediate nodes do not need to reassemble/refragment.  Consequently,
we only fragment in the local transmit path, and reassemble
locally-bound packets. Messages are required to be in-order, so we
simply cancel reassembly on out-of-order or missing packets.

In the fragmentation path, we just break up the message into MTU-sized
fragments; the skb structure is a simple copy for now, which we can later
improve with a shared data implementation.

For reassembly, we keep track of incoming message fragments using the
existing tag infrastructure, allocating a key on the (src,dest,tag)
tuple, and reassembles matching fragments into a skb->frag_list.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctp.h |  25 +++-
 net/mctp/af_mctp.c |   8 ++
 net/mctp/route.c   | 372 +++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 361 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index f2d98f6993c0..0a460ba185b8 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -84,9 +84,21 @@ struct mctp_sock {
  *        updates to either list are performed under the netns_mctp->keys
  *        lock.
  *
- * - there is a single destruction path for a mctp_sk_key - through socket
- *   unhash (see mctp_sk_unhash). This performs the list removal under
- *   keys_lock.
+ * - a key may have a sk_buff attached as part of an in-progress message
+ *   reassembly (->reasm_head). The reassembly context is protected by
+ *   reasm_lock, which may be acquired with the keys lock (above) held, if
+ *   necessary. Consequently, keys lock *cannot* be acquired with the
+ *   reasm_lock held.
+ *
+ * - there are two destruction paths for a mctp_sk_key:
+ *
+ *    - through socket unhash (see mctp_sk_unhash). This performs the list
+ *      removal under keys_lock.
+ *
+ *    - where a key is established to receive a reply message: after receiving
+ *      the (complete) reply, or during reassembly errors. Here, we clean up
+ *      the reassembly context (marking reasm_dead, to prevent another from
+ *      starting), and remove the socket from the netns & socket lists.
  */
 struct mctp_sk_key {
 	mctp_eid_t	peer_addr;
@@ -102,6 +114,13 @@ struct mctp_sk_key {
 	/* per-socket list */
 	struct hlist_node sklist;
 
+	/* incoming fragment reassembly context */
+	spinlock_t	reasm_lock;
+	struct sk_buff	*reasm_head;
+	struct sk_buff	**reasm_tailp;
+	bool		reasm_dead;
+	u8		last_seq;
+
 	struct rcu_head	rcu;
 };
 
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 52bd7f2b78db..9ca836df19d0 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -263,6 +263,14 @@ static void mctp_sk_unhash(struct sock *sk)
 	hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
 		hlist_del_rcu(&key->sklist);
 		hlist_del_rcu(&key->hlist);
+
+		spin_lock(&key->reasm_lock);
+		if (key->reasm_head)
+			kfree_skb(key->reasm_head);
+		key->reasm_head = NULL;
+		key->reasm_dead = true;
+		spin_unlock(&key->reasm_lock);
+
 		kfree_rcu(key, rcu);
 	}
 	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
diff --git a/net/mctp/route.c b/net/mctp/route.c
index cc9891672eaa..160220e6f241 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -23,6 +23,8 @@
 #include <net/netlink.h>
 #include <net/sock.h>
 
+static const unsigned int mctp_message_maxlen = 64 * 1024;
+
 /* route output callbacks */
 static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
 {
@@ -105,14 +107,125 @@ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb,
 	return ret;
 }
 
+static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk,
+					  mctp_eid_t local, mctp_eid_t peer,
+					  u8 tag, gfp_t gfp)
+{
+	struct mctp_sk_key *key;
+
+	key = kzalloc(sizeof(*key), gfp);
+	if (!key)
+		return NULL;
+
+	key->peer_addr = peer;
+	key->local_addr = local;
+	key->tag = tag;
+	key->sk = &msk->sk;
+	spin_lock_init(&key->reasm_lock);
+
+	return key;
+}
+
+static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk)
+{
+	struct net *net = sock_net(&msk->sk);
+	struct mctp_sk_key *tmp;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&net->mctp.keys_lock, flags);
+
+	hlist_for_each_entry(tmp, &net->mctp.keys, hlist) {
+		if (mctp_key_match(tmp, key->local_addr, key->peer_addr,
+				   key->tag)) {
+			rc = -EEXIST;
+			break;
+		}
+	}
+
+	if (!rc) {
+		hlist_add_head(&key->hlist, &net->mctp.keys);
+		hlist_add_head(&key->sklist, &msk->keys);
+	}
+
+	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+
+	return rc;
+}
+
+/* Must be called with key->reasm_lock, which it will release. Will schedule
+ * the key for an RCU free.
+ */
+static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net,
+				   unsigned long flags)
+	__releases(&key->reasm_lock)
+{
+	struct sk_buff *skb;
+
+	skb = key->reasm_head;
+	key->reasm_head = NULL;
+	key->reasm_dead = true;
+	spin_unlock_irqrestore(&key->reasm_lock, flags);
+
+	spin_lock_irqsave(&net->mctp.keys_lock, flags);
+	hlist_del_rcu(&key->hlist);
+	hlist_del_rcu(&key->sklist);
+	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+	kfree_rcu(key, rcu);
+
+	if (skb)
+		kfree_skb(skb);
+}
+
+static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
+{
+	struct mctp_hdr *hdr = mctp_hdr(skb);
+	u8 exp_seq, this_seq;
+
+	this_seq = (hdr->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT)
+		& MCTP_HDR_SEQ_MASK;
+
+	if (!key->reasm_head) {
+		key->reasm_head = skb;
+		key->reasm_tailp = &(skb_shinfo(skb)->frag_list);
+		key->last_seq = this_seq;
+		return 0;
+	}
+
+	exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK;
+
+	if (this_seq != exp_seq)
+		return -EINVAL;
+
+	if (key->reasm_head->len + skb->len > mctp_message_maxlen)
+		return -EINVAL;
+
+	skb->next = NULL;
+	skb->sk = NULL;
+	*key->reasm_tailp = skb;
+	key->reasm_tailp = &skb->next;
+
+	key->last_seq = this_seq;
+
+	key->reasm_head->data_len += skb->len;
+	key->reasm_head->len += skb->len;
+	key->reasm_head->truesize += skb->truesize;
+
+	return 0;
+}
+
 static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	struct mctp_sk_key *key;
 	struct mctp_sock *msk;
 	struct mctp_hdr *mh;
+	unsigned long f;
+	u8 tag, flags;
+	int rc;
 
 	msk = NULL;
+	rc = -EINVAL;
 
 	/* we may be receiving a locally-routed packet; drop source sk
 	 * accounting
@@ -121,50 +234,144 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 
 	/* ensure we have enough data for a header and a type */
 	if (skb->len < sizeof(struct mctp_hdr) + 1)
-		goto drop;
+		goto out;
 
 	/* grab header, advance data ptr */
 	mh = mctp_hdr(skb);
 	skb_pull(skb, sizeof(struct mctp_hdr));
 
 	if (mh->ver != 1)
-		goto drop;
+		goto out;
 
-	/* TODO: reassembly */
-	if ((mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM))
-				!= (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM))
-		goto drop;
+	flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM);
+	tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
 
 	rcu_read_lock();
-	/* 1. lookup socket matching (src,dest,tag) */
+
+	/* lookup socket / reasm context, exactly matching (src,dest,tag) */
 	key = mctp_lookup_key(net, skb, mh->src);
 
-	/* 2. lookup socket macthing (BCAST,dest,tag) */
-	if (!key)
-		key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY);
+	if (flags & MCTP_HDR_FLAG_SOM) {
+		if (key) {
+			msk = container_of(key->sk, struct mctp_sock, sk);
+		} else {
+			/* first response to a broadcast? do a more general
+			 * key lookup to find the socket, but don't use this
+			 * key for reassembly - we'll create a more specific
+			 * one for future packets if required (ie, !EOM).
+			 */
+			key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY);
+			if (key) {
+				msk = container_of(key->sk,
+						   struct mctp_sock, sk);
+				key = NULL;
+			}
+		}
 
-	/* 3. SOM? -> lookup bound socket, conditionally (!EOM) create
-	 * mapping for future (1)/(2).
-	 */
-	if (key)
-		msk = container_of(key->sk, struct mctp_sock, sk);
-	else if (!msk && (mh->flags_seq_tag & MCTP_HDR_FLAG_SOM))
-		msk = mctp_lookup_bind(net, skb);
+		if (!key && !msk && (tag & MCTP_HDR_FLAG_TO))
+			msk = mctp_lookup_bind(net, skb);
 
-	if (!msk)
-		goto unlock_drop;
+		if (!msk) {
+			rc = -ENOENT;
+			goto out_unlock;
+		}
 
-	sock_queue_rcv_skb(&msk->sk, skb);
+		/* single-packet message? deliver to socket, clean up any
+		 * pending key.
+		 */
+		if (flags & MCTP_HDR_FLAG_EOM) {
+			sock_queue_rcv_skb(&msk->sk, skb);
+			if (key) {
+				spin_lock_irqsave(&key->reasm_lock, f);
+				/* we've hit a pending reassembly; not much we
+				 * can do but drop it
+				 */
+				__mctp_key_unlock_drop(key, net, f);
+			}
+			rc = 0;
+			goto out_unlock;
+		}
 
-	rcu_read_unlock();
+		/* broadcast response or a bind() - create a key for further
+		 * packets for this message
+		 */
+		if (!key) {
+			key = mctp_key_alloc(msk, mh->dest, mh->src,
+					     tag, GFP_ATOMIC);
+			if (!key) {
+				rc = -ENOMEM;
+				goto out_unlock;
+			}
 
-	return 0;
+			/* we can queue without the reasm lock here, as the
+			 * key isn't observable yet
+			 */
+			mctp_frag_queue(key, skb);
+
+			/* if the key_add fails, we've raced with another
+			 * SOM packet with the same src, dest and tag. There's
+			 * no way to distinguish future packets, so all we
+			 * can do is drop; we'll free the skb on exit from
+			 * this function.
+			 */
+			rc = mctp_key_add(key, msk);
+			if (rc)
+				kfree(key);
+
+		} else {
+			/* existing key: start reassembly */
+			spin_lock_irqsave(&key->reasm_lock, f);
+
+			if (key->reasm_head || key->reasm_dead) {
+				/* duplicate start? drop everything */
+				__mctp_key_unlock_drop(key, net, f);
+				rc = -EEXIST;
+			} else {
+				rc = mctp_frag_queue(key, skb);
+				spin_unlock_irqrestore(&key->reasm_lock, f);
+			}
+		}
+
+	} else if (key) {
+		/* this packet continues a previous message; reassemble
+		 * using the message-specific key
+		 */
+
+		spin_lock_irqsave(&key->reasm_lock, f);
+
+		/* we need to be continuing an existing reassembly... */
+		if (!key->reasm_head)
+			rc = -EINVAL;
+		else
+			rc = mctp_frag_queue(key, skb);
+
+		/* end of message? deliver to socket, and we're done with
+		 * the reassembly/response key
+		 */
+		if (!rc && flags & MCTP_HDR_FLAG_EOM) {
+			sock_queue_rcv_skb(key->sk, key->reasm_head);
+			key->reasm_head = NULL;
+			__mctp_key_unlock_drop(key, net, f);
+		} else {
+			spin_unlock_irqrestore(&key->reasm_lock, f);
+		}
+
+	} else {
+		/* not a start, no matching key */
+		rc = -ENOENT;
+	}
 
-unlock_drop:
+out_unlock:
 	rcu_read_unlock();
-drop:
-	kfree_skb(skb);
-	return 0;
+out:
+	if (rc)
+		kfree_skb(skb);
+	return rc;
+}
+
+static unsigned int mctp_route_mtu(struct mctp_route *rt)
+{
+	return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu);
 }
 
 static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
@@ -228,8 +435,6 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
 
 	lockdep_assert_held(&mns->keys_lock);
 
-	key->sk = &msk->sk;
-
 	/* we hold the net->key_lock here, allowing updates to both
 	 * then net and sk
 	 */
@@ -251,11 +456,9 @@ static int mctp_alloc_local_tag(struct mctp_sock *msk,
 	u8 tagbits;
 
 	/* be optimistic, alloc now */
-	key = kzalloc(sizeof(*key), GFP_KERNEL);
+	key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL);
 	if (!key)
 		return -ENOMEM;
-	key->local_addr = saddr;
-	key->peer_addr = daddr;
 
 	/* 8 possible tag values */
 	tagbits = 0xff;
@@ -340,6 +543,86 @@ int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb)
 	return rc;
 }
 
+static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
+				  unsigned int mtu, u8 tag)
+{
+	const unsigned int hlen = sizeof(struct mctp_hdr);
+	struct mctp_hdr *hdr, *hdr2;
+	unsigned int pos, size;
+	struct sk_buff *skb2;
+	int rc;
+	u8 seq;
+
+	hdr = mctp_hdr(skb);
+	seq = 0;
+	rc = 0;
+
+	if (mtu < hlen + 1) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	/* we've got the header */
+	skb_pull(skb, hlen);
+
+	for (pos = 0; pos < skb->len;) {
+		/* size of message payload */
+		size = min(mtu - hlen, skb->len - pos);
+
+		skb2 = alloc_skb(MCTP_HEADER_MAXLEN + hlen + size, GFP_KERNEL);
+		if (!skb2) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		/* generic skb copy */
+		skb2->protocol = skb->protocol;
+		skb2->priority = skb->priority;
+		skb2->dev = skb->dev;
+		memcpy(skb2->cb, skb->cb, sizeof(skb2->cb));
+
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+
+		/* establish packet */
+		skb_reserve(skb2, MCTP_HEADER_MAXLEN);
+		skb_reset_network_header(skb2);
+		skb_put(skb2, hlen + size);
+		skb2->transport_header = skb2->network_header + hlen;
+
+		/* copy header fields, calculate SOM/EOM flags & seq */
+		hdr2 = mctp_hdr(skb2);
+		hdr2->ver = hdr->ver;
+		hdr2->dest = hdr->dest;
+		hdr2->src = hdr->src;
+		hdr2->flags_seq_tag = tag &
+			(MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
+
+		if (pos == 0)
+			hdr2->flags_seq_tag |= MCTP_HDR_FLAG_SOM;
+
+		if (pos + size == skb->len)
+			hdr2->flags_seq_tag |= MCTP_HDR_FLAG_EOM;
+
+		hdr2->flags_seq_tag |= seq << MCTP_HDR_SEQ_SHIFT;
+
+		/* copy message payload */
+		skb_copy_bits(skb, pos, skb_transport_header(skb2), size);
+
+		/* do route, but don't drop the rt reference */
+		rc = rt->output(rt, skb2);
+		if (rc)
+			break;
+
+		seq = (seq + 1) & MCTP_HDR_SEQ_MASK;
+		pos += size;
+	}
+
+	mctp_route_release(rt);
+	consume_skb(skb);
+	return rc;
+}
+
 int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
 {
@@ -347,6 +630,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 	struct mctp_skb_cb *cb = mctp_cb(skb);
 	struct mctp_hdr *hdr;
 	unsigned long flags;
+	unsigned int mtu;
 	mctp_eid_t saddr;
 	int rc;
 	u8 tag;
@@ -376,26 +660,32 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 		tag = req_tag;
 	}
 
-	/* TODO: we have the route MTU here; packetise */
 
+	skb->protocol = htons(ETH_P_MCTP);
+	skb->priority = 0;
 	skb_reset_transport_header(skb);
 	skb_push(skb, sizeof(struct mctp_hdr));
 	skb_reset_network_header(skb);
+	skb->dev = rt->dev->dev;
+
+	/* cb->net will have been set on initial ingress */
+	cb->src = saddr;
+
+	/* set up common header fields */
 	hdr = mctp_hdr(skb);
 	hdr->ver = 1;
 	hdr->dest = daddr;
 	hdr->src = saddr;
-	hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | /* TODO */
-		tag;
 
-	skb->dev = rt->dev->dev;
-	skb->protocol = htons(ETH_P_MCTP);
-	skb->priority = 0;
+	mtu = mctp_route_mtu(rt);
 
-	/* cb->net will have been set on initial ingress */
-	cb->src = saddr;
-
-	return mctp_do_route(rt, skb);
+	if (skb->len + sizeof(struct mctp_hdr) <= mtu) {
+		hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM |
+			tag;
+		return mctp_do_route(rt, skb);
+	} else {
+		return mctp_do_fragment_route(rt, skb, mtu, tag);
+	}
 }
 
 /* route management */
-- 
cgit v1.2.3


From 03f2bbc4ee57ca53b2fa1d9caabc5006e0b8f375 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:52 +0800
Subject: mctp: Allow per-netns default networks

Currently we have a compile-time default network
(MCTP_INITIAL_DEFAULT_NET). This change introduces a default_net field
on the net namespace, allowing future configuration for new interfaces.

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctp.h        |  2 ++
 include/net/netns/mctp.h  |  3 +++
 include/uapi/linux/mctp.h |  1 -
 net/mctp/af_mctp.c        |  3 +++
 net/mctp/device.c         |  2 +-
 net/mctp/route.c          | 14 ++++++++++++++
 6 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index 0a460ba185b8..54bbe042c973 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -37,6 +37,8 @@ struct mctp_hdr {
 
 #define MCTP_HEADER_MAXLEN	4
 
+#define MCTP_INITIAL_DEFAULT_NET	1
+
 static inline bool mctp_address_ok(mctp_eid_t eid)
 {
 	return eid >= 8 && eid < 255;
diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h
index 14ae6d37e52a..acedef12a35e 100644
--- a/include/net/netns/mctp.h
+++ b/include/net/netns/mctp.h
@@ -25,6 +25,9 @@ struct netns_mctp {
 	spinlock_t keys_lock;
 	struct hlist_head keys;
 
+	/* MCTP network */
+	unsigned int default_net;
+
 	/* neighbour table */
 	struct mutex neigh_lock;
 	struct list_head neighbours;
diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index a9d8edb3402b..52b54d13f385 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -26,7 +26,6 @@ struct sockaddr_mctp {
 };
 
 #define MCTP_NET_ANY		0x0
-#define MCTP_NET_DEFAULT	0x0
 
 #define MCTP_ADDR_NULL		0x00
 #define MCTP_ADDR_ANY		0xff
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 9ca836df19d0..84f722d31fd7 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -94,6 +94,9 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	if (!capable(CAP_NET_RAW))
 		return -EACCES;
 
+	if (addr->smctp_network == MCTP_NET_ANY)
+		addr->smctp_network = mctp_default_net(sock_net(sk));
+
 	rt = mctp_route_lookup(sock_net(sk), addr->smctp_network,
 			       addr->smctp_addr.s_addr);
 	if (!rt)
diff --git a/net/mctp/device.c b/net/mctp/device.c
index aa049590acda..b9f38e765f61 100644
--- a/net/mctp/device.c
+++ b/net/mctp/device.c
@@ -267,7 +267,7 @@ static struct mctp_dev *mctp_add_dev(struct net_device *dev)
 
 	spin_lock_init(&mdev->addrs_lock);
 
-	mdev->net = MCTP_INITIAL_DEFAULT_NET;
+	mdev->net = mctp_default_net(dev_net(dev));
 
 	/* associate to net_device */
 	rcu_assign_pointer(dev->mctp_ptr, mdev);
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 38f0a7278520..b3101375c8e7 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -433,6 +433,19 @@ static struct mctp_route *mctp_route_alloc(void)
 	return rt;
 }
 
+unsigned int mctp_default_net(struct net *net)
+{
+	return READ_ONCE(net->mctp.default_net);
+}
+
+int mctp_default_net_set(struct net *net, unsigned int index)
+{
+	if (index == 0)
+		return -EINVAL;
+	WRITE_ONCE(net->mctp.default_net, index);
+	return 0;
+}
+
 /* tag management */
 static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
 			     struct mctp_sock *msk)
@@ -1045,6 +1058,7 @@ static int __net_init mctp_routes_net_init(struct net *net)
 	mutex_init(&ns->bind_lock);
 	INIT_HLIST_HEAD(&ns->keys);
 	spin_lock_init(&ns->keys_lock);
+	WARN_ON(mctp_default_net_set(net, MCTP_INITIAL_DEFAULT_NET));
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3aa2605594556c676fb88744bd9845acae60683d Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 28 Jul 2021 20:08:00 +0200
Subject: net/sched: store the last executed chain also for clsact egress

currently, only 'ingress' and 'clsact ingress' qdiscs store the tc 'chain
id' in the skb extension. However, userspace programs (like ovs) are able
to setup egress rules, and datapath gets confused in case it doesn't find
the 'chain id' for a packet that's "recirculated" by tc.
Change tcf_classify() to have the same semantic as tcf_classify_ingress()
so that a single function can be called in ingress / egress, using the tc
ingress / egress block respectively.

Suggested-by: Alaa Hleilel <alaa@nvidia.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h    | 22 +++++++---------------
 net/core/dev.c           |  5 ++---
 net/sched/cls_api.c      | 42 +++++++++++++++++-------------------------
 net/sched/sch_atm.c      |  2 +-
 net/sched/sch_cake.c     |  2 +-
 net/sched/sch_cbq.c      |  2 +-
 net/sched/sch_drr.c      |  2 +-
 net/sched/sch_dsmark.c   |  2 +-
 net/sched/sch_ets.c      |  2 +-
 net/sched/sch_fq_codel.c |  2 +-
 net/sched/sch_fq_pie.c   |  2 +-
 net/sched/sch_hfsc.c     |  2 +-
 net/sched/sch_htb.c      |  2 +-
 net/sched/sch_multiq.c   |  2 +-
 net/sched/sch_prio.c     |  2 +-
 net/sched/sch_qfq.c      |  2 +-
 net/sched/sch_sfb.c      |  2 +-
 net/sched/sch_sfq.c      |  2 +-
 18 files changed, 41 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index ec7823921bd2..dc28fcb6f0a2 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -76,12 +76,10 @@ static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 	return block->q;
 }
 
-int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-		 struct tcf_result *res, bool compat_mode);
-int tcf_classify_ingress(struct sk_buff *skb,
-			 const struct tcf_block *ingress_block,
-			 const struct tcf_proto *tp, struct tcf_result *res,
-			 bool compat_mode);
+int tcf_classify(struct sk_buff *skb,
+		 const struct tcf_block *block,
+		 const struct tcf_proto *tp, struct tcf_result *res,
+		 bool compat_mode);
 
 #else
 static inline bool tcf_block_shared(struct tcf_block *block)
@@ -138,20 +136,14 @@ void tc_setup_cb_block_unregister(struct tcf_block *block, flow_setup_cb_t *cb,
 {
 }
 
-static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+static inline int tcf_classify(struct sk_buff *skb,
+			       const struct tcf_block *block,
+			       const struct tcf_proto *tp,
 			       struct tcf_result *res, bool compat_mode)
 {
 	return TC_ACT_UNSPEC;
 }
 
-static inline int tcf_classify_ingress(struct sk_buff *skb,
-				       const struct tcf_block *ingress_block,
-				       const struct tcf_proto *tp,
-				       struct tcf_result *res, bool compat_mode)
-{
-	return TC_ACT_UNSPEC;
-}
-
 #endif
 
 static inline unsigned long
diff --git a/net/core/dev.c b/net/core/dev.c
index dcc87fcd64ba..b51e41d0a7fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4012,7 +4012,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	qdisc_skb_cb(skb)->post_ct = false;
 	mini_qdisc_bstats_cpu_update(miniq, skb);
 
-	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
 	case TC_ACT_OK:
 	case TC_ACT_RECLASSIFY:
 		skb->tc_index = TC_H_MIN(cl_res.classid);
@@ -5164,8 +5164,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	skb->tc_at_ingress = 1;
 	mini_qdisc_bstats_cpu_update(miniq, skb);
 
-	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
-				     &cl_res, false)) {
+	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
 	case TC_ACT_OK:
 	case TC_ACT_RECLASSIFY:
 		skb->tc_index = TC_H_MIN(cl_res.classid);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1167cd0be179..7be5b9d2aead 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1577,20 +1577,10 @@ reset:
 #endif
 }
 
-int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+int tcf_classify(struct sk_buff *skb,
+		 const struct tcf_block *block,
+		 const struct tcf_proto *tp,
 		 struct tcf_result *res, bool compat_mode)
-{
-	u32 last_executed_chain = 0;
-
-	return __tcf_classify(skb, tp, tp, res, compat_mode,
-			      &last_executed_chain);
-}
-EXPORT_SYMBOL(tcf_classify);
-
-int tcf_classify_ingress(struct sk_buff *skb,
-			 const struct tcf_block *ingress_block,
-			 const struct tcf_proto *tp,
-			 struct tcf_result *res, bool compat_mode)
 {
 #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
 	u32 last_executed_chain = 0;
@@ -1603,20 +1593,22 @@ int tcf_classify_ingress(struct sk_buff *skb,
 	struct tc_skb_ext *ext;
 	int ret;
 
-	ext = skb_ext_find(skb, TC_SKB_EXT);
+	if (block) {
+		ext = skb_ext_find(skb, TC_SKB_EXT);
 
-	if (ext && ext->chain) {
-		struct tcf_chain *fchain;
+		if (ext && ext->chain) {
+			struct tcf_chain *fchain;
 
-		fchain = tcf_chain_lookup_rcu(ingress_block, ext->chain);
-		if (!fchain)
-			return TC_ACT_SHOT;
+			fchain = tcf_chain_lookup_rcu(block, ext->chain);
+			if (!fchain)
+				return TC_ACT_SHOT;
 
-		/* Consume, so cloned/redirect skbs won't inherit ext */
-		skb_ext_del(skb, TC_SKB_EXT);
+			/* Consume, so cloned/redirect skbs won't inherit ext */
+			skb_ext_del(skb, TC_SKB_EXT);
 
-		tp = rcu_dereference_bh(fchain->filter_chain);
-		last_executed_chain = fchain->index;
+			tp = rcu_dereference_bh(fchain->filter_chain);
+			last_executed_chain = fchain->index;
+		}
 	}
 
 	ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode,
@@ -1635,7 +1627,7 @@ int tcf_classify_ingress(struct sk_buff *skb,
 	return ret;
 #endif
 }
-EXPORT_SYMBOL(tcf_classify_ingress);
+EXPORT_SYMBOL(tcf_classify);
 
 struct tcf_chain_info {
 	struct tcf_proto __rcu **pprev;
@@ -3825,7 +3817,7 @@ struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, stru
 
 	fl = rcu_dereference_bh(qe->filter_chain);
 
-	switch (tcf_classify(skb, fl, &cl_res, false)) {
+	switch (tcf_classify(skb, NULL, fl, &cl_res, false)) {
 	case TC_ACT_SHOT:
 		qdisc_qstats_drop(sch);
 		__qdisc_drop(skb, to_free);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index d0c9a57398fc..7d8518176b45 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -394,7 +394,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		list_for_each_entry(flow, &p->flows, list) {
 			fl = rcu_dereference_bh(flow->filter_list);
 			if (fl) {
-				result = tcf_classify(skb, fl, &res, true);
+				result = tcf_classify(skb, NULL, fl, &res, true);
 				if (result < 0)
 					continue;
 				flow = (struct atm_flow_data *)res.class;
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 951542843cab..ecc5c4d93779 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1665,7 +1665,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
 		goto hash;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tcf_classify(skb, filter, &res, false);
+	result = tcf_classify(skb, NULL, filter, &res, false);
 
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index b79a7e27bb31..2dabaffd39d0 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -228,7 +228,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 		/*
 		 * Step 2+n. Apply classifier.
 		 */
-		result = tcf_classify(skb, fl, &res, true);
+		result = tcf_classify(skb, NULL, fl, &res, true);
 		if (!fl || result < 0)
 			goto fallback;
 
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index fc1e47069593..642cd179b7a7 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -317,7 +317,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	fl = rcu_dereference_bh(q->filter_list);
-	result = tcf_classify(skb, fl, &res, false);
+	result = tcf_classify(skb, NULL, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index d320bcfb2da2..4c100d105269 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -242,7 +242,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	else {
 		struct tcf_result res;
 		struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
-		int result = tcf_classify(skb, fl, &res, false);
+		int result = tcf_classify(skb, NULL, fl, &res, false);
 
 		pr_debug("result %d class 0x%04x\n", result, res.classid);
 
diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
index c1e84d1eeaba..925924fab1ab 100644
--- a/net/sched/sch_ets.c
+++ b/net/sched/sch_ets.c
@@ -390,7 +390,7 @@ static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
 		fl = rcu_dereference_bh(q->filter_list);
-		err = tcf_classify(skb, fl, &res, false);
+		err = tcf_classify(skb, NULL, fl, &res, false);
 #ifdef CONFIG_NET_CLS_ACT
 		switch (err) {
 		case TC_ACT_STOLEN:
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index bbd5f8753600..c4afdd026f51 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -91,7 +91,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return fq_codel_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tcf_classify(skb, filter, &res, false);
+	result = tcf_classify(skb, NULL, filter, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index cac684952edc..830f3559f727 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -94,7 +94,7 @@ static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return fq_pie_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tcf_classify(skb, filter, &res, false);
+	result = tcf_classify(skb, NULL, filter, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index bf0034c66e35..b7ac30cca035 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1130,7 +1130,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	head = &q->root;
 	tcf = rcu_dereference_bh(q->root.filter_list);
-	while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
+	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 5f7ac27a5264..81ea8332547a 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -238,7 +238,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
+	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 5c27b4270b90..e282e7382117 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -36,7 +36,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	int err;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	err = tcf_classify(skb, fl, &res, false);
+	err = tcf_classify(skb, NULL, fl, &res, false);
 #ifdef CONFIG_NET_CLS_ACT
 	switch (err) {
 	case TC_ACT_STOLEN:
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 3eabb871a1d5..03fdf31ccb6a 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -39,7 +39,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
 		fl = rcu_dereference_bh(q->filter_list);
-		err = tcf_classify(skb, fl, &res, false);
+		err = tcf_classify(skb, NULL, fl, &res, false);
 #ifdef CONFIG_NET_CLS_ACT
 		switch (err) {
 		case TC_ACT_STOLEN:
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index b692a0de1ad5..58a9d42b52b8 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -690,7 +690,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	fl = rcu_dereference_bh(q->filter_list);
-	result = tcf_classify(skb, fl, &res, false);
+	result = tcf_classify(skb, NULL, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index dde829d4b9f8..3d061a13d7ed 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -257,7 +257,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
 	struct tcf_result res;
 	int result;
 
-	result = tcf_classify(skb, fl, &res, false);
+	result = tcf_classify(skb, NULL, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 066754a18569..f8e569f79f13 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -178,7 +178,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return sfq_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tcf_classify(skb, fl, &res, false);
+	result = tcf_classify(skb, NULL, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
-- 
cgit v1.2.3


From 79976892f7ea37f44f8bcfb6d266954e8ae0124d Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Thu, 29 Jul 2021 15:13:50 +0800
Subject: net: convert fib_treeref from int to refcount_t

refcount_t type should be used instead of int when fib_treeref is used as
a reference counter,and avoid use-after-free risks.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20210729071350.28919-1-yajun.deng@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dn_fib.h     | 2 +-
 include/net/ip_fib.h     | 2 +-
 net/decnet/dn_fib.c      | 6 +++---
 net/ipv4/fib_semantics.c | 8 ++++----
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h
index ccc6e9df178b..ddd6565957b3 100644
--- a/include/net/dn_fib.h
+++ b/include/net/dn_fib.h
@@ -29,7 +29,7 @@ struct dn_fib_nh {
 struct dn_fib_info {
 	struct dn_fib_info	*fib_next;
 	struct dn_fib_info	*fib_prev;
-	int 			fib_treeref;
+	refcount_t		fib_treeref;
 	refcount_t		fib_clntref;
 	int			fib_dead;
 	unsigned int		fib_flags;
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 3ab2563b1a23..21c5386d4a6d 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -133,7 +133,7 @@ struct fib_info {
 	struct hlist_node	fib_lhash;
 	struct list_head	nh_list;
 	struct net		*fib_net;
-	int			fib_treeref;
+	refcount_t		fib_treeref;
 	refcount_t		fib_clntref;
 	unsigned int		fib_flags;
 	unsigned char		fib_dead;
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 77fbf8e9df4b..387a7e81dd00 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -102,7 +102,7 @@ void dn_fib_free_info(struct dn_fib_info *fi)
 void dn_fib_release_info(struct dn_fib_info *fi)
 {
 	spin_lock(&dn_fib_info_lock);
-	if (fi && --fi->fib_treeref == 0) {
+	if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
 		if (fi->fib_next)
 			fi->fib_next->fib_prev = fi->fib_prev;
 		if (fi->fib_prev)
@@ -385,11 +385,11 @@ link_it:
 	if ((ofi = dn_fib_find_info(fi)) != NULL) {
 		fi->fib_dead = 1;
 		dn_fib_free_info(fi);
-		ofi->fib_treeref++;
+		refcount_inc(&ofi->fib_treeref);
 		return ofi;
 	}
 
-	fi->fib_treeref++;
+	refcount_inc(&fi->fib_treeref);
 	refcount_set(&fi->fib_clntref, 1);
 	spin_lock(&dn_fib_info_lock);
 	fi->fib_next = dn_fib_info_list;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4c0c33e4710d..fa19f4cdf3a4 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(free_fib_info);
 void fib_release_info(struct fib_info *fi)
 {
 	spin_lock_bh(&fib_info_lock);
-	if (fi && --fi->fib_treeref == 0) {
+	if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
 		hlist_del(&fi->fib_hash);
 		if (fi->fib_prefsrc)
 			hlist_del(&fi->fib_lhash);
@@ -1373,7 +1373,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		if (!cfg->fc_mx) {
 			fi = fib_find_info_nh(net, cfg);
 			if (fi) {
-				fi->fib_treeref++;
+				refcount_inc(&fi->fib_treeref);
 				return fi;
 			}
 		}
@@ -1547,11 +1547,11 @@ link_it:
 	if (ofi) {
 		fi->fib_dead = 1;
 		free_fib_info(fi);
-		ofi->fib_treeref++;
+		refcount_inc(&ofi->fib_treeref);
 		return ofi;
 	}
 
-	fi->fib_treeref++;
+	refcount_inc(&fi->fib_treeref);
 	refcount_set(&fi->fib_clntref, 1);
 	spin_lock_bh(&fib_info_lock);
 	hlist_add_head(&fi->fib_hash,
-- 
cgit v1.2.3


From 3df40eb3a2ea58bf404a38f15a7a2768e4762cb0 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Fri, 30 Jul 2021 16:41:59 +0200
Subject: nfc: constify several pointers to u8, char and sk_buff

Several functions receive pointers to u8, char or sk_buff but do not
modify the contents so make them const.  This allows doing the same for
local variables and in total makes the code a little bit safer.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nfc/nfc.h   |  4 ++--
 net/nfc/core.c          |  4 ++--
 net/nfc/hci/llc_shdlc.c | 10 +++++-----
 net/nfc/llcp.h          |  8 ++++----
 net/nfc/llcp_commands.c | 46 +++++++++++++++++++++++++---------------------
 net/nfc/llcp_core.c     | 44 ++++++++++++++++++++++++--------------------
 net/nfc/nfc.h           |  2 +-
 7 files changed, 63 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 901779138e2b..5dee575fbe86 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -266,7 +266,7 @@ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk,
 struct sk_buff *nfc_alloc_recv_skb(unsigned int size, gfp_t gfp);
 
 int nfc_set_remote_general_bytes(struct nfc_dev *dev,
-				 u8 *gt, u8 gt_len);
+				 const u8 *gt, u8 gt_len);
 u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len);
 
 int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
@@ -280,7 +280,7 @@ int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx,
 		       u8 comm_mode, u8 rf_mode);
 
 int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode,
-		     u8 *gb, size_t gb_len);
+		     const u8 *gb, size_t gb_len);
 int nfc_tm_deactivated(struct nfc_dev *dev);
 int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb);
 
diff --git a/net/nfc/core.c b/net/nfc/core.c
index 6ade54149b73..08182e209144 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -636,7 +636,7 @@ error:
 	return rc;
 }
 
-int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gb, u8 gb_len)
+int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gb, u8 gb_len)
 {
 	pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len);
 
@@ -665,7 +665,7 @@ int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb)
 EXPORT_SYMBOL(nfc_tm_data_received);
 
 int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode,
-		     u8 *gb, size_t gb_len)
+		     const u8 *gb, size_t gb_len)
 {
 	int rc;
 
diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c
index 6b747856d095..aef750d7787c 100644
--- a/net/nfc/hci/llc_shdlc.c
+++ b/net/nfc/hci/llc_shdlc.c
@@ -123,7 +123,7 @@ static bool llc_shdlc_x_lteq_y_lt_z(int x, int y, int z)
 		return ((y >= x) || (y < z)) ? true : false;
 }
 
-static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc,
+static struct sk_buff *llc_shdlc_alloc_skb(const struct llc_shdlc *shdlc,
 					   int payload_len)
 {
 	struct sk_buff *skb;
@@ -137,7 +137,7 @@ static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc,
 }
 
 /* immediately sends an S frame. */
-static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc,
+static int llc_shdlc_send_s_frame(const struct llc_shdlc *shdlc,
 				  enum sframe_type sframe_type, int nr)
 {
 	int r;
@@ -159,7 +159,7 @@ static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc,
 }
 
 /* immediately sends an U frame. skb may contain optional payload */
-static int llc_shdlc_send_u_frame(struct llc_shdlc *shdlc,
+static int llc_shdlc_send_u_frame(const struct llc_shdlc *shdlc,
 				  struct sk_buff *skb,
 				  enum uframe_modifier uframe_modifier)
 {
@@ -361,7 +361,7 @@ static void llc_shdlc_connect_complete(struct llc_shdlc *shdlc, int r)
 	wake_up(shdlc->connect_wq);
 }
 
-static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc)
+static int llc_shdlc_connect_initiate(const struct llc_shdlc *shdlc)
 {
 	struct sk_buff *skb;
 
@@ -377,7 +377,7 @@ static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc)
 	return llc_shdlc_send_u_frame(shdlc, skb, U_FRAME_RSET);
 }
 
-static int llc_shdlc_connect_send_ua(struct llc_shdlc *shdlc)
+static int llc_shdlc_connect_send_ua(const struct llc_shdlc *shdlc)
 {
 	struct sk_buff *skb;
 
diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h
index 97853c9cefc7..d49d4bf2e37c 100644
--- a/net/nfc/llcp.h
+++ b/net/nfc/llcp.h
@@ -221,15 +221,15 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *sk, struct socket *newsock);
 
 /* TLV API */
 int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local,
-			  u8 *tlv_array, u16 tlv_array_len);
+			  const u8 *tlv_array, u16 tlv_array_len);
 int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock,
-				  u8 *tlv_array, u16 tlv_array_len);
+				  const u8 *tlv_array, u16 tlv_array_len);
 
 /* Commands API */
 void nfc_llcp_recv(void *data, struct sk_buff *skb, int err);
-u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length);
+u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length);
 struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap);
-struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri,
+struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri,
 						  size_t uri_len);
 void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp);
 void nfc_llcp_free_sdp_tlv_list(struct hlist_head *sdp_head);
diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
index 475061c79c44..3c4172a5aeb5 100644
--- a/net/nfc/llcp_commands.c
+++ b/net/nfc/llcp_commands.c
@@ -15,7 +15,7 @@
 #include "nfc.h"
 #include "llcp.h"
 
-static u8 llcp_tlv_length[LLCP_TLV_MAX] = {
+static const u8 llcp_tlv_length[LLCP_TLV_MAX] = {
 	0,
 	1, /* VERSION */
 	2, /* MIUX */
@@ -29,7 +29,7 @@ static u8 llcp_tlv_length[LLCP_TLV_MAX] = {
 
 };
 
-static u8 llcp_tlv8(u8 *tlv, u8 type)
+static u8 llcp_tlv8(const u8 *tlv, u8 type)
 {
 	if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]])
 		return 0;
@@ -37,7 +37,7 @@ static u8 llcp_tlv8(u8 *tlv, u8 type)
 	return tlv[2];
 }
 
-static u16 llcp_tlv16(u8 *tlv, u8 type)
+static u16 llcp_tlv16(const u8 *tlv, u8 type)
 {
 	if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]])
 		return 0;
@@ -46,37 +46,37 @@ static u16 llcp_tlv16(u8 *tlv, u8 type)
 }
 
 
-static u8 llcp_tlv_version(u8 *tlv)
+static u8 llcp_tlv_version(const u8 *tlv)
 {
 	return llcp_tlv8(tlv, LLCP_TLV_VERSION);
 }
 
-static u16 llcp_tlv_miux(u8 *tlv)
+static u16 llcp_tlv_miux(const u8 *tlv)
 {
 	return llcp_tlv16(tlv, LLCP_TLV_MIUX) & 0x7ff;
 }
 
-static u16 llcp_tlv_wks(u8 *tlv)
+static u16 llcp_tlv_wks(const u8 *tlv)
 {
 	return llcp_tlv16(tlv, LLCP_TLV_WKS);
 }
 
-static u16 llcp_tlv_lto(u8 *tlv)
+static u16 llcp_tlv_lto(const u8 *tlv)
 {
 	return llcp_tlv8(tlv, LLCP_TLV_LTO);
 }
 
-static u8 llcp_tlv_opt(u8 *tlv)
+static u8 llcp_tlv_opt(const u8 *tlv)
 {
 	return llcp_tlv8(tlv, LLCP_TLV_OPT);
 }
 
-static u8 llcp_tlv_rw(u8 *tlv)
+static u8 llcp_tlv_rw(const u8 *tlv)
 {
 	return llcp_tlv8(tlv, LLCP_TLV_RW) & 0xf;
 }
 
-u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length)
+u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length)
 {
 	u8 *tlv, length;
 
@@ -130,7 +130,7 @@ struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap)
 	return sdres;
 }
 
-struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri,
+struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri,
 						  size_t uri_len)
 {
 	struct nfc_llcp_sdp_tlv *sdreq;
@@ -190,9 +190,10 @@ void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head)
 }
 
 int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local,
-			  u8 *tlv_array, u16 tlv_array_len)
+			  const u8 *tlv_array, u16 tlv_array_len)
 {
-	u8 *tlv = tlv_array, type, length, offset = 0;
+	const u8 *tlv = tlv_array;
+	u8 type, length, offset = 0;
 
 	pr_debug("TLV array length %d\n", tlv_array_len);
 
@@ -239,9 +240,10 @@ int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local,
 }
 
 int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock,
-				  u8 *tlv_array, u16 tlv_array_len)
+				  const u8 *tlv_array, u16 tlv_array_len)
 {
-	u8 *tlv = tlv_array, type, length, offset = 0;
+	const u8 *tlv = tlv_array;
+	u8 type, length, offset = 0;
 
 	pr_debug("TLV array length %d\n", tlv_array_len);
 
@@ -295,7 +297,7 @@ static struct sk_buff *llcp_add_header(struct sk_buff *pdu,
 	return pdu;
 }
 
-static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, u8 *tlv,
+static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, const u8 *tlv,
 				    u8 tlv_length)
 {
 	/* XXX Add an skb length check */
@@ -389,9 +391,10 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock)
 {
 	struct nfc_llcp_local *local;
 	struct sk_buff *skb;
-	u8 *service_name_tlv = NULL, service_name_tlv_length;
-	u8 *miux_tlv = NULL, miux_tlv_length;
-	u8 *rw_tlv = NULL, rw_tlv_length, rw;
+	const u8 *service_name_tlv = NULL;
+	const u8 *miux_tlv = NULL;
+	const u8 *rw_tlv = NULL;
+	u8 service_name_tlv_length, miux_tlv_length,  rw_tlv_length, rw;
 	int err;
 	u16 size = 0;
 	__be16 miux;
@@ -465,8 +468,9 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock)
 {
 	struct nfc_llcp_local *local;
 	struct sk_buff *skb;
-	u8 *miux_tlv = NULL, miux_tlv_length;
-	u8 *rw_tlv = NULL, rw_tlv_length, rw;
+	const u8 *miux_tlv = NULL;
+	const u8 *rw_tlv = NULL;
+	u8 miux_tlv_length, rw_tlv_length, rw;
 	int err;
 	u16 size = 0;
 	__be16 miux;
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index cc997518f79d..eaeb2b1cfa6a 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -301,7 +301,7 @@ static char *wks[] = {
 	"urn:nfc:sn:snep",
 };
 
-static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len)
+static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len)
 {
 	int sap, num_wks;
 
@@ -325,7 +325,7 @@ static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len)
 
 static
 struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local,
-					    u8 *sn, size_t sn_len)
+					    const u8 *sn, size_t sn_len)
 {
 	struct sock *sk;
 	struct nfc_llcp_sock *llcp_sock, *tmp_sock;
@@ -522,7 +522,7 @@ static int nfc_llcp_build_gb(struct nfc_llcp_local *local)
 {
 	u8 *gb_cur, version, version_length;
 	u8 lto_length, wks_length, miux_length;
-	u8 *version_tlv = NULL, *lto_tlv = NULL,
+	const u8 *version_tlv = NULL, *lto_tlv = NULL,
 	   *wks_tlv = NULL, *miux_tlv = NULL;
 	__be16 wks = cpu_to_be16(local->local_wks);
 	u8 gb_len = 0;
@@ -612,7 +612,7 @@ u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len)
 	return local->gb;
 }
 
-int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len)
+int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len)
 {
 	struct nfc_llcp_local *local;
 
@@ -639,27 +639,27 @@ int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len)
 				     local->remote_gb_len - 3);
 }
 
-static u8 nfc_llcp_dsap(struct sk_buff *pdu)
+static u8 nfc_llcp_dsap(const struct sk_buff *pdu)
 {
 	return (pdu->data[0] & 0xfc) >> 2;
 }
 
-static u8 nfc_llcp_ptype(struct sk_buff *pdu)
+static u8 nfc_llcp_ptype(const struct sk_buff *pdu)
 {
 	return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6);
 }
 
-static u8 nfc_llcp_ssap(struct sk_buff *pdu)
+static u8 nfc_llcp_ssap(const struct sk_buff *pdu)
 {
 	return pdu->data[1] & 0x3f;
 }
 
-static u8 nfc_llcp_ns(struct sk_buff *pdu)
+static u8 nfc_llcp_ns(const struct sk_buff *pdu)
 {
 	return pdu->data[2] >> 4;
 }
 
-static u8 nfc_llcp_nr(struct sk_buff *pdu)
+static u8 nfc_llcp_nr(const struct sk_buff *pdu)
 {
 	return pdu->data[2] & 0xf;
 }
@@ -801,7 +801,7 @@ out:
 }
 
 static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local,
-						  u8 *sn, size_t sn_len)
+						  const u8 *sn, size_t sn_len)
 {
 	struct nfc_llcp_sock *llcp_sock;
 
@@ -815,9 +815,10 @@ static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local,
 	return llcp_sock;
 }
 
-static u8 *nfc_llcp_connect_sn(struct sk_buff *skb, size_t *sn_len)
+static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len)
 {
-	u8 *tlv = &skb->data[2], type, length;
+	u8 type, length;
+	const u8 *tlv = &skb->data[2];
 	size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0;
 
 	while (offset < tlv_array_len) {
@@ -875,7 +876,7 @@ static void nfc_llcp_recv_ui(struct nfc_llcp_local *local,
 }
 
 static void nfc_llcp_recv_connect(struct nfc_llcp_local *local,
-				  struct sk_buff *skb)
+				  const struct sk_buff *skb)
 {
 	struct sock *new_sk, *parent;
 	struct nfc_llcp_sock *sock, *new_sock;
@@ -893,7 +894,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local,
 			goto fail;
 		}
 	} else {
-		u8 *sn;
+		const u8 *sn;
 		size_t sn_len;
 
 		sn = nfc_llcp_connect_sn(skb, &sn_len);
@@ -1112,7 +1113,7 @@ static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local,
 }
 
 static void nfc_llcp_recv_disc(struct nfc_llcp_local *local,
-			       struct sk_buff *skb)
+			       const struct sk_buff *skb)
 {
 	struct nfc_llcp_sock *llcp_sock;
 	struct sock *sk;
@@ -1155,7 +1156,8 @@ static void nfc_llcp_recv_disc(struct nfc_llcp_local *local,
 	nfc_llcp_sock_put(llcp_sock);
 }
 
-static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb)
+static void nfc_llcp_recv_cc(struct nfc_llcp_local *local,
+			     const struct sk_buff *skb)
 {
 	struct nfc_llcp_sock *llcp_sock;
 	struct sock *sk;
@@ -1188,7 +1190,8 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb)
 	nfc_llcp_sock_put(llcp_sock);
 }
 
-static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb)
+static void nfc_llcp_recv_dm(struct nfc_llcp_local *local,
+			     const struct sk_buff *skb)
 {
 	struct nfc_llcp_sock *llcp_sock;
 	struct sock *sk;
@@ -1226,12 +1229,13 @@ static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb)
 }
 
 static void nfc_llcp_recv_snl(struct nfc_llcp_local *local,
-			      struct sk_buff *skb)
+			      const struct sk_buff *skb)
 {
 	struct nfc_llcp_sock *llcp_sock;
-	u8 dsap, ssap, *tlv, type, length, tid, sap;
+	u8 dsap, ssap, type, length, tid, sap;
+	const u8 *tlv;
 	u16 tlv_len, offset;
-	char *service_name;
+	const char *service_name;
 	size_t service_name_len;
 	struct nfc_llcp_sdp_tlv *sdp;
 	HLIST_HEAD(llc_sdres_list);
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index 889fefd64e56..de2ec66d7e83 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -48,7 +48,7 @@ void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx,
 			u8 comm_mode, u8 rf_mode);
 int nfc_llcp_register_device(struct nfc_dev *dev);
 void nfc_llcp_unregister_device(struct nfc_dev *dev);
-int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len);
+int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len);
 u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len);
 int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb);
 struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev);
-- 
cgit v1.2.3


From ddecf5556f7fdf871fa8ce26b376e3e8ae6213b6 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Fri, 30 Jul 2021 16:42:01 +0200
Subject: nfc: nci: constify several pointers to u8, sk_buff and other structs

Several functions receive pointers to u8, sk_buff or other structs but
do not modify the contents so make them const.  This allows doing the
same for local variables and in total makes the code a little bit safer.

This makes const also data passed as "unsigned long opt" argument to
nci_request() function.  Usual flow for such functions is:
1. Receive "u8 *" and store it (the pointer) in a structure
   allocated on stack (e.g. struct nci_set_config_param),
2. Call nci_request() or __nci_request() passing a callback function an
   the pointer to the structure via an "unsigned long opt",
3. nci_request() calls the callback which dereferences "unsigned long
   opt" in a read-only way.

This converts all above paths to use proper pointer to const data, so
entire flow is safer.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nfc/nci_core.h | 14 ++++----
 net/nfc/nci/core.c         | 40 +++++++++++-----------
 net/nfc/nci/data.c         | 12 +++----
 net/nfc/nci/hci.c          | 24 +++++++-------
 net/nfc/nci/ntf.c          | 83 +++++++++++++++++++++++++---------------------
 net/nfc/nci/rsp.c          | 46 +++++++++++++------------
 net/nfc/nci/spi.c          |  2 +-
 7 files changed, 118 insertions(+), 103 deletions(-)

(limited to 'include')

diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index 00f2c60971d7..4770a81f4aa7 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -278,23 +278,25 @@ int nci_request(struct nci_dev *ndev,
 		void (*req)(struct nci_dev *ndev,
 			    unsigned long opt),
 		unsigned long opt, __u32 timeout);
-int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload);
-int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload);
+int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len,
+		 const __u8 *payload);
+int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len,
+		 const __u8 *payload);
 int nci_core_reset(struct nci_dev *ndev);
 int nci_core_init(struct nci_dev *ndev);
 
 int nci_recv_frame(struct nci_dev *ndev, struct sk_buff *skb);
 int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb);
-int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val);
+int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, const __u8 *val);
 
 int nci_nfcee_discover(struct nci_dev *ndev, u8 action);
 int nci_nfcee_mode_set(struct nci_dev *ndev, u8 nfcee_id, u8 nfcee_mode);
 int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type,
 			 u8 number_destination_params,
 			 size_t params_len,
-			 struct core_conn_create_dest_spec_params *params);
+			 const struct core_conn_create_dest_spec_params *params);
 int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id);
-int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len,
+int nci_nfcc_loopback(struct nci_dev *ndev, const void *data, size_t data_len,
 		      struct sk_buff **resp);
 
 struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev);
@@ -378,7 +380,7 @@ void nci_req_complete(struct nci_dev *ndev, int result);
 struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev,
 						   int conn_id);
 int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type,
-					  struct dest_spec_params *params);
+					  const struct dest_spec_params *params);
 
 /* ----- NCI status code ----- */
 int nci_to_errno(__u8 code);
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 400d66c4e210..774ddf957388 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -53,9 +53,9 @@ struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev,
 }
 
 int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type,
-					  struct dest_spec_params *params)
+					  const struct dest_spec_params *params)
 {
-	struct nci_conn_info *conn_info;
+	const struct nci_conn_info *conn_info;
 
 	list_for_each_entry(conn_info, &ndev->conn_info_list, list) {
 		if (conn_info->dest_type == dest_type) {
@@ -210,14 +210,15 @@ static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt)
 }
 
 struct nci_set_config_param {
-	__u8	id;
-	size_t	len;
-	__u8	*val;
+	__u8		id;
+	size_t		len;
+	const __u8	*val;
 };
 
 static void nci_set_config_req(struct nci_dev *ndev, unsigned long opt)
 {
-	struct nci_set_config_param *param = (struct nci_set_config_param *)opt;
+	const struct nci_set_config_param *param =
+		(struct nci_set_config_param *)opt;
 	struct nci_core_set_config_cmd cmd;
 
 	BUG_ON(param->len > NCI_MAX_PARAM_LEN);
@@ -237,7 +238,7 @@ struct nci_rf_discover_param {
 
 static void nci_rf_discover_req(struct nci_dev *ndev, unsigned long opt)
 {
-	struct nci_rf_discover_param *param =
+	const struct nci_rf_discover_param *param =
 		(struct nci_rf_discover_param *)opt;
 	struct nci_rf_disc_cmd cmd;
 
@@ -303,7 +304,7 @@ struct nci_rf_discover_select_param {
 
 static void nci_rf_discover_select_req(struct nci_dev *ndev, unsigned long opt)
 {
-	struct nci_rf_discover_select_param *param =
+	const struct nci_rf_discover_select_param *param =
 		(struct nci_rf_discover_select_param *)opt;
 	struct nci_rf_discover_select_cmd cmd;
 
@@ -341,18 +342,18 @@ static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt)
 struct nci_cmd_param {
 	__u16 opcode;
 	size_t len;
-	__u8 *payload;
+	const __u8 *payload;
 };
 
 static void nci_generic_req(struct nci_dev *ndev, unsigned long opt)
 {
-	struct nci_cmd_param *param =
+	const struct nci_cmd_param *param =
 		(struct nci_cmd_param *)opt;
 
 	nci_send_cmd(ndev, param->opcode, param->len, param->payload);
 }
 
-int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload)
+int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, const __u8 *payload)
 {
 	struct nci_cmd_param param;
 
@@ -365,7 +366,8 @@ int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload)
 }
 EXPORT_SYMBOL(nci_prop_cmd);
 
-int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload)
+int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len,
+		 const __u8 *payload)
 {
 	struct nci_cmd_param param;
 
@@ -399,7 +401,7 @@ struct nci_loopback_data {
 
 static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt)
 {
-	struct nci_loopback_data *data = (struct nci_loopback_data *)opt;
+	const struct nci_loopback_data *data = (struct nci_loopback_data *)opt;
 
 	nci_send_data(ndev, data->conn_id, data->data);
 }
@@ -420,7 +422,7 @@ static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err)
 	nci_req_complete(ndev, NCI_STATUS_OK);
 }
 
-int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len,
+int nci_nfcc_loopback(struct nci_dev *ndev, const void *data, size_t data_len,
 		      struct sk_buff **resp)
 {
 	int r;
@@ -624,7 +626,7 @@ static int nci_dev_down(struct nfc_dev *nfc_dev)
 	return nci_close_device(ndev);
 }
 
-int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val)
+int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, const __u8 *val)
 {
 	struct nci_set_config_param param;
 
@@ -659,7 +661,7 @@ EXPORT_SYMBOL(nci_nfcee_discover);
 
 static void nci_nfcee_mode_set_req(struct nci_dev *ndev, unsigned long opt)
 {
-	struct nci_nfcee_mode_set_cmd *cmd =
+	const struct nci_nfcee_mode_set_cmd *cmd =
 					(struct nci_nfcee_mode_set_cmd *)opt;
 
 	nci_send_cmd(ndev, NCI_OP_NFCEE_MODE_SET_CMD,
@@ -681,7 +683,7 @@ EXPORT_SYMBOL(nci_nfcee_mode_set);
 
 static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt)
 {
-	struct core_conn_create_data *data =
+	const struct core_conn_create_data *data =
 					(struct core_conn_create_data *)opt;
 
 	nci_send_cmd(ndev, NCI_OP_CORE_CONN_CREATE_CMD, data->length, data->cmd);
@@ -690,7 +692,7 @@ static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt)
 int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type,
 			 u8 number_destination_params,
 			 size_t params_len,
-			 struct core_conn_create_dest_spec_params *params)
+			 const struct core_conn_create_dest_spec_params *params)
 {
 	int r;
 	struct nci_core_conn_create_cmd *cmd;
@@ -863,7 +865,7 @@ static int nci_activate_target(struct nfc_dev *nfc_dev,
 {
 	struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
 	struct nci_rf_discover_select_param param;
-	struct nfc_target *nci_target = NULL;
+	const struct nfc_target *nci_target = NULL;
 	int i;
 	int rc = 0;
 
diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c
index ce3382be937f..6055dc9a82aa 100644
--- a/net/nfc/nci/data.c
+++ b/net/nfc/nci/data.c
@@ -26,7 +26,7 @@
 void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb,
 				__u8 conn_id, int err)
 {
-	struct nci_conn_info    *conn_info;
+	const struct nci_conn_info *conn_info;
 	data_exchange_cb_t cb;
 	void *cb_context;
 
@@ -80,7 +80,7 @@ static inline void nci_push_data_hdr(struct nci_dev *ndev,
 
 int nci_conn_max_data_pkt_payload_size(struct nci_dev *ndev, __u8 conn_id)
 {
-	struct nci_conn_info *conn_info;
+	const struct nci_conn_info *conn_info;
 
 	conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id);
 	if (!conn_info)
@@ -93,9 +93,9 @@ EXPORT_SYMBOL(nci_conn_max_data_pkt_payload_size);
 static int nci_queue_tx_data_frags(struct nci_dev *ndev,
 				   __u8 conn_id,
 				   struct sk_buff *skb) {
-	struct nci_conn_info    *conn_info;
+	const struct nci_conn_info *conn_info;
 	int total_len = skb->len;
-	unsigned char *data = skb->data;
+	const unsigned char *data = skb->data;
 	unsigned long flags;
 	struct sk_buff_head frags_q;
 	struct sk_buff *skb_frag;
@@ -166,7 +166,7 @@ exit:
 /* Send NCI data */
 int nci_send_data(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb)
 {
-	struct nci_conn_info    *conn_info;
+	const struct nci_conn_info *conn_info;
 	int rc = 0;
 
 	pr_debug("conn_id 0x%x, plen %d\n", conn_id, skb->len);
@@ -269,7 +269,7 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb)
 	__u8 pbf = nci_pbf(skb->data);
 	__u8 status = 0;
 	__u8 conn_id = nci_conn_id(skb->data);
-	struct nci_conn_info    *conn_info;
+	const struct nci_conn_info *conn_info;
 
 	pr_debug("len %d\n", skb->len);
 
diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c
index d6732e5e8958..71a306b29735 100644
--- a/net/nfc/nci/hci.c
+++ b/net/nfc/nci/hci.c
@@ -142,7 +142,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe,
 			     const u8 data_type, const u8 *data,
 			     size_t data_len)
 {
-	struct nci_conn_info    *conn_info;
+	const struct nci_conn_info *conn_info;
 	struct sk_buff *skb;
 	int len, i, r;
 	u8 cb = pipe;
@@ -197,7 +197,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe,
 
 static void nci_hci_send_data_req(struct nci_dev *ndev, unsigned long opt)
 {
-	struct nci_data *data = (struct nci_data *)opt;
+	const struct nci_data *data = (struct nci_data *)opt;
 
 	nci_hci_send_data(ndev, data->pipe, data->cmd,
 			  data->data, data->data_len);
@@ -221,8 +221,8 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd,
 		     const u8 *param, size_t param_len,
 		     struct sk_buff **skb)
 {
-	struct nci_hcp_message *message;
-	struct nci_conn_info   *conn_info;
+	const struct nci_hcp_message *message;
+	const struct nci_conn_info *conn_info;
 	struct nci_data data;
 	int r;
 	u8 pipe = ndev->hci_dev->gate2pipe[gate];
@@ -406,7 +406,7 @@ static void nci_hci_msg_rx_work(struct work_struct *work)
 	struct nci_hci_dev *hdev =
 		container_of(work, struct nci_hci_dev, msg_rx_work);
 	struct sk_buff *skb;
-	struct nci_hcp_message *message;
+	const struct nci_hcp_message *message;
 	u8 pipe, type, instruction;
 
 	while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) {
@@ -498,7 +498,7 @@ void nci_hci_data_received_cb(void *context,
 int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe)
 {
 	struct nci_data data;
-	struct nci_conn_info    *conn_info;
+	const struct nci_conn_info *conn_info;
 
 	conn_info = ndev->hci_dev->conn_info;
 	if (!conn_info)
@@ -523,7 +523,7 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host,
 	u8 pipe;
 	struct sk_buff *skb;
 	struct nci_hci_create_pipe_params params;
-	struct nci_hci_create_pipe_resp *resp;
+	const struct nci_hci_create_pipe_resp *resp;
 
 	pr_debug("gate=%d\n", dest_gate);
 
@@ -557,8 +557,8 @@ static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe)
 int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx,
 		      const u8 *param, size_t param_len)
 {
-	struct nci_hcp_message *message;
-	struct nci_conn_info *conn_info;
+	const struct nci_hcp_message *message;
+	const struct nci_conn_info *conn_info;
 	struct nci_data data;
 	int r;
 	u8 *tmp;
@@ -605,8 +605,8 @@ EXPORT_SYMBOL(nci_hci_set_param);
 int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx,
 		      struct sk_buff **skb)
 {
-	struct nci_hcp_message *message;
-	struct nci_conn_info    *conn_info;
+	const struct nci_hcp_message *message;
+	const struct nci_conn_info *conn_info;
 	struct nci_data data;
 	int r;
 	u8 pipe = ndev->hci_dev->gate2pipe[gate];
@@ -697,7 +697,7 @@ EXPORT_SYMBOL(nci_hci_connect_gate);
 
 static int nci_hci_dev_connect_gates(struct nci_dev *ndev,
 				     u8 gate_count,
-				     struct nci_hci_gate *gates)
+				     const struct nci_hci_gate *gates)
 {
 	int r;
 
diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c
index 98af04c86b2c..d6251363b72b 100644
--- a/net/nfc/nci/ntf.c
+++ b/net/nfc/nci/ntf.c
@@ -28,10 +28,10 @@
 /* Handle NCI Notification packets */
 
 static void nci_core_reset_ntf_packet(struct nci_dev *ndev,
-				      struct sk_buff *skb)
+				      const struct sk_buff *skb)
 {
 	/* Handle NCI 2.x core reset notification */
-	struct nci_core_reset_ntf *ntf = (void *)skb->data;
+	const struct nci_core_reset_ntf *ntf = (void *)skb->data;
 
 	ndev->nci_ver = ntf->nci_ver;
 	pr_debug("nci_ver 0x%x, config_status 0x%x\n",
@@ -80,7 +80,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev,
 }
 
 static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev,
-					      struct sk_buff *skb)
+					      const struct sk_buff *skb)
 {
 	__u8 status = skb->data[0];
 
@@ -107,9 +107,10 @@ static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev,
 		nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO);
 }
 
-static __u8 *nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev,
-			struct rf_tech_specific_params_nfca_poll *nfca_poll,
-						     __u8 *data)
+static const __u8 *
+nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev,
+					struct rf_tech_specific_params_nfca_poll *nfca_poll,
+					const __u8 *data)
 {
 	nfca_poll->sens_res = __le16_to_cpu(*((__le16 *)data));
 	data += 2;
@@ -134,9 +135,10 @@ static __u8 *nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev,
 	return data;
 }
 
-static __u8 *nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev,
-			struct rf_tech_specific_params_nfcb_poll *nfcb_poll,
-						     __u8 *data)
+static const __u8 *
+nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev,
+					struct rf_tech_specific_params_nfcb_poll *nfcb_poll,
+					const __u8 *data)
 {
 	nfcb_poll->sensb_res_len = min_t(__u8, *data++, NFC_SENSB_RES_MAXSIZE);
 
@@ -148,9 +150,10 @@ static __u8 *nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev,
 	return data;
 }
 
-static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev,
-			struct rf_tech_specific_params_nfcf_poll *nfcf_poll,
-						     __u8 *data)
+static const __u8 *
+nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev,
+					struct rf_tech_specific_params_nfcf_poll *nfcf_poll,
+					const __u8 *data)
 {
 	nfcf_poll->bit_rate = *data++;
 	nfcf_poll->sensf_res_len = min_t(__u8, *data++, NFC_SENSF_RES_MAXSIZE);
@@ -164,9 +167,10 @@ static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev,
 	return data;
 }
 
-static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev,
-			struct rf_tech_specific_params_nfcv_poll *nfcv_poll,
-						     __u8 *data)
+static const __u8 *
+nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev,
+					struct rf_tech_specific_params_nfcv_poll *nfcv_poll,
+					const __u8 *data)
 {
 	++data;
 	nfcv_poll->dsfid = *data++;
@@ -175,9 +179,10 @@ static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev,
 	return data;
 }
 
-static __u8 *nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev,
-			struct rf_tech_specific_params_nfcf_listen *nfcf_listen,
-						     __u8 *data)
+static const __u8 *
+nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev,
+					  struct rf_tech_specific_params_nfcf_listen *nfcf_listen,
+					  const __u8 *data)
 {
 	nfcf_listen->local_nfcid2_len = min_t(__u8, *data++,
 					      NFC_NFCID2_MAXSIZE);
@@ -198,12 +203,12 @@ static int nci_add_new_protocol(struct nci_dev *ndev,
 				struct nfc_target *target,
 				__u8 rf_protocol,
 				__u8 rf_tech_and_mode,
-				void *params)
+				const void *params)
 {
-	struct rf_tech_specific_params_nfca_poll *nfca_poll;
-	struct rf_tech_specific_params_nfcb_poll *nfcb_poll;
-	struct rf_tech_specific_params_nfcf_poll *nfcf_poll;
-	struct rf_tech_specific_params_nfcv_poll *nfcv_poll;
+	const struct rf_tech_specific_params_nfca_poll *nfca_poll;
+	const struct rf_tech_specific_params_nfcb_poll *nfcb_poll;
+	const struct rf_tech_specific_params_nfcf_poll *nfcf_poll;
+	const struct rf_tech_specific_params_nfcv_poll *nfcv_poll;
 	__u32 protocol;
 
 	if (rf_protocol == NCI_RF_PROTOCOL_T1T)
@@ -274,7 +279,7 @@ static int nci_add_new_protocol(struct nci_dev *ndev,
 }
 
 static void nci_add_new_target(struct nci_dev *ndev,
-			       struct nci_rf_discover_ntf *ntf)
+			       const struct nci_rf_discover_ntf *ntf)
 {
 	struct nfc_target *target;
 	int i, rc;
@@ -319,10 +324,10 @@ void nci_clear_target_list(struct nci_dev *ndev)
 }
 
 static void nci_rf_discover_ntf_packet(struct nci_dev *ndev,
-				       struct sk_buff *skb)
+				       const struct sk_buff *skb)
 {
 	struct nci_rf_discover_ntf ntf;
-	__u8 *data = skb->data;
+	const __u8 *data = skb->data;
 	bool add_target = true;
 
 	ntf.rf_discovery_id = *data++;
@@ -382,7 +387,8 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev,
 }
 
 static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev,
-			struct nci_rf_intf_activated_ntf *ntf, __u8 *data)
+						 struct nci_rf_intf_activated_ntf *ntf,
+						 const __u8 *data)
 {
 	struct activation_params_nfca_poll_iso_dep *nfca_poll;
 	struct activation_params_nfcb_poll_iso_dep *nfcb_poll;
@@ -418,7 +424,8 @@ static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev,
 }
 
 static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev,
-			struct nci_rf_intf_activated_ntf *ntf, __u8 *data)
+						 struct nci_rf_intf_activated_ntf *ntf,
+						 const __u8 *data)
 {
 	struct activation_params_poll_nfc_dep *poll;
 	struct activation_params_listen_nfc_dep *listen;
@@ -454,7 +461,7 @@ static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev,
 }
 
 static void nci_target_auto_activated(struct nci_dev *ndev,
-				      struct nci_rf_intf_activated_ntf *ntf)
+				      const struct nci_rf_intf_activated_ntf *ntf)
 {
 	struct nfc_target *target;
 	int rc;
@@ -477,7 +484,7 @@ static void nci_target_auto_activated(struct nci_dev *ndev,
 }
 
 static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev,
-		struct nci_rf_intf_activated_ntf *ntf)
+					   const struct nci_rf_intf_activated_ntf *ntf)
 {
 	ndev->remote_gb_len = 0;
 
@@ -519,11 +526,11 @@ static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev,
 }
 
 static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev,
-					     struct sk_buff *skb)
+					     const struct sk_buff *skb)
 {
 	struct nci_conn_info    *conn_info;
 	struct nci_rf_intf_activated_ntf ntf;
-	__u8 *data = skb->data;
+	const __u8 *data = skb->data;
 	int err = NCI_STATUS_OK;
 
 	ntf.rf_discovery_id = *data++;
@@ -681,10 +688,10 @@ listen:
 }
 
 static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev,
-					 struct sk_buff *skb)
+					 const struct sk_buff *skb)
 {
-	struct nci_conn_info    *conn_info;
-	struct nci_rf_deactivate_ntf *ntf = (void *) skb->data;
+	const struct nci_conn_info *conn_info;
+	const struct nci_rf_deactivate_ntf *ntf = (void *)skb->data;
 
 	pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason);
 
@@ -725,10 +732,10 @@ static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev,
 }
 
 static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev,
-					  struct sk_buff *skb)
+					  const struct sk_buff *skb)
 {
 	u8 status = NCI_STATUS_OK;
-	struct nci_nfcee_discover_ntf   *nfcee_ntf =
+	const struct nci_nfcee_discover_ntf *nfcee_ntf =
 				(struct nci_nfcee_discover_ntf *)skb->data;
 
 	pr_debug("\n");
@@ -745,7 +752,7 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev,
 }
 
 static void nci_nfcee_action_ntf_packet(struct nci_dev *ndev,
-					struct sk_buff *skb)
+					const struct sk_buff *skb)
 {
 	pr_debug("\n");
 }
diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c
index e9605922a322..dbb0b55a1757 100644
--- a/net/nfc/nci/rsp.c
+++ b/net/nfc/nci/rsp.c
@@ -25,9 +25,10 @@
 
 /* Handle NCI Response packets */
 
-static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
+static void nci_core_reset_rsp_packet(struct nci_dev *ndev,
+				      const struct sk_buff *skb)
 {
-	struct nci_core_reset_rsp *rsp = (void *) skb->data;
+	const struct nci_core_reset_rsp *rsp = (void *)skb->data;
 
 	pr_debug("status 0x%x\n", rsp->status);
 
@@ -43,10 +44,11 @@ static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
 	}
 }
 
-static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb)
+static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev,
+				      const struct sk_buff *skb)
 {
-	struct nci_core_init_rsp_1 *rsp_1 = (void *) skb->data;
-	struct nci_core_init_rsp_2 *rsp_2;
+	const struct nci_core_init_rsp_1 *rsp_1 = (void *)skb->data;
+	const struct nci_core_init_rsp_2 *rsp_2;
 
 	pr_debug("status 0x%x\n", rsp_1->status);
 
@@ -81,10 +83,11 @@ static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb)
 	return NCI_STATUS_OK;
 }
 
-static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb)
+static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev,
+				      const struct sk_buff *skb)
 {
-	struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data;
-	u8 *supported_rf_interface = rsp->supported_rf_interfaces;
+	const struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data;
+	const u8 *supported_rf_interface = rsp->supported_rf_interfaces;
 	u8 rf_interface_idx = 0;
 	u8 rf_extension_cnt = 0;
 
@@ -118,7 +121,7 @@ static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb)
 	return NCI_STATUS_OK;
 }
 
-static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
+static void nci_core_init_rsp_packet(struct nci_dev *ndev, const struct sk_buff *skb)
 {
 	u8 status = 0;
 
@@ -160,9 +163,9 @@ exit:
 }
 
 static void nci_core_set_config_rsp_packet(struct nci_dev *ndev,
-					   struct sk_buff *skb)
+					   const struct sk_buff *skb)
 {
-	struct nci_core_set_config_rsp *rsp = (void *) skb->data;
+	const struct nci_core_set_config_rsp *rsp = (void *)skb->data;
 
 	pr_debug("status 0x%x\n", rsp->status);
 
@@ -170,7 +173,7 @@ static void nci_core_set_config_rsp_packet(struct nci_dev *ndev,
 }
 
 static void nci_rf_disc_map_rsp_packet(struct nci_dev *ndev,
-				       struct sk_buff *skb)
+				       const struct sk_buff *skb)
 {
 	__u8 status = skb->data[0];
 
@@ -179,7 +182,8 @@ static void nci_rf_disc_map_rsp_packet(struct nci_dev *ndev,
 	nci_req_complete(ndev, status);
 }
 
-static void nci_rf_disc_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
+static void nci_rf_disc_rsp_packet(struct nci_dev *ndev,
+				   const struct sk_buff *skb)
 {
 	struct nci_conn_info    *conn_info;
 	__u8 status = skb->data[0];
@@ -210,7 +214,7 @@ exit:
 }
 
 static void nci_rf_disc_select_rsp_packet(struct nci_dev *ndev,
-					  struct sk_buff *skb)
+					  const struct sk_buff *skb)
 {
 	__u8 status = skb->data[0];
 
@@ -222,7 +226,7 @@ static void nci_rf_disc_select_rsp_packet(struct nci_dev *ndev,
 }
 
 static void nci_rf_deactivate_rsp_packet(struct nci_dev *ndev,
-					 struct sk_buff *skb)
+					 const struct sk_buff *skb)
 {
 	__u8 status = skb->data[0];
 
@@ -238,9 +242,9 @@ static void nci_rf_deactivate_rsp_packet(struct nci_dev *ndev,
 }
 
 static void nci_nfcee_discover_rsp_packet(struct nci_dev *ndev,
-					  struct sk_buff *skb)
+					  const struct sk_buff *skb)
 {
-	struct nci_nfcee_discover_rsp *discover_rsp;
+	const struct nci_nfcee_discover_rsp *discover_rsp;
 
 	if (skb->len != 2) {
 		nci_req_complete(ndev, NCI_STATUS_NFCEE_PROTOCOL_ERROR);
@@ -255,7 +259,7 @@ static void nci_nfcee_discover_rsp_packet(struct nci_dev *ndev,
 }
 
 static void nci_nfcee_mode_set_rsp_packet(struct nci_dev *ndev,
-					  struct sk_buff *skb)
+					  const struct sk_buff *skb)
 {
 	__u8 status = skb->data[0];
 
@@ -264,11 +268,11 @@ static void nci_nfcee_mode_set_rsp_packet(struct nci_dev *ndev,
 }
 
 static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev,
-					    struct sk_buff *skb)
+					    const struct sk_buff *skb)
 {
 	__u8 status = skb->data[0];
 	struct nci_conn_info *conn_info = NULL;
-	struct nci_core_conn_create_rsp *rsp;
+	const struct nci_core_conn_create_rsp *rsp;
 
 	pr_debug("status 0x%x\n", status);
 
@@ -319,7 +323,7 @@ exit:
 }
 
 static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev,
-					   struct sk_buff *skb)
+					   const struct sk_buff *skb)
 {
 	struct nci_conn_info *conn_info;
 	__u8 status = skb->data[0];
diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c
index 7d8e10e27c20..0935527d1d12 100644
--- a/net/nfc/nci/spi.c
+++ b/net/nfc/nci/spi.c
@@ -27,7 +27,7 @@
 
 #define CRC_INIT		0xFFFF
 
-static int __nci_spi_send(struct nci_spi *nspi, struct sk_buff *skb,
+static int __nci_spi_send(struct nci_spi *nspi, const struct sk_buff *skb,
 			  int cs_change)
 {
 	struct spi_message m;
-- 
cgit v1.2.3


From bc830525615df6b6b1793ac23750f32695903fd0 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Thu, 29 Jul 2021 15:48:54 +0800
Subject: net: netlink: Remove unused function

lockdep_genl_is_held() and its caller arm not used now, just remove them.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Link: https://lore.kernel.org/r/20210729074854.8968-1-yajun.deng@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/genetlink.h | 23 -----------------------
 net/netlink/genetlink.c   |  8 --------
 2 files changed, 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
index bc738504ab4a..c285968e437a 100644
--- a/include/linux/genetlink.h
+++ b/include/linux/genetlink.h
@@ -8,34 +8,11 @@
 /* All generic netlink requests are serialized by a global lock.  */
 extern void genl_lock(void);
 extern void genl_unlock(void);
-#ifdef CONFIG_LOCKDEP
-extern bool lockdep_genl_is_held(void);
-#endif
 
 /* for synchronisation between af_netlink and genetlink */
 extern atomic_t genl_sk_destructing_cnt;
 extern wait_queue_head_t genl_sk_destructing_waitq;
 
-/**
- * rcu_dereference_genl - rcu_dereference with debug checking
- * @p: The pointer to read, prior to dereferencing
- *
- * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
- * or genl mutex. Note : Please prefer genl_dereference() or rcu_dereference()
- */
-#define rcu_dereference_genl(p)					\
-	rcu_dereference_check(p, lockdep_genl_is_held())
-
-/**
- * genl_dereference - fetch RCU pointer when updates are prevented by genl mutex
- * @p: The pointer to read, prior to dereferencing
- *
- * Return the value of the specified RCU-protected pointer, but omit
- * the READ_ONCE(), because caller holds genl mutex.
- */
-#define genl_dereference(p)					\
-	rcu_dereference_protected(p, lockdep_genl_is_held())
-
 #define MODULE_ALIAS_GENL_FAMILY(family)\
  MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family)
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index ae58da608a31..1afca2a6c2ac 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -40,14 +40,6 @@ void genl_unlock(void)
 }
 EXPORT_SYMBOL(genl_unlock);
 
-#ifdef CONFIG_LOCKDEP
-bool lockdep_genl_is_held(void)
-{
-	return lockdep_is_held(&genl_mutex);
-}
-EXPORT_SYMBOL(lockdep_genl_is_held);
-#endif
-
 static void genl_lock_all(void)
 {
 	down_write(&cb_lock);
-- 
cgit v1.2.3


From a432934a30679c0e3c47b87f13e4901bc1a3fc03 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 30 Jul 2021 18:30:53 +0200
Subject: sk_buff: avoid potentially clearing 'slow_gro' field

If skb_dst_set_noref() is invoked with a NULL dst, the 'slow_gro'
field is cleared, too. That could lead to wrong behavior if
the skb later enters the GRO stage.

Fix the potential issue replacing preserving a non-zero value of
the 'slow_gro' field.

Additionally, fix a comment typo.

Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Reported-by: Jakub Kicinski <kuba@kernel.org>
Fixes: 8a886b142bd0 ("sk_buff: track dst status in slow_gro")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/aa42529252dc8bb02bd42e8629427040d1058537.1627662501.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 2 +-
 net/core/dev.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b1e5bbfcc926..2bcdc8cd38be 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1009,7 +1009,7 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
 {
 	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
-	skb->slow_gro = !!dst;
+	skb->slow_gro |= !!dst;
 	skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index b51e41d0a7fe..64e1a5f63f93 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6022,7 +6022,7 @@ static void gro_list_prepare(const struct list_head *head,
 				       skb_mac_header(skb),
 				       maclen);
 
-		/* in most common scenarions _state is 0
+		/* in most common scenarions 'slow_gro' is 0
 		 * otherwise we are already on some slower paths
 		 * either skip all the infrequent tests altogether or
 		 * avoid trying too hard to skip each of them individually
-- 
cgit v1.2.3


From 26713455048eb19122b1561b471d30710177ef97 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 29 Jul 2021 20:19:25 +0300
Subject: devlink: Allocate devlink directly in requested net namespace

There is no need in extra call indirection and check from impossible
flow where someone tries to set namespace without prior call
to devlink_alloc().

Instead of this extra logic and additional EXPORT_SYMBOL, use specialized
devlink allocation function that receives net namespace as an argument.

Such specialized API allows clear view when devlink initialized in wrong
net namespace and/or kernel users don't try to change devlink namespace
under the hood.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/netdevsim/dev.c |  4 ++--
 include/net/devlink.h       | 14 ++++++++++++--
 net/core/devlink.c          | 26 ++++++++------------------
 3 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 6348307bfa84..d538a39d4225 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -1431,10 +1431,10 @@ int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev)
 	struct devlink *devlink;
 	int err;
 
-	devlink = devlink_alloc(&nsim_dev_devlink_ops, sizeof(*nsim_dev));
+	devlink = devlink_alloc_ns(&nsim_dev_devlink_ops, sizeof(*nsim_dev),
+				   nsim_bus_dev->initial_net);
 	if (!devlink)
 		return -ENOMEM;
-	devlink_net_set(devlink, nsim_bus_dev->initial_net);
 	nsim_dev = devlink_priv(devlink);
 	nsim_dev->nsim_bus_dev = nsim_bus_dev;
 	nsim_dev->switch_id.id_len = sizeof(nsim_dev->switch_id.id);
diff --git a/include/net/devlink.h b/include/net/devlink.h
index e48a62320407..08f4c6191e72 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1540,8 +1540,18 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev)
 struct ib_device;
 
 struct net *devlink_net(const struct devlink *devlink);
-void devlink_net_set(struct devlink *devlink, struct net *net);
-struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
+/* This call is intended for software devices that can create
+ * devlink instances in other namespaces than init_net.
+ *
+ * Drivers that operate on real HW must use devlink_alloc() instead.
+ */
+struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
+				 size_t priv_size, struct net *net);
+static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
+					    size_t priv_size)
+{
+	return devlink_alloc_ns(ops, priv_size, &init_net);
+}
 int devlink_register(struct devlink *devlink, struct device *dev);
 void devlink_unregister(struct devlink *devlink);
 void devlink_reload_enable(struct devlink *devlink);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index fbca61ad9bbc..fd2fc2befba9 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -108,19 +108,6 @@ struct net *devlink_net(const struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devlink_net);
 
-static void __devlink_net_set(struct devlink *devlink, struct net *net)
-{
-	write_pnet(&devlink->_net, net);
-}
-
-void devlink_net_set(struct devlink *devlink, struct net *net)
-{
-	if (WARN_ON(devlink->dev))
-		return;
-	__devlink_net_set(devlink, net);
-}
-EXPORT_SYMBOL_GPL(devlink_net_set);
-
 static struct devlink *devlink_get_from_attrs(struct net *net,
 					      struct nlattr **attrs)
 {
@@ -3921,7 +3908,7 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net,
 		return err;
 
 	if (dest_net && !net_eq(dest_net, curr_net))
-		__devlink_net_set(devlink, dest_net);
+		write_pnet(&devlink->_net, dest_net);
 
 	err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack);
 	devlink_reload_failed_set(devlink, !!err);
@@ -8776,15 +8763,18 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops)
 }
 
 /**
- *	devlink_alloc - Allocate new devlink instance resources
+ *	devlink_alloc_ns - Allocate new devlink instance resources
+ *	in specific namespace
  *
  *	@ops: ops
  *	@priv_size: size of user private data
+ *	@net: net namespace
  *
  *	Allocate new devlink instance resources, including devlink index
  *	and name.
  */
-struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
+				 size_t priv_size, struct net *net)
 {
 	struct devlink *devlink;
 
@@ -8799,7 +8789,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 		return NULL;
 	devlink->ops = ops;
 	xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
-	__devlink_net_set(devlink, &init_net);
+	write_pnet(&devlink->_net, net);
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->rate_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
@@ -8815,7 +8805,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	mutex_init(&devlink->reporters_lock);
 	return devlink;
 }
-EXPORT_SYMBOL_GPL(devlink_alloc);
+EXPORT_SYMBOL_GPL(devlink_alloc_ns);
 
 /**
  *	devlink_register - Register devlink instance
-- 
cgit v1.2.3


From f2e3778db7e13fd9a83c309dd8c9fbe69ba8efbe Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 22 Jul 2021 10:48:34 +0200
Subject: netfilter: remove xt pernet data

clusterip is now handled via net_generic.

NOTRACK is tiny compared to rest of xt_CT feature set, even the existing
deprecation warning is bigger than the actual functionality.

Just remove the warning, its not worth keeping/adding a net_generic one.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/net_namespace.h  |  2 --
 include/net/netns/x_tables.h | 12 ------------
 net/netfilter/xt_CT.c        | 11 -----------
 3 files changed, 25 deletions(-)
 delete mode 100644 include/net/netns/x_tables.h

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index cc54750dd3db..bb5fa5914032 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -23,7 +23,6 @@
 #include <net/netns/ieee802154_6lowpan.h>
 #include <net/netns/sctp.h>
 #include <net/netns/netfilter.h>
-#include <net/netns/x_tables.h>
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netns/conntrack.h>
 #endif
@@ -133,7 +132,6 @@ struct net {
 #endif
 #ifdef CONFIG_NETFILTER
 	struct netns_nf		nf;
-	struct netns_xt		xt;
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct netns_ct		ct;
 #endif
diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h
deleted file mode 100644
index d02316ec2906..000000000000
--- a/include/net/netns/x_tables.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __NETNS_X_TABLES_H
-#define __NETNS_X_TABLES_H
-
-#include <linux/list.h>
-#include <linux/netfilter_defs.h>
-
-struct netns_xt {
-	bool notrack_deprecated_warning;
-	bool clusterip_deprecated_warning;
-};
-#endif
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 12404d221026..0a913ce07425 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -351,21 +351,10 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static int notrack_chk(const struct xt_tgchk_param *par)
-{
-	if (!par->net->xt.notrack_deprecated_warning) {
-		pr_info("netfilter: NOTRACK target is deprecated, "
-			"use CT instead or upgrade iptables\n");
-		par->net->xt.notrack_deprecated_warning = true;
-	}
-	return 0;
-}
-
 static struct xt_target notrack_tg_reg __read_mostly = {
 	.name		= "NOTRACK",
 	.revision	= 0,
 	.family		= NFPROTO_UNSPEC,
-	.checkentry	= notrack_chk,
 	.target		= notrack_tg,
 	.table		= "raw",
 	.me		= THIS_MODULE,
-- 
cgit v1.2.3


From 695176bfe5dec2051f950bdac0ae0b21e29e6de3 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Thu, 29 Jul 2021 16:12:14 -0700
Subject: net_sched: refactor TC action init API

TC action ->init() API has 10 parameters, it becomes harder
to read. Some of them are just boolean and can be replaced
by flags. Similarly for the internal API tcf_action_init()
and tcf_exts_validate().

This patch converts them to flags and fold them into
the upper 16 bits of "flags", whose lower 16 bits are still
reserved for user-space. More specifically, the following
kernel flags are introduced:

TCA_ACT_FLAGS_POLICE replace 'name' in a few contexts, to
distinguish whether it is compatible with policer.

TCA_ACT_FLAGS_BIND replaces 'bind', to indicate whether
this action is bound to a filter.

TCA_ACT_FLAGS_REPLACE  replaces 'ovr' in most contexts,
means we are replacing an existing action.

TCA_ACT_FLAGS_NO_RTNL replaces 'rtnl_held' but has the
opposite meaning, because we still hold RTNL in most
cases.

The only user-space flag TCA_ACT_FLAGS_NO_PERCPU_STATS is
untouched and still stored as before.

I have tested this patch with tdc and I do not see any
failure related to this patch.

Tested-by: Vlad Buslov <vladbu@nvidia.com>
Acked-by: Jamal Hadi Salim<jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h        | 22 ++++++++++------
 include/net/pkt_cls.h        |  2 +-
 include/net/sch_generic.h    |  2 +-
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/act_api.c          | 61 ++++++++++++++++++++++----------------------
 net/sched/act_bpf.c          |  4 +--
 net/sched/act_connmark.c     |  4 +--
 net/sched/act_csum.c         |  7 ++---
 net/sched/act_ct.c           |  4 +--
 net/sched/act_ctinfo.c       |  4 +--
 net/sched/act_gact.c         |  4 +--
 net/sched/act_gate.c         |  4 +--
 net/sched/act_ife.c          |  9 ++++---
 net/sched/act_ipt.c          | 21 +++++++--------
 net/sched/act_mirred.c       |  4 +--
 net/sched/act_mpls.c         |  4 +--
 net/sched/act_nat.c          |  6 ++---
 net/sched/act_pedit.c        |  4 +--
 net/sched/act_police.c       |  4 +--
 net/sched/act_sample.c       |  7 ++---
 net/sched/act_simple.c       |  4 +--
 net/sched/act_skbedit.c      |  4 +--
 net/sched/act_skbmod.c       |  3 ++-
 net/sched/act_tunnel_key.c   |  4 +--
 net/sched/act_vlan.c         |  4 +--
 net/sched/cls_api.c          | 28 ++++++++++++--------
 net/sched/cls_basic.c        | 10 ++++----
 net/sched/cls_bpf.c          |  8 +++---
 net/sched/cls_cgroup.c       |  6 ++---
 net/sched/cls_flow.c         |  6 ++---
 net/sched/cls_flower.c       | 18 ++++++-------
 net/sched/cls_fw.c           | 13 +++++-----
 net/sched/cls_matchall.c     | 17 ++++++------
 net/sched/cls_route.c        | 10 ++++----
 net/sched/cls_rsvp.h         |  7 +++--
 net/sched/cls_tcindex.c      | 10 ++++----
 net/sched/cls_u32.c          | 24 ++++++++---------
 37 files changed, 185 insertions(+), 169 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 086b291e9530..f19f7f4a463c 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -58,6 +58,14 @@ struct tc_action {
 #define TCA_ACT_HW_STATS_ANY (TCA_ACT_HW_STATS_IMMEDIATE | \
 			      TCA_ACT_HW_STATS_DELAYED)
 
+/* Reserve 16 bits for user-space. See TCA_ACT_FLAGS_NO_PERCPU_STATS. */
+#define TCA_ACT_FLAGS_USER_BITS 16
+#define TCA_ACT_FLAGS_USER_MASK 0xffff
+#define TCA_ACT_FLAGS_POLICE	(1U << TCA_ACT_FLAGS_USER_BITS)
+#define TCA_ACT_FLAGS_BIND	(1U << (TCA_ACT_FLAGS_USER_BITS + 1))
+#define TCA_ACT_FLAGS_REPLACE	(1U << (TCA_ACT_FLAGS_USER_BITS + 2))
+#define TCA_ACT_FLAGS_NO_RTNL	(1U << (TCA_ACT_FLAGS_USER_BITS + 3))
+
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
  */
@@ -99,8 +107,8 @@ struct tc_action_ops {
 	void	(*cleanup)(struct tc_action *);
 	int     (*lookup)(struct net *net, struct tc_action **a, u32 index);
 	int     (*init)(struct net *net, struct nlattr *nla,
-			struct nlattr *est, struct tc_action **act, int ovr,
-			int bind, bool rtnl_held, struct tcf_proto *tp,
+			struct nlattr *est, struct tc_action **act,
+			struct tcf_proto *tp,
 			u32 flags, struct netlink_ext_ack *extack);
 	int     (*walk)(struct net *, struct sk_buff *,
 			struct netlink_callback *, int,
@@ -179,18 +187,16 @@ int tcf_action_destroy(struct tc_action *actions[], int bind);
 int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
 		    int nr_actions, struct tcf_result *res);
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
-		    struct nlattr *est, char *name, int ovr, int bind,
+		    struct nlattr *est,
 		    struct tc_action *actions[], int init_res[], size_t *attr_size,
-		    bool rtnl_held, struct netlink_ext_ack *extack);
-struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
+		    u32 flags, struct netlink_ext_ack *extack);
+struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police,
 					 bool rtnl_held,
 					 struct netlink_ext_ack *extack);
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
-				    char *name, int ovr, int bind,
 				    struct tc_action_ops *a_o, int *init_res,
-				    bool rtnl_held,
-				    struct netlink_ext_ack *extack);
+				    u32 flags, struct netlink_ext_ack *extack);
 int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
 		    int ref, bool terse);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index dc28fcb6f0a2..64de26b7ad39 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -319,7 +319,7 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
 
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
 		      struct nlattr **tb, struct nlattr *rate_tlv,
-		      struct tcf_exts *exts, bool ovr, bool rtnl_held,
+		      struct tcf_exts *exts, u32 flags,
 		      struct netlink_ext_ack *extack);
 void tcf_exts_destroy(struct tcf_exts *exts);
 void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9ed33e6840bd..c0069ac00e62 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -357,7 +357,7 @@ struct tcf_proto_ops {
 	int			(*change)(struct net *net, struct sk_buff *,
 					struct tcf_proto*, unsigned long,
 					u32 handle, struct nlattr **,
-					void **, bool, bool,
+					void **, u32,
 					struct netlink_ext_ack *);
 	int			(*delete)(struct tcf_proto *tp, void *arg,
 					  bool *last, bool rtnl_held,
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 025c40fef93d..6836ccb9c45d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -22,6 +22,7 @@ enum {
 	__TCA_ACT_MAX
 };
 
+/* See other TCA_ACT_FLAGS_ * flags in include/net/act_api.h. */
 #define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for
 					 * actions stats.
 					 */
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 998a2374f7ae..7dd3a2dc5fa4 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -495,7 +495,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 	p->tcfa_tm.install = jiffies;
 	p->tcfa_tm.lastuse = jiffies;
 	p->tcfa_tm.firstuse = 0;
-	p->tcfa_flags = flags;
+	p->tcfa_flags = flags & TCA_ACT_FLAGS_USER_MASK;
 	if (est) {
 		err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
 					&p->tcfa_rate_est,
@@ -941,7 +941,7 @@ void tcf_idr_insert_many(struct tc_action *actions[])
 	}
 }
 
-struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
+struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police,
 					 bool rtnl_held,
 					 struct netlink_ext_ack *extack)
 {
@@ -951,7 +951,7 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
 	struct nlattr *kind;
 	int err;
 
-	if (name == NULL) {
+	if (!police) {
 		err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
 						  tcf_action_policy, extack);
 		if (err < 0)
@@ -967,7 +967,7 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
 			return ERR_PTR(err);
 		}
 	} else {
-		if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
+		if (strlcpy(act_name, "police", IFNAMSIZ) >= IFNAMSIZ) {
 			NL_SET_ERR_MSG(extack, "TC action name too long");
 			return ERR_PTR(-EINVAL);
 		}
@@ -1004,12 +1004,11 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
 
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
-				    char *name, int ovr, int bind,
 				    struct tc_action_ops *a_o, int *init_res,
-				    bool rtnl_held,
-				    struct netlink_ext_ack *extack)
+				    u32 flags, struct netlink_ext_ack *extack)
 {
-	struct nla_bitfield32 flags = { 0, 0 };
+	bool police = flags & TCA_ACT_FLAGS_POLICE;
+	struct nla_bitfield32 userflags = { 0, 0 };
 	u8 hw_stats = TCA_ACT_HW_STATS_ANY;
 	struct nlattr *tb[TCA_ACT_MAX + 1];
 	struct tc_cookie *cookie = NULL;
@@ -1017,7 +1016,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 	int err;
 
 	/* backward compatibility for policer */
-	if (name == NULL) {
+	if (!police) {
 		err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
 						  tcf_action_policy, extack);
 		if (err < 0)
@@ -1032,22 +1031,22 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 		}
 		hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]);
 		if (tb[TCA_ACT_FLAGS])
-			flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
+			userflags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
 
-		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
-				rtnl_held, tp, flags.value, extack);
+		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, tp,
+				userflags.value | flags, extack);
 	} else {
-		err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
-				tp, flags.value, extack);
+		err = a_o->init(net, nla, est, &a, tp, userflags.value | flags,
+				extack);
 	}
 	if (err < 0)
 		goto err_out;
 	*init_res = err;
 
-	if (!name && tb[TCA_ACT_COOKIE])
+	if (!police && tb[TCA_ACT_COOKIE])
 		tcf_set_action_cookie(&a->act_cookie, cookie);
 
-	if (!name)
+	if (!police)
 		a->hw_stats = hw_stats;
 
 	return a;
@@ -1063,9 +1062,9 @@ err_out:
 /* Returns numbers of initialized actions or negative error. */
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
-		    struct nlattr *est, char *name, int ovr, int bind,
-		    struct tc_action *actions[], int init_res[], size_t *attr_size,
-		    bool rtnl_held, struct netlink_ext_ack *extack)
+		    struct nlattr *est, struct tc_action *actions[],
+		    int init_res[], size_t *attr_size, u32 flags,
+		    struct netlink_ext_ack *extack)
 {
 	struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {};
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
@@ -1082,7 +1081,9 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
 		struct tc_action_ops *a_o;
 
-		a_o = tc_action_load_ops(name, tb[i], rtnl_held, extack);
+		a_o = tc_action_load_ops(tb[i], flags & TCA_ACT_FLAGS_POLICE,
+					 !(flags & TCA_ACT_FLAGS_NO_RTNL),
+					 extack);
 		if (IS_ERR(a_o)) {
 			err = PTR_ERR(a_o);
 			goto err_mod;
@@ -1091,9 +1092,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 	}
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
-					ops[i - 1], &init_res[i - 1], rtnl_held,
-					extack);
+		act = tcf_action_init_1(net, tp, tb[i], est, ops[i - 1],
+					&init_res[i - 1], flags, extack);
 		if (IS_ERR(act)) {
 			err = PTR_ERR(act);
 			goto err;
@@ -1113,7 +1113,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 	goto err_mod;
 
 err:
-	tcf_action_destroy(actions, bind);
+	tcf_action_destroy(actions, flags & TCA_ACT_FLAGS_BIND);
 err_mod:
 	for (i = 0; i < TCA_ACT_MAX_PRIO; i++) {
 		if (ops[i])
@@ -1495,7 +1495,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
 }
 
 static int tcf_action_add(struct net *net, struct nlattr *nla,
-			  struct nlmsghdr *n, u32 portid, int ovr,
+			  struct nlmsghdr *n, u32 portid, u32 flags,
 			  struct netlink_ext_ack *extack)
 {
 	size_t attr_size = 0;
@@ -1504,8 +1504,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
 	int init_res[TCA_ACT_MAX_PRIO] = {};
 
 	for (loop = 0; loop < 10; loop++) {
-		ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0,
-				      actions, init_res, &attr_size, true, extack);
+		ret = tcf_action_init(net, NULL, nla, NULL, actions, init_res,
+				      &attr_size, flags, extack);
 		if (ret != -EAGAIN)
 			break;
 	}
@@ -1535,7 +1535,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *tca[TCA_ROOT_MAX + 1];
 	u32 portid = NETLINK_CB(skb).portid;
-	int ret = 0, ovr = 0;
+	u32 flags = 0;
+	int ret = 0;
 
 	if ((n->nlmsg_type != RTM_GETACTION) &&
 	    !netlink_capable(skb, CAP_NET_ADMIN))
@@ -1561,8 +1562,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 		 * is zero) then just set this
 		 */
 		if (n->nlmsg_flags & NLM_F_REPLACE)
-			ovr = 1;
-		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
+			flags = TCA_ACT_FLAGS_REPLACE;
+		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, flags,
 				     extack);
 		break;
 	case RTM_DELACTION:
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index e409a0005717..040807aa15b9 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -275,11 +275,11 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
 
 static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **act,
-			int replace, int bind, bool rtnl_held,
 			struct tcf_proto *tp, u32 flags,
 			struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tcf_bpf_cfg cfg, old;
@@ -317,7 +317,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 		if (bind)
 			return 0;
 
-		if (!replace) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*act, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e19885d7fe2c..94e78ac7a748 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -96,12 +96,12 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
 
 static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 			     struct nlattr *est, struct tc_action **a,
-			     int ovr, int bind, bool rtnl_held,
 			     struct tcf_proto *tp, u32 flags,
 			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 	struct nlattr *tb[TCA_CONNMARK_MAX + 1];
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct tcf_chain *goto_ch = NULL;
 	struct tcf_connmark_info *ci;
 	struct tc_connmark *parm;
@@ -144,7 +144,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 		ci = to_connmark(*a);
 		if (bind)
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 4fa4fcb842ba..a15ec95e69c3 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -41,11 +41,12 @@ static unsigned int csum_net_id;
 static struct tc_action_ops act_csum_ops;
 
 static int tcf_csum_init(struct net *net, struct nlattr *nla,
-			 struct nlattr *est, struct tc_action **a, int ovr,
-			 int bind, bool rtnl_held, struct tcf_proto *tp,
+			 struct nlattr *est, struct tc_action **a,
+			 struct tcf_proto *tp,
 			 u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct tcf_csum_params *params_new;
 	struct nlattr *tb[TCA_CSUM_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
@@ -78,7 +79,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 	} else if (err > 0) {
 		if (bind)/* dont override defaults */
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 1b4b3514c94f..ad9df0cb4b98 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1235,11 +1235,11 @@ static int tcf_ct_fill_params(struct net *net,
 
 static int tcf_ct_init(struct net *net, struct nlattr *nla,
 		       struct nlattr *est, struct tc_action **a,
-		       int replace, int bind, bool rtnl_held,
 		       struct tcf_proto *tp, u32 flags,
 		       struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ct_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct tcf_ct_params *params = NULL;
 	struct nlattr *tb[TCA_CT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
@@ -1279,7 +1279,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
 		if (bind)
 			return 0;
 
-		if (!replace) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index b20c8ce59905..549374a2d008 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -154,11 +154,11 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
 
 static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind, bool rtnl_held,
 			   struct tcf_proto *tp, u32 flags,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	u32 dscpmask = 0, dscpstatemask, index;
 	struct nlattr *tb[TCA_CTINFO_MAX + 1];
 	struct tcf_ctinfo_params *cp_new;
@@ -221,7 +221,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
 	} else if (err > 0) {
 		if (bind) /* don't override defaults */
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 73c3926358a0..d8dce173df37 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -52,11 +52,11 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
 
 static int tcf_gact_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_GACT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_gact *parm;
@@ -109,7 +109,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	} else if (err > 0) {
 		if (bind)/* dont override defaults */
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
index a78cb7965718..7df72a4197a3 100644
--- a/net/sched/act_gate.c
+++ b/net/sched/act_gate.c
@@ -295,12 +295,12 @@ static void gate_setup_timer(struct tcf_gate *gact, u64 basetime,
 
 static int tcf_gate_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, gate_net_id);
 	enum tk_offsets tk_offset = TK_OFFS_TAI;
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_GATE_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	u64 cycletime = 0, basetime = 0;
@@ -364,7 +364,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 		}
 
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index a2ddea04183a..7064a365a1a9 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -479,11 +479,11 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
 
 static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **a,
-			int ovr, int bind, bool rtnl_held,
 			struct tcf_proto *tp, u32 flags,
 			struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_IFE_MAX + 1];
 	struct nlattr *tb2[IFE_META_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
@@ -532,7 +532,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			kfree(p);
 			return err;
 		}
-		err = load_metalist(tb2, rtnl_held);
+		err = load_metalist(tb2, !(flags & TCA_ACT_FLAGS_NO_RTNL));
 		if (err) {
 			kfree(p);
 			return err;
@@ -560,7 +560,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			return ret;
 		}
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		kfree(p);
 		return -EEXIST;
@@ -600,7 +600,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (tb[TCA_IFE_METALST]) {
-		err = populate_metalist(ife, tb2, exists, rtnl_held);
+		err = populate_metalist(ife, tb2, exists,
+					!(flags & TCA_ACT_FLAGS_NO_RTNL));
 		if (err)
 			goto metadata_parse_err;
 	} else {
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index ac7297f42355..265b1443e252 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -94,10 +94,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 
 static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
-			  const struct tc_action_ops *ops, int ovr, int bind,
+			  const struct tc_action_ops *ops,
 			  struct tcf_proto *tp, u32 flags)
 {
 	struct tc_action_net *tn = net_generic(net, id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_IPT_MAX + 1];
 	struct tcf_ipt *ipt;
 	struct xt_entry_target *td, *t;
@@ -154,7 +155,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 		if (bind)/* dont override defaults */
 			return 0;
 
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
@@ -201,21 +202,21 @@ err1:
 }
 
 static int tcf_ipt_init(struct net *net, struct nlattr *nla,
-			struct nlattr *est, struct tc_action **a, int ovr,
-			int bind, bool rtnl_held, struct tcf_proto *tp,
+			struct nlattr *est, struct tc_action **a,
+			struct tcf_proto *tp,
 			u32 flags, struct netlink_ext_ack *extack)
 {
-	return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
-			      bind, tp, flags);
+	return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops,
+			      tp, flags);
 }
 
 static int tcf_xt_init(struct net *net, struct nlattr *nla,
-		       struct nlattr *est, struct tc_action **a, int ovr,
-		       int bind, bool unlocked, struct tcf_proto *tp,
+		       struct nlattr *est, struct tc_action **a,
+		       struct tcf_proto *tp,
 		       u32 flags, struct netlink_ext_ack *extack)
 {
-	return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
-			      bind, tp, flags);
+	return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops,
+			      tp, flags);
 }
 
 static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 7153c67f641e..77ee80e3effc 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -91,11 +91,11 @@ static struct tc_action_ops act_mirred_ops;
 
 static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind, bool rtnl_held,
 			   struct tcf_proto *tp,
 			   u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_MIRRED_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	bool mac_header_xmit = false;
@@ -155,7 +155,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			return ret;
 		}
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index d1486ea496a2..e4529b428cf4 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -152,11 +152,11 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
 
 static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_MPLS_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tcf_mpls_params *p;
@@ -255,7 +255,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 		}
 
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 1ebd2a86d980..7dd6b586ba7f 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -34,11 +34,11 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 };
 
 static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
-			struct tc_action **a, int ovr, int bind,
-			bool rtnl_held,	struct tcf_proto *tp,
+			struct tc_action **a, struct tcf_proto *tp,
 			u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_NAT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_nat *parm;
@@ -70,7 +70,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	} else if (err > 0) {
 		if (bind)
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b45304446e13..c6c862c459cc 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -136,11 +136,11 @@ nla_failure:
 
 static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
-			  int ovr, int bind, bool rtnl_held,
 			  struct tcf_proto *tp, u32 flags,
 			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_pedit_key *keys = NULL;
@@ -198,7 +198,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	} else if (err > 0) {
 		if (bind)
 			goto out_free;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			ret = -EEXIST;
 			goto out_release;
 		}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 0fab8de176d2..832157a840fc 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -48,11 +48,11 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 
 static int tcf_police_init(struct net *net, struct nlattr *nla,
 			       struct nlattr *est, struct tc_action **a,
-			       int ovr, int bind, bool rtnl_held,
 			       struct tcf_proto *tp, u32 flags,
 			       struct netlink_ext_ack *extack)
 {
 	int ret = 0, tcfp_result = TC_ACT_OK, err, size;
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_POLICE_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_police *parm;
@@ -97,7 +97,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 		}
 		ret = ACT_P_CREATED;
 		spin_lock_init(&(to_police(*a)->tcfp_lock));
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 6a0c16e4351d..230501eb9e06 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -34,11 +34,12 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
 };
 
 static int tcf_sample_init(struct net *net, struct nlattr *nla,
-			   struct nlattr *est, struct tc_action **a, int ovr,
-			   int bind, bool rtnl_held, struct tcf_proto *tp,
+			   struct nlattr *est, struct tc_action **a,
+			   struct tcf_proto *tp,
 			   u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_SAMPLE_MAX + 1];
 	struct psample_group *psample_group;
 	u32 psample_group_num, rate, index;
@@ -75,7 +76,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 			return ret;
 		}
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 726cc956d06f..cbbe1861d3a2 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -85,11 +85,11 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
 
 static int tcf_simp_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_DEF_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_defact *parm;
@@ -147,7 +147,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 		tcf_action_set_ctrlact(*a, parm->action, goto_ch);
 		ret = ACT_P_CREATED;
 	} else {
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			err = -EEXIST;
 			goto release_idr;
 		}
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index e5f3fb8b00e3..605418538347 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -96,11 +96,11 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 			    struct nlattr *est, struct tc_action **a,
-			    int ovr, int bind, bool rtnl_held,
 			    struct tcf_proto *tp, u32 act_flags,
 			    struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+	bool bind = act_flags & TCA_ACT_FLAGS_BIND;
 	struct tcf_skbedit_params *params_new;
 	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
@@ -186,7 +186,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		ret = ACT_P_CREATED;
 	} else {
 		d = to_skbedit(*a);
-		if (!ovr) {
+		if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 762ceec3e6f6..ecb9ee666095 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -100,11 +100,12 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
 
 static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind, bool rtnl_held,
 			   struct tcf_proto *tp, u32 flags,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+	bool ovr = flags & TCA_ACT_FLAGS_REPLACE;
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_SKBMOD_MAX + 1];
 	struct tcf_skbmod_params *p, *p_old;
 	struct tcf_chain *goto_ch = NULL;
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 85c0d0d5b9da..d9cd174eecb7 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -355,11 +355,11 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
 
 static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind, bool rtnl_held,
 			   struct tcf_proto *tp, u32 act_flags,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+	bool bind = act_flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
 	struct tcf_tunnel_key_params *params_new;
 	struct metadata_dst *metadata = NULL;
@@ -504,7 +504,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 		}
 
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) {
 		NL_SET_ERR_MSG(extack, "TC IDR already exists");
 		ret = -EEXIST;
 		goto release_tun_meta;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 71f2015c70ca..e4dc5a555bd8 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -114,11 +114,11 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
 
 static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_VLAN_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	bool push_prio_exists = false;
@@ -223,7 +223,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 		}
 
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7be5b9d2aead..69185e311422 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1949,6 +1949,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 	int err;
 	int tp_created;
 	bool rtnl_held = false;
+	u32 flags = 0;
 
 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
@@ -2112,9 +2113,12 @@ replay:
 		goto errout;
 	}
 
+	if (!(n->nlmsg_flags & NLM_F_CREATE))
+		flags |= TCA_ACT_FLAGS_REPLACE;
+	if (!rtnl_held)
+		flags |= TCA_ACT_FLAGS_NO_RTNL;
 	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
-			      n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE,
-			      rtnl_held, extack);
+			      flags, extack);
 	if (err == 0) {
 		tfilter_notify(net, skb, n, tp, block, q, parent, fh,
 			       RTM_NEWTFILTER, false, rtnl_held);
@@ -3020,8 +3024,8 @@ void tcf_exts_destroy(struct tcf_exts *exts)
 EXPORT_SYMBOL(tcf_exts_destroy);
 
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
-		      struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr,
-		      bool rtnl_held, struct netlink_ext_ack *extack)
+		      struct nlattr *rate_tlv, struct tcf_exts *exts,
+		      u32 flags, struct netlink_ext_ack *extack)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	{
@@ -3032,13 +3036,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 		if (exts->police && tb[exts->police]) {
 			struct tc_action_ops *a_o;
 
-			a_o = tc_action_load_ops("police", tb[exts->police], rtnl_held, extack);
+			a_o = tc_action_load_ops(tb[exts->police], true,
+						 !(flags & TCA_ACT_FLAGS_NO_RTNL),
+						 extack);
 			if (IS_ERR(a_o))
 				return PTR_ERR(a_o);
+			flags |= TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND;
 			act = tcf_action_init_1(net, tp, tb[exts->police],
-						rate_tlv, "police", ovr,
-						TCA_ACT_BIND, a_o, init_res,
-						rtnl_held, extack);
+						rate_tlv, a_o, init_res, flags,
+						extack);
 			module_put(a_o->owner);
 			if (IS_ERR(act))
 				return PTR_ERR(act);
@@ -3050,10 +3056,10 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 		} else if (exts->action && tb[exts->action]) {
 			int err;
 
+			flags |= TCA_ACT_FLAGS_BIND;
 			err = tcf_action_init(net, tp, tb[exts->action],
-					      rate_tlv, NULL, ovr, TCA_ACT_BIND,
-					      exts->actions, init_res,
-					      &attr_size, rtnl_held, extack);
+					      rate_tlv, exts->actions, init_res,
+					      &attr_size, flags, extack);
 			if (err < 0)
 				return err;
 			exts->nr_actions = err;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index f256a7c69093..8158fc9ee1ab 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -145,12 +145,12 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
 static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 			   struct basic_filter *f, unsigned long base,
 			   struct nlattr **tb,
-			   struct nlattr *est, bool ovr,
+			   struct nlattr *est, u32 flags,
 			   struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack);
+	err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack);
 	if (err < 0)
 		return err;
 
@@ -169,8 +169,8 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 
 static int basic_change(struct net *net, struct sk_buff *in_skb,
 			struct tcf_proto *tp, unsigned long base, u32 handle,
-			struct nlattr **tca, void **arg, bool ovr,
-			bool rtnl_held, struct netlink_ext_ack *extack)
+			struct nlattr **tca, void **arg,
+			u32 flags, struct netlink_ext_ack *extack)
 {
 	int err;
 	struct basic_head *head = rtnl_dereference(tp->root);
@@ -216,7 +216,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 	}
 
-	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr,
+	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], flags,
 			      extack);
 	if (err < 0) {
 		if (!fold)
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index fa739efa59f4..3b472bafdc9d 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -404,7 +404,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 
 static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 			     struct cls_bpf_prog *prog, unsigned long base,
-			     struct nlattr **tb, struct nlattr *est, bool ovr,
+			     struct nlattr **tb, struct nlattr *est, u32 flags,
 			     struct netlink_ext_ack *extack)
 {
 	bool is_bpf, is_ebpf, have_exts = false;
@@ -416,7 +416,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 	if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
 		return -EINVAL;
 
-	ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, true,
+	ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, flags,
 				extack);
 	if (ret < 0)
 		return ret;
@@ -455,7 +455,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 			  struct tcf_proto *tp, unsigned long base,
 			  u32 handle, struct nlattr **tca,
-			  void **arg, bool ovr, bool rtnl_held,
+			  void **arg, u32 flags,
 			  struct netlink_ext_ack *extack)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
@@ -500,7 +500,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 	prog->handle = handle;
 
-	ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr,
+	ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], flags,
 				extack);
 	if (ret < 0)
 		goto errout_idr;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index fb881144fa01..ed00001b528a 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -76,7 +76,7 @@ static void cls_cgroup_destroy_work(struct work_struct *work)
 static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 			     struct tcf_proto *tp, unsigned long base,
 			     u32 handle, struct nlattr **tca,
-			     void **arg, bool ovr, bool rtnl_held,
+			     void **arg, u32 flags,
 			     struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_CGROUP_MAX + 1];
@@ -108,8 +108,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto errout;
 
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr,
-				true, extack);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, flags,
+				extack);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 87398af2715a..972303aa8edd 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -387,7 +387,7 @@ static void flow_destroy_filter_work(struct work_struct *work)
 static int flow_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
-		       void **arg, bool ovr, bool rtnl_held,
+		       void **arg, u32 flags,
 		       struct netlink_ext_ack *extack)
 {
 	struct flow_head *head = rtnl_dereference(tp->root);
@@ -442,8 +442,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto err2;
 
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr,
-				true, extack);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, flags,
+				extack);
 	if (err < 0)
 		goto err2;
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index d7869a984881..23b21253b3c3 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1915,23 +1915,22 @@ errout_cleanup:
 static int fl_set_parms(struct net *net, struct tcf_proto *tp,
 			struct cls_fl_filter *f, struct fl_flow_mask *mask,
 			unsigned long base, struct nlattr **tb,
-			struct nlattr *est, bool ovr,
-			struct fl_flow_tmplt *tmplt, bool rtnl_held,
+			struct nlattr *est,
+			struct fl_flow_tmplt *tmplt, u32 flags,
 			struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, rtnl_held,
-				extack);
+	err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack);
 	if (err < 0)
 		return err;
 
 	if (tb[TCA_FLOWER_CLASSID]) {
 		f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
-		if (!rtnl_held)
+		if (flags & TCA_ACT_FLAGS_NO_RTNL)
 			rtnl_lock();
 		tcf_bind_filter(tp, &f->res, base);
-		if (!rtnl_held)
+		if (flags & TCA_ACT_FLAGS_NO_RTNL)
 			rtnl_unlock();
 	}
 
@@ -1975,10 +1974,11 @@ static int fl_ht_insert_unique(struct cls_fl_filter *fnew,
 static int fl_change(struct net *net, struct sk_buff *in_skb,
 		     struct tcf_proto *tp, unsigned long base,
 		     u32 handle, struct nlattr **tca,
-		     void **arg, bool ovr, bool rtnl_held,
+		     void **arg, u32 flags,
 		     struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = fl_head_dereference(tp);
+	bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL);
 	struct cls_fl_filter *fold = *arg;
 	struct cls_fl_filter *fnew;
 	struct fl_flow_mask *mask;
@@ -2034,8 +2034,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		}
 	}
 
-	err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr,
-			   tp->chain->tmplt_priv, rtnl_held, extack);
+	err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE],
+			   tp->chain->tmplt_priv, flags, extack);
 	if (err)
 		goto errout;
 
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index ec945294626a..8654b0ce997c 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -198,15 +198,15 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
 
 static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 			struct fw_filter *f, struct nlattr **tb,
-			struct nlattr **tca, unsigned long base, bool ovr,
+			struct nlattr **tca, unsigned long base, u32 flags,
 			struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	u32 mask;
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr,
-				true, extack);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, flags,
+				extack);
 	if (err < 0)
 		return err;
 
@@ -237,8 +237,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 static int fw_change(struct net *net, struct sk_buff *in_skb,
 		     struct tcf_proto *tp, unsigned long base,
 		     u32 handle, struct nlattr **tca, void **arg,
-		     bool ovr, bool rtnl_held,
-		     struct netlink_ext_ack *extack)
+		     u32 flags, struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f = *arg;
@@ -277,7 +276,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 			return err;
 		}
 
-		err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack);
+		err = fw_set_parms(net, tp, fnew, tb, tca, base, flags, extack);
 		if (err < 0) {
 			tcf_exts_destroy(&fnew->exts);
 			kfree(fnew);
@@ -326,7 +325,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 	f->id = handle;
 	f->tp = tp;
 
-	err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack);
+	err = fw_set_parms(net, tp, f, tb, tca, base, flags, extack);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index cafb84480bab..24f0046ce0b3 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -163,13 +163,12 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
 static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 			  struct cls_mall_head *head,
 			  unsigned long base, struct nlattr **tb,
-			  struct nlattr *est, bool ovr,
+			  struct nlattr *est, u32 flags,
 			  struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, true,
-				extack);
+	err = tcf_exts_validate(net, tp, tb, est, &head->exts, flags, extack);
 	if (err < 0)
 		return err;
 
@@ -183,13 +182,13 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 static int mall_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
-		       void **arg, bool ovr, bool rtnl_held,
+		       void **arg, u32 flags,
 		       struct netlink_ext_ack *extack)
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
 	struct nlattr *tb[TCA_MATCHALL_MAX + 1];
 	struct cls_mall_head *new;
-	u32 flags = 0;
+	u32 userflags = 0;
 	int err;
 
 	if (!tca[TCA_OPTIONS])
@@ -204,8 +203,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 		return err;
 
 	if (tb[TCA_MATCHALL_FLAGS]) {
-		flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]);
-		if (!tc_flags_valid(flags))
+		userflags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]);
+		if (!tc_flags_valid(userflags))
 			return -EINVAL;
 	}
 
@@ -220,14 +219,14 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	if (!handle)
 		handle = 1;
 	new->handle = handle;
-	new->flags = flags;
+	new->flags = userflags;
 	new->pf = alloc_percpu(struct tc_matchall_pcnt);
 	if (!new->pf) {
 		err = -ENOMEM;
 		goto err_alloc_percpu;
 	}
 
-	err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr,
+	err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], flags,
 			     extack);
 	if (err)
 		goto err_set_parms;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 5efa3e7ace15..a35ab8c27866 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -382,7 +382,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 			    unsigned long base, struct route4_filter *f,
 			    u32 handle, struct route4_head *head,
 			    struct nlattr **tb, struct nlattr *est, int new,
-			    bool ovr, struct netlink_ext_ack *extack)
+			    u32 flags, struct netlink_ext_ack *extack)
 {
 	u32 id = 0, to = 0, nhandle = 0x8000;
 	struct route4_filter *fp;
@@ -390,7 +390,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 	struct route4_bucket *b;
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack);
+	err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack);
 	if (err < 0)
 		return err;
 
@@ -464,8 +464,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 
 static int route4_change(struct net *net, struct sk_buff *in_skb,
 			 struct tcf_proto *tp, unsigned long base, u32 handle,
-			 struct nlattr **tca, void **arg, bool ovr,
-			 bool rtnl_held, struct netlink_ext_ack *extack)
+			 struct nlattr **tca, void **arg, u32 flags,
+			 struct netlink_ext_ack *extack)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	struct route4_filter __rcu **fp;
@@ -510,7 +510,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 	}
 
 	err = route4_set_parms(net, tp, base, f, handle, head, tb,
-			       tca[TCA_RATE], new, ovr, extack);
+			       tca[TCA_RATE], new, flags, extack);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 27a4b6dbcf57..5cd9d6b143c4 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -470,9 +470,8 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
 
 static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
-		       u32 handle,
-		       struct nlattr **tca,
-		       void **arg, bool ovr, bool rtnl_held,
+		       u32 handle, struct nlattr **tca,
+		       void **arg, u32 flags,
 		       struct netlink_ext_ack *extack)
 {
 	struct rsvp_head *data = rtnl_dereference(tp->root);
@@ -499,7 +498,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 	err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE);
 	if (err < 0)
 		return err;
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, true,
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, flags,
 				extack);
 	if (err < 0)
 		goto errout2;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index e9a8a2c86bbd..742c7d49a958 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -330,7 +330,7 @@ static int
 tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 		  u32 handle, struct tcindex_data *p,
 		  struct tcindex_filter_result *r, struct nlattr **tb,
-		  struct nlattr *est, bool ovr, struct netlink_ext_ack *extack)
+		  struct nlattr *est, u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tcindex_filter_result new_filter_result, *old_r = r;
 	struct tcindex_data *cp = NULL, *oldp;
@@ -342,7 +342,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 	err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
 	if (err < 0)
 		return err;
-	err = tcf_exts_validate(net, tp, tb, est, &e, ovr, true, extack);
+	err = tcf_exts_validate(net, tp, tb, est, &e, flags, extack);
 	if (err < 0)
 		goto errout;
 
@@ -529,8 +529,8 @@ errout:
 static int
 tcindex_change(struct net *net, struct sk_buff *in_skb,
 	       struct tcf_proto *tp, unsigned long base, u32 handle,
-	       struct nlattr **tca, void **arg, bool ovr,
-	       bool rtnl_held, struct netlink_ext_ack *extack)
+	       struct nlattr **tca, void **arg, u32 flags,
+	       struct netlink_ext_ack *extack)
 {
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_TCINDEX_MAX + 1];
@@ -551,7 +551,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb,
 		return err;
 
 	return tcindex_set_parms(net, tp, base, handle, p, r, tb,
-				 tca[TCA_RATE], ovr, extack);
+				 tca[TCA_RATE], flags, extack);
 }
 
 static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker,
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 6e1abe805448..4272814487f0 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -709,12 +709,12 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 			 unsigned long base,
 			 struct tc_u_knode *n, struct nlattr **tb,
-			 struct nlattr *est, bool ovr,
+			 struct nlattr *est, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, true, extack);
+	err = tcf_exts_validate(net, tp, tb, est, &n->exts, flags, extack);
 	if (err < 0)
 		return err;
 
@@ -840,7 +840,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
 
 static int u32_change(struct net *net, struct sk_buff *in_skb,
 		      struct tcf_proto *tp, unsigned long base, u32 handle,
-		      struct nlattr **tca, void **arg, bool ovr, bool rtnl_held,
+		      struct nlattr **tca, void **arg, u32 flags,
 		      struct netlink_ext_ack *extack)
 {
 	struct tc_u_common *tp_c = tp->data;
@@ -849,7 +849,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	struct tc_u32_sel *s;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_U32_MAX + 1];
-	u32 htid, flags = 0;
+	u32 htid, userflags = 0;
 	size_t sel_size;
 	int err;
 
@@ -868,8 +868,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		return err;
 
 	if (tb[TCA_U32_FLAGS]) {
-		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
-		if (!tc_flags_valid(flags)) {
+		userflags = nla_get_u32(tb[TCA_U32_FLAGS]);
+		if (!tc_flags_valid(userflags)) {
 			NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags");
 			return -EINVAL;
 		}
@@ -884,7 +884,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -EINVAL;
 		}
 
-		if ((n->flags ^ flags) &
+		if ((n->flags ^ userflags) &
 		    ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) {
 			NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags");
 			return -EINVAL;
@@ -895,7 +895,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -ENOMEM;
 
 		err = u32_set_parms(net, tp, base, new, tb,
-				    tca[TCA_RATE], ovr, extack);
+				    tca[TCA_RATE], flags, extack);
 
 		if (err) {
 			u32_destroy_key(new, false);
@@ -955,9 +955,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		ht->handle = handle;
 		ht->prio = tp->prio;
 		idr_init(&ht->handle_idr);
-		ht->flags = flags;
+		ht->flags = userflags;
 
-		err = u32_replace_hw_hnode(tp, ht, flags, extack);
+		err = u32_replace_hw_hnode(tp, ht, userflags, extack);
 		if (err) {
 			idr_remove(&tp_c->handle_idr, handle);
 			kfree(ht);
@@ -1038,7 +1038,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	RCU_INIT_POINTER(n->ht_up, ht);
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
-	n->flags = flags;
+	n->flags = userflags;
 
 	err = tcf_exts_init(&n->exts, net, TCA_U32_ACT, TCA_U32_POLICE);
 	if (err < 0)
@@ -1060,7 +1060,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	}
 #endif
 
-	err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr,
+	err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], flags,
 			    extack);
 	if (err == 0) {
 		struct tc_u_knode __rcu **ins;
-- 
cgit v1.2.3


From 87663c39f898b18905499126548da61450628682 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 23 Jul 2021 15:18:01 +0200
Subject: netfilter: ebtables: do not hook tables by default

If any of these modules is loaded, hooks get registered in all netns:

Before: 'unshare -n nft list hooks' shows:
family bridge hook prerouting {
	-2147483648 ebt_broute
	-0000000300 ebt_nat_hook
}
family bridge hook input {
	-0000000200 ebt_filter_hook
}
family bridge hook forward {
	-0000000200 ebt_filter_hook
}
family bridge hook output {
	+0000000100 ebt_nat_hook
	+0000000200 ebt_filter_hook
}
family bridge hook postrouting {
	+0000000300 ebt_nat_hook
}

This adds 'template 'tables' for ebtables.

Each ebtable_foo registers the table as a template, with an init function
that gets called once the first get/setsockopt call is made.

ebtables core then searches the (per netns) list of tables.
If no table is found, it searches the list of templates instead.
If a template entry exists, the init function is called which will
enable the table and register the hooks (so packets are diverted
to the table).

If no entry is found in the template list, request_module is called.

After this, hook registration is delayed until the 'ebtables'
(set/getsockopt) request is made for a given table and will only
happen in the specific namespace.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h |   2 +
 net/bridge/netfilter/ebtable_broute.c     |  17 ++++-
 net/bridge/netfilter/ebtable_filter.c     |  17 ++++-
 net/bridge/netfilter/ebtable_nat.c        |  17 ++++-
 net/bridge/netfilter/ebtables.c           | 109 ++++++++++++++++++++++++++----
 5 files changed, 139 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index a8178253ce53..10a01978bc0d 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -127,4 +127,6 @@ static inline bool ebt_invalid_target(int target)
 	return (target < -NUM_STANDARD_TARGETS || target >= 0);
 }
 
+int ebt_register_template(const struct ebt_table *t, int(*table_init)(struct net *net));
+void ebt_unregister_template(const struct ebt_table *t);
 #endif
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 020b1487ee0c..a7af4eaff17d 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -98,7 +98,7 @@ static const struct nf_hook_ops ebt_ops_broute = {
 	.priority	= NF_BR_PRI_FIRST,
 };
 
-static int __net_init broute_net_init(struct net *net)
+static int broute_table_init(struct net *net)
 {
 	return ebt_register_table(net, &broute_table, &ebt_ops_broute);
 }
@@ -114,19 +114,30 @@ static void __net_exit broute_net_exit(struct net *net)
 }
 
 static struct pernet_operations broute_net_ops = {
-	.init = broute_net_init,
 	.exit = broute_net_exit,
 	.pre_exit = broute_net_pre_exit,
 };
 
 static int __init ebtable_broute_init(void)
 {
-	return register_pernet_subsys(&broute_net_ops);
+	int ret = ebt_register_template(&broute_table, broute_table_init);
+
+	if (ret)
+		return ret;
+
+	ret = register_pernet_subsys(&broute_net_ops);
+	if (ret) {
+		ebt_unregister_template(&broute_table);
+		return ret;
+	}
+
+	return 0;
 }
 
 static void __exit ebtable_broute_fini(void)
 {
 	unregister_pernet_subsys(&broute_net_ops);
+	ebt_unregister_template(&broute_table);
 }
 
 module_init(ebtable_broute_init);
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 8ec0b3736803..c0b121df4a9a 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -86,7 +86,7 @@ static const struct nf_hook_ops ebt_ops_filter[] = {
 	},
 };
 
-static int __net_init frame_filter_net_init(struct net *net)
+static int frame_filter_table_init(struct net *net)
 {
 	return ebt_register_table(net, &frame_filter, ebt_ops_filter);
 }
@@ -102,19 +102,30 @@ static void __net_exit frame_filter_net_exit(struct net *net)
 }
 
 static struct pernet_operations frame_filter_net_ops = {
-	.init = frame_filter_net_init,
 	.exit = frame_filter_net_exit,
 	.pre_exit = frame_filter_net_pre_exit,
 };
 
 static int __init ebtable_filter_init(void)
 {
-	return register_pernet_subsys(&frame_filter_net_ops);
+	int ret = ebt_register_template(&frame_filter, frame_filter_table_init);
+
+	if (ret)
+		return ret;
+
+	ret = register_pernet_subsys(&frame_filter_net_ops);
+	if (ret) {
+		ebt_unregister_template(&frame_filter);
+		return ret;
+	}
+
+	return 0;
 }
 
 static void __exit ebtable_filter_fini(void)
 {
 	unregister_pernet_subsys(&frame_filter_net_ops);
+	ebt_unregister_template(&frame_filter);
 }
 
 module_init(ebtable_filter_init);
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 7c8a1064a531..4078151c224f 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -85,7 +85,7 @@ static const struct nf_hook_ops ebt_ops_nat[] = {
 	},
 };
 
-static int __net_init frame_nat_net_init(struct net *net)
+static int frame_nat_table_init(struct net *net)
 {
 	return ebt_register_table(net, &frame_nat, ebt_ops_nat);
 }
@@ -101,19 +101,30 @@ static void __net_exit frame_nat_net_exit(struct net *net)
 }
 
 static struct pernet_operations frame_nat_net_ops = {
-	.init = frame_nat_net_init,
 	.exit = frame_nat_net_exit,
 	.pre_exit = frame_nat_net_pre_exit,
 };
 
 static int __init ebtable_nat_init(void)
 {
-	return register_pernet_subsys(&frame_nat_net_ops);
+	int ret = ebt_register_template(&frame_nat, frame_nat_table_init);
+
+	if (ret)
+		return ret;
+
+	ret = register_pernet_subsys(&frame_nat_net_ops);
+	if (ret) {
+		ebt_unregister_template(&frame_nat);
+		return ret;
+	}
+
+	return ret;
 }
 
 static void __exit ebtable_nat_fini(void)
 {
 	unregister_pernet_subsys(&frame_nat_net_ops);
+	ebt_unregister_template(&frame_nat);
 }
 
 module_init(ebtable_nat_init);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index f022deb3721e..83d1798dfbb4 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -44,7 +44,16 @@ struct ebt_pernet {
 	struct list_head tables;
 };
 
+struct ebt_template {
+	struct list_head list;
+	char name[EBT_TABLE_MAXNAMELEN];
+	struct module *owner;
+	/* called when table is needed in the given netns */
+	int (*table_init)(struct net *net);
+};
+
 static unsigned int ebt_pernet_id __read_mostly;
+static LIST_HEAD(template_tables);
 static DEFINE_MUTEX(ebt_mutex);
 
 #ifdef CONFIG_NETFILTER_XTABLES_COMPAT
@@ -309,30 +318,57 @@ letscontinue:
 
 /* If it succeeds, returns element and locks mutex */
 static inline void *
-find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
+find_inlist_lock_noload(struct net *net, const char *name, int *error,
 			struct mutex *mutex)
 {
-	struct {
-		struct list_head list;
-		char name[EBT_FUNCTION_MAXNAMELEN];
-	} *e;
+	struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
+	struct ebt_template *tmpl;
+	struct ebt_table *table;
 
 	mutex_lock(mutex);
-	list_for_each_entry(e, head, list) {
-		if (strcmp(e->name, name) == 0)
-			return e;
+	list_for_each_entry(table, &ebt_net->tables, list) {
+		if (strcmp(table->name, name) == 0)
+			return table;
 	}
+
+	list_for_each_entry(tmpl, &template_tables, list) {
+		if (strcmp(name, tmpl->name) == 0) {
+			struct module *owner = tmpl->owner;
+
+			if (!try_module_get(owner))
+				goto out;
+
+			mutex_unlock(mutex);
+
+			*error = tmpl->table_init(net);
+			if (*error) {
+				module_put(owner);
+				return NULL;
+			}
+
+			mutex_lock(mutex);
+			module_put(owner);
+			break;
+		}
+	}
+
+	list_for_each_entry(table, &ebt_net->tables, list) {
+		if (strcmp(table->name, name) == 0)
+			return table;
+	}
+
+out:
 	*error = -ENOENT;
 	mutex_unlock(mutex);
 	return NULL;
 }
 
 static void *
-find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
+find_inlist_lock(struct net *net, const char *name, const char *prefix,
 		 int *error, struct mutex *mutex)
 {
 	return try_then_request_module(
-			find_inlist_lock_noload(head, name, error, mutex),
+			find_inlist_lock_noload(net, name, error, mutex),
 			"%s%s", prefix, name);
 }
 
@@ -340,10 +376,7 @@ static inline struct ebt_table *
 find_table_lock(struct net *net, const char *name, int *error,
 		struct mutex *mutex)
 {
-	struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
-
-	return find_inlist_lock(&ebt_net->tables, name,
-				"ebtable_", error, mutex);
+	return find_inlist_lock(net, name, "ebtable_", error, mutex);
 }
 
 static inline void ebt_free_table_info(struct ebt_table_info *info)
@@ -1258,6 +1291,54 @@ out:
 	return ret;
 }
 
+int ebt_register_template(const struct ebt_table *t, int (*table_init)(struct net *net))
+{
+	struct ebt_template *tmpl;
+
+	mutex_lock(&ebt_mutex);
+	list_for_each_entry(tmpl, &template_tables, list) {
+		if (WARN_ON_ONCE(strcmp(t->name, tmpl->name) == 0)) {
+			mutex_unlock(&ebt_mutex);
+			return -EEXIST;
+		}
+	}
+
+	tmpl = kzalloc(sizeof(*tmpl), GFP_KERNEL);
+	if (!tmpl) {
+		mutex_unlock(&ebt_mutex);
+		return -ENOMEM;
+	}
+
+	tmpl->table_init = table_init;
+	strscpy(tmpl->name, t->name, sizeof(tmpl->name));
+	tmpl->owner = t->me;
+	list_add(&tmpl->list, &template_tables);
+
+	mutex_unlock(&ebt_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ebt_register_template);
+
+void ebt_unregister_template(const struct ebt_table *t)
+{
+	struct ebt_template *tmpl;
+
+	mutex_lock(&ebt_mutex);
+	list_for_each_entry(tmpl, &template_tables, list) {
+		if (strcmp(t->name, tmpl->name))
+			continue;
+
+		list_del(&tmpl->list);
+		mutex_unlock(&ebt_mutex);
+		kfree(tmpl);
+		return;
+	}
+
+	mutex_unlock(&ebt_mutex);
+	WARN_ON_ONCE(1);
+}
+EXPORT_SYMBOL(ebt_unregister_template);
+
 static struct ebt_table *__ebt_find_table(struct net *net, const char *name)
 {
 	struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
-- 
cgit v1.2.3


From 35d7a6f1fb53479965e9f99e8c87edc642336eba Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Sat, 31 Jul 2021 12:21:44 +0200
Subject: nfc: hci: pass callback data param as pointer in nci_request()

The nci_request() receives a callback function and unsigned long data
argument "opt" which is passed to the callback.  Almost all of the
nci_request() callers pass pointer to a stack variable as data argument.
Only few pass scalar value (e.g. u8).

All such callbacks do not modify passed data argument and in previous
commit they were made as const.  However passing pointers via unsigned
long removes the const annotation.  The callback could simply cast
unsigned long to a pointer to writeable memory.

Use "const void *" as type of this "opt" argument to solve this and
prevent modifying the pointed contents.  This is also consistent with
generic pattern of passing data arguments - via "void *".  In few places
which pass scalar values, use casts via "unsigned long" to suppress any
warnings.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nfc/nci_core.h |   4 +-
 net/nfc/nci/core.c         | 119 ++++++++++++++++++++++-----------------------
 net/nfc/nci/hci.c          |  16 +++---
 3 files changed, 67 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index 4770a81f4aa7..a964daedc17b 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -276,8 +276,8 @@ int nci_register_device(struct nci_dev *ndev);
 void nci_unregister_device(struct nci_dev *ndev);
 int nci_request(struct nci_dev *ndev,
 		void (*req)(struct nci_dev *ndev,
-			    unsigned long opt),
-		unsigned long opt, __u32 timeout);
+			    const void *opt),
+		const void *opt, __u32 timeout);
 int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len,
 		 const __u8 *payload);
 int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len,
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 80a5c2a8e9fa..82ab39d80726 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -95,8 +95,8 @@ static void nci_req_cancel(struct nci_dev *ndev, int err)
 
 /* Execute request and wait for completion. */
 static int __nci_request(struct nci_dev *ndev,
-			 void (*req)(struct nci_dev *ndev, unsigned long opt),
-			 unsigned long opt, __u32 timeout)
+			 void (*req)(struct nci_dev *ndev, const void *opt),
+			 const void *opt, __u32 timeout)
 {
 	int rc = 0;
 	long completion_rc;
@@ -139,8 +139,8 @@ static int __nci_request(struct nci_dev *ndev,
 
 inline int nci_request(struct nci_dev *ndev,
 		       void (*req)(struct nci_dev *ndev,
-				   unsigned long opt),
-		       unsigned long opt, __u32 timeout)
+				   const void *opt),
+		       const void *opt, __u32 timeout)
 {
 	int rc;
 
@@ -155,7 +155,7 @@ inline int nci_request(struct nci_dev *ndev,
 	return rc;
 }
 
-static void nci_reset_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_reset_req(struct nci_dev *ndev, const void *opt)
 {
 	struct nci_core_reset_cmd cmd;
 
@@ -163,17 +163,17 @@ static void nci_reset_req(struct nci_dev *ndev, unsigned long opt)
 	nci_send_cmd(ndev, NCI_OP_CORE_RESET_CMD, 1, &cmd);
 }
 
-static void nci_init_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_init_req(struct nci_dev *ndev, const void *opt)
 {
 	u8 plen = 0;
 
 	if (opt)
 		plen = sizeof(struct nci_core_init_v2_cmd);
 
-	nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, plen, (void *)opt);
+	nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, plen, opt);
 }
 
-static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_init_complete_req(struct nci_dev *ndev, const void *opt)
 {
 	struct nci_rf_disc_map_cmd cmd;
 	struct disc_map_config *cfg = cmd.mapping_configs;
@@ -215,10 +215,9 @@ struct nci_set_config_param {
 	const __u8	*val;
 };
 
-static void nci_set_config_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_set_config_req(struct nci_dev *ndev, const void *opt)
 {
-	const struct nci_set_config_param *param =
-		(struct nci_set_config_param *)opt;
+	const struct nci_set_config_param *param = opt;
 	struct nci_core_set_config_cmd cmd;
 
 	BUG_ON(param->len > NCI_MAX_PARAM_LEN);
@@ -236,10 +235,9 @@ struct nci_rf_discover_param {
 	__u32	tm_protocols;
 };
 
-static void nci_rf_discover_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_rf_discover_req(struct nci_dev *ndev, const void *opt)
 {
-	const struct nci_rf_discover_param *param =
-		(struct nci_rf_discover_param *)opt;
+	const struct nci_rf_discover_param *param = opt;
 	struct nci_rf_disc_cmd cmd;
 
 	cmd.num_disc_configs = 0;
@@ -302,10 +300,9 @@ struct nci_rf_discover_select_param {
 	__u8	rf_protocol;
 };
 
-static void nci_rf_discover_select_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_rf_discover_select_req(struct nci_dev *ndev, const void *opt)
 {
-	const struct nci_rf_discover_select_param *param =
-		(struct nci_rf_discover_select_param *)opt;
+	const struct nci_rf_discover_select_param *param = opt;
 	struct nci_rf_discover_select_cmd cmd;
 
 	cmd.rf_discovery_id = param->rf_discovery_id;
@@ -329,11 +326,11 @@ static void nci_rf_discover_select_req(struct nci_dev *ndev, unsigned long opt)
 		     sizeof(struct nci_rf_discover_select_cmd), &cmd);
 }
 
-static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_rf_deactivate_req(struct nci_dev *ndev, const void *opt)
 {
 	struct nci_rf_deactivate_cmd cmd;
 
-	cmd.type = opt;
+	cmd.type = (unsigned long)opt;
 
 	nci_send_cmd(ndev, NCI_OP_RF_DEACTIVATE_CMD,
 		     sizeof(struct nci_rf_deactivate_cmd), &cmd);
@@ -345,10 +342,9 @@ struct nci_cmd_param {
 	const __u8 *payload;
 };
 
-static void nci_generic_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_generic_req(struct nci_dev *ndev, const void *opt)
 {
-	const struct nci_cmd_param *param =
-		(struct nci_cmd_param *)opt;
+	const struct nci_cmd_param *param = opt;
 
 	nci_send_cmd(ndev, param->opcode, param->len, param->payload);
 }
@@ -361,7 +357,7 @@ int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, const __u8 *payload
 	param.len = len;
 	param.payload = payload;
 
-	return __nci_request(ndev, nci_generic_req, (unsigned long)&param,
+	return __nci_request(ndev, nci_generic_req, &param,
 			     msecs_to_jiffies(NCI_CMD_TIMEOUT));
 }
 EXPORT_SYMBOL(nci_prop_cmd);
@@ -375,21 +371,21 @@ int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len,
 	param.len = len;
 	param.payload = payload;
 
-	return __nci_request(ndev, nci_generic_req, (unsigned long)&param,
+	return __nci_request(ndev, nci_generic_req, &param,
 			     msecs_to_jiffies(NCI_CMD_TIMEOUT));
 }
 EXPORT_SYMBOL(nci_core_cmd);
 
 int nci_core_reset(struct nci_dev *ndev)
 {
-	return __nci_request(ndev, nci_reset_req, 0,
+	return __nci_request(ndev, nci_reset_req, (void *)0,
 			     msecs_to_jiffies(NCI_RESET_TIMEOUT));
 }
 EXPORT_SYMBOL(nci_core_reset);
 
 int nci_core_init(struct nci_dev *ndev)
 {
-	return __nci_request(ndev, nci_init_req, 0,
+	return __nci_request(ndev, nci_init_req, (void *)0,
 			     msecs_to_jiffies(NCI_INIT_TIMEOUT));
 }
 EXPORT_SYMBOL(nci_core_init);
@@ -399,9 +395,9 @@ struct nci_loopback_data {
 	struct sk_buff *data;
 };
 
-static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_send_data_req(struct nci_dev *ndev, const void *opt)
 {
-	const struct nci_loopback_data *data = (struct nci_loopback_data *)opt;
+	const struct nci_loopback_data *data = opt;
 
 	nci_send_data(ndev, data->conn_id, data->data);
 }
@@ -462,7 +458,7 @@ int nci_nfcc_loopback(struct nci_dev *ndev, const void *data, size_t data_len,
 	loopback_data.data = skb;
 
 	ndev->cur_conn_id = conn_id;
-	r = nci_request(ndev, nci_send_data_req, (unsigned long)&loopback_data,
+	r = nci_request(ndev, nci_send_data_req, &loopback_data,
 			msecs_to_jiffies(NCI_DATA_TIMEOUT));
 	if (r == NCI_STATUS_OK && resp)
 		*resp = conn_info->rx_skb;
@@ -495,7 +491,7 @@ static int nci_open_device(struct nci_dev *ndev)
 		rc = ndev->ops->init(ndev);
 
 	if (!rc) {
-		rc = __nci_request(ndev, nci_reset_req, 0,
+		rc = __nci_request(ndev, nci_reset_req, (void *)0,
 				   msecs_to_jiffies(NCI_RESET_TIMEOUT));
 	}
 
@@ -508,10 +504,10 @@ static int nci_open_device(struct nci_dev *ndev)
 			.feature1 = NCI_FEATURE_DISABLE,
 			.feature2 = NCI_FEATURE_DISABLE
 		};
-		unsigned long opt = 0;
+		const void *opt = NULL;
 
 		if (ndev->nci_ver & NCI_VER_2_MASK)
-			opt = (unsigned long)&nci_init_v2_cmd;
+			opt = &nci_init_v2_cmd;
 
 		rc = __nci_request(ndev, nci_init_req, opt,
 				   msecs_to_jiffies(NCI_INIT_TIMEOUT));
@@ -521,7 +517,7 @@ static int nci_open_device(struct nci_dev *ndev)
 		rc = ndev->ops->post_setup(ndev);
 
 	if (!rc) {
-		rc = __nci_request(ndev, nci_init_complete_req, 0,
+		rc = __nci_request(ndev, nci_init_complete_req, (void *)0,
 				   msecs_to_jiffies(NCI_INIT_TIMEOUT));
 	}
 
@@ -571,7 +567,7 @@ static int nci_close_device(struct nci_dev *ndev)
 	atomic_set(&ndev->cmd_cnt, 1);
 
 	set_bit(NCI_INIT, &ndev->flags);
-	__nci_request(ndev, nci_reset_req, 0,
+	__nci_request(ndev, nci_reset_req, (void *)0,
 		      msecs_to_jiffies(NCI_RESET_TIMEOUT));
 
 	/* After this point our queues are empty
@@ -637,15 +633,15 @@ int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, const __u8 *val)
 	param.len = len;
 	param.val = val;
 
-	return __nci_request(ndev, nci_set_config_req, (unsigned long)&param,
+	return __nci_request(ndev, nci_set_config_req, &param,
 			     msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT));
 }
 EXPORT_SYMBOL(nci_set_config);
 
-static void nci_nfcee_discover_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_nfcee_discover_req(struct nci_dev *ndev, const void *opt)
 {
 	struct nci_nfcee_discover_cmd cmd;
-	__u8 action = opt;
+	__u8 action = (unsigned long)opt;
 
 	cmd.discovery_action = action;
 
@@ -654,15 +650,16 @@ static void nci_nfcee_discover_req(struct nci_dev *ndev, unsigned long opt)
 
 int nci_nfcee_discover(struct nci_dev *ndev, u8 action)
 {
-	return __nci_request(ndev, nci_nfcee_discover_req, action,
+	unsigned long opt = action;
+
+	return __nci_request(ndev, nci_nfcee_discover_req, (void *)opt,
 				msecs_to_jiffies(NCI_CMD_TIMEOUT));
 }
 EXPORT_SYMBOL(nci_nfcee_discover);
 
-static void nci_nfcee_mode_set_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_nfcee_mode_set_req(struct nci_dev *ndev, const void *opt)
 {
-	const struct nci_nfcee_mode_set_cmd *cmd =
-					(struct nci_nfcee_mode_set_cmd *)opt;
+	const struct nci_nfcee_mode_set_cmd *cmd = opt;
 
 	nci_send_cmd(ndev, NCI_OP_NFCEE_MODE_SET_CMD,
 		     sizeof(struct nci_nfcee_mode_set_cmd), cmd);
@@ -675,16 +672,14 @@ int nci_nfcee_mode_set(struct nci_dev *ndev, u8 nfcee_id, u8 nfcee_mode)
 	cmd.nfcee_id = nfcee_id;
 	cmd.nfcee_mode = nfcee_mode;
 
-	return __nci_request(ndev, nci_nfcee_mode_set_req,
-			     (unsigned long)&cmd,
+	return __nci_request(ndev, nci_nfcee_mode_set_req, &cmd,
 			     msecs_to_jiffies(NCI_CMD_TIMEOUT));
 }
 EXPORT_SYMBOL(nci_nfcee_mode_set);
 
-static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_core_conn_create_req(struct nci_dev *ndev, const void *opt)
 {
-	const struct core_conn_create_data *data =
-					(struct core_conn_create_data *)opt;
+	const struct core_conn_create_data *data = opt;
 
 	nci_send_cmd(ndev, NCI_OP_CORE_CONN_CREATE_CMD, data->length, data->cmd);
 }
@@ -721,24 +716,26 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type,
 	}
 	ndev->cur_dest_type = destination_type;
 
-	r = __nci_request(ndev, nci_core_conn_create_req, (unsigned long)&data,
+	r = __nci_request(ndev, nci_core_conn_create_req, &data,
 			  msecs_to_jiffies(NCI_CMD_TIMEOUT));
 	kfree(cmd);
 	return r;
 }
 EXPORT_SYMBOL(nci_core_conn_create);
 
-static void nci_core_conn_close_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_core_conn_close_req(struct nci_dev *ndev, const void *opt)
 {
-	__u8 conn_id = opt;
+	__u8 conn_id = (unsigned long)opt;
 
 	nci_send_cmd(ndev, NCI_OP_CORE_CONN_CLOSE_CMD, 1, &conn_id);
 }
 
 int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id)
 {
+	unsigned long opt = conn_id;
+
 	ndev->cur_conn_id = conn_id;
-	return __nci_request(ndev, nci_core_conn_close_req, conn_id,
+	return __nci_request(ndev, nci_core_conn_close_req, (void *)opt,
 			     msecs_to_jiffies(NCI_CMD_TIMEOUT));
 }
 EXPORT_SYMBOL(nci_core_conn_close);
@@ -758,14 +755,14 @@ static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev)
 
 	param.id = NCI_PN_ATR_REQ_GEN_BYTES;
 
-	rc = nci_request(ndev, nci_set_config_req, (unsigned long)&param,
+	rc = nci_request(ndev, nci_set_config_req, &param,
 			 msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT));
 	if (rc)
 		return rc;
 
 	param.id = NCI_LN_ATR_RES_GEN_BYTES;
 
-	return nci_request(ndev, nci_set_config_req, (unsigned long)&param,
+	return nci_request(ndev, nci_set_config_req, &param,
 			   msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT));
 }
 
@@ -815,7 +812,7 @@ static int nci_start_poll(struct nfc_dev *nfc_dev,
 		pr_debug("target active or w4 select, implicitly deactivate\n");
 
 		rc = nci_request(ndev, nci_rf_deactivate_req,
-				 NCI_DEACTIVATE_TYPE_IDLE_MODE,
+				 (void *)NCI_DEACTIVATE_TYPE_IDLE_MODE,
 				 msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
 		if (rc)
 			return -EBUSY;
@@ -837,7 +834,7 @@ static int nci_start_poll(struct nfc_dev *nfc_dev,
 
 	param.im_protocols = im_protocols;
 	param.tm_protocols = tm_protocols;
-	rc = nci_request(ndev, nci_rf_discover_req, (unsigned long)&param,
+	rc = nci_request(ndev, nci_rf_discover_req, &param,
 			 msecs_to_jiffies(NCI_RF_DISC_TIMEOUT));
 
 	if (!rc)
@@ -856,7 +853,8 @@ static void nci_stop_poll(struct nfc_dev *nfc_dev)
 		return;
 	}
 
-	nci_request(ndev, nci_rf_deactivate_req, NCI_DEACTIVATE_TYPE_IDLE_MODE,
+	nci_request(ndev, nci_rf_deactivate_req,
+		    (void *)NCI_DEACTIVATE_TYPE_IDLE_MODE,
 		    msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
 }
 
@@ -915,8 +913,7 @@ static int nci_activate_target(struct nfc_dev *nfc_dev,
 		else
 			param.rf_protocol = NCI_RF_PROTOCOL_NFC_DEP;
 
-		rc = nci_request(ndev, nci_rf_discover_select_req,
-				 (unsigned long)&param,
+		rc = nci_request(ndev, nci_rf_discover_select_req, &param,
 				 msecs_to_jiffies(NCI_RF_DISC_SELECT_TIMEOUT));
 	}
 
@@ -931,7 +928,7 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev,
 				  __u8 mode)
 {
 	struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
-	u8 nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE;
+	unsigned long nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE;
 
 	pr_debug("entry\n");
 
@@ -949,7 +946,7 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev,
 	}
 
 	if (atomic_read(&ndev->state) == NCI_POLL_ACTIVE) {
-		nci_request(ndev, nci_rf_deactivate_req, nci_mode,
+		nci_request(ndev, nci_rf_deactivate_req, (void *)nci_mode,
 			    msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
 	}
 }
@@ -987,8 +984,8 @@ static int nci_dep_link_down(struct nfc_dev *nfc_dev)
 	} else {
 		if (atomic_read(&ndev->state) == NCI_LISTEN_ACTIVE ||
 		    atomic_read(&ndev->state) == NCI_DISCOVERY) {
-			nci_request(ndev, nci_rf_deactivate_req, 0,
-				msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
+			nci_request(ndev, nci_rf_deactivate_req, (void *)0,
+				    msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
 		}
 
 		rc = nfc_tm_deactivated(nfc_dev);
diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c
index a8ff794a8084..e199912ee1e5 100644
--- a/net/nfc/nci/hci.c
+++ b/net/nfc/nci/hci.c
@@ -195,9 +195,9 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe,
 	return i;
 }
 
-static void nci_hci_send_data_req(struct nci_dev *ndev, unsigned long opt)
+static void nci_hci_send_data_req(struct nci_dev *ndev, const void *opt)
 {
-	const struct nci_data *data = (struct nci_data *)opt;
+	const struct nci_data *data = opt;
 
 	nci_hci_send_data(ndev, data->pipe, data->cmd,
 			  data->data, data->data_len);
@@ -240,7 +240,7 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd,
 	data.data = param;
 	data.data_len = param_len;
 
-	r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data,
+	r = nci_request(ndev, nci_hci_send_data_req, &data,
 			msecs_to_jiffies(NCI_DATA_TIMEOUT));
 	if (r == NCI_STATUS_OK) {
 		message = (struct nci_hcp_message *)conn_info->rx_skb->data;
@@ -511,9 +511,8 @@ int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe)
 	data.data = NULL;
 	data.data_len = 0;
 
-	return nci_request(ndev, nci_hci_send_data_req,
-			(unsigned long)&data,
-			msecs_to_jiffies(NCI_DATA_TIMEOUT));
+	return nci_request(ndev, nci_hci_send_data_req, &data,
+			   msecs_to_jiffies(NCI_DATA_TIMEOUT));
 }
 EXPORT_SYMBOL(nci_hci_open_pipe);
 
@@ -587,8 +586,7 @@ int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx,
 	data.data = tmp;
 	data.data_len = param_len + 1;
 
-	r = nci_request(ndev, nci_hci_send_data_req,
-			(unsigned long)&data,
+	r = nci_request(ndev, nci_hci_send_data_req, &data,
 			msecs_to_jiffies(NCI_DATA_TIMEOUT));
 	if (r == NCI_STATUS_OK) {
 		message = (struct nci_hcp_message *)conn_info->rx_skb->data;
@@ -627,7 +625,7 @@ int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx,
 	data.data = &idx;
 	data.data_len = 1;
 
-	r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data,
+	r = nci_request(ndev, nci_hci_send_data_req, &data,
 			msecs_to_jiffies(NCI_DATA_TIMEOUT));
 
 	if (r == NCI_STATUS_OK) {
-- 
cgit v1.2.3


From 29a097b7747725da003245412dab61093d4e5976 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 31 Jul 2021 17:14:32 +0300
Subject: net: dsa: remove the struct packet_type argument from
 dsa_device_ops::rcv()

No tagging driver uses this.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h          |  7 ++-----
 net/dsa/dsa.c              |  2 +-
 net/dsa/tag_ar9331.c       |  3 +--
 net/dsa/tag_brcm.c         | 14 +++++---------
 net/dsa/tag_dsa.c          |  6 ++----
 net/dsa/tag_gswip.c        |  3 +--
 net/dsa/tag_hellcreek.c    |  3 +--
 net/dsa/tag_ksz.c          |  6 ++----
 net/dsa/tag_lan9303.c      |  3 +--
 net/dsa/tag_mtk.c          |  3 +--
 net/dsa/tag_ocelot.c       |  3 +--
 net/dsa/tag_ocelot_8021q.c |  3 +--
 net/dsa/tag_qca.c          |  3 +--
 net/dsa/tag_rtl4_a.c       |  3 +--
 net/dsa/tag_sja1105.c      |  6 ++----
 net/dsa/tag_trailer.c      |  3 +--
 net/dsa/tag_xrs700x.c      |  3 +--
 17 files changed, 25 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2af6ee2f2bfb..7cc9507282d3 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -79,13 +79,11 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_SJA1110		= DSA_TAG_PROTO_SJA1110_VALUE,
 };
 
-struct packet_type;
 struct dsa_switch;
 
 struct dsa_device_ops {
 	struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
-	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt);
+	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
 	void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
 			     int *offset);
 	unsigned int needed_headroom;
@@ -239,8 +237,7 @@ struct dsa_port {
 
 	/* Copies for faster access in master receive hot path */
 	struct dsa_switch_tree *dst;
-	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt);
+	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
 
 	enum {
 		DSA_PORT_TYPE_UNUSED = 0,
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 84cad1be9ce4..1dc45e40f961 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -238,7 +238,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!skb)
 		return 0;
 
-	nskb = cpu_dp->rcv(skb, dev, pt);
+	nskb = cpu_dp->rcv(skb, dev);
 	if (!nskb) {
 		kfree_skb(skb);
 		return 0;
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
index 0efae1a372b3..8a02ac44282f 100644
--- a/net/dsa/tag_ar9331.c
+++ b/net/dsa/tag_ar9331.c
@@ -44,8 +44,7 @@ static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb,
 }
 
 static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
-				      struct net_device *ndev,
-				      struct packet_type *pt)
+				      struct net_device *ndev)
 {
 	u8 ver, port;
 	u16 hdr;
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index a27f5096777a..96e93b544a0d 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -136,7 +136,6 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
  */
 static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
 				       struct net_device *dev,
-				       struct packet_type *pt,
 				       unsigned int offset)
 {
 	int source_port;
@@ -182,13 +181,12 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb,
 }
 
 
-static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
-				    struct packet_type *pt)
+static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	struct sk_buff *nskb;
 
 	/* skb->data points to the EtherType, the tag is right before it */
-	nskb = brcm_tag_rcv_ll(skb, dev, pt, 2);
+	nskb = brcm_tag_rcv_ll(skb, dev, 2);
 	if (!nskb)
 		return nskb;
 
@@ -251,8 +249,7 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
 }
 
 static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
-					struct net_device *dev,
-					struct packet_type *pt)
+					struct net_device *dev)
 {
 	int source_port;
 	u8 *brcm_tag;
@@ -302,11 +299,10 @@ static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb,
 }
 
 static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb,
-					    struct net_device *dev,
-					    struct packet_type *pt)
+					    struct net_device *dev)
 {
 	/* tag is prepended to the packet */
-	return brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN);
+	return brcm_tag_rcv_ll(skb, dev, ETH_HLEN);
 }
 
 static const struct dsa_device_ops brcm_prepend_netdev_ops = {
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 3607499d0697..e32f8160e895 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -332,8 +332,7 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 	return dsa_xmit_ll(skb, dev, 0);
 }
 
-static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt)
+static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
 		return NULL;
@@ -373,8 +372,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 	return skb;
 }
 
-static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
-				struct packet_type *pt)
+static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
 		return NULL;
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
index 5985dab06ab8..df7140984da3 100644
--- a/net/dsa/tag_gswip.c
+++ b/net/dsa/tag_gswip.c
@@ -75,8 +75,7 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb,
 }
 
 static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
-				     struct net_device *dev,
-				     struct packet_type *pt)
+				     struct net_device *dev)
 {
 	int port;
 	u8 *gswip_tag;
diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
index c41208cbd936..f64b805303cd 100644
--- a/net/dsa/tag_hellcreek.c
+++ b/net/dsa/tag_hellcreek.c
@@ -29,8 +29,7 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb,
 }
 
 static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
-				     struct net_device *dev,
-				     struct packet_type *pt)
+				     struct net_device *dev)
 {
 	/* Tag decoding */
 	u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN;
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 1c2dfa80f9b0..fa1d60d13ad9 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -67,8 +67,7 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
 	return skb;
 }
 
-static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev,
-				  struct packet_type *pt)
+static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
 
@@ -134,8 +133,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
 	return skb;
 }
 
-static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt)
+static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	/* Tag decoding */
 	u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index cf7cf2fa1240..58d3a0e712d2 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -74,8 +74,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev)
 	return skb;
 }
 
-static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt)
+static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	__be16 *lan9303_tag;
 	u16 lan9303_tag1;
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 3fb80e43f3a5..bbf37c031d44 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -61,8 +61,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 	return skb;
 }
 
-static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt)
+static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	u16 hdr;
 	int port;
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 3252634a29b8..d37ab98e7fe1 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -55,8 +55,7 @@ static struct sk_buff *seville_xmit(struct sk_buff *skb,
 }
 
 static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
-				  struct net_device *netdev,
-				  struct packet_type *pt)
+				  struct net_device *netdev)
 {
 	u64 src_port, qos_class;
 	u64 vlan_tci, tag_type;
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index c95de71d13b0..3038a257ba05 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -38,8 +38,7 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 }
 
 static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
-				  struct net_device *netdev,
-				  struct packet_type *pt)
+				  struct net_device *netdev)
 {
 	int src_port, switch_id;
 
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 693bda013065..6e3136990491 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -48,8 +48,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 	return skb;
 }
 
-static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt)
+static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	u8 ver;
 	u16  hdr;
diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c
index f6b63aad6551..aaddca3c0245 100644
--- a/net/dsa/tag_rtl4_a.c
+++ b/net/dsa/tag_rtl4_a.c
@@ -64,8 +64,7 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb,
 }
 
 static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
-				     struct net_device *dev,
-				     struct packet_type *pt)
+				     struct net_device *dev)
 {
 	u16 protport;
 	__be16 *p;
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 664cb802b71a..745c4560b4aa 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -391,8 +391,7 @@ static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port,
 }
 
 static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
-				   struct net_device *netdev,
-				   struct packet_type *pt)
+				   struct net_device *netdev)
 {
 	int source_port = -1, switch_id = -1;
 	struct sja1105_meta meta = {0};
@@ -546,8 +545,7 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
 }
 
 static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
-				   struct net_device *netdev,
-				   struct packet_type *pt)
+				   struct net_device *netdev)
 {
 	int source_port = -1, switch_id = -1;
 	bool host_only = false;
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index ba73804340a5..5749ba85c2b8 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -24,8 +24,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 	return skb;
 }
 
-static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt)
+static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	u8 *trailer;
 	int source_port;
diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c
index da231c16ac82..ff442b8af636 100644
--- a/net/dsa/tag_xrs700x.c
+++ b/net/dsa/tag_xrs700x.c
@@ -25,8 +25,7 @@ static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev)
 	return skb;
 }
 
-static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt)
+static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	int source_port;
 	u8 *trailer;
-- 
cgit v1.2.3


From 2d3e5caf96b9449af951e63476657acd759c1a30 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Sat, 31 Jul 2021 12:08:30 -0500
Subject: net/ipv4: Replace one-element array with flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having
a dynamically sized set of trailing elements in a structure. Kernel code
should always use “flexible array members”[1] for these cases. The older
style of one-element or zero-length arrays should no longer be used[2].

Use an anonymous union with a couple of anonymous structs in order to
keep userspace unchanged:

$ pahole -C ip_msfilter net/ipv4/ip_sockglue.o
struct ip_msfilter {
	union {
		struct {
			__be32     imsf_multiaddr_aux;   /*     0     4 */
			__be32     imsf_interface_aux;   /*     4     4 */
			__u32      imsf_fmode_aux;       /*     8     4 */
			__u32      imsf_numsrc_aux;      /*    12     4 */
			__be32     imsf_slist[1];        /*    16     4 */
		};                                       /*     0    20 */
		struct {
			__be32     imsf_multiaddr;       /*     0     4 */
			__be32     imsf_interface;       /*     4     4 */
			__u32      imsf_fmode;           /*     8     4 */
			__u32      imsf_numsrc;          /*    12     4 */
			__be32     imsf_slist_flex[0];   /*    16     0 */
		};                                       /*     0    16 */
	};                                               /*     0    20 */

	/* size: 20, cachelines: 1, members: 1 */
	/* last cacheline: 20 bytes */
};

Also, refactor the code accordingly and make use of the struct_size()
and flex_array_size() helpers.

This helps with the ongoing efforts to globally enable -Warray-bounds
and get us closer to being able to tighten the FORTIFY_SOURCE routines
on memcpy().

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/79
Link: https://github.com/KSPP/linux/issues/109
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in.h | 21 ++++++++++++++++-----
 net/ipv4/igmp.c         | 12 ++++++------
 net/ipv4/ip_sockglue.c  | 15 ++++++++-------
 3 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index d1b327036ae4..193b7cf1f0ac 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -188,11 +188,22 @@ struct ip_mreq_source {
 };
 
 struct ip_msfilter {
-	__be32		imsf_multiaddr;
-	__be32		imsf_interface;
-	__u32		imsf_fmode;
-	__u32		imsf_numsrc;
-	__be32		imsf_slist[1];
+	union {
+		struct {
+			__be32		imsf_multiaddr_aux;
+			__be32		imsf_interface_aux;
+			__u32		imsf_fmode_aux;
+			__u32		imsf_numsrc_aux;
+			__be32		imsf_slist[1];
+		};
+		struct {
+			__be32		imsf_multiaddr;
+			__be32		imsf_interface;
+			__u32		imsf_fmode;
+			__u32		imsf_numsrc;
+			__be32		imsf_slist_flex[];
+		};
+	};
 };
 
 #define IP_MSFILTER_SIZE(numsrc) \
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 03589a04f9aa..a5f4ecb02e97 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2475,8 +2475,8 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 			goto done;
 		}
 		newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
-		memcpy(newpsl->sl_addr, msf->imsf_slist,
-			msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+		memcpy(newpsl->sl_addr, msf->imsf_slist_flex,
+		       flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc));
 		err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
 			msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
 		if (err) {
@@ -2551,14 +2551,14 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		count = psl->sl_count;
 	}
 	copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
-	len = copycount * sizeof(psl->sl_addr[0]);
+	len = flex_array_size(psl, sl_addr, copycount);
 	msf->imsf_numsrc = count;
-	if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
-	    copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+	if (put_user(struct_size(optval, imsf_slist_flex, copycount), optlen) ||
+	    copy_to_user(optval, msf, struct_size(optval, imsf_slist_flex, 0))) {
 		return -EFAULT;
 	}
 	if (len &&
-	    copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+	    copy_to_user(&optval->imsf_slist_flex[0], psl->sl_addr, len))
 		return -EFAULT;
 	return 0;
 done:
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ec6036713e2c..bbe660b84a91 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -663,12 +663,11 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex,
 			      struct sockaddr_storage *group,
 			      struct sockaddr_storage *list)
 {
-	int msize = IP_MSFILTER_SIZE(numsrc);
 	struct ip_msfilter *msf;
 	struct sockaddr_in *psin;
 	int err, i;
 
-	msf = kmalloc(msize, GFP_KERNEL);
+	msf = kmalloc(struct_size(msf, imsf_slist_flex, numsrc), GFP_KERNEL);
 	if (!msf)
 		return -ENOBUFS;
 
@@ -684,7 +683,7 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex,
 
 		if (psin->sin_family != AF_INET)
 			goto Eaddrnotavail;
-		msf->imsf_slist[i] = psin->sin_addr.s_addr;
+		msf->imsf_slist_flex[i] = psin->sin_addr.s_addr;
 	}
 	err = ip_mc_msfilter(sk, msf, ifindex);
 	kfree(msf);
@@ -1229,7 +1228,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
 	{
 		struct ip_msfilter *msf;
 
-		if (optlen < IP_MSFILTER_SIZE(0))
+		if (optlen < struct_size(msf, imsf_slist_flex, 0))
 			goto e_inval;
 		if (optlen > sysctl_optmem_max) {
 			err = -ENOBUFS;
@@ -1247,7 +1246,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
 			err = -ENOBUFS;
 			break;
 		}
-		if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+		if (struct_size(msf, imsf_slist_flex, msf->imsf_numsrc) >
+		    optlen) {
 			kfree(msf);
 			err = -EINVAL;
 			break;
@@ -1660,11 +1660,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	{
 		struct ip_msfilter msf;
 
-		if (len < IP_MSFILTER_SIZE(0)) {
+		if (len < struct_size(&msf, imsf_slist_flex, 0)) {
 			err = -EINVAL;
 			goto out;
 		}
-		if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+		if (copy_from_user(&msf, optval,
+				   struct_size(&msf, imsf_slist_flex, 0))) {
 			err = -EFAULT;
 			goto out;
 		}
-- 
cgit v1.2.3


From 371cf74e78f3468016e8c7a159fc288a71d4dc86 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Fri, 2 Jul 2021 10:38:32 +0300
Subject: net/mlx5: Move TTC logic to fs_ttc

Now that TTC logic is not dependent on mlx5e structs, move it to
lib/fs_ttc.c so it could be used other part of the mlx5 driver.

Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   2 -
 drivers/net/ethernet/mellanox/mlx5/core/en/fs.h    |  78 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    | 558 --------------------
 .../net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c   | 584 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h   |  77 +++
 include/linux/mlx5/fs.h                            |   2 +
 7 files changed, 665 insertions(+), 638 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index e8522ccb3519..33e550d77fa6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -15,7 +15,7 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o alloc.o port.o mr.o pd.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
 		fs_counters.o fs_ft_pool.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
-		lib/devcom.o lib/pci_vsc.o lib/dm.o diag/fs_tracepoint.o \
+		lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \
 		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o \
 		fw_reset.o qos.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 594b7971caf9..4f6897c1ea8d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -66,8 +66,6 @@ struct page_pool;
 #define MLX5E_METADATA_ETHER_TYPE (0x8CE4)
 #define MLX5E_METADATA_ETHER_LEN 8
 
-#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
-
 #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
 
 #define MLX5E_HW2SW_MTU(params, hwmtu) ((hwmtu) - ((params)->hard_mtu))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
index c289f7004e10..8e7794c3d330 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
@@ -5,6 +5,7 @@
 #define __MLX5E_FLOW_STEER_H__
 
 #include "mod_hdr.h"
+#include "lib/fs_ttc.h"
 
 enum {
 	MLX5E_TC_FT_LEVEL = 0,
@@ -67,21 +68,6 @@ struct mlx5e_l2_table {
 	bool                       promisc_enabled;
 };
 
-enum mlx5_traffic_types {
-	MLX5_TT_IPV4_TCP,
-	MLX5_TT_IPV6_TCP,
-	MLX5_TT_IPV4_UDP,
-	MLX5_TT_IPV6_UDP,
-	MLX5_TT_IPV4_IPSEC_AH,
-	MLX5_TT_IPV6_IPSEC_AH,
-	MLX5_TT_IPV4_IPSEC_ESP,
-	MLX5_TT_IPV6_IPSEC_ESP,
-	MLX5_TT_IPV4,
-	MLX5_TT_IPV6,
-	MLX5_TT_ANY,
-	MLX5_NUM_TT,
-};
-
 #define MLX5E_NUM_INDIR_TIRS (MLX5_NUM_TT - 1)
 
 #define MLX5_HASH_IP		(MLX5_HASH_FIELD_SEL_SRC_IP   |\
@@ -94,32 +80,6 @@ enum mlx5_traffic_types {
 				 MLX5_HASH_FIELD_SEL_DST_IP   |\
 				 MLX5_HASH_FIELD_SEL_IPSEC_SPI)
 
-enum mlx5_tunnel_types {
-	MLX5_TT_IPV4_GRE,
-	MLX5_TT_IPV6_GRE,
-	MLX5_TT_IPV4_IPIP,
-	MLX5_TT_IPV6_IPIP,
-	MLX5_TT_IPV4_IPV6,
-	MLX5_TT_IPV6_IPV6,
-	MLX5_NUM_TUNNEL_TT,
-};
-
-bool mlx5_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev);
-
-struct mlx5_ttc_rule {
-	struct mlx5_flow_handle *rule;
-	struct mlx5_flow_destination default_dest;
-};
-
-/* L3/L4 traffic type classifier */
-struct mlx5_ttc_table {
-	int num_groups;
-	struct mlx5_flow_table *t;
-	struct mlx5_flow_group **g;
-	struct mlx5_ttc_rule rules[MLX5_NUM_TT];
-	struct mlx5_flow_handle *tunnel_rules[MLX5_NUM_TUNNEL_TT];
-};
-
 /* NIC prio FTS */
 enum {
 	MLX5E_PROMISC_FT_LEVEL,
@@ -141,22 +101,6 @@ enum {
 #endif
 };
 
-#define MLX5_TTC_NUM_GROUPS	3
-#define MLX5_TTC_GROUP1_SIZE	(BIT(3) + MLX5_NUM_TUNNEL_TT)
-#define MLX5_TTC_GROUP2_SIZE	 BIT(1)
-#define MLX5_TTC_GROUP3_SIZE	 BIT(0)
-#define MLX5_TTC_TABLE_SIZE	(MLX5_TTC_GROUP1_SIZE +\
-				 MLX5_TTC_GROUP2_SIZE +\
-				 MLX5_TTC_GROUP3_SIZE)
-
-#define MLX5_INNER_TTC_NUM_GROUPS	3
-#define MLX5_INNER_TTC_GROUP1_SIZE	BIT(3)
-#define MLX5_INNER_TTC_GROUP2_SIZE	BIT(1)
-#define MLX5_INNER_TTC_GROUP3_SIZE	BIT(0)
-#define MLX5_INNER_TTC_TABLE_SIZE	(MLX5_INNER_TTC_GROUP1_SIZE +\
-					 MLX5_INNER_TTC_GROUP2_SIZE +\
-					 MLX5_INNER_TTC_GROUP3_SIZE)
-
 struct mlx5e_priv;
 
 #ifdef CONFIG_MLX5_EN_RXNFC
@@ -238,29 +182,10 @@ struct mlx5e_flow_steering {
 	struct mlx5e_ptp_fs            *ptp_fs;
 };
 
-struct ttc_params {
-	struct mlx5_flow_namespace *ns;
-	struct mlx5_flow_table_attr ft_attr;
-	struct mlx5_flow_destination dests[MLX5_NUM_TT];
-	bool   inner_ttc;
-	struct mlx5_flow_destination tunnel_dests[MLX5_NUM_TUNNEL_TT];
-};
-
 void mlx5e_set_ttc_params(struct mlx5e_priv *priv,
 			  struct ttc_params *ttc_params, bool tunnel);
 
-int mlx5_create_ttc_table(struct mlx5_core_dev *dev, struct ttc_params *params,
-			  struct mlx5_ttc_table *ttc);
-void mlx5_destroy_ttc_table(struct mlx5_ttc_table *ttc);
-
 void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft);
-int mlx5_ttc_fwd_dest(struct mlx5_ttc_table *ttc, enum mlx5_traffic_types type,
-		      struct mlx5_flow_destination *new_dest);
-struct mlx5_flow_destination
-mlx5_ttc_get_default_dest(struct mlx5_ttc_table *ttc,
-			  enum mlx5_traffic_types type);
-int mlx5_ttc_fwd_default_dest(struct mlx5_ttc_table *ttc,
-			      enum mlx5_traffic_types type);
 
 void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv);
 void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv);
@@ -268,7 +193,6 @@ void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv);
 int mlx5e_create_flow_steering(struct mlx5e_priv *priv);
 void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv);
 
-u8 mlx5_get_proto_by_tunnel_type(enum mlx5_tunnel_types tt);
 int mlx5e_add_vlan_trap(struct mlx5e_priv *priv, int  trap_id, int tir_num);
 void mlx5e_remove_vlan_trap(struct mlx5e_priv *priv);
 int mlx5e_add_mac_trap(struct mlx5e_priv *priv, int  trap_id, int tir_num);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index a03842d132f6..cbad05760551 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -854,454 +854,6 @@ void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft)
 	ft->t = NULL;
 }
 
-static void mlx5_cleanup_ttc_rules(struct mlx5_ttc_table *ttc)
-{
-	int i;
-
-	for (i = 0; i < MLX5_NUM_TT; i++) {
-		if (!IS_ERR_OR_NULL(ttc->rules[i].rule)) {
-			mlx5_del_flow_rules(ttc->rules[i].rule);
-			ttc->rules[i].rule = NULL;
-		}
-	}
-
-	for (i = 0; i < MLX5_NUM_TUNNEL_TT; i++) {
-		if (!IS_ERR_OR_NULL(ttc->tunnel_rules[i])) {
-			mlx5_del_flow_rules(ttc->tunnel_rules[i]);
-			ttc->tunnel_rules[i] = NULL;
-		}
-	}
-}
-
-struct mlx5_etype_proto {
-	u16 etype;
-	u8 proto;
-};
-
-static struct mlx5_etype_proto ttc_rules[] = {
-	[MLX5_TT_IPV4_TCP] = {
-		.etype = ETH_P_IP,
-		.proto = IPPROTO_TCP,
-	},
-	[MLX5_TT_IPV6_TCP] = {
-		.etype = ETH_P_IPV6,
-		.proto = IPPROTO_TCP,
-	},
-	[MLX5_TT_IPV4_UDP] = {
-		.etype = ETH_P_IP,
-		.proto = IPPROTO_UDP,
-	},
-	[MLX5_TT_IPV6_UDP] = {
-		.etype = ETH_P_IPV6,
-		.proto = IPPROTO_UDP,
-	},
-	[MLX5_TT_IPV4_IPSEC_AH] = {
-		.etype = ETH_P_IP,
-		.proto = IPPROTO_AH,
-	},
-	[MLX5_TT_IPV6_IPSEC_AH] = {
-		.etype = ETH_P_IPV6,
-		.proto = IPPROTO_AH,
-	},
-	[MLX5_TT_IPV4_IPSEC_ESP] = {
-		.etype = ETH_P_IP,
-		.proto = IPPROTO_ESP,
-	},
-	[MLX5_TT_IPV6_IPSEC_ESP] = {
-		.etype = ETH_P_IPV6,
-		.proto = IPPROTO_ESP,
-	},
-	[MLX5_TT_IPV4] = {
-		.etype = ETH_P_IP,
-		.proto = 0,
-	},
-	[MLX5_TT_IPV6] = {
-		.etype = ETH_P_IPV6,
-		.proto = 0,
-	},
-	[MLX5_TT_ANY] = {
-		.etype = 0,
-		.proto = 0,
-	},
-};
-
-static struct mlx5_etype_proto ttc_tunnel_rules[] = {
-	[MLX5_TT_IPV4_GRE] = {
-		.etype = ETH_P_IP,
-		.proto = IPPROTO_GRE,
-	},
-	[MLX5_TT_IPV6_GRE] = {
-		.etype = ETH_P_IPV6,
-		.proto = IPPROTO_GRE,
-	},
-	[MLX5_TT_IPV4_IPIP] = {
-		.etype = ETH_P_IP,
-		.proto = IPPROTO_IPIP,
-	},
-	[MLX5_TT_IPV6_IPIP] = {
-		.etype = ETH_P_IPV6,
-		.proto = IPPROTO_IPIP,
-	},
-	[MLX5_TT_IPV4_IPV6] = {
-		.etype = ETH_P_IP,
-		.proto = IPPROTO_IPV6,
-	},
-	[MLX5_TT_IPV6_IPV6] = {
-		.etype = ETH_P_IPV6,
-		.proto = IPPROTO_IPV6,
-	},
-
-};
-
-u8 mlx5_get_proto_by_tunnel_type(enum mlx5_tunnel_types tt)
-{
-	return ttc_tunnel_rules[tt].proto;
-}
-
-static bool mlx5_tunnel_proto_supported_rx(struct mlx5_core_dev *mdev,
-					   u8 proto_type)
-{
-	switch (proto_type) {
-	case IPPROTO_GRE:
-		return MLX5_CAP_ETH(mdev, tunnel_stateless_gre);
-	case IPPROTO_IPIP:
-	case IPPROTO_IPV6:
-		return (MLX5_CAP_ETH(mdev, tunnel_stateless_ip_over_ip) ||
-			MLX5_CAP_ETH(mdev, tunnel_stateless_ip_over_ip_rx));
-	default:
-		return false;
-	}
-}
-
-static bool mlx5_tunnel_any_rx_proto_supported(struct mlx5_core_dev *mdev)
-{
-	int tt;
-
-	for (tt = 0; tt < MLX5_NUM_TUNNEL_TT; tt++) {
-		if (mlx5_tunnel_proto_supported_rx(mdev,
-						   ttc_tunnel_rules[tt].proto))
-			return true;
-	}
-	return false;
-}
-
-bool mlx5_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
-{
-	return (mlx5_tunnel_any_rx_proto_supported(mdev) &&
-		MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-					  ft_field_support.inner_ip_version));
-}
-
-static u8 mlx5_etype_to_ipv(u16 ethertype)
-{
-	if (ethertype == ETH_P_IP)
-		return 4;
-
-	if (ethertype == ETH_P_IPV6)
-		return 6;
-
-	return 0;
-}
-
-static struct mlx5_flow_handle *
-mlx5_generate_ttc_rule(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft,
-		       struct mlx5_flow_destination *dest, u16 etype, u8 proto)
-{
-	int match_ipv_outer =
-		MLX5_CAP_FLOWTABLE_NIC_RX(dev,
-					  ft_field_support.outer_ip_version);
-	MLX5_DECLARE_FLOW_ACT(flow_act);
-	struct mlx5_flow_handle *rule;
-	struct mlx5_flow_spec *spec;
-	int err = 0;
-	u8 ipv;
-
-	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
-	if (!spec)
-		return ERR_PTR(-ENOMEM);
-
-	if (proto) {
-		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol);
-		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, proto);
-	}
-
-	ipv = mlx5_etype_to_ipv(etype);
-	if (match_ipv_outer && ipv) {
-		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version);
-		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, ipv);
-	} else if (etype) {
-		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ethertype);
-		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, etype);
-	}
-
-	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
-	if (IS_ERR(rule)) {
-		err = PTR_ERR(rule);
-		mlx5_core_err(dev, "%s: add rule failed\n", __func__);
-	}
-
-	kvfree(spec);
-	return err ? ERR_PTR(err) : rule;
-}
-
-static int mlx5_generate_ttc_table_rules(struct mlx5_core_dev *dev,
-					 struct ttc_params *params,
-					 struct mlx5_ttc_table *ttc)
-{
-	struct mlx5_flow_handle **trules;
-	struct mlx5_ttc_rule *rules;
-	struct mlx5_flow_table *ft;
-	int tt;
-	int err;
-
-	ft = ttc->t;
-	rules = ttc->rules;
-	for (tt = 0; tt < MLX5_NUM_TT; tt++) {
-		struct mlx5_ttc_rule *rule = &rules[tt];
-
-		rule->rule = mlx5_generate_ttc_rule(dev, ft, &params->dests[tt],
-						    ttc_rules[tt].etype,
-						    ttc_rules[tt].proto);
-		if (IS_ERR(rule->rule)) {
-			err = PTR_ERR(rule->rule);
-			rule->rule = NULL;
-			goto del_rules;
-		}
-		rule->default_dest = params->dests[tt];
-	}
-
-	if (!params->inner_ttc || !mlx5_tunnel_inner_ft_supported(dev))
-		return 0;
-
-	trules    = ttc->tunnel_rules;
-	for (tt = 0; tt < MLX5_NUM_TUNNEL_TT; tt++) {
-		if (!mlx5_tunnel_proto_supported_rx(dev,
-						    ttc_tunnel_rules[tt].proto))
-			continue;
-		trules[tt] = mlx5_generate_ttc_rule(dev, ft,
-						    &params->tunnel_dests[tt],
-						    ttc_tunnel_rules[tt].etype,
-						    ttc_tunnel_rules[tt].proto);
-		if (IS_ERR(trules[tt])) {
-			err = PTR_ERR(trules[tt]);
-			trules[tt] = NULL;
-			goto del_rules;
-		}
-	}
-
-	return 0;
-
-del_rules:
-	mlx5_cleanup_ttc_rules(ttc);
-	return err;
-}
-
-static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc,
-					bool use_ipv)
-{
-	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
-	int ix = 0;
-	u32 *in;
-	int err;
-	u8 *mc;
-
-	ttc->g = kcalloc(MLX5_TTC_NUM_GROUPS, sizeof(*ttc->g), GFP_KERNEL);
-	if (!ttc->g)
-		return -ENOMEM;
-	in = kvzalloc(inlen, GFP_KERNEL);
-	if (!in) {
-		kfree(ttc->g);
-		ttc->g = NULL;
-		return -ENOMEM;
-	}
-
-	/* L4 Group */
-	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
-	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
-	if (use_ipv)
-		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_version);
-	else
-		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
-	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
-	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_TTC_GROUP1_SIZE;
-	MLX5_SET_CFG(in, end_flow_index, ix - 1);
-	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
-	if (IS_ERR(ttc->g[ttc->num_groups]))
-		goto err;
-	ttc->num_groups++;
-
-	/* L3 Group */
-	MLX5_SET(fte_match_param, mc, outer_headers.ip_protocol, 0);
-	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_TTC_GROUP2_SIZE;
-	MLX5_SET_CFG(in, end_flow_index, ix - 1);
-	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
-	if (IS_ERR(ttc->g[ttc->num_groups]))
-		goto err;
-	ttc->num_groups++;
-
-	/* Any Group */
-	memset(in, 0, inlen);
-	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_TTC_GROUP3_SIZE;
-	MLX5_SET_CFG(in, end_flow_index, ix - 1);
-	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
-	if (IS_ERR(ttc->g[ttc->num_groups]))
-		goto err;
-	ttc->num_groups++;
-
-	kvfree(in);
-	return 0;
-
-err:
-	err = PTR_ERR(ttc->g[ttc->num_groups]);
-	ttc->g[ttc->num_groups] = NULL;
-	kvfree(in);
-
-	return err;
-}
-
-static struct mlx5_flow_handle *
-mlx5_generate_inner_ttc_rule(struct mlx5_core_dev *dev,
-			     struct mlx5_flow_table *ft,
-			     struct mlx5_flow_destination *dest,
-			     u16 etype, u8 proto)
-{
-	MLX5_DECLARE_FLOW_ACT(flow_act);
-	struct mlx5_flow_handle *rule;
-	struct mlx5_flow_spec *spec;
-	int err = 0;
-	u8 ipv;
-
-	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
-	if (!spec)
-		return ERR_PTR(-ENOMEM);
-
-	ipv = mlx5_etype_to_ipv(etype);
-	if (etype && ipv) {
-		spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS;
-		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_version);
-		MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_version, ipv);
-	}
-
-	if (proto) {
-		spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS;
-		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_protocol);
-		MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_protocol, proto);
-	}
-
-	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
-	if (IS_ERR(rule)) {
-		err = PTR_ERR(rule);
-		mlx5_core_err(dev, "%s: add inner TTC rule failed\n", __func__);
-	}
-
-	kvfree(spec);
-	return err ? ERR_PTR(err) : rule;
-}
-
-static int mlx5_generate_inner_ttc_table_rules(struct mlx5_core_dev *dev,
-					       struct ttc_params *params,
-					       struct mlx5_ttc_table *ttc)
-{
-	struct mlx5_ttc_rule *rules;
-	struct mlx5_flow_table *ft;
-	int err;
-	int tt;
-
-	ft = ttc->t;
-	rules = ttc->rules;
-
-	for (tt = 0; tt < MLX5_NUM_TT; tt++) {
-		struct mlx5_ttc_rule *rule = &rules[tt];
-
-		rule->rule = mlx5_generate_inner_ttc_rule(dev, ft,
-							  &params->dests[tt],
-							  ttc_rules[tt].etype,
-							  ttc_rules[tt].proto);
-		if (IS_ERR(rule->rule)) {
-			err = PTR_ERR(rule->rule);
-			rule->rule = NULL;
-			goto del_rules;
-		}
-		rule->default_dest = params->dests[tt];
-	}
-
-	return 0;
-
-del_rules:
-
-	mlx5_cleanup_ttc_rules(ttc);
-	return err;
-}
-
-static int mlx5_create_inner_ttc_table_groups(struct mlx5_ttc_table *ttc)
-{
-	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
-	int ix = 0;
-	u32 *in;
-	int err;
-	u8 *mc;
-
-	ttc->g = kcalloc(MLX5_INNER_TTC_NUM_GROUPS, sizeof(*ttc->g),
-			 GFP_KERNEL);
-	if (!ttc->g)
-		return -ENOMEM;
-	in = kvzalloc(inlen, GFP_KERNEL);
-	if (!in) {
-		kfree(ttc->g);
-		ttc->g = NULL;
-		return -ENOMEM;
-	}
-
-	/* L4 Group */
-	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
-	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol);
-	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_version);
-	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS);
-	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_INNER_TTC_GROUP1_SIZE;
-	MLX5_SET_CFG(in, end_flow_index, ix - 1);
-	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
-	if (IS_ERR(ttc->g[ttc->num_groups]))
-		goto err;
-	ttc->num_groups++;
-
-	/* L3 Group */
-	MLX5_SET(fte_match_param, mc, inner_headers.ip_protocol, 0);
-	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_INNER_TTC_GROUP2_SIZE;
-	MLX5_SET_CFG(in, end_flow_index, ix - 1);
-	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
-	if (IS_ERR(ttc->g[ttc->num_groups]))
-		goto err;
-	ttc->num_groups++;
-
-	/* Any Group */
-	memset(in, 0, inlen);
-	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_INNER_TTC_GROUP3_SIZE;
-	MLX5_SET_CFG(in, end_flow_index, ix - 1);
-	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
-	if (IS_ERR(ttc->g[ttc->num_groups]))
-		goto err;
-	ttc->num_groups++;
-
-	kvfree(in);
-	return 0;
-
-err:
-	err = PTR_ERR(ttc->g[ttc->num_groups]);
-	ttc->g[ttc->num_groups] = NULL;
-	kvfree(in);
-
-	return err;
-}
-
 static void mlx5e_set_inner_ttc_params(struct mlx5e_priv *priv,
 				       struct ttc_params *ttc_params)
 {
@@ -1356,116 +908,6 @@ void mlx5e_set_ttc_params(struct mlx5e_priv *priv,
 	}
 }
 
-static int mlx5_create_inner_ttc_table(struct mlx5_core_dev *dev,
-				       struct ttc_params *params,
-				       struct mlx5_ttc_table *ttc)
-{
-	int err;
-
-	WARN_ON_ONCE(params->ft_attr.max_fte);
-	params->ft_attr.max_fte = MLX5_INNER_TTC_TABLE_SIZE;
-	ttc->t = mlx5_create_flow_table(params->ns, &params->ft_attr);
-	if (IS_ERR(ttc->t)) {
-		err = PTR_ERR(ttc->t);
-		ttc->t = NULL;
-		return err;
-	}
-
-	err = mlx5_create_inner_ttc_table_groups(ttc);
-	if (err)
-		goto destroy_ttc;
-
-	err = mlx5_generate_inner_ttc_table_rules(dev, params, ttc);
-	if (err)
-		goto destroy_ttc;
-
-	return 0;
-
-destroy_ttc:
-	mlx5_destroy_ttc_table(ttc);
-	return err;
-}
-
-void mlx5_destroy_ttc_table(struct mlx5_ttc_table *ttc)
-{
-	int i;
-
-	mlx5_cleanup_ttc_rules(ttc);
-	for (i = ttc->num_groups - 1; i >= 0; i--) {
-		if (!IS_ERR_OR_NULL(ttc->g[i]))
-			mlx5_destroy_flow_group(ttc->g[i]);
-		ttc->g[i] = NULL;
-	}
-
-	ttc->num_groups = 0;
-	kfree(ttc->g);
-	mlx5_destroy_flow_table(ttc->t);
-	ttc->t = NULL;
-}
-
-static void mlx5_destroy_inner_ttc_table(struct mlx5_ttc_table *ttc)
-{
-	mlx5_destroy_ttc_table(ttc);
-}
-
-int mlx5_create_ttc_table(struct mlx5_core_dev *dev, struct ttc_params *params,
-			  struct mlx5_ttc_table *ttc)
-{
-	bool match_ipv_outer =
-		MLX5_CAP_FLOWTABLE_NIC_RX(dev,
-					  ft_field_support.outer_ip_version);
-	int err;
-
-	WARN_ON_ONCE(params->ft_attr.max_fte);
-	params->ft_attr.max_fte = MLX5_TTC_TABLE_SIZE;
-	ttc->t = mlx5_create_flow_table(params->ns, &params->ft_attr);
-	if (IS_ERR(ttc->t)) {
-		err = PTR_ERR(ttc->t);
-		ttc->t = NULL;
-		return err;
-	}
-
-	err = mlx5_create_ttc_table_groups(ttc, match_ipv_outer);
-	if (err)
-		goto destroy_ttc;
-
-	err = mlx5_generate_ttc_table_rules(dev, params, ttc);
-	if (err)
-		goto destroy_ttc;
-
-	return 0;
-destroy_ttc:
-	mlx5_destroy_ttc_table(ttc);
-	return err;
-}
-
-int mlx5_ttc_fwd_dest(struct mlx5_ttc_table *ttc, enum mlx5_traffic_types type,
-		      struct mlx5_flow_destination *new_dest)
-{
-	return mlx5_modify_rule_destination(ttc->rules[type].rule, new_dest,
-					    NULL);
-}
-
-struct mlx5_flow_destination
-mlx5_ttc_get_default_dest(struct mlx5_ttc_table *ttc,
-			  enum mlx5_traffic_types type)
-{
-	struct mlx5_flow_destination *dest = &ttc->rules[type].default_dest;
-
-	WARN_ONCE(dest->type != MLX5_FLOW_DESTINATION_TYPE_TIR,
-		  "TTC[%d] default dest is not setup yet", type);
-
-	return *dest;
-}
-
-int mlx5_ttc_fwd_default_dest(struct mlx5_ttc_table *ttc,
-			      enum mlx5_traffic_types type)
-{
-	struct mlx5_flow_destination dest = mlx5_ttc_get_default_dest(ttc, type);
-
-	return mlx5_ttc_fwd_dest(ttc, type, &dest);
-}
-
 static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv,
 				   struct mlx5e_l2_rule *ai)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c
new file mode 100644
index 000000000000..4b54b4127d33
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c
@@ -0,0 +1,584 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
+
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/mlx5/fs.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "lib/fs_ttc.h"
+
+#define MLX5_TTC_NUM_GROUPS	3
+#define MLX5_TTC_GROUP1_SIZE	(BIT(3) + MLX5_NUM_TUNNEL_TT)
+#define MLX5_TTC_GROUP2_SIZE	 BIT(1)
+#define MLX5_TTC_GROUP3_SIZE	 BIT(0)
+#define MLX5_TTC_TABLE_SIZE	(MLX5_TTC_GROUP1_SIZE +\
+				 MLX5_TTC_GROUP2_SIZE +\
+				 MLX5_TTC_GROUP3_SIZE)
+
+#define MLX5_INNER_TTC_NUM_GROUPS	3
+#define MLX5_INNER_TTC_GROUP1_SIZE	BIT(3)
+#define MLX5_INNER_TTC_GROUP2_SIZE	BIT(1)
+#define MLX5_INNER_TTC_GROUP3_SIZE	BIT(0)
+#define MLX5_INNER_TTC_TABLE_SIZE	(MLX5_INNER_TTC_GROUP1_SIZE +\
+					 MLX5_INNER_TTC_GROUP2_SIZE +\
+					 MLX5_INNER_TTC_GROUP3_SIZE)
+
+static void mlx5_cleanup_ttc_rules(struct mlx5_ttc_table *ttc)
+{
+	int i;
+
+	for (i = 0; i < MLX5_NUM_TT; i++) {
+		if (!IS_ERR_OR_NULL(ttc->rules[i].rule)) {
+			mlx5_del_flow_rules(ttc->rules[i].rule);
+			ttc->rules[i].rule = NULL;
+		}
+	}
+
+	for (i = 0; i < MLX5_NUM_TUNNEL_TT; i++) {
+		if (!IS_ERR_OR_NULL(ttc->tunnel_rules[i])) {
+			mlx5_del_flow_rules(ttc->tunnel_rules[i]);
+			ttc->tunnel_rules[i] = NULL;
+		}
+	}
+}
+
+struct mlx5_etype_proto {
+	u16 etype;
+	u8 proto;
+};
+
+static struct mlx5_etype_proto ttc_rules[] = {
+	[MLX5_TT_IPV4_TCP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_TCP,
+	},
+	[MLX5_TT_IPV6_TCP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_TCP,
+	},
+	[MLX5_TT_IPV4_UDP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_UDP,
+	},
+	[MLX5_TT_IPV6_UDP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_UDP,
+	},
+	[MLX5_TT_IPV4_IPSEC_AH] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_AH,
+	},
+	[MLX5_TT_IPV6_IPSEC_AH] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_AH,
+	},
+	[MLX5_TT_IPV4_IPSEC_ESP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_ESP,
+	},
+	[MLX5_TT_IPV6_IPSEC_ESP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_ESP,
+	},
+	[MLX5_TT_IPV4] = {
+		.etype = ETH_P_IP,
+		.proto = 0,
+	},
+	[MLX5_TT_IPV6] = {
+		.etype = ETH_P_IPV6,
+		.proto = 0,
+	},
+	[MLX5_TT_ANY] = {
+		.etype = 0,
+		.proto = 0,
+	},
+};
+
+static struct mlx5_etype_proto ttc_tunnel_rules[] = {
+	[MLX5_TT_IPV4_GRE] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_GRE,
+	},
+	[MLX5_TT_IPV6_GRE] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_GRE,
+	},
+	[MLX5_TT_IPV4_IPIP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_IPIP,
+	},
+	[MLX5_TT_IPV6_IPIP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_IPIP,
+	},
+	[MLX5_TT_IPV4_IPV6] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_IPV6,
+	},
+	[MLX5_TT_IPV6_IPV6] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_IPV6,
+	},
+
+};
+
+u8 mlx5_get_proto_by_tunnel_type(enum mlx5_tunnel_types tt)
+{
+	return ttc_tunnel_rules[tt].proto;
+}
+
+static bool mlx5_tunnel_proto_supported_rx(struct mlx5_core_dev *mdev,
+					   u8 proto_type)
+{
+	switch (proto_type) {
+	case IPPROTO_GRE:
+		return MLX5_CAP_ETH(mdev, tunnel_stateless_gre);
+	case IPPROTO_IPIP:
+	case IPPROTO_IPV6:
+		return (MLX5_CAP_ETH(mdev, tunnel_stateless_ip_over_ip) ||
+			MLX5_CAP_ETH(mdev, tunnel_stateless_ip_over_ip_rx));
+	default:
+		return false;
+	}
+}
+
+static bool mlx5_tunnel_any_rx_proto_supported(struct mlx5_core_dev *mdev)
+{
+	int tt;
+
+	for (tt = 0; tt < MLX5_NUM_TUNNEL_TT; tt++) {
+		if (mlx5_tunnel_proto_supported_rx(mdev,
+						   ttc_tunnel_rules[tt].proto))
+			return true;
+	}
+	return false;
+}
+
+bool mlx5_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
+{
+	return (mlx5_tunnel_any_rx_proto_supported(mdev) &&
+		MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					  ft_field_support.inner_ip_version));
+}
+
+static u8 mlx5_etype_to_ipv(u16 ethertype)
+{
+	if (ethertype == ETH_P_IP)
+		return 4;
+
+	if (ethertype == ETH_P_IPV6)
+		return 6;
+
+	return 0;
+}
+
+static struct mlx5_flow_handle *
+mlx5_generate_ttc_rule(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft,
+		       struct mlx5_flow_destination *dest, u16 etype, u8 proto)
+{
+	int match_ipv_outer =
+		MLX5_CAP_FLOWTABLE_NIC_RX(dev,
+					  ft_field_support.outer_ip_version);
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+	u8 ipv;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return ERR_PTR(-ENOMEM);
+
+	if (proto) {
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, proto);
+	}
+
+	ipv = mlx5_etype_to_ipv(etype);
+	if (match_ipv_outer && ipv) {
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, ipv);
+	} else if (etype) {
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ethertype);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, etype);
+	}
+
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		mlx5_core_err(dev, "%s: add rule failed\n", __func__);
+	}
+
+	kvfree(spec);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static int mlx5_generate_ttc_table_rules(struct mlx5_core_dev *dev,
+					 struct ttc_params *params,
+					 struct mlx5_ttc_table *ttc)
+{
+	struct mlx5_flow_handle **trules;
+	struct mlx5_ttc_rule *rules;
+	struct mlx5_flow_table *ft;
+	int tt;
+	int err;
+
+	ft = ttc->t;
+	rules = ttc->rules;
+	for (tt = 0; tt < MLX5_NUM_TT; tt++) {
+		struct mlx5_ttc_rule *rule = &rules[tt];
+
+		rule->rule = mlx5_generate_ttc_rule(dev, ft, &params->dests[tt],
+						    ttc_rules[tt].etype,
+						    ttc_rules[tt].proto);
+		if (IS_ERR(rule->rule)) {
+			err = PTR_ERR(rule->rule);
+			rule->rule = NULL;
+			goto del_rules;
+		}
+		rule->default_dest = params->dests[tt];
+	}
+
+	if (!params->inner_ttc || !mlx5_tunnel_inner_ft_supported(dev))
+		return 0;
+
+	trules    = ttc->tunnel_rules;
+	for (tt = 0; tt < MLX5_NUM_TUNNEL_TT; tt++) {
+		if (!mlx5_tunnel_proto_supported_rx(dev,
+						    ttc_tunnel_rules[tt].proto))
+			continue;
+		trules[tt] = mlx5_generate_ttc_rule(dev, ft,
+						    &params->tunnel_dests[tt],
+						    ttc_tunnel_rules[tt].etype,
+						    ttc_tunnel_rules[tt].proto);
+		if (IS_ERR(trules[tt])) {
+			err = PTR_ERR(trules[tt]);
+			trules[tt] = NULL;
+			goto del_rules;
+		}
+	}
+
+	return 0;
+
+del_rules:
+	mlx5_cleanup_ttc_rules(ttc);
+	return err;
+}
+
+static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc,
+					bool use_ipv)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	int ix = 0;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ttc->g = kcalloc(MLX5_TTC_NUM_GROUPS, sizeof(*ttc->g), GFP_KERNEL);
+	if (!ttc->g)
+		return -ENOMEM;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		kfree(ttc->g);
+		ttc->g = NULL;
+		return -ENOMEM;
+	}
+
+	/* L4 Group */
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
+	if (use_ipv)
+		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_version);
+	else
+		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5_TTC_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
+	if (IS_ERR(ttc->g[ttc->num_groups]))
+		goto err;
+	ttc->num_groups++;
+
+	/* L3 Group */
+	MLX5_SET(fte_match_param, mc, outer_headers.ip_protocol, 0);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5_TTC_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
+	if (IS_ERR(ttc->g[ttc->num_groups]))
+		goto err;
+	ttc->num_groups++;
+
+	/* Any Group */
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5_TTC_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
+	if (IS_ERR(ttc->g[ttc->num_groups]))
+		goto err;
+	ttc->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err:
+	err = PTR_ERR(ttc->g[ttc->num_groups]);
+	ttc->g[ttc->num_groups] = NULL;
+	kvfree(in);
+
+	return err;
+}
+
+static struct mlx5_flow_handle *
+mlx5_generate_inner_ttc_rule(struct mlx5_core_dev *dev,
+			     struct mlx5_flow_table *ft,
+			     struct mlx5_flow_destination *dest,
+			     u16 etype, u8 proto)
+{
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+	u8 ipv;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return ERR_PTR(-ENOMEM);
+
+	ipv = mlx5_etype_to_ipv(etype);
+	if (etype && ipv) {
+		spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_version);
+		MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_version, ipv);
+	}
+
+	if (proto) {
+		spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_protocol);
+		MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_protocol, proto);
+	}
+
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		mlx5_core_err(dev, "%s: add inner TTC rule failed\n", __func__);
+	}
+
+	kvfree(spec);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static int mlx5_generate_inner_ttc_table_rules(struct mlx5_core_dev *dev,
+					       struct ttc_params *params,
+					       struct mlx5_ttc_table *ttc)
+{
+	struct mlx5_ttc_rule *rules;
+	struct mlx5_flow_table *ft;
+	int err;
+	int tt;
+
+	ft = ttc->t;
+	rules = ttc->rules;
+
+	for (tt = 0; tt < MLX5_NUM_TT; tt++) {
+		struct mlx5_ttc_rule *rule = &rules[tt];
+
+		rule->rule = mlx5_generate_inner_ttc_rule(dev, ft,
+							  &params->dests[tt],
+							  ttc_rules[tt].etype,
+							  ttc_rules[tt].proto);
+		if (IS_ERR(rule->rule)) {
+			err = PTR_ERR(rule->rule);
+			rule->rule = NULL;
+			goto del_rules;
+		}
+		rule->default_dest = params->dests[tt];
+	}
+
+	return 0;
+
+del_rules:
+
+	mlx5_cleanup_ttc_rules(ttc);
+	return err;
+}
+
+static int mlx5_create_inner_ttc_table_groups(struct mlx5_ttc_table *ttc)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	int ix = 0;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ttc->g = kcalloc(MLX5_INNER_TTC_NUM_GROUPS, sizeof(*ttc->g),
+			 GFP_KERNEL);
+	if (!ttc->g)
+		return -ENOMEM;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		kfree(ttc->g);
+		ttc->g = NULL;
+		return -ENOMEM;
+	}
+
+	/* L4 Group */
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol);
+	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_version);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5_INNER_TTC_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
+	if (IS_ERR(ttc->g[ttc->num_groups]))
+		goto err;
+	ttc->num_groups++;
+
+	/* L3 Group */
+	MLX5_SET(fte_match_param, mc, inner_headers.ip_protocol, 0);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5_INNER_TTC_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
+	if (IS_ERR(ttc->g[ttc->num_groups]))
+		goto err;
+	ttc->num_groups++;
+
+	/* Any Group */
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5_INNER_TTC_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
+	if (IS_ERR(ttc->g[ttc->num_groups]))
+		goto err;
+	ttc->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err:
+	err = PTR_ERR(ttc->g[ttc->num_groups]);
+	ttc->g[ttc->num_groups] = NULL;
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_create_inner_ttc_table(struct mlx5_core_dev *dev,
+				struct ttc_params *params,
+				struct mlx5_ttc_table *ttc)
+{
+	int err;
+
+	WARN_ON_ONCE(params->ft_attr.max_fte);
+	params->ft_attr.max_fte = MLX5_INNER_TTC_TABLE_SIZE;
+	ttc->t = mlx5_create_flow_table(params->ns, &params->ft_attr);
+	if (IS_ERR(ttc->t)) {
+		err = PTR_ERR(ttc->t);
+		ttc->t = NULL;
+		return err;
+	}
+
+	err = mlx5_create_inner_ttc_table_groups(ttc);
+	if (err)
+		goto destroy_ft;
+
+	err = mlx5_generate_inner_ttc_table_rules(dev, params, ttc);
+	if (err)
+		goto destroy_ft;
+
+	return 0;
+
+destroy_ft:
+	mlx5_destroy_ttc_table(ttc);
+	return err;
+}
+
+void mlx5_destroy_ttc_table(struct mlx5_ttc_table *ttc)
+{
+	int i;
+
+	mlx5_cleanup_ttc_rules(ttc);
+	for (i = ttc->num_groups - 1; i >= 0; i--) {
+		if (!IS_ERR_OR_NULL(ttc->g[i]))
+			mlx5_destroy_flow_group(ttc->g[i]);
+		ttc->g[i] = NULL;
+	}
+
+	ttc->num_groups = 0;
+	kfree(ttc->g);
+	mlx5_destroy_flow_table(ttc->t);
+	ttc->t = NULL;
+}
+
+void mlx5_destroy_inner_ttc_table(struct mlx5_ttc_table *ttc)
+{
+	mlx5_destroy_ttc_table(ttc);
+}
+
+int mlx5_create_ttc_table(struct mlx5_core_dev *dev, struct ttc_params *params,
+			  struct mlx5_ttc_table *ttc)
+{
+	bool match_ipv_outer =
+		MLX5_CAP_FLOWTABLE_NIC_RX(dev,
+					  ft_field_support.outer_ip_version);
+	int err;
+
+	WARN_ON_ONCE(params->ft_attr.max_fte);
+	params->ft_attr.max_fte = MLX5_TTC_TABLE_SIZE;
+	ttc->t = mlx5_create_flow_table(params->ns, &params->ft_attr);
+	if (IS_ERR(ttc->t)) {
+		err = PTR_ERR(ttc->t);
+		ttc->t = NULL;
+		return err;
+	}
+
+	err = mlx5_create_ttc_table_groups(ttc, match_ipv_outer);
+	if (err)
+		goto destroy_ft;
+
+	err = mlx5_generate_ttc_table_rules(dev, params, ttc);
+	if (err)
+		goto destroy_ft;
+
+	return 0;
+destroy_ft:
+	mlx5_destroy_ttc_table(ttc);
+	return err;
+}
+
+int mlx5_ttc_fwd_dest(struct mlx5_ttc_table *ttc, enum mlx5_traffic_types type,
+		      struct mlx5_flow_destination *new_dest)
+{
+	return mlx5_modify_rule_destination(ttc->rules[type].rule, new_dest,
+					    NULL);
+}
+
+struct mlx5_flow_destination
+mlx5_ttc_get_default_dest(struct mlx5_ttc_table *ttc,
+			  enum mlx5_traffic_types type)
+{
+	struct mlx5_flow_destination *dest = &ttc->rules[type].default_dest;
+
+	WARN_ONCE(dest->type != MLX5_FLOW_DESTINATION_TYPE_TIR,
+		  "TTC[%d] default dest is not setup yet", type);
+
+	return *dest;
+}
+
+int mlx5_ttc_fwd_default_dest(struct mlx5_ttc_table *ttc,
+			      enum mlx5_traffic_types type)
+{
+	struct mlx5_flow_destination dest = mlx5_ttc_get_default_dest(ttc, type);
+
+	return mlx5_ttc_fwd_dest(ttc, type, &dest);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h
new file mode 100644
index 000000000000..1010e00c10bd
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies. */
+
+#ifndef __ML5_FS_TTC_H__
+#define __ML5_FS_TTC_H__
+
+#include <linux/mlx5/fs.h>
+
+enum mlx5_traffic_types {
+	MLX5_TT_IPV4_TCP,
+	MLX5_TT_IPV6_TCP,
+	MLX5_TT_IPV4_UDP,
+	MLX5_TT_IPV6_UDP,
+	MLX5_TT_IPV4_IPSEC_AH,
+	MLX5_TT_IPV6_IPSEC_AH,
+	MLX5_TT_IPV4_IPSEC_ESP,
+	MLX5_TT_IPV6_IPSEC_ESP,
+	MLX5_TT_IPV4,
+	MLX5_TT_IPV6,
+	MLX5_TT_ANY,
+	MLX5_NUM_TT,
+	MLX5_NUM_INDIR_TIRS = MLX5_TT_ANY,
+};
+
+enum mlx5_tunnel_types {
+	MLX5_TT_IPV4_GRE,
+	MLX5_TT_IPV6_GRE,
+	MLX5_TT_IPV4_IPIP,
+	MLX5_TT_IPV6_IPIP,
+	MLX5_TT_IPV4_IPV6,
+	MLX5_TT_IPV6_IPV6,
+	MLX5_NUM_TUNNEL_TT,
+};
+
+struct mlx5_ttc_rule {
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_destination default_dest;
+};
+
+/* L3/L4 traffic type classifier */
+struct mlx5_ttc_table {
+	int num_groups;
+	struct mlx5_flow_table *t;
+	struct mlx5_flow_group **g;
+	struct mlx5_ttc_rule rules[MLX5_NUM_TT];
+	struct mlx5_flow_handle *tunnel_rules[MLX5_NUM_TUNNEL_TT];
+};
+
+struct ttc_params {
+	struct mlx5_flow_namespace *ns;
+	struct mlx5_flow_table_attr ft_attr;
+	struct mlx5_flow_destination dests[MLX5_NUM_TT];
+	bool   inner_ttc;
+	struct mlx5_flow_destination tunnel_dests[MLX5_NUM_TUNNEL_TT];
+};
+
+int mlx5_create_ttc_table(struct mlx5_core_dev *dev, struct ttc_params *params,
+			  struct mlx5_ttc_table *ttc);
+void mlx5_destroy_ttc_table(struct mlx5_ttc_table *ttc);
+
+int mlx5_create_inner_ttc_table(struct mlx5_core_dev *dev,
+				struct ttc_params *params,
+				struct mlx5_ttc_table *ttc);
+void mlx5_destroy_inner_ttc_table(struct mlx5_ttc_table *ttc);
+
+int mlx5_ttc_fwd_dest(struct mlx5_ttc_table *ttc, enum mlx5_traffic_types type,
+		      struct mlx5_flow_destination *new_dest);
+struct mlx5_flow_destination
+mlx5_ttc_get_default_dest(struct mlx5_ttc_table *ttc,
+			  enum mlx5_traffic_types type);
+int mlx5_ttc_fwd_default_dest(struct mlx5_ttc_table *ttc,
+			      enum mlx5_traffic_types type);
+
+bool mlx5_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev);
+u8 mlx5_get_proto_by_tunnel_type(enum mlx5_tunnel_types tt);
+
+#endif /* __MLX5_FS_TTC_H__ */
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 77746f7e35b8..0106c67e8ccb 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -38,6 +38,8 @@
 
 #define MLX5_FS_DEFAULT_FLOW_TAG 0x0
 
+#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
+
 enum {
 	MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO	= 1 << 16,
 	MLX5_FLOW_CONTEXT_ACTION_ENCRYPT	= 1 << 17,
-- 
cgit v1.2.3


From f1260ff15a71b8fc122b2c9abd8a7abffb6e0168 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Mon, 2 Aug 2021 11:52:15 +0300
Subject: skbuff: introduce skb_expand_head()

Like skb_realloc_headroom(), new helper increases headroom of specified skb.
Unlike skb_realloc_headroom(), it does not allocate a new skb if possible;
copies skb->sk on new skb when as needed and frees original skb in case
of failures.

This helps to simplify ip[6]_finish_output2() and a few other similar cases.

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 +
 net/core/skbuff.c      | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2bcdc8cd38be..783cc2368bb1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1183,6 +1183,7 @@ static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
 				     unsigned int headroom);
+struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom);
 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
 				int newtailroom, gfp_t priority);
 int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fcbd977186b0..8bac7a1a81ba 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1789,6 +1789,48 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
 }
 EXPORT_SYMBOL(skb_realloc_headroom);
 
+/**
+ *	skb_expand_head - reallocate header of &sk_buff
+ *	@skb: buffer to reallocate
+ *	@headroom: needed headroom
+ *
+ *	Unlike skb_realloc_headroom, this one does not allocate a new skb
+ *	if possible; copies skb->sk to new skb as needed
+ *	and frees original skb in case of failures.
+ *
+ *	It expect increased headroom and generates warning otherwise.
+ */
+
+struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
+{
+	int delta = headroom - skb_headroom(skb);
+
+	if (WARN_ONCE(delta <= 0,
+		      "%s is expecting an increase in the headroom", __func__))
+		return skb;
+
+	/* pskb_expand_head() might crash, if skb is shared */
+	if (skb_shared(skb)) {
+		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+		if (likely(nskb)) {
+			if (skb->sk)
+				skb_set_owner_w(nskb, skb->sk);
+			consume_skb(skb);
+		} else {
+			kfree_skb(skb);
+		}
+		skb = nskb;
+	}
+	if (skb &&
+	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+		kfree_skb(skb);
+		skb = NULL;
+	}
+	return skb;
+}
+EXPORT_SYMBOL(skb_expand_head);
+
 /**
  *	skb_copy_expand	-	copy and expand sk_buff
  *	@skb: buffer to copy
-- 
cgit v1.2.3


From 3a755cd8b7c601f756cbbf908b84f7cc8c04a02b Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 2 Aug 2021 11:02:19 +0800
Subject: bonding: add new option lacp_active

Add an option lacp_active, which is similar with team's runner.active.
This option specifies whether to send LACPDU frames periodically. If set
on, the LACPDU frames are sent along with the configured lacp_rate
setting. If set off, the LACPDU frames acts as "speak when spoken to".

Note, the LACPDU state frames still will be sent when init or unbind port.

v2: remove module parameter

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.rst | 12 ++++++++++++
 drivers/net/bonding/bond_3ad.c       | 11 ++++++-----
 drivers/net/bonding/bond_main.c      |  1 +
 drivers/net/bonding/bond_netlink.c   | 16 ++++++++++++++++
 drivers/net/bonding/bond_options.c   | 27 +++++++++++++++++++++++++++
 drivers/net/bonding/bond_procfs.c    |  2 ++
 drivers/net/bonding/bond_sysfs.c     | 25 ++++++++++++++++++++-----
 include/net/bond_3ad.h               |  1 +
 include/net/bond_options.h           |  1 +
 include/net/bonding.h                |  1 +
 include/uapi/linux/if_link.h         |  1 +
 tools/include/uapi/linux/if_link.h   |  1 +
 12 files changed, 89 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index 62f2aab8eaec..31cfd7d674a6 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -501,6 +501,18 @@ fail_over_mac
 	This option was added in bonding version 3.2.0.  The "follow"
 	policy was added in bonding version 3.3.0.
 
+lacp_active
+	Option specifying whether to send LACPDU frames periodically.
+
+	off or 0
+		LACPDU frames acts as "speak when spoken to".
+
+	on or 1
+		LACPDU frames are sent along the configured links
+		periodically. See lacp_rate for more details.
+
+	The default is on.
+
 lacp_rate
 
 	Option specifying the rate in which we'll ask our link partner
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 6908822d9773..a4a202b9a0a2 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -96,7 +96,7 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker);
 static void ad_mux_machine(struct port *port, bool *update_slave_arr);
 static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port);
 static void ad_tx_machine(struct port *port);
-static void ad_periodic_machine(struct port *port);
+static void ad_periodic_machine(struct port *port, struct bond_params bond_params);
 static void ad_port_selection_logic(struct port *port, bool *update_slave_arr);
 static void ad_agg_selection_logic(struct aggregator *aggregator,
 				   bool *update_slave_arr);
@@ -1294,10 +1294,11 @@ static void ad_tx_machine(struct port *port)
 /**
  * ad_periodic_machine - handle a port's periodic state machine
  * @port: the port we're looking at
+ * @bond_params: bond parameters we will use
  *
  * Turn ntt flag on priodically to perform periodic transmission of lacpdu's.
  */
-static void ad_periodic_machine(struct port *port)
+static void ad_periodic_machine(struct port *port, struct bond_params bond_params)
 {
 	periodic_states_t last_state;
 
@@ -1306,8 +1307,8 @@ static void ad_periodic_machine(struct port *port)
 
 	/* check if port was reinitialized */
 	if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) ||
-	    (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY))
-	   ) {
+	    (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) ||
+	    !bond_params.lacp_active) {
 		port->sm_periodic_state = AD_NO_PERIODIC;
 	}
 	/* check if state machine should change state */
@@ -2341,7 +2342,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 		}
 
 		ad_rx_machine(NULL, port);
-		ad_periodic_machine(port);
+		ad_periodic_machine(port, bond->params);
 		ad_port_selection_logic(port, &update_slave_arr);
 		ad_mux_machine(port, &update_slave_arr);
 		ad_tx_machine(port);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 616ebbb08ca6..3ba5f4871162 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5478,6 +5478,7 @@ static int bond_check_params(struct bond_params *params)
 	params->downdelay = downdelay;
 	params->peer_notif_delay = 0;
 	params->use_carrier = use_carrier;
+	params->lacp_active = 1;
 	params->lacp_fast = lacp_fast;
 	params->primary[0] = 0;
 	params->primary_reselect = primary_reselect_value;
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 0561ece1ba45..5d54e11d18fa 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -100,6 +100,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 	[IFLA_BOND_MIN_LINKS]		= { .type = NLA_U32 },
 	[IFLA_BOND_LP_INTERVAL]		= { .type = NLA_U32 },
 	[IFLA_BOND_PACKETS_PER_SLAVE]	= { .type = NLA_U32 },
+	[IFLA_BOND_AD_LACP_ACTIVE]	= { .type = NLA_U8 },
 	[IFLA_BOND_AD_LACP_RATE]	= { .type = NLA_U8 },
 	[IFLA_BOND_AD_SELECT]		= { .type = NLA_U8 },
 	[IFLA_BOND_AD_INFO]		= { .type = NLA_NESTED },
@@ -387,6 +388,16 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		if (err)
 			return err;
 	}
+
+	if (data[IFLA_BOND_AD_LACP_ACTIVE]) {
+		int lacp_active = nla_get_u8(data[IFLA_BOND_AD_LACP_ACTIVE]);
+
+		bond_opt_initval(&newval, lacp_active);
+		err = __bond_opt_set(bond, BOND_OPT_LACP_ACTIVE, &newval);
+		if (err)
+			return err;
+	}
+
 	if (data[IFLA_BOND_AD_LACP_RATE]) {
 		int lacp_rate =
 			nla_get_u8(data[IFLA_BOND_AD_LACP_RATE]);
@@ -490,6 +501,7 @@ static size_t bond_get_size(const struct net_device *bond_dev)
 		nla_total_size(sizeof(u32)) +	/* IFLA_BOND_MIN_LINKS */
 		nla_total_size(sizeof(u32)) +	/* IFLA_BOND_LP_INTERVAL */
 		nla_total_size(sizeof(u32)) +  /* IFLA_BOND_PACKETS_PER_SLAVE */
+		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_AD_LACP_ACTIVE */
 		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_AD_LACP_RATE */
 		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_AD_SELECT */
 		nla_total_size(sizeof(struct nlattr)) + /* IFLA_BOND_AD_INFO */
@@ -622,6 +634,10 @@ static int bond_fill_info(struct sk_buff *skb,
 			packets_per_slave))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_BOND_AD_LACP_ACTIVE,
+		       bond->params.lacp_active))
+		goto nla_put_failure;
+
 	if (nla_put_u8(skb, IFLA_BOND_AD_LACP_RATE,
 		       bond->params.lacp_fast))
 		goto nla_put_failure;
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 0cf25de6f46d..a8fde3bc458f 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -58,6 +58,8 @@ static int bond_option_lp_interval_set(struct bonding *bond,
 				       const struct bond_opt_value *newval);
 static int bond_option_pps_set(struct bonding *bond,
 			       const struct bond_opt_value *newval);
+static int bond_option_lacp_active_set(struct bonding *bond,
+				       const struct bond_opt_value *newval);
 static int bond_option_lacp_rate_set(struct bonding *bond,
 				     const struct bond_opt_value *newval);
 static int bond_option_ad_select_set(struct bonding *bond,
@@ -135,6 +137,12 @@ static const struct bond_opt_value bond_intmax_tbl[] = {
 	{ NULL,      -1,      0}
 };
 
+static const struct bond_opt_value bond_lacp_active[] = {
+	{ "off", 0,  0},
+	{ "on",  1,  BOND_VALFLAG_DEFAULT},
+	{ NULL,  -1, 0}
+};
+
 static const struct bond_opt_value bond_lacp_rate_tbl[] = {
 	{ "slow", AD_LACP_SLOW, 0},
 	{ "fast", AD_LACP_FAST, 0},
@@ -283,6 +291,15 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.values = bond_intmax_tbl,
 		.set = bond_option_updelay_set
 	},
+	[BOND_OPT_LACP_ACTIVE] = {
+		.id = BOND_OPT_LACP_ACTIVE,
+		.name = "lacp_active",
+		.desc = "Send LACPDU frames with configured lacp rate or acts as speak when spoken to",
+		.flags = BOND_OPTFLAG_IFDOWN,
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)),
+		.values = bond_lacp_active,
+		.set = bond_option_lacp_active_set
+	},
 	[BOND_OPT_LACP_RATE] = {
 		.id = BOND_OPT_LACP_RATE,
 		.name = "lacp_rate",
@@ -1333,6 +1350,16 @@ static int bond_option_pps_set(struct bonding *bond,
 	return 0;
 }
 
+static int bond_option_lacp_active_set(struct bonding *bond,
+				       const struct bond_opt_value *newval)
+{
+	netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n",
+		   newval->string, newval->value);
+	bond->params.lacp_active = newval->value;
+
+	return 0;
+}
+
 static int bond_option_lacp_rate_set(struct bonding *bond,
 				     const struct bond_opt_value *newval)
 {
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index 0fb1da361bb1..f3e3bfd72556 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -133,6 +133,8 @@ static void bond_info_show_master(struct seq_file *seq)
 		struct ad_info ad_info;
 
 		seq_puts(seq, "\n802.3ad info\n");
+		seq_printf(seq, "LACP active: %s\n",
+			   (bond->params.lacp_active) ? "on" : "off");
 		seq_printf(seq, "LACP rate: %s\n",
 			   (bond->params.lacp_fast) ? "fast" : "slow");
 		seq_printf(seq, "Min links: %d\n", bond->params.min_links);
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 5f9e9a240226..b9e9842fed94 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -339,10 +339,24 @@ static ssize_t bonding_show_peer_notif_delay(struct device *d,
 static DEVICE_ATTR(peer_notif_delay, 0644,
 		   bonding_show_peer_notif_delay, bonding_sysfs_store_option);
 
-/* Show the LACP interval. */
-static ssize_t bonding_show_lacp(struct device *d,
-				 struct device_attribute *attr,
-				 char *buf)
+/* Show the LACP activity and interval. */
+static ssize_t bonding_show_lacp_active(struct device *d,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct bonding *bond = to_bond(d);
+	const struct bond_opt_value *val;
+
+	val = bond_opt_get_val(BOND_OPT_LACP_ACTIVE, bond->params.lacp_active);
+
+	return sprintf(buf, "%s %d\n", val->string, bond->params.lacp_active);
+}
+static DEVICE_ATTR(lacp_active, 0644,
+		   bonding_show_lacp_active, bonding_sysfs_store_option);
+
+static ssize_t bonding_show_lacp_rate(struct device *d,
+				      struct device_attribute *attr,
+				      char *buf)
 {
 	struct bonding *bond = to_bond(d);
 	const struct bond_opt_value *val;
@@ -352,7 +366,7 @@ static ssize_t bonding_show_lacp(struct device *d,
 	return sprintf(buf, "%s %d\n", val->string, bond->params.lacp_fast);
 }
 static DEVICE_ATTR(lacp_rate, 0644,
-		   bonding_show_lacp, bonding_sysfs_store_option);
+		   bonding_show_lacp_rate, bonding_sysfs_store_option);
 
 static ssize_t bonding_show_min_links(struct device *d,
 				      struct device_attribute *attr,
@@ -738,6 +752,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_downdelay.attr,
 	&dev_attr_updelay.attr,
 	&dev_attr_peer_notif_delay.attr,
+	&dev_attr_lacp_active.attr,
 	&dev_attr_lacp_rate.attr,
 	&dev_attr_ad_select.attr,
 	&dev_attr_xmit_hash_policy.attr,
diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h
index c8696a230b7d..38785d48baff 100644
--- a/include/net/bond_3ad.h
+++ b/include/net/bond_3ad.h
@@ -303,6 +303,7 @@ int  __bond_3ad_get_active_agg_info(struct bonding *bond,
 int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond,
 			 struct slave *slave);
 int bond_3ad_set_carrier(struct bonding *bond);
+void bond_3ad_update_lacp_active(struct bonding *bond);
 void bond_3ad_update_lacp_rate(struct bonding *bond);
 void bond_3ad_update_ad_actor_settings(struct bonding *bond);
 int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats);
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 9d382f2f0bc5..e64833a674eb 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -64,6 +64,7 @@ enum {
 	BOND_OPT_AD_USER_PORT_KEY,
 	BOND_OPT_NUM_PEER_NOTIF_ALIAS,
 	BOND_OPT_PEER_NOTIF_DELAY,
+	BOND_OPT_LACP_ACTIVE,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 625d9c72dee3..46df47004803 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -129,6 +129,7 @@ struct bond_params {
 	int updelay;
 	int downdelay;
 	int peer_notif_delay;
+	int lacp_active;
 	int lacp_fast;
 	unsigned int min_links;
 	int ad_select;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 49b22afab78f..5310003523ce 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -855,6 +855,7 @@ enum {
 	IFLA_BOND_AD_ACTOR_SYSTEM,
 	IFLA_BOND_TLB_DYNAMIC_LB,
 	IFLA_BOND_PEER_NOTIF_DELAY,
+	IFLA_BOND_AD_LACP_ACTIVE,
 	__IFLA_BOND_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index d208b2af697f..eb15f319aa57 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -653,6 +653,7 @@ enum {
 	IFLA_BOND_AD_ACTOR_SYSTEM,
 	IFLA_BOND_TLB_DYNAMIC_LB,
 	IFLA_BOND_PEER_NOTIF_DELAY,
+	IFLA_BOND_AD_LACP_ACTIVE,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From 81dd3ee5962d767b913d4c4efec3f50e888463c1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 3 Aug 2021 13:40:40 +0200
Subject: appletalk: ltpc: remove static probing

This driver never relies on the netdev_boot_setup_check()
to get its configuration, so it can just as well do its
own probing all the time.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Space.c          | 3 ---
 drivers/net/appletalk/ltpc.c | 7 ++-----
 include/net/Space.h          | 1 -
 3 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/Space.c b/drivers/net/Space.c
index df79e7370bcc..9196a26615cc 100644
--- a/drivers/net/Space.c
+++ b/drivers/net/Space.c
@@ -142,9 +142,6 @@ static int __init net_olddevs_init(void)
 	cops_probe(1);
 	cops_probe(2);
 #endif
-#ifdef CONFIG_LTPC
-	ltpc_probe();
-#endif
 
 	return 0;
 }
diff --git a/drivers/net/appletalk/ltpc.c b/drivers/net/appletalk/ltpc.c
index 69c270885ff0..1f8925e75b3f 100644
--- a/drivers/net/appletalk/ltpc.c
+++ b/drivers/net/appletalk/ltpc.c
@@ -1015,7 +1015,7 @@ static const struct net_device_ops ltpc_netdev = {
 	.ndo_set_rx_mode	= set_multicast_list,
 };
 
-struct net_device * __init ltpc_probe(void)
+static struct net_device * __init ltpc_probe(void)
 {
 	struct net_device *dev;
 	int err = -ENOMEM;
@@ -1221,12 +1221,10 @@ static int __init ltpc_setup(char *str)
 }
 
 __setup("ltpc=", ltpc_setup);
-#endif /* MODULE */
+#endif
 
 static struct net_device *dev_ltpc;
 
-#ifdef MODULE
-
 MODULE_LICENSE("GPL");
 module_param(debug, int, 0);
 module_param_hw(io, int, ioport, 0);
@@ -1244,7 +1242,6 @@ static int __init ltpc_module_init(void)
 	return PTR_ERR_OR_ZERO(dev_ltpc);
 }
 module_init(ltpc_module_init);
-#endif
 
 static void __exit ltpc_cleanup(void)
 {
diff --git a/include/net/Space.h b/include/net/Space.h
index 9cce0d80d37a..e30e7a70ea99 100644
--- a/include/net/Space.h
+++ b/include/net/Space.h
@@ -21,7 +21,6 @@ struct net_device *mvme147lance_probe(int unit);
 struct net_device *tc515_probe(int unit);
 struct net_device *lance_probe(int unit);
 struct net_device *cops_probe(int unit);
-struct net_device *ltpc_probe(void);
 
 /* Fibre Channel adapters */
 int iph5526_probe(struct net_device *dev);
-- 
cgit v1.2.3


From e179d78ee11a70e2675bc572f9f4e33d97233b23 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 3 Aug 2021 13:40:43 +0200
Subject: m68k: remove legacy probing

There are six m68k specific drivers that use the legacy probe method
in drivers/net/Space.c. However, all of these only support a single
device, and they completely ignore the command line settings from
netdev_boot_setup_check, so there is really no point at all.

Aside from sun3_82586, these already have a module_init function that
can be used for built-in mode as well, simply by removing the #ifdef.

Note that the 82596 driver was previously used on ISA as well, but
that got dropped long ago.

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Space.c                      | 25 +------------------------
 drivers/net/ethernet/8390/apne.c         | 11 ++---------
 drivers/net/ethernet/amd/atarilance.c    | 11 ++---------
 drivers/net/ethernet/amd/mvme147.c       | 16 ++++++----------
 drivers/net/ethernet/amd/sun3lance.c     | 19 ++++++-------------
 drivers/net/ethernet/i825xx/82596.c      | 24 ++++++------------------
 drivers/net/ethernet/i825xx/sun3_82586.c | 17 +++++++----------
 include/net/Space.h                      |  6 ------
 8 files changed, 30 insertions(+), 99 deletions(-)

(limited to 'include')

diff --git a/drivers/net/Space.c b/drivers/net/Space.c
index 9f573f7ded3c..a03559f23295 100644
--- a/drivers/net/Space.c
+++ b/drivers/net/Space.c
@@ -80,34 +80,12 @@ static struct devprobe2 isa_probes[] __initdata = {
 #ifdef CONFIG_CS89x0_ISA
 	{cs89x0_probe, 0},
 #endif
-#if defined(CONFIG_MVME16x_NET) || defined(CONFIG_BVME6000_NET)	/* Intel */
-	{i82596_probe, 0},					/* I82596 */
-#endif
 #ifdef CONFIG_NI65
 	{ni65_probe, 0},
 #endif
 	{NULL, 0},
 };
 
-static struct devprobe2 m68k_probes[] __initdata = {
-#ifdef CONFIG_ATARILANCE	/* Lance-based Atari ethernet boards */
-	{atarilance_probe, 0},
-#endif
-#ifdef CONFIG_SUN3LANCE         /* sun3 onboard Lance chip */
-	{sun3lance_probe, 0},
-#endif
-#ifdef CONFIG_SUN3_82586        /* sun3 onboard Intel 82586 chip */
-	{sun3_82586_probe, 0},
-#endif
-#ifdef CONFIG_APNE		/* A1200 PCMCIA NE2000 */
-	{apne_probe, 0},
-#endif
-#ifdef CONFIG_MVME147_NET	/* MVME147 internal Ethernet */
-	{mvme147lance_probe, 0},
-#endif
-	{NULL, 0},
-};
-
 /* Unified ethernet device probe, segmented per architecture and
  * per bus interface. This drives the legacy devices only for now.
  */
@@ -119,8 +97,7 @@ static void __init ethif_probe2(int unit)
 	if (base_addr == 1)
 		return;
 
-	(void)(probe_list2(unit, m68k_probes, base_addr == 0) &&
-		probe_list2(unit, isa_probes, base_addr == 0));
+	probe_list2(unit, isa_probes, base_addr == 0);
 }
 
 /*  Statically configured drivers -- order matters here. */
diff --git a/drivers/net/ethernet/8390/apne.c b/drivers/net/ethernet/8390/apne.c
index fe6c834c422e..da1ae37a9d73 100644
--- a/drivers/net/ethernet/8390/apne.c
+++ b/drivers/net/ethernet/8390/apne.c
@@ -75,7 +75,6 @@
 #define NESM_STOP_PG	0x80	/* Last page +1 of RX ring */
 
 
-struct net_device * __init apne_probe(int unit);
 static int apne_probe1(struct net_device *dev, int ioaddr);
 
 static void apne_reset_8390(struct net_device *dev);
@@ -120,7 +119,7 @@ static u32 apne_msg_enable;
 module_param_named(msg_enable, apne_msg_enable, uint, 0444);
 MODULE_PARM_DESC(msg_enable, "Debug message level (see linux/netdevice.h for bitmap)");
 
-struct net_device * __init apne_probe(int unit)
+static struct net_device * __init apne_probe(void)
 {
 	struct net_device *dev;
 	struct ei_device *ei_local;
@@ -150,10 +149,6 @@ struct net_device * __init apne_probe(int unit)
 	dev = alloc_ei_netdev();
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
-	if (unit >= 0) {
-		sprintf(dev->name, "eth%d", unit);
-		netdev_boot_setup_check(dev);
-	}
 	ei_local = netdev_priv(dev);
 	ei_local->msg_enable = apne_msg_enable;
 
@@ -554,12 +549,11 @@ static irqreturn_t apne_interrupt(int irq, void *dev_id)
     return IRQ_HANDLED;
 }
 
-#ifdef MODULE
 static struct net_device *apne_dev;
 
 static int __init apne_module_init(void)
 {
-	apne_dev = apne_probe(-1);
+	apne_dev = apne_probe();
 	return PTR_ERR_OR_ZERO(apne_dev);
 }
 
@@ -579,7 +573,6 @@ static void __exit apne_module_exit(void)
 }
 module_init(apne_module_init);
 module_exit(apne_module_exit);
-#endif
 
 static int init_pcmcia(void)
 {
diff --git a/drivers/net/ethernet/amd/atarilance.c b/drivers/net/ethernet/amd/atarilance.c
index 36f54d13a2eb..9d2f49fd945e 100644
--- a/drivers/net/ethernet/amd/atarilance.c
+++ b/drivers/net/ethernet/amd/atarilance.c
@@ -367,7 +367,7 @@ static void *slow_memcpy( void *dst, const void *src, size_t len )
 }
 
 
-struct net_device * __init atarilance_probe(int unit)
+struct net_device * __init atarilance_probe(void)
 {
 	int i;
 	static int found;
@@ -382,10 +382,6 @@ struct net_device * __init atarilance_probe(int unit)
 	dev = alloc_etherdev(sizeof(struct lance_private));
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
-	if (unit >= 0) {
-		sprintf(dev->name, "eth%d", unit);
-		netdev_boot_setup_check(dev);
-	}
 
 	for( i = 0; i < N_LANCE_ADDR; ++i ) {
 		if (lance_probe1( dev, &lance_addr_list[i] )) {
@@ -1137,13 +1133,11 @@ static int lance_set_mac_address( struct net_device *dev, void *addr )
 	return 0;
 }
 
-
-#ifdef MODULE
 static struct net_device *atarilance_dev;
 
 static int __init atarilance_module_init(void)
 {
-	atarilance_dev = atarilance_probe(-1);
+	atarilance_dev = atarilance_probe();
 	return PTR_ERR_OR_ZERO(atarilance_dev);
 }
 
@@ -1155,4 +1149,3 @@ static void __exit atarilance_module_exit(void)
 }
 module_init(atarilance_module_init);
 module_exit(atarilance_module_exit);
-#endif /* MODULE */
diff --git a/drivers/net/ethernet/amd/mvme147.c b/drivers/net/ethernet/amd/mvme147.c
index 3f2e4cdd0b83..da97fccea9ea 100644
--- a/drivers/net/ethernet/amd/mvme147.c
+++ b/drivers/net/ethernet/amd/mvme147.c
@@ -68,7 +68,7 @@ static const struct net_device_ops lance_netdev_ops = {
 };
 
 /* Initialise the one and only on-board 7990 */
-struct net_device * __init mvme147lance_probe(int unit)
+static struct net_device * __init mvme147lance_probe(void)
 {
 	struct net_device *dev;
 	static int called;
@@ -86,9 +86,6 @@ struct net_device * __init mvme147lance_probe(int unit)
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
-	if (unit >= 0)
-		sprintf(dev->name, "eth%d", unit);
-
 	/* Fill the dev fields */
 	dev->base_addr = (unsigned long)MVME147_LANCE_BASE;
 	dev->netdev_ops = &lance_netdev_ops;
@@ -179,22 +176,21 @@ static int m147lance_close(struct net_device *dev)
 	return 0;
 }
 
-#ifdef MODULE
 MODULE_LICENSE("GPL");
 
 static struct net_device *dev_mvme147_lance;
-int __init init_module(void)
+static int __init m147lance_init(void)
 {
-	dev_mvme147_lance = mvme147lance_probe(-1);
+	dev_mvme147_lance = mvme147lance_probe();
 	return PTR_ERR_OR_ZERO(dev_mvme147_lance);
 }
+module_init(m147lance_init);
 
-void __exit cleanup_module(void)
+static void __exit m147lance_exit(void)
 {
 	struct m147lance_private *lp = netdev_priv(dev_mvme147_lance);
 	unregister_netdev(dev_mvme147_lance);
 	free_pages(lp->ram, 3);
 	free_netdev(dev_mvme147_lance);
 }
-
-#endif /* MODULE */
+module_exit(m147lance_exit);
diff --git a/drivers/net/ethernet/amd/sun3lance.c b/drivers/net/ethernet/amd/sun3lance.c
index f8d7a9387a56..4a845bc071b2 100644
--- a/drivers/net/ethernet/amd/sun3lance.c
+++ b/drivers/net/ethernet/amd/sun3lance.c
@@ -245,7 +245,7 @@ static void set_multicast_list( struct net_device *dev );
 
 /************************* End of Prototypes **************************/
 
-struct net_device * __init sun3lance_probe(int unit)
+static struct net_device * __init sun3lance_probe(void)
 {
 	struct net_device *dev;
 	static int found;
@@ -272,10 +272,6 @@ struct net_device * __init sun3lance_probe(int unit)
 	dev = alloc_etherdev(sizeof(struct lance_private));
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
-	if (unit >= 0) {
-		sprintf(dev->name, "eth%d", unit);
-		netdev_boot_setup_check(dev);
-	}
 
 	if (!lance_probe(dev))
 		goto out;
@@ -924,17 +920,16 @@ static void set_multicast_list( struct net_device *dev )
 }
 
 
-#ifdef MODULE
-
 static struct net_device *sun3lance_dev;
 
-int __init init_module(void)
+static int __init sun3lance_init(void)
 {
-	sun3lance_dev = sun3lance_probe(-1);
+	sun3lance_dev = sun3lance_probe();
 	return PTR_ERR_OR_ZERO(sun3lance_dev);
 }
+module_init(sun3lance_init);
 
-void __exit cleanup_module(void)
+static void __exit sun3lance_cleanup(void)
 {
 	unregister_netdev(sun3lance_dev);
 #ifdef CONFIG_SUN3
@@ -942,6 +937,4 @@ void __exit cleanup_module(void)
 #endif
 	free_netdev(sun3lance_dev);
 }
-
-#endif /* MODULE */
-
+module_exit(sun3lance_cleanup);
diff --git a/drivers/net/ethernet/i825xx/82596.c b/drivers/net/ethernet/i825xx/82596.c
index fc8c7cd67471..b8a40146b895 100644
--- a/drivers/net/ethernet/i825xx/82596.c
+++ b/drivers/net/ethernet/i825xx/82596.c
@@ -1110,9 +1110,6 @@ static void print_eth(unsigned char *add, char *str)
 	       add, add + 6, add, add[12], add[13], str);
 }
 
-static int io = 0x300;
-static int irq = 10;
-
 static const struct net_device_ops i596_netdev_ops = {
 	.ndo_open 		= i596_open,
 	.ndo_stop		= i596_close,
@@ -1123,7 +1120,7 @@ static const struct net_device_ops i596_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
-struct net_device * __init i82596_probe(int unit)
+static struct net_device * __init i82596_probe(void)
 {
 	struct net_device *dev;
 	int i;
@@ -1140,14 +1137,6 @@ struct net_device * __init i82596_probe(int unit)
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
-	if (unit >= 0) {
-		sprintf(dev->name, "eth%d", unit);
-		netdev_boot_setup_check(dev);
-	} else {
-		dev->base_addr = io;
-		dev->irq = irq;
-	}
-
 #ifdef ENABLE_MVME16x_NET
 	if (MACH_IS_MVME16x) {
 		if (mvme16x_config & MVME16x_CONFIG_NO_ETHERNET) {
@@ -1515,22 +1504,22 @@ static void set_multicast_list(struct net_device *dev)
 	}
 }
 
-#ifdef MODULE
 static struct net_device *dev_82596;
 
 static int debug = -1;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "i82596 debug mask");
 
-int __init init_module(void)
+static int __init i82596_init(void)
 {
 	if (debug >= 0)
 		i596_debug = debug;
-	dev_82596 = i82596_probe(-1);
+	dev_82596 = i82596_probe();
 	return PTR_ERR_OR_ZERO(dev_82596);
 }
+module_init(i82596_init);
 
-void __exit cleanup_module(void)
+static void __exit i82596_cleanup(void)
 {
 	unregister_netdev(dev_82596);
 #ifdef __mc68000__
@@ -1544,5 +1533,4 @@ void __exit cleanup_module(void)
 	free_page ((u32)(dev_82596->mem_start));
 	free_netdev(dev_82596);
 }
-
-#endif				/* MODULE */
+module_exit(i82596_cleanup);
diff --git a/drivers/net/ethernet/i825xx/sun3_82586.c b/drivers/net/ethernet/i825xx/sun3_82586.c
index 4564ee02c95f..893e0ddcb611 100644
--- a/drivers/net/ethernet/i825xx/sun3_82586.c
+++ b/drivers/net/ethernet/i825xx/sun3_82586.c
@@ -29,6 +29,7 @@ static int rfdadd = 0; /* rfdadd=1 may be better for 8K MEM cards */
 static int fifo=0x8;	/* don't change */
 
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
@@ -276,7 +277,7 @@ static void alloc586(struct net_device *dev)
 	memset((char *)p->scb,0,sizeof(struct scb_struct));
 }
 
-struct net_device * __init sun3_82586_probe(int unit)
+static int __init sun3_82586_probe(void)
 {
 	struct net_device *dev;
 	unsigned long ioaddr;
@@ -291,25 +292,20 @@ struct net_device * __init sun3_82586_probe(int unit)
 		break;
 
 	default:
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 	}
 
 	if (found)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
 	ioaddr = (unsigned long)ioremap(IE_OBIO, SUN3_82586_TOTAL_SIZE);
 	if (!ioaddr)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	found = 1;
 
 	dev = alloc_etherdev(sizeof(struct priv));
 	if (!dev)
 		goto out;
-	if (unit >= 0) {
-		sprintf(dev->name, "eth%d", unit);
-		netdev_boot_setup_check(dev);
-	}
-
 	dev->irq = IE_IRQ;
 	dev->base_addr = ioaddr;
 	err = sun3_82586_probe1(dev, ioaddr);
@@ -326,8 +322,9 @@ out1:
 	free_netdev(dev);
 out:
 	iounmap((void __iomem *)ioaddr);
-	return ERR_PTR(err);
+	return err;
 }
+module_init(sun3_82586_probe);
 
 static const struct net_device_ops sun3_82586_netdev_ops = {
 	.ndo_open		= sun3_82586_open,
diff --git a/include/net/Space.h b/include/net/Space.h
index e30e7a70ea99..93fd6caa4bad 100644
--- a/include/net/Space.h
+++ b/include/net/Space.h
@@ -8,16 +8,10 @@ struct net_device *ultra_probe(int unit);
 struct net_device *wd_probe(int unit);
 struct net_device *ne_probe(int unit);
 struct net_device *fmv18x_probe(int unit);
-struct net_device *i82596_probe(int unit);
 struct net_device *ni65_probe(int unit);
 struct net_device *sonic_probe(int unit);
 struct net_device *smc_init(int unit);
-struct net_device *atarilance_probe(int unit);
-struct net_device *sun3lance_probe(int unit);
-struct net_device *sun3_82586_probe(int unit);
-struct net_device *apne_probe(int unit);
 struct net_device *cs89x0_probe(int unit);
-struct net_device *mvme147lance_probe(int unit);
 struct net_device *tc515_probe(int unit);
 struct net_device *lance_probe(int unit);
 struct net_device *cops_probe(int unit);
-- 
cgit v1.2.3


From 375df5f8c1812c59930cfed14ff4cc15929c8f2f Mon Sep 17 00:00:00 2001
From: Michael Schmitz <schmitzmic@gmail.com>
Date: Tue, 3 Aug 2021 13:40:44 +0200
Subject: ax88796: export ax_NS8390_init() hook

The block I/O code for the new X-Surf 100 ax88796 driver needs
ax_NS8390_init() for error fixup in its block_output function.

Export this static function through the ax_NS8390_reinit()
wrapper so we can lose the lib8380.c include in the X-Surf 100
driver.

[arnd: add the declaration in the header to avoid a
 -Wmissing-prototypes warning]
Fixes: 861928f4e60e826c ("net-next: New ax88796 platform
driver for Amiga X-Surf 100 Zorro board (m68k)")
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 7 +++++++
 include/net/ax88796.h               | 3 +++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 9595dd1f32ca..6c6bdd5913ec 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -101,6 +101,13 @@ static inline struct ax_device *to_ax_dev(struct net_device *dev)
 	return (struct ax_device *)(ei_local + 1);
 }
 
+void ax_NS8390_reinit(struct net_device *dev)
+{
+	ax_NS8390_init(dev, 1);
+}
+
+EXPORT_SYMBOL_GPL(ax_NS8390_reinit);
+
 /*
  * ax_initial_check
  *
diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index aa52b2e8ff7b..2ed23a368602 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -38,4 +38,7 @@ struct ax_plat_data {
 	int (*check_irq)(struct platform_device *pdev);
 };
 
+/* exported from ax88796.c for xsurf100.c  */
+extern void ax_NS8390_reinit(struct net_device *dev);
+
 #endif /* __NET_AX88796_PLAT_H */
-- 
cgit v1.2.3


From 5ea2f5ffde39251115ef9a566262fb9e52b91cb7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 3 Aug 2021 13:40:46 +0200
Subject: move netdev_boot_setup into Space.c

This is now only used by a handful of old ISA drivers,
and can be moved into the file they already all depend on.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Space.c       | 142 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netdevice.h |  13 -----
 net/core/dev.c            | 125 ----------------------------------------
 net/ethernet/eth.c        |   2 -
 4 files changed, 142 insertions(+), 140 deletions(-)

(limited to 'include')

diff --git a/drivers/net/Space.c b/drivers/net/Space.c
index a03559f23295..f40f2e38682f 100644
--- a/drivers/net/Space.c
+++ b/drivers/net/Space.c
@@ -30,6 +30,148 @@
 #include <linux/netlink.h>
 #include <net/Space.h>
 
+/*
+ * This structure holds boot-time configured netdevice settings. They
+ * are then used in the device probing.
+ */
+struct netdev_boot_setup {
+	char name[IFNAMSIZ];
+	struct ifmap map;
+};
+#define NETDEV_BOOT_SETUP_MAX 8
+
+
+/******************************************************************************
+ *
+ *		      Device Boot-time Settings Routines
+ *
+ ******************************************************************************/
+
+/* Boot time configuration table */
+static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
+
+/**
+ *	netdev_boot_setup_add	- add new setup entry
+ *	@name: name of the device
+ *	@map: configured settings for the device
+ *
+ *	Adds new setup entry to the dev_boot_setup list.  The function
+ *	returns 0 on error and 1 on success.  This is a generic routine to
+ *	all netdevices.
+ */
+static int netdev_boot_setup_add(char *name, struct ifmap *map)
+{
+	struct netdev_boot_setup *s;
+	int i;
+
+	s = dev_boot_setup;
+	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
+			memset(s[i].name, 0, sizeof(s[i].name));
+			strlcpy(s[i].name, name, IFNAMSIZ);
+			memcpy(&s[i].map, map, sizeof(s[i].map));
+			break;
+		}
+	}
+
+	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
+}
+
+/**
+ * netdev_boot_setup_check	- check boot time settings
+ * @dev: the netdevice
+ *
+ * Check boot time settings for the device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found, 1 if they are.
+ */
+int netdev_boot_setup_check(struct net_device *dev)
+{
+	struct netdev_boot_setup *s = dev_boot_setup;
+	int i;
+
+	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
+		    !strcmp(dev->name, s[i].name)) {
+			dev->irq = s[i].map.irq;
+			dev->base_addr = s[i].map.base_addr;
+			dev->mem_start = s[i].map.mem_start;
+			dev->mem_end = s[i].map.mem_end;
+			return 1;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(netdev_boot_setup_check);
+
+/**
+ * netdev_boot_base	- get address from boot time settings
+ * @prefix: prefix for network device
+ * @unit: id for network device
+ *
+ * Check boot time settings for the base address of device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found.
+ */
+static unsigned long netdev_boot_base(const char *prefix, int unit)
+{
+	const struct netdev_boot_setup *s = dev_boot_setup;
+	char name[IFNAMSIZ];
+	int i;
+
+	sprintf(name, "%s%d", prefix, unit);
+
+	/*
+	 * If device already registered then return base of 1
+	 * to indicate not to probe for this interface
+	 */
+	if (__dev_get_by_name(&init_net, name))
+		return 1;
+
+	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
+		if (!strcmp(name, s[i].name))
+			return s[i].map.base_addr;
+	return 0;
+}
+
+/*
+ * Saves at boot time configured settings for any netdevice.
+ */
+static int __init netdev_boot_setup(char *str)
+{
+	int ints[5];
+	struct ifmap map;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+	if (!str || !*str)
+		return 0;
+
+	/* Save settings */
+	memset(&map, 0, sizeof(map));
+	if (ints[0] > 0)
+		map.irq = ints[1];
+	if (ints[0] > 1)
+		map.base_addr = ints[2];
+	if (ints[0] > 2)
+		map.mem_start = ints[3];
+	if (ints[0] > 3)
+		map.mem_end = ints[4];
+
+	/* Add new entry to the list */
+	return netdev_boot_setup_add(str, &map);
+}
+
+__setup("netdev=", netdev_boot_setup);
+
+static int __init ether_boot_setup(char *str)
+{
+	return netdev_boot_setup(str);
+}
+__setup("ether=", ether_boot_setup);
+
+
 /* A unified ethernet device probe.  This is the easiest way to have every
  * ethernet adaptor have the name "eth[0123...]".
  */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d63a94ecbf3b..cd136499ec59 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -295,18 +295,6 @@ enum netdev_state_t {
 };
 
 
-/*
- * This structure holds boot-time configured netdevice settings. They
- * are then used in the device probing.
- */
-struct netdev_boot_setup {
-	char name[IFNAMSIZ];
-	struct ifmap map;
-};
-#define NETDEV_BOOT_SETUP_MAX 8
-
-int __init netdev_boot_setup(char *str);
-
 struct gro_list {
 	struct list_head	list;
 	int			count;
@@ -2939,7 +2927,6 @@ static inline struct net_device *first_net_device_rcu(struct net *net)
 }
 
 int netdev_boot_setup_check(struct net_device *dev);
-unsigned long netdev_boot_base(const char *prefix, int unit);
 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 				       const char *hwaddr);
 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
diff --git a/net/core/dev.c b/net/core/dev.c
index 64e1a5f63f93..4a1401008db9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -676,131 +676,6 @@ void dev_remove_offload(struct packet_offload *po)
 }
 EXPORT_SYMBOL(dev_remove_offload);
 
-/******************************************************************************
- *
- *		      Device Boot-time Settings Routines
- *
- ******************************************************************************/
-
-/* Boot time configuration table */
-static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
-
-/**
- *	netdev_boot_setup_add	- add new setup entry
- *	@name: name of the device
- *	@map: configured settings for the device
- *
- *	Adds new setup entry to the dev_boot_setup list.  The function
- *	returns 0 on error and 1 on success.  This is a generic routine to
- *	all netdevices.
- */
-static int netdev_boot_setup_add(char *name, struct ifmap *map)
-{
-	struct netdev_boot_setup *s;
-	int i;
-
-	s = dev_boot_setup;
-	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
-		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
-			memset(s[i].name, 0, sizeof(s[i].name));
-			strlcpy(s[i].name, name, IFNAMSIZ);
-			memcpy(&s[i].map, map, sizeof(s[i].map));
-			break;
-		}
-	}
-
-	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
-}
-
-/**
- * netdev_boot_setup_check	- check boot time settings
- * @dev: the netdevice
- *
- * Check boot time settings for the device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found, 1 if they are.
- */
-int netdev_boot_setup_check(struct net_device *dev)
-{
-	struct netdev_boot_setup *s = dev_boot_setup;
-	int i;
-
-	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
-		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
-		    !strcmp(dev->name, s[i].name)) {
-			dev->irq = s[i].map.irq;
-			dev->base_addr = s[i].map.base_addr;
-			dev->mem_start = s[i].map.mem_start;
-			dev->mem_end = s[i].map.mem_end;
-			return 1;
-		}
-	}
-	return 0;
-}
-EXPORT_SYMBOL(netdev_boot_setup_check);
-
-
-/**
- * netdev_boot_base	- get address from boot time settings
- * @prefix: prefix for network device
- * @unit: id for network device
- *
- * Check boot time settings for the base address of device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found.
- */
-unsigned long netdev_boot_base(const char *prefix, int unit)
-{
-	const struct netdev_boot_setup *s = dev_boot_setup;
-	char name[IFNAMSIZ];
-	int i;
-
-	sprintf(name, "%s%d", prefix, unit);
-
-	/*
-	 * If device already registered then return base of 1
-	 * to indicate not to probe for this interface
-	 */
-	if (__dev_get_by_name(&init_net, name))
-		return 1;
-
-	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
-		if (!strcmp(name, s[i].name))
-			return s[i].map.base_addr;
-	return 0;
-}
-
-/*
- * Saves at boot time configured settings for any netdevice.
- */
-int __init netdev_boot_setup(char *str)
-{
-	int ints[5];
-	struct ifmap map;
-
-	str = get_options(str, ARRAY_SIZE(ints), ints);
-	if (!str || !*str)
-		return 0;
-
-	/* Save settings */
-	memset(&map, 0, sizeof(map));
-	if (ints[0] > 0)
-		map.irq = ints[1];
-	if (ints[0] > 1)
-		map.base_addr = ints[2];
-	if (ints[0] > 2)
-		map.mem_start = ints[3];
-	if (ints[0] > 3)
-		map.mem_end = ints[4];
-
-	/* Add new entry to the list */
-	return netdev_boot_setup_add(str, &map);
-}
-
-__setup("netdev=", netdev_boot_setup);
-
 /*******************************************************************************
  *
  *			    Device Interface Subroutines
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 171ba75b74c9..73fce9467467 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,8 +62,6 @@
 #include <linux/uaccess.h>
 #include <net/pkt_sched.h>
 
-__setup("ether=", netdev_boot_setup);
-
 /**
  * eth_header - create the Ethernet header
  * @skb:	buffer to alter
-- 
cgit v1.2.3


From 72bcad5393a7079706fcfe02d84ed1599716d6a2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 3 Aug 2021 13:40:49 +0200
Subject: wan: remove sbni/granch driver

The driver was merged in 1999 and has only ever seen treewide cleanups
since then, with no indication whatsoever that anyone has actually
had access to hardware for testing the patches.

>From the information in the link below, it appears that the hardware
is for some leased line system in Russia that has since been
discontinued, and useless without any remote end to connect to.

As the driver still feels like a Linux-2.2 era artifact today, it
appears that the best way forward is to just delete it.

Link: https://www.tms.ru/%D0%90%D0%B4%D0%B0%D0%BF%D1%82%D0%B5%D1%80_%D0%B4%D0%BB%D1%8F_%D0%B2%D1%8B%D0%B4%D0%B5%D0%BB%D0%B5%D0%BD%D0%BD%D1%8B%D1%85_%D0%BB%D0%B8%D0%BD%D0%B8%D0%B9_Granch_SBNI12-10
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/admin-guide/kernel-parameters.txt |    2 -
 drivers/net/Space.c                             |    4 -
 drivers/net/wan/Kconfig                         |   27 -
 drivers/net/wan/Makefile                        |    1 -
 drivers/net/wan/sbni.c                          | 1639 -----------------------
 drivers/net/wan/sbni.h                          |  147 --
 include/net/Space.h                             |    3 -
 7 files changed, 1823 deletions(-)
 delete mode 100644 drivers/net/wan/sbni.c
 delete mode 100644 drivers/net/wan/sbni.h

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bdb22006f713..ee0569a040c6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4945,8 +4945,6 @@
 	sa1100ir	[NET]
 			See drivers/net/irda/sa1100_ir.c.
 
-	sbni=		[NET] Granch SBNI12 leased line adapter
-
 	sched_verbose	[KNL] Enables verbose scheduler debug messages.
 
 	schedstats=	[KNL,X86] Enable or disable scheduled statistics.
diff --git a/drivers/net/Space.c b/drivers/net/Space.c
index f40f2e38682f..49e67c9fb5a4 100644
--- a/drivers/net/Space.c
+++ b/drivers/net/Space.c
@@ -247,10 +247,6 @@ static int __init net_olddevs_init(void)
 {
 	int num;
 
-#ifdef CONFIG_SBNI
-	for (num = 0; num < 8; ++num)
-		sbni_probe(num);
-#endif
 	for (num = 0; num < 8; ++num)
 		ethif_probe2(num);
 
diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig
index d31791535ccf..592a8389fc5a 100644
--- a/drivers/net/wan/Kconfig
+++ b/drivers/net/wan/Kconfig
@@ -313,33 +313,6 @@ config LAPBETHER
 	  To compile this driver as a module, choose M here: the
 	  module will be called lapbether.
 
-	  If unsure, say N.
-
-config SBNI
-	tristate "Granch SBNI12 Leased Line adapter support"
-	depends on X86
-	help
-	  Driver for ISA SBNI12-xx cards which are low cost alternatives to
-	  leased line modems.
-
-	  You can find more information and last versions of drivers and
-	  utilities at <http://www.granch.ru/>. If you have any question you
-	  can send email to <sbni@granch.ru>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called sbni.
-
-	  If unsure, say N.
-
-config SBNI_MULTILINE
-	bool "Multiple line feature support"
-	depends on SBNI
-	help
-	  Schedule traffic for some parallel lines, via SBNI12 adapters.
-
-	  If you have two computers connected with two parallel lines it's
-	  possible to increase transfer rate nearly twice. You should have
-	  a program named 'sbniconfig' to configure adapters.
 
 	  If unsure, say N.
 
diff --git a/drivers/net/wan/Makefile b/drivers/net/wan/Makefile
index 081666c36ca2..f6b92efffc94 100644
--- a/drivers/net/wan/Makefile
+++ b/drivers/net/wan/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_FARSYNC)		+= farsync.o
 obj-$(CONFIG_LANMEDIA)		+= lmc/
 
 obj-$(CONFIG_LAPBETHER)		+= lapbether.o
-obj-$(CONFIG_SBNI)		+= sbni.o
 obj-$(CONFIG_N2)		+= n2.o
 obj-$(CONFIG_C101)		+= c101.o
 obj-$(CONFIG_WANXL)		+= wanxl.o
diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c
deleted file mode 100644
index 469fe979d664..000000000000
--- a/drivers/net/wan/sbni.c
+++ /dev/null
@@ -1,1639 +0,0 @@
-/* sbni.c:  Granch SBNI12 leased line adapters driver for linux
- *
- *	Written 2001 by Denis I.Timofeev (timofeev@granch.ru)
- *
- *	Previous versions were written by Yaroslav Polyakov,
- *	Alexey Zverev and Max Khon.
- *
- *	Driver supports SBNI12-02,-04,-05,-10,-11 cards, single and
- *	double-channel, PCI and ISA modifications.
- *	More info and useful utilities to work with SBNI12 cards you can find
- *	at http://www.granch.com (English) or http://www.granch.ru (Russian)
- *
- *	This software may be used and distributed according to the terms
- *	of the GNU General Public License.
- *
- *
- *  5.0.1	Jun 22 2001
- *	  - Fixed bug in probe
- *  5.0.0	Jun 06 2001
- *	  - Driver was completely redesigned by Denis I.Timofeev,
- *	  - now PCI/Dual, ISA/Dual (with single interrupt line) models are
- *	  - supported
- *  3.3.0	Thu Feb 24 21:30:28 NOVT 2000 
- *        - PCI cards support
- *  3.2.0	Mon Dec 13 22:26:53 NOVT 1999
- * 	  - Completely rebuilt all the packet storage system
- * 	  -    to work in Ethernet-like style.
- *  3.1.1	just fixed some bugs (5 aug 1999)
- *  3.1.0	added balancing feature	(26 apr 1999)
- *  3.0.1	just fixed some bugs (14 apr 1999).
- *  3.0.0	Initial Revision, Yaroslav Polyakov (24 Feb 1999)
- *        - added pre-calculation for CRC, fixed bug with "len-2" frames, 
- *        - removed outbound fragmentation (MTU=1000), written CRC-calculation 
- *        - on asm, added work with hard_headers and now we have our own cache 
- *        - for them, optionally supported word-interchange on some chipsets,
- * 
- *	Known problem: this driver wasn't tested on multiprocessor machine.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/fcntl.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/pci.h>
-#include <linux/skbuff.h>
-#include <linux/timer.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-
-#include <net/net_namespace.h>
-#include <net/arp.h>
-#include <net/Space.h>
-
-#include <asm/io.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
-#include <asm/irq.h>
-#include <linux/uaccess.h>
-
-#include "sbni.h"
-
-/* device private data */
-
-struct net_local {
-	struct timer_list	watchdog;
-	struct net_device	*watchdog_dev;
-
-	spinlock_t	lock;
-	struct sk_buff  *rx_buf_p;		/* receive buffer ptr */
-	struct sk_buff  *tx_buf_p;		/* transmit buffer ptr */
-	
-	unsigned int	framelen;		/* current frame length */
-	unsigned int	maxframe;		/* maximum valid frame length */
-	unsigned int	state;
-	unsigned int	inppos, outpos;		/* positions in rx/tx buffers */
-
-	/* transmitting frame number - from frames qty to 1 */
-	unsigned int	tx_frameno;
-
-	/* expected number of next receiving frame */
-	unsigned int	wait_frameno;
-
-	/* count of failed attempts to frame send - 32 attempts do before
-	   error - while receiver tunes on opposite side of wire */
-	unsigned int	trans_errors;
-
-	/* idle time; send pong when limit exceeded */
-	unsigned int	timer_ticks;
-
-	/* fields used for receive level autoselection */
-	int	delta_rxl;
-	unsigned int	cur_rxl_index, timeout_rxl;
-	unsigned long	cur_rxl_rcvd, prev_rxl_rcvd;
-
-	struct sbni_csr1	csr1;		/* current value of CSR1 */
-	struct sbni_in_stats	in_stats; 	/* internal statistics */ 
-
-	struct net_device		*second;	/* for ISA/dual cards */
-
-#ifdef CONFIG_SBNI_MULTILINE
-	struct net_device		*master;
-	struct net_device		*link;
-#endif
-};
-
-
-static int  sbni_card_probe( unsigned long );
-static int  sbni_pci_probe( struct net_device  * );
-static struct net_device  *sbni_probe1(struct net_device *, unsigned long, int);
-static int  sbni_open( struct net_device * );
-static int  sbni_close( struct net_device * );
-static netdev_tx_t sbni_start_xmit(struct sk_buff *,
-					 struct net_device * );
-static int  sbni_siocdevprivate(struct net_device *, struct ifreq *,
-				void __user *, int);
-static void  set_multicast_list( struct net_device * );
-
-static irqreturn_t sbni_interrupt( int, void * );
-static void  handle_channel( struct net_device * );
-static int   recv_frame( struct net_device * );
-static void  send_frame( struct net_device * );
-static int   upload_data( struct net_device *,
-			  unsigned, unsigned, unsigned, u32 );
-static void  download_data( struct net_device *, u32 * );
-static void  sbni_watchdog(struct timer_list *);
-static void  interpret_ack( struct net_device *, unsigned );
-static int   append_frame_to_pkt( struct net_device *, unsigned, u32 );
-static void  indicate_pkt( struct net_device * );
-static void  card_start( struct net_device * );
-static void  prepare_to_send( struct sk_buff *, struct net_device * );
-static void  drop_xmit_queue( struct net_device * );
-static void  send_frame_header( struct net_device *, u32 * );
-static int   skip_tail( unsigned int, unsigned int, u32 );
-static int   check_fhdr( u32, u32 *, u32 *, u32 *, u32 *, u32 * );
-static void  change_level( struct net_device * );
-static void  timeout_change_level( struct net_device * );
-static u32   calc_crc32( u32, u8 *, u32 );
-static struct sk_buff *  get_rx_buf( struct net_device * );
-static int  sbni_init( struct net_device * );
-
-#ifdef CONFIG_SBNI_MULTILINE
-static int  enslave( struct net_device *, struct net_device * );
-static int  emancipate( struct net_device * );
-#endif
-
-static const char  version[] =
-	"Granch SBNI12 driver ver 5.0.1  Jun 22 2001  Denis I.Timofeev.\n";
-
-static bool skip_pci_probe	__initdata = false;
-static int  scandone	__initdata = 0;
-static int  num		__initdata = 0;
-
-static unsigned char  rxl_tab[];
-static u32  crc32tab[];
-
-/* A list of all installed devices, for removing the driver module. */
-static struct net_device  *sbni_cards[ SBNI_MAX_NUM_CARDS ];
-
-/* Lists of device's parameters */
-static u32	io[   SBNI_MAX_NUM_CARDS ] __initdata =
-	{ [0 ... SBNI_MAX_NUM_CARDS-1] = -1 };
-static u32	irq[  SBNI_MAX_NUM_CARDS ] __initdata;
-static u32	baud[ SBNI_MAX_NUM_CARDS ] __initdata;
-static u32	rxl[  SBNI_MAX_NUM_CARDS ] __initdata =
-	{ [0 ... SBNI_MAX_NUM_CARDS-1] = -1 };
-static u32	mac[  SBNI_MAX_NUM_CARDS ] __initdata;
-
-#ifndef MODULE
-typedef u32  iarr[];
-static iarr *dest[5] __initdata = { &io, &irq, &baud, &rxl, &mac };
-#endif
-
-/* A zero-terminated list of I/O addresses to be probed on ISA bus */
-static unsigned int  netcard_portlist[ ] __initdata = { 
-	0x210, 0x214, 0x220, 0x224, 0x230, 0x234, 0x240, 0x244, 0x250, 0x254,
-	0x260, 0x264, 0x270, 0x274, 0x280, 0x284, 0x290, 0x294, 0x2a0, 0x2a4,
-	0x2b0, 0x2b4, 0x2c0, 0x2c4, 0x2d0, 0x2d4, 0x2e0, 0x2e4, 0x2f0, 0x2f4,
-	0 };
-
-#define NET_LOCAL_LOCK(dev) (((struct net_local *)netdev_priv(dev))->lock)
-
-/*
- * Look for SBNI card which addr stored in dev->base_addr, if nonzero.
- * Otherwise, look through PCI bus. If none PCI-card was found, scan ISA.
- */
-
-static inline int __init
-sbni_isa_probe( struct net_device  *dev )
-{
-	if( dev->base_addr > 0x1ff &&
-	    request_region( dev->base_addr, SBNI_IO_EXTENT, dev->name ) &&
-	    sbni_probe1( dev, dev->base_addr, dev->irq ) )
-
-		return  0;
-	else {
-		pr_err("base address 0x%lx is busy, or adapter is malfunctional!\n",
-		       dev->base_addr);
-		return  -ENODEV;
-	}
-}
-
-static const struct net_device_ops sbni_netdev_ops = {
-	.ndo_open		= sbni_open,
-	.ndo_stop		= sbni_close,
-	.ndo_start_xmit		= sbni_start_xmit,
-	.ndo_set_rx_mode	= set_multicast_list,
-	.ndo_siocdevprivate	= sbni_siocdevprivate,
-	.ndo_set_mac_address 	= eth_mac_addr,
-	.ndo_validate_addr	= eth_validate_addr,
-};
-
-static void __init sbni_devsetup(struct net_device *dev)
-{
-	ether_setup( dev );
-	dev->netdev_ops = &sbni_netdev_ops;
-}
-
-int __init sbni_probe(int unit)
-{
-	struct net_device *dev;
-	int err;
-
-	dev = alloc_netdev(sizeof(struct net_local), "sbni",
-			   NET_NAME_UNKNOWN, sbni_devsetup);
-	if (!dev)
-		return -ENOMEM;
-
-	dev->netdev_ops = &sbni_netdev_ops;
-
-	sprintf(dev->name, "sbni%d", unit);
-	netdev_boot_setup_check(dev);
-
-	err = sbni_init(dev);
-	if (err) {
-		free_netdev(dev);
-		return err;
-	}
-
-	err = register_netdev(dev);
-	if (err) {
-		release_region( dev->base_addr, SBNI_IO_EXTENT );
-		free_netdev(dev);
-		return err;
-	}
-	pr_info_once("%s", version);
-	return 0;
-}
-
-static int __init sbni_init(struct net_device *dev)
-{
-	int  i;
-	if( dev->base_addr )
-		return  sbni_isa_probe( dev );
-	/* otherwise we have to perform search our adapter */
-
-	if( io[ num ] != -1 ) {
-		dev->base_addr	= io[ num ];
-		dev->irq	= irq[ num ];
-	} else if( scandone  ||  io[ 0 ] != -1 ) {
-		return  -ENODEV;
-	}
-
-	/* if io[ num ] contains non-zero address, then that is on ISA bus */
-	if( dev->base_addr )
-		return  sbni_isa_probe( dev );
-
-	/* ...otherwise - scan PCI first */
-	if( !skip_pci_probe  &&  !sbni_pci_probe( dev ) )
-		return  0;
-
-	if( io[ num ] == -1 ) {
-		/* Auto-scan will be stopped when first ISA card were found */
-		scandone = 1;
-		if( num > 0 )
-			return  -ENODEV;
-	}
-
-	for( i = 0;  netcard_portlist[ i ];  ++i ) {
-		int  ioaddr = netcard_portlist[ i ];
-		if( request_region( ioaddr, SBNI_IO_EXTENT, dev->name ) &&
-		    sbni_probe1( dev, ioaddr, 0 ))
-			return 0;
-	}
-
-	return  -ENODEV;
-}
-
-
-static int __init
-sbni_pci_probe( struct net_device  *dev )
-{
-	struct pci_dev  *pdev = NULL;
-
-	while( (pdev = pci_get_class( PCI_CLASS_NETWORK_OTHER << 8, pdev ))
-	       != NULL ) {
-		int  pci_irq_line;
-		unsigned long  pci_ioaddr;
-
-		if( pdev->vendor != SBNI_PCI_VENDOR &&
-		    pdev->device != SBNI_PCI_DEVICE )
-			continue;
-
-		pci_ioaddr = pci_resource_start( pdev, 0 );
-		pci_irq_line = pdev->irq;
-
-		/* Avoid already found cards from previous calls */
-		if( !request_region( pci_ioaddr, SBNI_IO_EXTENT, dev->name ) ) {
-			if (pdev->subsystem_device != 2)
-				continue;
-
-			/* Dual adapter is present */
-			if (!request_region(pci_ioaddr += 4, SBNI_IO_EXTENT,
-							dev->name ) )
-				continue;
-		}
-
-		if (pci_irq_line <= 0 || pci_irq_line >= nr_irqs)
-			pr_warn(
-"WARNING: The PCI BIOS assigned this PCI card to IRQ %d, which is unlikely to work!.\n"
-"You should use the PCI BIOS setup to assign a valid IRQ line.\n",
-				pci_irq_line );
-
-		/* avoiding re-enable dual adapters */
-		if( (pci_ioaddr & 7) == 0  &&  pci_enable_device( pdev ) ) {
-			release_region( pci_ioaddr, SBNI_IO_EXTENT );
-			pci_dev_put( pdev );
-			return  -EIO;
-		}
-		if( sbni_probe1( dev, pci_ioaddr, pci_irq_line ) ) {
-			SET_NETDEV_DEV(dev, &pdev->dev);
-			/* not the best thing to do, but this is all messed up 
-			   for hotplug systems anyway... */
-			pci_dev_put( pdev );
-			return  0;
-		}
-	}
-	return  -ENODEV;
-}
-
-
-static struct net_device * __init
-sbni_probe1( struct net_device  *dev,  unsigned long  ioaddr,  int  irq )
-{
-	struct net_local  *nl;
-
-	if( sbni_card_probe( ioaddr ) ) {
-		release_region( ioaddr, SBNI_IO_EXTENT );
-		return NULL;
-	}
-
-	outb( 0, ioaddr + CSR0 );
-
-	if( irq < 2 ) {
-		unsigned long irq_mask;
-
-		irq_mask = probe_irq_on();
-		outb( EN_INT | TR_REQ, ioaddr + CSR0 );
-		outb( PR_RES, ioaddr + CSR1 );
-		mdelay(50);
-		irq = probe_irq_off(irq_mask);
-		outb( 0, ioaddr + CSR0 );
-
-		if( !irq ) {
-			pr_err("%s: can't detect device irq!\n", dev->name);
-			release_region( ioaddr, SBNI_IO_EXTENT );
-			return NULL;
-		}
-	} else if( irq == 2 )
-		irq = 9;
-
-	dev->irq = irq;
-	dev->base_addr = ioaddr;
-
-	/* Fill in sbni-specific dev fields. */
-	nl = netdev_priv(dev);
-	if( !nl ) {
-		pr_err("%s: unable to get memory!\n", dev->name);
-		release_region( ioaddr, SBNI_IO_EXTENT );
-		return NULL;
-	}
-
-	memset( nl, 0, sizeof(struct net_local) );
-	spin_lock_init( &nl->lock );
-
-	/* store MAC address (generate if that isn't known) */
-	*(__be16 *)dev->dev_addr = htons( 0x00ff );
-	*(__be32 *)(dev->dev_addr + 2) = htonl( 0x01000000 |
-		((mac[num] ?
-		mac[num] :
-		(u32)((long)netdev_priv(dev))) & 0x00ffffff));
-
-	/* store link settings (speed, receive level ) */
-	nl->maxframe  = DEFAULT_FRAME_LEN;
-	nl->csr1.rate = baud[ num ];
-
-	if( (nl->cur_rxl_index = rxl[ num ]) == -1 ) {
-		/* autotune rxl */
-		nl->cur_rxl_index = DEF_RXL;
-		nl->delta_rxl = DEF_RXL_DELTA;
-	} else {
-		nl->delta_rxl = 0;
-	}
-	nl->csr1.rxl  = rxl_tab[ nl->cur_rxl_index ];
-	if( inb( ioaddr + CSR0 ) & 0x01 )
-		nl->state |= FL_SLOW_MODE;
-
-	pr_notice("%s: ioaddr %#lx, irq %d, MAC: 00:ff:01:%02x:%02x:%02x\n",
-		  dev->name, dev->base_addr, dev->irq,
-		  ((u8 *)dev->dev_addr)[3],
-		  ((u8 *)dev->dev_addr)[4],
-		  ((u8 *)dev->dev_addr)[5]);
-
-	pr_notice("%s: speed %d",
-		  dev->name,
-		  ((nl->state & FL_SLOW_MODE) ? 500000 : 2000000)
-		  / (1 << nl->csr1.rate));
-
-	if( nl->delta_rxl == 0 )
-		pr_cont(", receive level 0x%x (fixed)\n", nl->cur_rxl_index);
-	else
-		pr_cont(", receive level (auto)\n");
-
-#ifdef CONFIG_SBNI_MULTILINE
-	nl->master = dev;
-	nl->link   = NULL;
-#endif
-   
-	sbni_cards[ num++ ] = dev;
-	return  dev;
-}
-
-/* -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_SBNI_MULTILINE
-
-static netdev_tx_t
-sbni_start_xmit( struct sk_buff  *skb,  struct net_device  *dev )
-{
-	struct net_device  *p;
-
-	netif_stop_queue( dev );
-
-	/* Looking for idle device in the list */
-	for( p = dev;  p; ) {
-		struct net_local  *nl = netdev_priv(p);
-		spin_lock( &nl->lock );
-		if( nl->tx_buf_p  ||  (nl->state & FL_LINE_DOWN) ) {
-			p = nl->link;
-			spin_unlock( &nl->lock );
-		} else {
-			/* Idle dev is found */
-			prepare_to_send( skb, p );
-			spin_unlock( &nl->lock );
-			netif_start_queue( dev );
-			return NETDEV_TX_OK;
-		}
-	}
-
-	return NETDEV_TX_BUSY;
-}
-
-#else	/* CONFIG_SBNI_MULTILINE */
-
-static netdev_tx_t
-sbni_start_xmit( struct sk_buff  *skb,  struct net_device  *dev )
-{
-	struct net_local  *nl  = netdev_priv(dev);
-
-	netif_stop_queue( dev );
-	spin_lock( &nl->lock );
-
-	prepare_to_send( skb, dev );
-
-	spin_unlock( &nl->lock );
-	return NETDEV_TX_OK;
-}
-
-#endif	/* CONFIG_SBNI_MULTILINE */
-
-/* -------------------------------------------------------------------------- */
-
-/* interrupt handler */
-
-/*
- * 	SBNI12D-10, -11/ISA boards within "common interrupt" mode could not
- * be looked as two independent single-channel devices. Every channel seems
- * as Ethernet interface but interrupt handler must be common. Really, first
- * channel ("master") driver only registers the handler. In its struct net_local
- * it has got pointer to "slave" channel's struct net_local and handles that's
- * interrupts too.
- *	dev of successfully attached ISA SBNI boards is linked to list.
- * While next board driver is initialized, it scans this list. If one
- * has found dev with same irq and ioaddr different by 4 then it assumes
- * this board to be "master".
- */ 
-
-static irqreturn_t
-sbni_interrupt( int  irq,  void  *dev_id )
-{
-	struct net_device	  *dev = dev_id;
-	struct net_local  *nl  = netdev_priv(dev);
-	int	repeat;
-
-	spin_lock( &nl->lock );
-	if( nl->second )
-		spin_lock(&NET_LOCAL_LOCK(nl->second));
-
-	do {
-		repeat = 0;
-		if( inb( dev->base_addr + CSR0 ) & (RC_RDY | TR_RDY) ) {
-			handle_channel( dev );
-			repeat = 1;
-		}
-		if( nl->second  && 	/* second channel present */
-		    (inb( nl->second->base_addr+CSR0 ) & (RC_RDY | TR_RDY)) ) {
-			handle_channel( nl->second );
-			repeat = 1;
-		}
-	} while( repeat );
-
-	if( nl->second )
-		spin_unlock(&NET_LOCAL_LOCK(nl->second));
-	spin_unlock( &nl->lock );
-	return IRQ_HANDLED;
-}
-
-
-static void
-handle_channel( struct net_device  *dev )
-{
-	struct net_local	*nl    = netdev_priv(dev);
-	unsigned long		ioaddr = dev->base_addr;
-
-	int  req_ans;
-	unsigned char  csr0;
-
-#ifdef CONFIG_SBNI_MULTILINE
-	/* Lock the master device because we going to change its local data */
-	if( nl->state & FL_SLAVE )
-		spin_lock(&NET_LOCAL_LOCK(nl->master));
-#endif
-
-	outb( (inb( ioaddr + CSR0 ) & ~EN_INT) | TR_REQ, ioaddr + CSR0 );
-
-	nl->timer_ticks = CHANGE_LEVEL_START_TICKS;
-	for(;;) {
-		csr0 = inb( ioaddr + CSR0 );
-		if( ( csr0 & (RC_RDY | TR_RDY) ) == 0 )
-			break;
-
-		req_ans = !(nl->state & FL_PREV_OK);
-
-		if( csr0 & RC_RDY )
-			req_ans = recv_frame( dev );
-
-		/*
-		 * TR_RDY always equals 1 here because we have owned the marker,
-		 * and we set TR_REQ when disabled interrupts
-		 */
-		csr0 = inb( ioaddr + CSR0 );
-		if( !(csr0 & TR_RDY)  ||  (csr0 & RC_RDY) )
-			netdev_err(dev, "internal error!\n");
-
-		/* if state & FL_NEED_RESEND != 0 then tx_frameno != 0 */
-		if( req_ans  ||  nl->tx_frameno != 0 )
-			send_frame( dev );
-		else
-			/* send marker without any data */
-			outb( inb( ioaddr + CSR0 ) & ~TR_REQ, ioaddr + CSR0 );
-	}
-
-	outb( inb( ioaddr + CSR0 ) | EN_INT, ioaddr + CSR0 );
-
-#ifdef CONFIG_SBNI_MULTILINE
-	if( nl->state & FL_SLAVE )
-		spin_unlock(&NET_LOCAL_LOCK(nl->master));
-#endif
-}
-
-
-/*
- * Routine returns 1 if it needs to acknowledge received frame.
- * Empty frame received without errors won't be acknowledged.
- */
-
-static int
-recv_frame( struct net_device  *dev )
-{
-	struct net_local  *nl   = netdev_priv(dev);
-	unsigned long  ioaddr	= dev->base_addr;
-
-	u32  crc = CRC32_INITIAL;
-
-	unsigned  framelen = 0, frameno, ack;
-	unsigned  is_first, frame_ok = 0;
-
-	if( check_fhdr( ioaddr, &framelen, &frameno, &ack, &is_first, &crc ) ) {
-		frame_ok = framelen > 4
-			?  upload_data( dev, framelen, frameno, is_first, crc )
-			:  skip_tail( ioaddr, framelen, crc );
-		if( frame_ok )
-			interpret_ack( dev, ack );
-	}
-
-	outb( inb( ioaddr + CSR0 ) ^ CT_ZER, ioaddr + CSR0 );
-	if( frame_ok ) {
-		nl->state |= FL_PREV_OK;
-		if( framelen > 4 )
-			nl->in_stats.all_rx_number++;
-	} else {
-		nl->state &= ~FL_PREV_OK;
-		change_level( dev );
-		nl->in_stats.all_rx_number++;
-		nl->in_stats.bad_rx_number++;
-	}
-
-	return  !frame_ok  ||  framelen > 4;
-}
-
-
-static void
-send_frame( struct net_device  *dev )
-{
-	struct net_local  *nl    = netdev_priv(dev);
-
-	u32  crc = CRC32_INITIAL;
-
-	if( nl->state & FL_NEED_RESEND ) {
-
-		/* if frame was sended but not ACK'ed - resend it */
-		if( nl->trans_errors ) {
-			--nl->trans_errors;
-			if( nl->framelen != 0 )
-				nl->in_stats.resend_tx_number++;
-		} else {
-			/* cannot xmit with many attempts */
-#ifdef CONFIG_SBNI_MULTILINE
-			if( (nl->state & FL_SLAVE)  ||  nl->link )
-#endif
-			nl->state |= FL_LINE_DOWN;
-			drop_xmit_queue( dev );
-			goto  do_send;
-		}
-	} else
-		nl->trans_errors = TR_ERROR_COUNT;
-
-	send_frame_header( dev, &crc );
-	nl->state |= FL_NEED_RESEND;
-	/*
-	 * FL_NEED_RESEND will be cleared after ACK, but if empty
-	 * frame sended then in prepare_to_send next frame
-	 */
-
-
-	if( nl->framelen ) {
-		download_data( dev, &crc );
-		nl->in_stats.all_tx_number++;
-		nl->state |= FL_WAIT_ACK;
-	}
-
-	outsb( dev->base_addr + DAT, (u8 *)&crc, sizeof crc );
-
-do_send:
-	outb( inb( dev->base_addr + CSR0 ) & ~TR_REQ, dev->base_addr + CSR0 );
-
-	if( nl->tx_frameno )
-		/* next frame exists - we request card to send it */
-		outb( inb( dev->base_addr + CSR0 ) | TR_REQ,
-		      dev->base_addr + CSR0 );
-}
-
-
-/*
- * Write the frame data into adapter's buffer memory, and calculate CRC.
- * Do padding if necessary.
- */
-
-static void
-download_data( struct net_device  *dev,  u32  *crc_p )
-{
-	struct net_local  *nl    = netdev_priv(dev);
-	struct sk_buff    *skb	 = nl->tx_buf_p;
-
-	unsigned  len = min_t(unsigned int, skb->len - nl->outpos, nl->framelen);
-
-	outsb( dev->base_addr + DAT, skb->data + nl->outpos, len );
-	*crc_p = calc_crc32( *crc_p, skb->data + nl->outpos, len );
-
-	/* if packet too short we should write some more bytes to pad */
-	for( len = nl->framelen - len;  len--; ) {
-		outb( 0, dev->base_addr + DAT );
-		*crc_p = CRC32( 0, *crc_p );
-	}
-}
-
-
-static int
-upload_data( struct net_device  *dev,  unsigned  framelen,  unsigned  frameno,
-	     unsigned  is_first,  u32  crc )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-	int  frame_ok;
-
-	if( is_first ) {
-		nl->wait_frameno = frameno;
-		nl->inppos = 0;
-	}
-
-	if( nl->wait_frameno == frameno ) {
-
-		if( nl->inppos + framelen  <=  ETHER_MAX_LEN )
-			frame_ok = append_frame_to_pkt( dev, framelen, crc );
-
-		/*
-		 * if CRC is right but framelen incorrect then transmitter
-		 * error was occurred... drop entire packet
-		 */
-		else if( (frame_ok = skip_tail( dev->base_addr, framelen, crc ))
-			 != 0 ) {
-			nl->wait_frameno = 0;
-			nl->inppos = 0;
-#ifdef CONFIG_SBNI_MULTILINE
-			nl->master->stats.rx_errors++;
-			nl->master->stats.rx_missed_errors++;
-#else
-		        dev->stats.rx_errors++;
-			dev->stats.rx_missed_errors++;
-#endif
-		}
-			/* now skip all frames until is_first != 0 */
-	} else
-		frame_ok = skip_tail( dev->base_addr, framelen, crc );
-
-	if( is_first  &&  !frame_ok ) {
-		/*
-		 * Frame has been broken, but we had already stored
-		 * is_first... Drop entire packet.
-		 */
-		nl->wait_frameno = 0;
-#ifdef CONFIG_SBNI_MULTILINE
-		nl->master->stats.rx_errors++;
-		nl->master->stats.rx_crc_errors++;
-#else
-		dev->stats.rx_errors++;
-		dev->stats.rx_crc_errors++;
-#endif
-	}
-
-	return  frame_ok;
-}
-
-
-static inline void
-send_complete( struct net_device *dev )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-#ifdef CONFIG_SBNI_MULTILINE
-	nl->master->stats.tx_packets++;
-	nl->master->stats.tx_bytes += nl->tx_buf_p->len;
-#else
-	dev->stats.tx_packets++;
-	dev->stats.tx_bytes += nl->tx_buf_p->len;
-#endif
-	dev_consume_skb_irq(nl->tx_buf_p);
-
-	nl->tx_buf_p = NULL;
-
-	nl->outpos = 0;
-	nl->state &= ~(FL_WAIT_ACK | FL_NEED_RESEND);
-	nl->framelen   = 0;
-}
-
-
-static void
-interpret_ack( struct net_device  *dev,  unsigned  ack )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-	if( ack == FRAME_SENT_OK ) {
-		nl->state &= ~FL_NEED_RESEND;
-
-		if( nl->state & FL_WAIT_ACK ) {
-			nl->outpos += nl->framelen;
-
-			if( --nl->tx_frameno ) {
-				nl->framelen = min_t(unsigned int,
-						   nl->maxframe,
-						   nl->tx_buf_p->len - nl->outpos);
-			} else {
-				send_complete( dev );
-#ifdef CONFIG_SBNI_MULTILINE
-				netif_wake_queue( nl->master );
-#else
-				netif_wake_queue( dev );
-#endif
-			}
-		}
-	}
-
-	nl->state &= ~FL_WAIT_ACK;
-}
-
-
-/*
- * Glue received frame with previous fragments of packet.
- * Indicate packet when last frame would be accepted.
- */
-
-static int
-append_frame_to_pkt( struct net_device  *dev,  unsigned  framelen,  u32  crc )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-	u8  *p;
-
-	if( nl->inppos + framelen  >  ETHER_MAX_LEN )
-		return  0;
-
-	if( !nl->rx_buf_p  &&  !(nl->rx_buf_p = get_rx_buf( dev )) )
-		return  0;
-
-	p = nl->rx_buf_p->data + nl->inppos;
-	insb( dev->base_addr + DAT, p, framelen );
-	if( calc_crc32( crc, p, framelen ) != CRC32_REMAINDER )
-		return  0;
-
-	nl->inppos += framelen - 4;
-	if( --nl->wait_frameno == 0 )		/* last frame received */
-		indicate_pkt( dev );
-
-	return  1;
-}
-
-
-/*
- * Prepare to start output on adapter.
- * Transmitter will be actually activated when marker is accepted.
- */
-
-static void
-prepare_to_send( struct sk_buff  *skb,  struct net_device  *dev )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-	unsigned int  len;
-
-	/* nl->tx_buf_p == NULL here! */
-	if( nl->tx_buf_p )
-		netdev_err(dev, "memory leak!\n");
-
-	nl->outpos = 0;
-	nl->state &= ~(FL_WAIT_ACK | FL_NEED_RESEND);
-
-	len = skb->len;
-	if( len < SBNI_MIN_LEN )
-		len = SBNI_MIN_LEN;
-
-	nl->tx_buf_p	= skb;
-	nl->tx_frameno	= DIV_ROUND_UP(len, nl->maxframe);
-	nl->framelen	= len < nl->maxframe  ?  len  :  nl->maxframe;
-
-	outb( inb( dev->base_addr + CSR0 ) | TR_REQ,  dev->base_addr + CSR0 );
-#ifdef CONFIG_SBNI_MULTILINE
-	netif_trans_update(nl->master);
-#else
-	netif_trans_update(dev);
-#endif
-}
-
-
-static void
-drop_xmit_queue( struct net_device  *dev )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-	if( nl->tx_buf_p ) {
-		dev_kfree_skb_any( nl->tx_buf_p );
-		nl->tx_buf_p = NULL;
-#ifdef CONFIG_SBNI_MULTILINE
-		nl->master->stats.tx_errors++;
-		nl->master->stats.tx_carrier_errors++;
-#else
-		dev->stats.tx_errors++;
-		dev->stats.tx_carrier_errors++;
-#endif
-	}
-
-	nl->tx_frameno	= 0;
-	nl->framelen	= 0;
-	nl->outpos	= 0;
-	nl->state &= ~(FL_WAIT_ACK | FL_NEED_RESEND);
-#ifdef CONFIG_SBNI_MULTILINE
-	netif_start_queue( nl->master );
-	netif_trans_update(nl->master);
-#else
-	netif_start_queue( dev );
-	netif_trans_update(dev);
-#endif
-}
-
-
-static void
-send_frame_header( struct net_device  *dev,  u32  *crc_p )
-{
-	struct net_local  *nl  = netdev_priv(dev);
-
-	u32  crc = *crc_p;
-	u32  len_field = nl->framelen + 6;	/* CRC + frameno + reserved */
-	u8   value;
-
-	if( nl->state & FL_NEED_RESEND )
-		len_field |= FRAME_RETRY;	/* non-first attempt... */
-
-	if( nl->outpos == 0 )
-		len_field |= FRAME_FIRST;
-
-	len_field |= (nl->state & FL_PREV_OK) ? FRAME_SENT_OK : FRAME_SENT_BAD;
-	outb( SBNI_SIG, dev->base_addr + DAT );
-
-	value = (u8) len_field;
-	outb( value, dev->base_addr + DAT );
-	crc = CRC32( value, crc );
-	value = (u8) (len_field >> 8);
-	outb( value, dev->base_addr + DAT );
-	crc = CRC32( value, crc );
-
-	outb( nl->tx_frameno, dev->base_addr + DAT );
-	crc = CRC32( nl->tx_frameno, crc );
-	outb( 0, dev->base_addr + DAT );
-	crc = CRC32( 0, crc );
-	*crc_p = crc;
-}
-
-
-/*
- * if frame tail not needed (incorrect number or received twice),
- * it won't store, but CRC will be calculated
- */
-
-static int
-skip_tail( unsigned int  ioaddr,  unsigned int  tail_len,  u32 crc )
-{
-	while( tail_len-- )
-		crc = CRC32( inb( ioaddr + DAT ), crc );
-
-	return  crc == CRC32_REMAINDER;
-}
-
-
-/*
- * Preliminary checks if frame header is correct, calculates its CRC
- * and split it to simple fields
- */
-
-static int
-check_fhdr( u32  ioaddr,  u32  *framelen,  u32  *frameno,  u32  *ack,
-	    u32  *is_first,  u32  *crc_p )
-{
-	u32  crc = *crc_p;
-	u8   value;
-
-	if( inb( ioaddr + DAT ) != SBNI_SIG )
-		return  0;
-
-	value = inb( ioaddr + DAT );
-	*framelen = (u32)value;
-	crc = CRC32( value, crc );
-	value = inb( ioaddr + DAT );
-	*framelen |= ((u32)value) << 8;
-	crc = CRC32( value, crc );
-
-	*ack = *framelen & FRAME_ACK_MASK;
-	*is_first = (*framelen & FRAME_FIRST) != 0;
-
-	if( (*framelen &= FRAME_LEN_MASK) < 6 ||
-	    *framelen > SBNI_MAX_FRAME - 3 )
-		return  0;
-
-	value = inb( ioaddr + DAT );
-	*frameno = (u32)value;
-	crc = CRC32( value, crc );
-
-	crc = CRC32( inb( ioaddr + DAT ), crc );	/* reserved byte */
-	*framelen -= 2;
-
-	*crc_p = crc;
-	return  1;
-}
-
-
-static struct sk_buff *
-get_rx_buf( struct net_device  *dev )
-{
-	/* +2 is to compensate for the alignment fixup below */
-	struct sk_buff  *skb = dev_alloc_skb( ETHER_MAX_LEN + 2 );
-	if( !skb )
-		return  NULL;
-
-	skb_reserve( skb, 2 );		/* Align IP on longword boundaries */
-	return  skb;
-}
-
-
-static void
-indicate_pkt( struct net_device  *dev )
-{
-	struct net_local  *nl  = netdev_priv(dev);
-	struct sk_buff    *skb = nl->rx_buf_p;
-
-	skb_put( skb, nl->inppos );
-
-#ifdef CONFIG_SBNI_MULTILINE
-	skb->protocol = eth_type_trans( skb, nl->master );
-	netif_rx( skb );
-	++nl->master->stats.rx_packets;
-	nl->master->stats.rx_bytes += nl->inppos;
-#else
-	skb->protocol = eth_type_trans( skb, dev );
-	netif_rx( skb );
-	++dev->stats.rx_packets;
-	dev->stats.rx_bytes += nl->inppos;
-#endif
-	nl->rx_buf_p = NULL;	/* protocol driver will clear this sk_buff */
-}
-
-
-/* -------------------------------------------------------------------------- */
-
-/*
- * Routine checks periodically wire activity and regenerates marker if
- * connect was inactive for a long time.
- */
-
-static void
-sbni_watchdog(struct timer_list *t)
-{
-	struct net_local   *nl  = from_timer(nl, t, watchdog);
-	struct net_device  *dev = nl->watchdog_dev;
-	unsigned long	   flags;
-	unsigned char	   csr0;
-
-	spin_lock_irqsave( &nl->lock, flags );
-
-	csr0 = inb( dev->base_addr + CSR0 );
-	if( csr0 & RC_CHK ) {
-
-		if( nl->timer_ticks ) {
-			if( csr0 & (RC_RDY | BU_EMP) )
-				/* receiving not active */
-				nl->timer_ticks--;
-		} else {
-			nl->in_stats.timeout_number++;
-			if( nl->delta_rxl )
-				timeout_change_level( dev );
-
-			outb( *(u_char *)&nl->csr1 | PR_RES,
-			      dev->base_addr + CSR1 );
-			csr0 = inb( dev->base_addr + CSR0 );
-		}
-	} else
-		nl->state &= ~FL_LINE_DOWN;
-
-	outb( csr0 | RC_CHK, dev->base_addr + CSR0 ); 
-
-	mod_timer(t, jiffies + SBNI_TIMEOUT);
-
-	spin_unlock_irqrestore( &nl->lock, flags );
-}
-
-
-static unsigned char  rxl_tab[] = {
-	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
-	0x0a, 0x0c, 0x0f, 0x16, 0x18, 0x1a, 0x1c, 0x1f
-};
-
-#define SIZE_OF_TIMEOUT_RXL_TAB 4
-static unsigned char  timeout_rxl_tab[] = {
-	0x03, 0x05, 0x08, 0x0b
-};
-
-/* -------------------------------------------------------------------------- */
-
-static void
-card_start( struct net_device  *dev )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-	nl->timer_ticks = CHANGE_LEVEL_START_TICKS;
-	nl->state &= ~(FL_WAIT_ACK | FL_NEED_RESEND);
-	nl->state |= FL_PREV_OK;
-
-	nl->inppos = nl->outpos = 0;
-	nl->wait_frameno = 0;
-	nl->tx_frameno	 = 0;
-	nl->framelen	 = 0;
-
-	outb( *(u_char *)&nl->csr1 | PR_RES, dev->base_addr + CSR1 );
-	outb( EN_INT, dev->base_addr + CSR0 );
-}
-
-/* -------------------------------------------------------------------------- */
-
-/* Receive level auto-selection */
-
-static void
-change_level( struct net_device  *dev )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-	if( nl->delta_rxl == 0 )	/* do not auto-negotiate RxL */
-		return;
-
-	if( nl->cur_rxl_index == 0 )
-		nl->delta_rxl = 1;
-	else if( nl->cur_rxl_index == 15 )
-		nl->delta_rxl = -1;
-	else if( nl->cur_rxl_rcvd < nl->prev_rxl_rcvd )
-		nl->delta_rxl = -nl->delta_rxl;
-
-	nl->csr1.rxl = rxl_tab[ nl->cur_rxl_index += nl->delta_rxl ];
-	inb( dev->base_addr + CSR0 );	/* needs for PCI cards */
-	outb( *(u8 *)&nl->csr1, dev->base_addr + CSR1 );
-
-	nl->prev_rxl_rcvd = nl->cur_rxl_rcvd;
-	nl->cur_rxl_rcvd  = 0;
-}
-
-
-static void
-timeout_change_level( struct net_device  *dev )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-	nl->cur_rxl_index = timeout_rxl_tab[ nl->timeout_rxl ];
-	if( ++nl->timeout_rxl >= 4 )
-		nl->timeout_rxl = 0;
-
-	nl->csr1.rxl = rxl_tab[ nl->cur_rxl_index ];
-	inb( dev->base_addr + CSR0 );
-	outb( *(unsigned char *)&nl->csr1, dev->base_addr + CSR1 );
-
-	nl->prev_rxl_rcvd = nl->cur_rxl_rcvd;
-	nl->cur_rxl_rcvd  = 0;
-}
-
-/* -------------------------------------------------------------------------- */
-
-/*
- *	Open/initialize the board. 
- */
-
-static int
-sbni_open( struct net_device  *dev )
-{
-	struct net_local	*nl = netdev_priv(dev);
-	struct timer_list	*w  = &nl->watchdog;
-
-	/*
-	 * For double ISA adapters within "common irq" mode, we have to
-	 * determine whether primary or secondary channel is initialized,
-	 * and set the irq handler only in first case.
-	 */
-	if( dev->base_addr < 0x400 ) {		/* ISA only */
-		struct net_device  **p = sbni_cards;
-		for( ;  *p  &&  p < sbni_cards + SBNI_MAX_NUM_CARDS;  ++p )
-			if( (*p)->irq == dev->irq &&
-			    ((*p)->base_addr == dev->base_addr + 4 ||
-			     (*p)->base_addr == dev->base_addr - 4) &&
-			    (*p)->flags & IFF_UP ) {
-
-				((struct net_local *) (netdev_priv(*p)))
-					->second = dev;
-				netdev_notice(dev, "using shared irq with %s\n",
-					      (*p)->name);
-				nl->state |= FL_SECONDARY;
-				goto  handler_attached;
-			}
-	}
-
-	if( request_irq(dev->irq, sbni_interrupt, IRQF_SHARED, dev->name, dev) ) {
-		netdev_err(dev, "unable to get IRQ %d\n", dev->irq);
-		return  -EAGAIN;
-	}
-
-handler_attached:
-
-	spin_lock( &nl->lock );
-	memset( &dev->stats, 0, sizeof(struct net_device_stats) );
-	memset( &nl->in_stats, 0, sizeof(struct sbni_in_stats) );
-
-	card_start( dev );
-
-	netif_start_queue( dev );
-
-	/* set timer watchdog */
-	nl->watchdog_dev = dev;
-	timer_setup(w, sbni_watchdog, 0);
-	w->expires	= jiffies + SBNI_TIMEOUT;
-	add_timer( w );
-   
-	spin_unlock( &nl->lock );
-	return 0;
-}
-
-
-static int
-sbni_close( struct net_device  *dev )
-{
-	struct net_local  *nl = netdev_priv(dev);
-
-	if( nl->second  &&  nl->second->flags & IFF_UP ) {
-		netdev_notice(dev, "Secondary channel (%s) is active!\n",
-			      nl->second->name);
-		return  -EBUSY;
-	}
-
-#ifdef CONFIG_SBNI_MULTILINE
-	if( nl->state & FL_SLAVE )
-		emancipate( dev );
-	else
-		while( nl->link )	/* it's master device! */
-			emancipate( nl->link );
-#endif
-
-	spin_lock( &nl->lock );
-
-	nl->second = NULL;
-	drop_xmit_queue( dev );	
-	netif_stop_queue( dev );
-   
-	del_timer( &nl->watchdog );
-
-	outb( 0, dev->base_addr + CSR0 );
-
-	if( !(nl->state & FL_SECONDARY) )
-		free_irq( dev->irq, dev );
-	nl->state &= FL_SECONDARY;
-
-	spin_unlock( &nl->lock );
-	return 0;
-}
-
-
-/*
-	Valid combinations in CSR0 (for probing):
-
-	VALID_DECODER	0000,0011,1011,1010
-
-				    	; 0   ; -
-				TR_REQ	; 1   ; +
-			TR_RDY	    	; 2   ; -
-			TR_RDY	TR_REQ	; 3   ; +
-		BU_EMP		    	; 4   ; +
-		BU_EMP	     	TR_REQ	; 5   ; +
-		BU_EMP	TR_RDY	    	; 6   ; -
-		BU_EMP	TR_RDY	TR_REQ	; 7   ; +
-	RC_RDY 		     		; 8   ; +
-	RC_RDY			TR_REQ	; 9   ; +
-	RC_RDY		TR_RDY		; 10  ; -
-	RC_RDY		TR_RDY	TR_REQ	; 11  ; -
-	RC_RDY	BU_EMP			; 12  ; -
-	RC_RDY	BU_EMP		TR_REQ	; 13  ; -
-	RC_RDY	BU_EMP	TR_RDY		; 14  ; -
-	RC_RDY	BU_EMP	TR_RDY	TR_REQ	; 15  ; -
-*/
-
-#define VALID_DECODER (2 + 8 + 0x10 + 0x20 + 0x80 + 0x100 + 0x200)
-
-
-static int
-sbni_card_probe( unsigned long  ioaddr )
-{
-	unsigned char  csr0;
-
-	csr0 = inb( ioaddr + CSR0 );
-	if( csr0 != 0xff  &&  csr0 != 0x00 ) {
-		csr0 &= ~EN_INT;
-		if( csr0 & BU_EMP )
-			csr0 |= EN_INT;
-      
-		if( VALID_DECODER & (1 << (csr0 >> 4)) )
-			return  0;
-	}
-   
-	return  -ENODEV;
-}
-
-/* -------------------------------------------------------------------------- */
-
-static int
-sbni_siocdevprivate(struct net_device  *dev,  struct ifreq  *ifr, void __user *data, int  cmd)
-{
-	struct net_local  *nl = netdev_priv(dev);
-	struct sbni_flags  flags;
-	int  error = 0;
-
-#ifdef CONFIG_SBNI_MULTILINE
-	struct net_device  *slave_dev;
-	char  slave_name[ 8 ];
-#endif
-  
-	switch( cmd ) {
-	case  SIOCDEVGETINSTATS :
-		if (copy_to_user(data, &nl->in_stats,
-				 sizeof(struct sbni_in_stats)))
-			error = -EFAULT;
-		break;
-
-	case  SIOCDEVRESINSTATS :
-		if (!capable(CAP_NET_ADMIN))
-			return  -EPERM;
-		memset( &nl->in_stats, 0, sizeof(struct sbni_in_stats) );
-		break;
-
-	case  SIOCDEVGHWSTATE :
-		flags.mac_addr	= *(u32 *)(dev->dev_addr + 3);
-		flags.rate	= nl->csr1.rate;
-		flags.slow_mode	= (nl->state & FL_SLOW_MODE) != 0;
-		flags.rxl	= nl->cur_rxl_index;
-		flags.fixed_rxl	= nl->delta_rxl == 0;
-
-		if (copy_to_user(data, &flags, sizeof(flags)))
-			error = -EFAULT;
-		break;
-
-	case  SIOCDEVSHWSTATE :
-		if (!capable(CAP_NET_ADMIN))
-			return  -EPERM;
-
-		spin_lock( &nl->lock );
-		flags = *(struct sbni_flags*) &ifr->ifr_ifru;
-		if( flags.fixed_rxl ) {
-			nl->delta_rxl = 0;
-			nl->cur_rxl_index = flags.rxl;
-		} else {
-			nl->delta_rxl = DEF_RXL_DELTA;
-			nl->cur_rxl_index = DEF_RXL;
-		}
-
-		nl->csr1.rxl = rxl_tab[ nl->cur_rxl_index ];
-		nl->csr1.rate = flags.rate;
-		outb( *(u8 *)&nl->csr1 | PR_RES, dev->base_addr + CSR1 );
-		spin_unlock( &nl->lock );
-		break;
-
-#ifdef CONFIG_SBNI_MULTILINE
-
-	case  SIOCDEVENSLAVE :
-		if (!capable(CAP_NET_ADMIN))
-			return  -EPERM;
-
-		if (copy_from_user(slave_name, data, sizeof(slave_name)))
-			return -EFAULT;
-		slave_dev = dev_get_by_name(&init_net, slave_name );
-		if( !slave_dev  ||  !(slave_dev->flags & IFF_UP) ) {
-			netdev_err(dev, "trying to enslave non-active device %s\n",
-				   slave_name);
-			if (slave_dev)
-				dev_put(slave_dev);
-			return  -EPERM;
-		}
-
-		return  enslave( dev, slave_dev );
-
-	case  SIOCDEVEMANSIPATE :
-		if (!capable(CAP_NET_ADMIN))
-			return  -EPERM;
-
-		return  emancipate( dev );
-
-#endif	/* CONFIG_SBNI_MULTILINE */
-
-	default :
-		return  -EOPNOTSUPP;
-	}
-
-	return  error;
-}
-
-
-#ifdef CONFIG_SBNI_MULTILINE
-
-static int
-enslave( struct net_device  *dev,  struct net_device  *slave_dev )
-{
-	struct net_local  *nl  = netdev_priv(dev);
-	struct net_local  *snl = netdev_priv(slave_dev);
-
-	if( nl->state & FL_SLAVE )	/* This isn't master or free device */
-		return  -EBUSY;
-
-	if( snl->state & FL_SLAVE )	/* That was already enslaved */
-		return  -EBUSY;
-
-	spin_lock( &nl->lock );
-	spin_lock( &snl->lock );
-
-	/* append to list */
-	snl->link = nl->link;
-	nl->link  = slave_dev;
-	snl->master = dev;
-	snl->state |= FL_SLAVE;
-
-	/* Summary statistics of MultiLine operation will be stored
-	   in master's counters */
-	memset( &slave_dev->stats, 0, sizeof(struct net_device_stats) );
-	netif_stop_queue( slave_dev );
-	netif_wake_queue( dev );	/* Now we are able to transmit */
-
-	spin_unlock( &snl->lock );
-	spin_unlock( &nl->lock );
-	netdev_notice(dev, "slave device (%s) attached\n", slave_dev->name);
-	return  0;
-}
-
-
-static int
-emancipate( struct net_device  *dev )
-{
-	struct net_local   *snl = netdev_priv(dev);
-	struct net_device  *p   = snl->master;
-	struct net_local   *nl  = netdev_priv(p);
-
-	if( !(snl->state & FL_SLAVE) )
-		return  -EINVAL;
-
-	spin_lock( &nl->lock );
-	spin_lock( &snl->lock );
-	drop_xmit_queue( dev );
-
-	/* exclude from list */
-	for(;;) {	/* must be in list */
-		struct net_local  *t = netdev_priv(p);
-		if( t->link == dev ) {
-			t->link = snl->link;
-			break;
-		}
-		p = t->link;
-	}
-
-	snl->link = NULL;
-	snl->master = dev;
-	snl->state &= ~FL_SLAVE;
-
-	netif_start_queue( dev );
-
-	spin_unlock( &snl->lock );
-	spin_unlock( &nl->lock );
-
-	dev_put( dev );
-	return  0;
-}
-
-#endif
-
-static void
-set_multicast_list( struct net_device  *dev )
-{
-	return;		/* sbni always operate in promiscuos mode */
-}
-
-
-#ifdef MODULE
-module_param_hw_array(io, int, ioport, NULL, 0);
-module_param_hw_array(irq, int, irq, NULL, 0);
-module_param_array(baud, int, NULL, 0);
-module_param_array(rxl, int, NULL, 0);
-module_param_array(mac, int, NULL, 0);
-module_param(skip_pci_probe, bool, 0);
-
-MODULE_LICENSE("GPL");
-
-
-int __init init_module( void )
-{
-	struct net_device  *dev;
-	int err;
-
-	while( num < SBNI_MAX_NUM_CARDS ) {
-		dev = alloc_netdev(sizeof(struct net_local), "sbni%d",
-				   NET_NAME_UNKNOWN, sbni_devsetup);
-		if( !dev)
-			break;
-
-		sprintf( dev->name, "sbni%d", num );
-
-		err = sbni_init(dev);
-		if (err) {
-			free_netdev(dev);
-			break;
-		}
-
-		if( register_netdev( dev ) ) {
-			release_region( dev->base_addr, SBNI_IO_EXTENT );
-			free_netdev( dev );
-			break;
-		}
-	}
-
-	return  *sbni_cards  ?  0  :  -ENODEV;
-}
-
-void
-cleanup_module(void)
-{
-	int i;
-
-	for (i = 0;  i < SBNI_MAX_NUM_CARDS;  ++i) {
-		struct net_device *dev = sbni_cards[i];
-		if (dev != NULL) {
-			unregister_netdev(dev);
-			release_region(dev->base_addr, SBNI_IO_EXTENT);
-			free_netdev(dev);
-		}
-	}
-}
-
-#else	/* MODULE */
-
-static int __init
-sbni_setup( char  *p )
-{
-	int  n, parm;
-
-	if( *p++ != '(' )
-		goto  bad_param;
-
-	for( n = 0, parm = 0;  *p  &&  n < 8; ) {
-		(*dest[ parm ])[ n ] = simple_strtoul( p, &p, 0 );
-		if( !*p  ||  *p == ')' )
-			return 1;
-		if( *p == ';' ) {
-			++p;
-			++n;
-			parm = 0;
-		} else if( *p++ != ',' ) {
-			break;
-		} else {
-			if( ++parm >= 5 )
-				break;
-		}
-	}
-bad_param:
-	pr_err("Error in sbni kernel parameter!\n");
-	return 0;
-}
-
-__setup( "sbni=", sbni_setup );
-
-#endif	/* MODULE */
-
-/* -------------------------------------------------------------------------- */
-
-static u32
-calc_crc32( u32  crc,  u8  *p,  u32  len )
-{
-	while( len-- )
-		crc = CRC32( *p++, crc );
-
-	return  crc;
-}
-
-static u32  crc32tab[] __attribute__ ((aligned(8))) = {
-	0xD202EF8D,  0xA505DF1B,  0x3C0C8EA1,  0x4B0BBE37,
-	0xD56F2B94,  0xA2681B02,  0x3B614AB8,  0x4C667A2E,
-	0xDCD967BF,  0xABDE5729,  0x32D70693,  0x45D03605,
-	0xDBB4A3A6,  0xACB39330,  0x35BAC28A,  0x42BDF21C,
-	0xCFB5FFE9,  0xB8B2CF7F,  0x21BB9EC5,  0x56BCAE53,
-	0xC8D83BF0,  0xBFDF0B66,  0x26D65ADC,  0x51D16A4A,
-	0xC16E77DB,  0xB669474D,  0x2F6016F7,  0x58672661,
-	0xC603B3C2,  0xB1048354,  0x280DD2EE,  0x5F0AE278,
-	0xE96CCF45,  0x9E6BFFD3,  0x0762AE69,  0x70659EFF,
-	0xEE010B5C,  0x99063BCA,  0x000F6A70,  0x77085AE6,
-	0xE7B74777,  0x90B077E1,  0x09B9265B,  0x7EBE16CD,
-	0xE0DA836E,  0x97DDB3F8,  0x0ED4E242,  0x79D3D2D4,
-	0xF4DBDF21,  0x83DCEFB7,  0x1AD5BE0D,  0x6DD28E9B,
-	0xF3B61B38,  0x84B12BAE,  0x1DB87A14,  0x6ABF4A82,
-	0xFA005713,  0x8D076785,  0x140E363F,  0x630906A9,
-	0xFD6D930A,  0x8A6AA39C,  0x1363F226,  0x6464C2B0,
-	0xA4DEAE1D,  0xD3D99E8B,  0x4AD0CF31,  0x3DD7FFA7,
-	0xA3B36A04,  0xD4B45A92,  0x4DBD0B28,  0x3ABA3BBE,
-	0xAA05262F,  0xDD0216B9,  0x440B4703,  0x330C7795,
-	0xAD68E236,  0xDA6FD2A0,  0x4366831A,  0x3461B38C,
-	0xB969BE79,  0xCE6E8EEF,  0x5767DF55,  0x2060EFC3,
-	0xBE047A60,  0xC9034AF6,  0x500A1B4C,  0x270D2BDA,
-	0xB7B2364B,  0xC0B506DD,  0x59BC5767,  0x2EBB67F1,
-	0xB0DFF252,  0xC7D8C2C4,  0x5ED1937E,  0x29D6A3E8,
-	0x9FB08ED5,  0xE8B7BE43,  0x71BEEFF9,  0x06B9DF6F,
-	0x98DD4ACC,  0xEFDA7A5A,  0x76D32BE0,  0x01D41B76,
-	0x916B06E7,  0xE66C3671,  0x7F6567CB,  0x0862575D,
-	0x9606C2FE,  0xE101F268,  0x7808A3D2,  0x0F0F9344,
-	0x82079EB1,  0xF500AE27,  0x6C09FF9D,  0x1B0ECF0B,
-	0x856A5AA8,  0xF26D6A3E,  0x6B643B84,  0x1C630B12,
-	0x8CDC1683,  0xFBDB2615,  0x62D277AF,  0x15D54739,
-	0x8BB1D29A,  0xFCB6E20C,  0x65BFB3B6,  0x12B88320,
-	0x3FBA6CAD,  0x48BD5C3B,  0xD1B40D81,  0xA6B33D17,
-	0x38D7A8B4,  0x4FD09822,  0xD6D9C998,  0xA1DEF90E,
-	0x3161E49F,  0x4666D409,  0xDF6F85B3,  0xA868B525,
-	0x360C2086,  0x410B1010,  0xD80241AA,  0xAF05713C,
-	0x220D7CC9,  0x550A4C5F,  0xCC031DE5,  0xBB042D73,
-	0x2560B8D0,  0x52678846,  0xCB6ED9FC,  0xBC69E96A,
-	0x2CD6F4FB,  0x5BD1C46D,  0xC2D895D7,  0xB5DFA541,
-	0x2BBB30E2,  0x5CBC0074,  0xC5B551CE,  0xB2B26158,
-	0x04D44C65,  0x73D37CF3,  0xEADA2D49,  0x9DDD1DDF,
-	0x03B9887C,  0x74BEB8EA,  0xEDB7E950,  0x9AB0D9C6,
-	0x0A0FC457,  0x7D08F4C1,  0xE401A57B,  0x930695ED,
-	0x0D62004E,  0x7A6530D8,  0xE36C6162,  0x946B51F4,
-	0x19635C01,  0x6E646C97,  0xF76D3D2D,  0x806A0DBB,
-	0x1E0E9818,  0x6909A88E,  0xF000F934,  0x8707C9A2,
-	0x17B8D433,  0x60BFE4A5,  0xF9B6B51F,  0x8EB18589,
-	0x10D5102A,  0x67D220BC,  0xFEDB7106,  0x89DC4190,
-	0x49662D3D,  0x3E611DAB,  0xA7684C11,  0xD06F7C87,
-	0x4E0BE924,  0x390CD9B2,  0xA0058808,  0xD702B89E,
-	0x47BDA50F,  0x30BA9599,  0xA9B3C423,  0xDEB4F4B5,
-	0x40D06116,  0x37D75180,  0xAEDE003A,  0xD9D930AC,
-	0x54D13D59,  0x23D60DCF,  0xBADF5C75,  0xCDD86CE3,
-	0x53BCF940,  0x24BBC9D6,  0xBDB2986C,  0xCAB5A8FA,
-	0x5A0AB56B,  0x2D0D85FD,  0xB404D447,  0xC303E4D1,
-	0x5D677172,  0x2A6041E4,  0xB369105E,  0xC46E20C8,
-	0x72080DF5,  0x050F3D63,  0x9C066CD9,  0xEB015C4F,
-	0x7565C9EC,  0x0262F97A,  0x9B6BA8C0,  0xEC6C9856,
-	0x7CD385C7,  0x0BD4B551,  0x92DDE4EB,  0xE5DAD47D,
-	0x7BBE41DE,  0x0CB97148,  0x95B020F2,  0xE2B71064,
-	0x6FBF1D91,  0x18B82D07,  0x81B17CBD,  0xF6B64C2B,
-	0x68D2D988,  0x1FD5E91E,  0x86DCB8A4,  0xF1DB8832,
-	0x616495A3,  0x1663A535,  0x8F6AF48F,  0xF86DC419,
-	0x660951BA,  0x110E612C,  0x88073096,  0xFF000000
-};
-
diff --git a/drivers/net/wan/sbni.h b/drivers/net/wan/sbni.h
deleted file mode 100644
index 84264510a8ed..000000000000
--- a/drivers/net/wan/sbni.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* sbni.h:  definitions for a Granch SBNI12 driver, version 5.0.0
- * Written 2001 Denis I.Timofeev (timofeev@granch.ru)
- * This file is distributed under the GNU GPL
- */
-
-#ifndef SBNI_H
-#define SBNI_H
-
-#ifdef SBNI_DEBUG
-#define DP( A ) A
-#else
-#define DP( A )
-#endif
-
-
-/* We don't have official vendor id yet... */
-#define SBNI_PCI_VENDOR 	0x55 
-#define SBNI_PCI_DEVICE 	0x9f
-
-#define ISA_MODE 0x00
-#define PCI_MODE 0x01
-
-#define	SBNI_IO_EXTENT	4
-
-enum sbni_reg {
-	CSR0 = 0,
-	CSR1 = 1,
-	DAT  = 2
-};
-
-/* CSR0 mapping */
-enum {
-	BU_EMP = 0x02,
-	RC_CHK = 0x04,
-	CT_ZER = 0x08,
-	TR_REQ = 0x10,
-	TR_RDY = 0x20,
-	EN_INT = 0x40,
-	RC_RDY = 0x80
-};
-
-
-/* CSR1 mapping */
-#define PR_RES 0x80
-
-struct sbni_csr1 {
-#ifdef __LITTLE_ENDIAN_BITFIELD
-	u8 rxl	: 5;
-	u8 rate	: 2;
-	u8 	: 1;
-#else
-	u8 	: 1;
-	u8 rate	: 2;
-	u8 rxl	: 5;
-#endif
-};
-
-/* fields in frame header */
-#define FRAME_ACK_MASK  (unsigned short)0x7000
-#define FRAME_LEN_MASK  (unsigned short)0x03FF
-#define FRAME_FIRST     (unsigned short)0x8000
-#define FRAME_RETRY     (unsigned short)0x0800
-
-#define FRAME_SENT_BAD  (unsigned short)0x4000
-#define FRAME_SENT_OK   (unsigned short)0x3000
-
-
-/* state flags */
-enum {
-	FL_WAIT_ACK    = 0x01,
-	FL_NEED_RESEND = 0x02,
-	FL_PREV_OK     = 0x04,
-	FL_SLOW_MODE   = 0x08,
-	FL_SECONDARY   = 0x10,
-#ifdef CONFIG_SBNI_MULTILINE
-	FL_SLAVE       = 0x20,
-#endif
-	FL_LINE_DOWN   = 0x40
-};
-
-
-enum {
-	DEFAULT_IOBASEADDR = 0x210,
-	DEFAULT_INTERRUPTNUMBER = 5,
-	DEFAULT_RATE = 0,
-	DEFAULT_FRAME_LEN = 1012
-};
-
-#define DEF_RXL_DELTA	-1
-#define DEF_RXL		0xf
-
-#define SBNI_SIG 0x5a
-
-#define	SBNI_MIN_LEN	60	/* Shortest Ethernet frame without FCS */
-#define SBNI_MAX_FRAME	1023
-#define ETHER_MAX_LEN	1518
-
-#define SBNI_TIMEOUT	(HZ/10)
-
-#define TR_ERROR_COUNT	32
-#define CHANGE_LEVEL_START_TICKS 4
-
-#define SBNI_MAX_NUM_CARDS	16
-
-/* internal SBNI-specific statistics */
-struct sbni_in_stats {
-	u32	all_rx_number;
-	u32	bad_rx_number;
-	u32	timeout_number;
-	u32	all_tx_number;
-	u32	resend_tx_number;
-};
-
-/* SBNI ioctl params */
-#define SIOCDEVGETINSTATS 	SIOCDEVPRIVATE
-#define SIOCDEVRESINSTATS 	SIOCDEVPRIVATE+1
-#define SIOCDEVGHWSTATE   	SIOCDEVPRIVATE+2
-#define SIOCDEVSHWSTATE   	SIOCDEVPRIVATE+3
-#define SIOCDEVENSLAVE  	SIOCDEVPRIVATE+4
-#define SIOCDEVEMANSIPATE  	SIOCDEVPRIVATE+5
-
-
-/* data packet for SIOCDEVGHWSTATE/SIOCDEVSHWSTATE ioctl requests */
-struct sbni_flags {
-	u32	rxl		: 4;
-	u32	rate		: 2;
-	u32	fixed_rxl	: 1;
-	u32	slow_mode	: 1;
-	u32	mac_addr	: 24;
-};
-
-/*
- * CRC-32 stuff
- */
-#define CRC32(c,crc) (crc32tab[((size_t)(crc) ^ (c)) & 0xff] ^ (((crc) >> 8) & 0x00FFFFFF))
-      /* CRC generator 0xEDB88320 */
-      /* CRC remainder 0x2144DF1C */
-      /* CRC initial value 0x00000000 */
-#define CRC32_REMAINDER 0x2144DF1C
-#define CRC32_INITIAL 0x00000000
-
-#ifndef __initdata
-#define __initdata
-#endif
-
-#endif
-
diff --git a/include/net/Space.h b/include/net/Space.h
index 93fd6caa4bad..08ca9cef0213 100644
--- a/include/net/Space.h
+++ b/include/net/Space.h
@@ -18,6 +18,3 @@ struct net_device *cops_probe(int unit);
 
 /* Fibre Channel adapters */
 int iph5526_probe(struct net_device *dev);
-
-/* SBNI adapters */
-int sbni_probe(int unit);
-- 
cgit v1.2.3


From 102793136ce9dacae77c2d1f8dd6878b480f4b95 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 2 Aug 2021 16:56:18 -0700
Subject: Bluetooth: HCI: Add proper tracking for enable status of adv
 instances

This adds a field to track if advertising instances are enabled or not
and only clear HCI_LE_ADV flag if there is no instance left advertising.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_event.c        | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 4abe3c494002..b79b31359bf8 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -221,6 +221,7 @@ struct oob_data {
 
 struct adv_info {
 	struct list_head list;
+	bool enabled;
 	bool pending;
 	__u8	instance;
 	__u32	flags;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index ea7fc09478be..35c5cc9f91b0 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1277,7 +1277,9 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev,
 					 struct sk_buff *skb)
 {
 	struct hci_cp_le_set_ext_adv_enable *cp;
+	struct hci_cp_ext_adv_set *set;
 	__u8 status = *((__u8 *) skb->data);
+	struct adv_info *adv = NULL, *n;
 
 	BT_DBG("%s status 0x%2.2x", hdev->name, status);
 
@@ -1288,22 +1290,48 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev,
 	if (!cp)
 		return;
 
+	set = (void *)cp->data;
+
 	hci_dev_lock(hdev);
 
+	if (cp->num_of_sets)
+		adv = hci_find_adv_instance(hdev, set->handle);
+
 	if (cp->enable) {
 		struct hci_conn *conn;
 
 		hci_dev_set_flag(hdev, HCI_LE_ADV);
 
+		if (adv)
+			adv->enabled = true;
+
 		conn = hci_lookup_le_connect(hdev);
 		if (conn)
 			queue_delayed_work(hdev->workqueue,
 					   &conn->le_conn_timeout,
 					   conn->conn_timeout);
 	} else {
+		if (adv) {
+			adv->enabled = false;
+			/* If just one instance was disabled check if there are
+			 * any other instance enabled before clearing HCI_LE_ADV
+			 */
+			list_for_each_entry_safe(adv, n, &hdev->adv_instances,
+						 list) {
+				if (adv->enabled)
+					goto unlock;
+			}
+		} else {
+			/* All instances shall be considered disabled */
+			list_for_each_entry_safe(adv, n, &hdev->adv_instances,
+						 list)
+				adv->enabled = false;
+		}
+
 		hci_dev_clear_flag(hdev, HCI_LE_ADV);
 	}
 
+unlock:
 	hci_dev_unlock(hdev);
 }
 
-- 
cgit v1.2.3


From c45074d68a9b1e893d86520af71fab37693c3d7e Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 2 Aug 2021 16:56:19 -0700
Subject: Bluetooth: Fix not generating RPA when required

Code was checking if random_addr and hdev->rpa match without first
checking if the RPA has not been set (BDADDR_ANY), furthermore it was
clearing HCI_RPA_EXPIRED before the command completes and the RPA is
actually programmed which in case of failure would leave the expired
RPA still set.

Since advertising instance have a similar problem the clearing of
HCI_RPA_EXPIRED has been moved to hci_event.c after checking the random
address is in fact the hdev->rap and then proceed to set the expire
timeout.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  4 ++
 net/bluetooth/hci_event.c        | 32 +++++++++++-----
 net/bluetooth/hci_request.c      | 81 +++++++++++++++++-----------------------
 3 files changed, 61 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b79b31359bf8..b011eeea28c3 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1413,6 +1413,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 				!hci_dev_test_flag(dev, HCI_AUTO_OFF))
 #define bredr_sc_enabled(dev)  (lmp_sc_capable(dev) && \
 				hci_dev_test_flag(dev, HCI_SC_ENABLED))
+#define rpa_valid(dev)         (bacmp(&dev->rpa, BDADDR_ANY) && \
+				!hci_dev_test_flag(dev, HCI_RPA_EXPIRED))
+#define adv_rpa_valid(adv)     (bacmp(&adv->random_addr, BDADDR_ANY) && \
+				!adv->rpa_expired)
 
 #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \
 		      ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M))
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 35c5cc9f91b0..38decf474f31 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -40,6 +40,8 @@
 #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
 		 "\x00\x00\x00\x00\x00\x00\x00\x00"
 
+#define secs_to_jiffies(_secs) msecs_to_jiffies((_secs) * 1000)
+
 /* Handle HCI Event packets */
 
 static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb,
@@ -1171,6 +1173,12 @@ static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb)
 
 	bacpy(&hdev->random_addr, sent);
 
+	if (!bacmp(&hdev->rpa, sent)) {
+		hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+		queue_delayed_work(hdev->workqueue, &hdev->rpa_expired,
+				   secs_to_jiffies(hdev->rpa_timeout));
+	}
+
 	hci_dev_unlock(hdev);
 }
 
@@ -1201,24 +1209,30 @@ static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev,
 {
 	__u8 status = *((__u8 *) skb->data);
 	struct hci_cp_le_set_adv_set_rand_addr *cp;
-	struct adv_info *adv_instance;
+	struct adv_info *adv;
 
 	if (status)
 		return;
 
 	cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR);
-	if (!cp)
+	/* Update only in case the adv instance since handle 0x00 shall be using
+	 * HCI_OP_LE_SET_RANDOM_ADDR since that allows both extended and
+	 * non-extended adverting.
+	 */
+	if (!cp || !cp->handle)
 		return;
 
 	hci_dev_lock(hdev);
 
-	if (!cp->handle) {
-		/* Store in hdev for instance 0 (Set adv and Directed advs) */
-		bacpy(&hdev->random_addr, &cp->bdaddr);
-	} else {
-		adv_instance = hci_find_adv_instance(hdev, cp->handle);
-		if (adv_instance)
-			bacpy(&adv_instance->random_addr, &cp->bdaddr);
+	adv = hci_find_adv_instance(hdev, cp->handle);
+	if (adv) {
+		bacpy(&adv->random_addr, &cp->bdaddr);
+		if (!bacmp(&hdev->rpa, &cp->bdaddr)) {
+			adv->rpa_expired = false;
+			queue_delayed_work(hdev->workqueue,
+					   &adv->rpa_expired_cb,
+					   secs_to_jiffies(hdev->rpa_timeout));
+		}
 	}
 
 	hci_dev_unlock(hdev);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 1d14adc023e9..f15626607b2d 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -2072,8 +2072,6 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
 	 * current RPA has expired then generate a new one.
 	 */
 	if (use_rpa) {
-		int to;
-
 		/* If Controller supports LL Privacy use own address type is
 		 * 0x03
 		 */
@@ -2084,14 +2082,10 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
 			*own_addr_type = ADDR_LE_DEV_RANDOM;
 
 		if (adv_instance) {
-			if (!adv_instance->rpa_expired &&
-			    !bacmp(&adv_instance->random_addr, &hdev->rpa))
+			if (adv_rpa_valid(adv_instance))
 				return 0;
-
-			adv_instance->rpa_expired = false;
 		} else {
-			if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) &&
-			    !bacmp(&hdev->random_addr, &hdev->rpa))
+			if (rpa_valid(hdev))
 				return 0;
 		}
 
@@ -2103,14 +2097,6 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
 
 		bacpy(rand_addr, &hdev->rpa);
 
-		to = msecs_to_jiffies(hdev->rpa_timeout * 1000);
-		if (adv_instance)
-			queue_delayed_work(hdev->workqueue,
-					   &adv_instance->rpa_expired_cb, to);
-		else
-			queue_delayed_work(hdev->workqueue,
-					   &hdev->rpa_expired, to);
-
 		return 0;
 	}
 
@@ -2153,6 +2139,30 @@ void __hci_req_clear_ext_adv_sets(struct hci_request *req)
 	hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL);
 }
 
+static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
+{
+	struct hci_dev *hdev = req->hdev;
+
+	/* If we're advertising or initiating an LE connection we can't
+	 * go ahead and change the random address at this time. This is
+	 * because the eventual initiator address used for the
+	 * subsequently created connection will be undefined (some
+	 * controllers use the new address and others the one we had
+	 * when the operation started).
+	 *
+	 * In this kind of scenario skip the update and let the random
+	 * address be updated at the next cycle.
+	 */
+	if (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
+	    hci_lookup_le_connect(hdev)) {
+		bt_dev_dbg(hdev, "Deferring random address update");
+		hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+		return;
+	}
+
+	hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa);
+}
+
 int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
 {
 	struct hci_cp_le_set_ext_adv_params cp;
@@ -2255,6 +2265,13 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
 		} else {
 			if (!bacmp(&random_addr, &hdev->random_addr))
 				return 0;
+			/* Instance 0x00 doesn't have an adv_info, instead it
+			 * uses hdev->random_addr to track its address so
+			 * whenever it needs to be updated this also set the
+			 * random address since hdev->random_addr is shared with
+			 * scan state machine.
+			 */
+			set_random_addr(req, &random_addr);
 		}
 
 		memset(&cp, 0, sizeof(cp));
@@ -2512,30 +2529,6 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
 						false);
 }
 
-static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
-{
-	struct hci_dev *hdev = req->hdev;
-
-	/* If we're advertising or initiating an LE connection we can't
-	 * go ahead and change the random address at this time. This is
-	 * because the eventual initiator address used for the
-	 * subsequently created connection will be undefined (some
-	 * controllers use the new address and others the one we had
-	 * when the operation started).
-	 *
-	 * In this kind of scenario skip the update and let the random
-	 * address be updated at the next cycle.
-	 */
-	if (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
-	    hci_lookup_le_connect(hdev)) {
-		bt_dev_dbg(hdev, "Deferring random address update");
-		hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
-		return;
-	}
-
-	hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa);
-}
-
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
 			      bool use_rpa, u8 *own_addr_type)
 {
@@ -2547,8 +2540,6 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy,
 	 * the current RPA in use, then generate a new one.
 	 */
 	if (use_rpa) {
-		int to;
-
 		/* If Controller supports LL Privacy use own address type is
 		 * 0x03
 		 */
@@ -2558,8 +2549,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy,
 		else
 			*own_addr_type = ADDR_LE_DEV_RANDOM;
 
-		if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) &&
-		    !bacmp(&hdev->random_addr, &hdev->rpa))
+		if (rpa_valid(hdev))
 			return 0;
 
 		err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
@@ -2570,9 +2560,6 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy,
 
 		set_random_addr(req, &hdev->rpa);
 
-		to = msecs_to_jiffies(hdev->rpa_timeout * 1000);
-		queue_delayed_work(hdev->workqueue, &hdev->rpa_expired, to);
-
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 27cfdadd687deca58146b415f60b23d185cb3532 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Tue, 3 Aug 2021 19:57:42 +0300
Subject: bus: fsl-mc: extend fsl_mc_get_endpoint() to pass interface ID

In case of a switch DPAA2 object, the interface ID is also needed when
querying for the object endpoint. Extend fsl_mc_get_endpoint() so that
users can also pass the interface ID that are interested in.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bus/fsl-mc/fsl-mc-bus.c                  | 4 +++-
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 2 +-
 include/linux/fsl/mc.h                           | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index 09c8ab5e0959..b3691de8ac06 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -914,7 +914,8 @@ void fsl_mc_device_remove(struct fsl_mc_device *mc_dev)
 }
 EXPORT_SYMBOL_GPL(fsl_mc_device_remove);
 
-struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev)
+struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev,
+					  u16 if_id)
 {
 	struct fsl_mc_device *mc_bus_dev, *endpoint;
 	struct fsl_mc_obj_desc endpoint_desc = {{ 0 }};
@@ -925,6 +926,7 @@ struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev)
 	mc_bus_dev = to_fsl_mc_device(mc_dev->dev.parent);
 	strcpy(endpoint1.type, mc_dev->obj_desc.type);
 	endpoint1.id = mc_dev->obj_desc.id;
+	endpoint1.if_id = if_id;
 
 	err = dprc_get_connection(mc_bus_dev->mc_io, 0,
 				  mc_bus_dev->mc_handle,
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index f664021c3ad1..7065c71ed7b8 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -4138,7 +4138,7 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv *priv)
 	int err;
 
 	dpni_dev = to_fsl_mc_device(priv->net_dev->dev.parent);
-	dpmac_dev = fsl_mc_get_endpoint(dpni_dev);
+	dpmac_dev = fsl_mc_get_endpoint(dpni_dev, 0);
 
 	if (PTR_ERR(dpmac_dev) == -EPROBE_DEFER)
 		return PTR_ERR(dpmac_dev);
diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index 63b56aba925a..30ece3ae6df7 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -423,7 +423,8 @@ int __must_check fsl_mc_allocate_irqs(struct fsl_mc_device *mc_dev);
 
 void fsl_mc_free_irqs(struct fsl_mc_device *mc_dev);
 
-struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev);
+struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev,
+					  u16 if_id);
 
 extern struct bus_type fsl_mc_bus_type;
 
-- 
cgit v1.2.3


From 314001f0bf927015e459c9d387d62a231fe93af3 Mon Sep 17 00:00:00 2001
From: Rao Shoaib <rao.shoaib@oracle.com>
Date: Sun, 1 Aug 2021 00:57:07 -0700
Subject: af_unix: Add OOB support

This patch adds OOB support for AF_UNIX sockets.
The semantics is same as TCP.

The last byte of a message with the OOB flag is
treated as the OOB byte. The byte is separated into
a skb and a pointer to the skb is stored in unix_sock.
The pointer is used to enforce OOB semantics.

Signed-off-by: Rao Shoaib <rao.shoaib@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h                              |   3 +
 net/unix/Kconfig                                   |   5 +
 net/unix/af_unix.c                                 | 153 +++++++-
 tools/testing/selftests/Makefile                   |   1 +
 tools/testing/selftests/net/af_unix/Makefile       |   5 +
 .../testing/selftests/net/af_unix/test_unix_oob.c  | 437 +++++++++++++++++++++
 6 files changed, 602 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/net/af_unix/Makefile
 create mode 100644 tools/testing/selftests/net/af_unix/test_unix_oob.c

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 435a2c3d5a6f..4757d7f53f13 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -70,6 +70,9 @@ struct unix_sock {
 	struct socket_wq	peer_wq;
 	wait_queue_entry_t	peer_wake;
 	struct scm_stat		scm_stat;
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+	struct sk_buff		*oob_skb;
+#endif
 };
 
 static inline struct unix_sock *unix_sk(const struct sock *sk)
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
index b6c4282899ec..b7f811216820 100644
--- a/net/unix/Kconfig
+++ b/net/unix/Kconfig
@@ -25,6 +25,11 @@ config UNIX_SCM
 	depends on UNIX
 	default y
 
+config	AF_UNIX_OOB
+	bool
+	depends on UNIX
+	default y
+
 config UNIX_DIAG
 	tristate "UNIX: socket monitoring interface"
 	depends on UNIX
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 256c4e31132e..ec02e70a549b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -503,6 +503,12 @@ static void unix_sock_destructor(struct sock *sk)
 
 	skb_queue_purge(&sk->sk_receive_queue);
 
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+	if (u->oob_skb) {
+		kfree_skb(u->oob_skb);
+		u->oob_skb = NULL;
+	}
+#endif
 	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
 	WARN_ON(!sk_unhashed(sk));
 	WARN_ON(sk->sk_socket);
@@ -1889,6 +1895,46 @@ out:
  */
 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
 
+#if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
+static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
+{
+	struct unix_sock *ousk = unix_sk(other);
+	struct sk_buff *skb;
+	int err = 0;
+
+	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
+
+	if (!skb)
+		return err;
+
+	skb_put(skb, 1);
+	skb->len = 1;
+	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
+
+	if (err) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	unix_state_lock(other);
+	maybe_add_creds(skb, sock, other);
+	skb_get(skb);
+
+	if (ousk->oob_skb)
+		kfree_skb(ousk->oob_skb);
+
+	ousk->oob_skb = skb;
+
+	scm_stat_add(other, skb);
+	skb_queue_tail(&other->sk_receive_queue, skb);
+	sk_send_sigurg(other);
+	unix_state_unlock(other);
+	other->sk_data_ready(other);
+
+	return err;
+}
+#endif
+
 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 			       size_t len)
 {
@@ -1907,8 +1953,14 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 		return err;
 
 	err = -EOPNOTSUPP;
-	if (msg->msg_flags&MSG_OOB)
-		goto out_err;
+	if (msg->msg_flags & MSG_OOB) {
+#if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
+		if (len)
+			len--;
+		else
+#endif
+			goto out_err;
+	}
 
 	if (msg->msg_namelen) {
 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
@@ -1973,6 +2025,15 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 		sent += size;
 	}
 
+#if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
+	if (msg->msg_flags & MSG_OOB) {
+		err = queue_oob(sock, msg, other);
+		if (err)
+			goto out_err;
+		sent++;
+	}
+#endif
+
 	scm_destroy(&scm);
 
 	return sent;
@@ -2358,6 +2419,59 @@ struct unix_stream_read_state {
 	unsigned int splice_flags;
 };
 
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+static int unix_stream_recv_urg(struct unix_stream_read_state *state)
+{
+	struct socket *sock = state->socket;
+	struct sock *sk = sock->sk;
+	struct unix_sock *u = unix_sk(sk);
+	int chunk = 1;
+
+	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb)
+		return -EINVAL;
+
+	chunk = state->recv_actor(u->oob_skb, 0, chunk, state);
+	if (chunk < 0)
+		return -EFAULT;
+
+	if (!(state->flags & MSG_PEEK)) {
+		UNIXCB(u->oob_skb).consumed += 1;
+		kfree_skb(u->oob_skb);
+		u->oob_skb = NULL;
+	}
+	state->msg->msg_flags |= MSG_OOB;
+	return 1;
+}
+
+static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
+				  int flags, int copied)
+{
+	struct unix_sock *u = unix_sk(sk);
+
+	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
+		skb_unlink(skb, &sk->sk_receive_queue);
+		consume_skb(skb);
+		skb = NULL;
+	} else {
+		if (skb == u->oob_skb) {
+			if (copied) {
+				skb = NULL;
+			} else if (sock_flag(sk, SOCK_URGINLINE)) {
+				if (!(flags & MSG_PEEK)) {
+					u->oob_skb = NULL;
+					consume_skb(skb);
+				}
+			} else if (!(flags & MSG_PEEK)) {
+				skb_unlink(skb, &sk->sk_receive_queue);
+				consume_skb(skb);
+				skb = skb_peek(&sk->sk_receive_queue);
+			}
+		}
+	}
+	return skb;
+}
+#endif
+
 static int unix_stream_read_generic(struct unix_stream_read_state *state,
 				    bool freezable)
 {
@@ -2383,6 +2497,15 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 
 	if (unlikely(flags & MSG_OOB)) {
 		err = -EOPNOTSUPP;
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+		mutex_lock(&u->iolock);
+		unix_state_lock(sk);
+
+		err = unix_stream_recv_urg(state);
+
+		unix_state_unlock(sk);
+		mutex_unlock(&u->iolock);
+#endif
 		goto out;
 	}
 
@@ -2411,6 +2534,18 @@ redo:
 		}
 		last = skb = skb_peek(&sk->sk_receive_queue);
 		last_len = last ? last->len : 0;
+
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+		if (skb) {
+			skb = manage_oob(skb, sk, flags, copied);
+			if (!skb) {
+				unix_state_unlock(sk);
+				if (copied)
+					break;
+				goto redo;
+			}
+		}
+#endif
 again:
 		if (skb == NULL) {
 			if (copied >= target)
@@ -2746,6 +2881,20 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCUNIXFILE:
 		err = unix_open_file(sk);
 		break;
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+	case SIOCATMARK:
+		{
+			struct sk_buff *skb;
+			struct unix_sock *u = unix_sk(sk);
+			int answ = 0;
+
+			skb = skb_peek(&sk->sk_receive_queue);
+			if (skb && skb == u->oob_skb)
+				answ = 1;
+			err = put_user(answ, (int __user *)arg);
+		}
+		break;
+#endif
 	default:
 		err = -ENOIOCTLCMD;
 		break;
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index fb010a35d61a..da9e8b699e42 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -38,6 +38,7 @@ TARGETS += mount_setattr
 TARGETS += mqueue
 TARGETS += nci
 TARGETS += net
+TARGETS += net/af_unix
 TARGETS += net/forwarding
 TARGETS += net/mptcp
 TARGETS += netfilter
diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile
new file mode 100644
index 000000000000..cfc7f4f97fd1
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/Makefile
@@ -0,0 +1,5 @@
+##TEST_GEN_FILES := test_unix_oob
+TEST_PROGS := test_unix_oob
+include ../../lib.mk
+
+all: $(TEST_PROGS)
diff --git a/tools/testing/selftests/net/af_unix/test_unix_oob.c b/tools/testing/selftests/net/af_unix/test_unix_oob.c
new file mode 100644
index 000000000000..0f3e3763f4f8
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/test_unix_oob.c
@@ -0,0 +1,437 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <netinet/tcp.h>
+#include <sys/un.h>
+#include <sys/signal.h>
+#include <sys/poll.h>
+
+static int pipefd[2];
+static int signal_recvd;
+static pid_t producer_id;
+static char sock_name[32];
+
+static void sig_hand(int sn, siginfo_t *si, void *p)
+{
+	signal_recvd = sn;
+}
+
+static int set_sig_handler(int signal)
+{
+	struct sigaction sa;
+
+	sa.sa_sigaction = sig_hand;
+	sigemptyset(&sa.sa_mask);
+	sa.sa_flags = SA_SIGINFO | SA_RESTART;
+
+	return sigaction(signal, &sa, NULL);
+}
+
+static void set_filemode(int fd, int set)
+{
+	int flags = fcntl(fd, F_GETFL, 0);
+
+	if (set)
+		flags &= ~O_NONBLOCK;
+	else
+		flags |= O_NONBLOCK;
+	fcntl(fd, F_SETFL, flags);
+}
+
+static void signal_producer(int fd)
+{
+	char cmd;
+
+	cmd = 'S';
+	write(fd, &cmd, sizeof(cmd));
+}
+
+static void wait_for_signal(int fd)
+{
+	char buf[5];
+
+	read(fd, buf, 5);
+}
+
+static void die(int status)
+{
+	fflush(NULL);
+	unlink(sock_name);
+	kill(producer_id, SIGTERM);
+	exit(status);
+}
+
+int is_sioctatmark(int fd)
+{
+	int ans = -1;
+
+	if (ioctl(fd, SIOCATMARK, &ans, sizeof(ans)) < 0) {
+#ifdef DEBUG
+		perror("SIOCATMARK Failed");
+#endif
+	}
+	return ans;
+}
+
+void read_oob(int fd, char *c)
+{
+
+	*c = ' ';
+	if (recv(fd, c, sizeof(*c), MSG_OOB) < 0) {
+#ifdef DEBUG
+		perror("Reading MSG_OOB Failed");
+#endif
+	}
+}
+
+int read_data(int pfd, char *buf, int size)
+{
+	int len = 0;
+
+	memset(buf, size, '0');
+	len = read(pfd, buf, size);
+#ifdef DEBUG
+	if (len < 0)
+		perror("read failed");
+#endif
+	return len;
+}
+
+static void wait_for_data(int pfd, int event)
+{
+	struct pollfd pfds[1];
+
+	pfds[0].fd = pfd;
+	pfds[0].events = event;
+	poll(pfds, 1, -1);
+}
+
+void producer(struct sockaddr_un *consumer_addr)
+{
+	int cfd;
+	char buf[64];
+	int i;
+
+	memset(buf, 'x', sizeof(buf));
+	cfd = socket(AF_UNIX, SOCK_STREAM, 0);
+
+	wait_for_signal(pipefd[0]);
+	if (connect(cfd, (struct sockaddr *)consumer_addr,
+		     sizeof(struct sockaddr)) != 0) {
+		perror("Connect failed");
+		kill(0, SIGTERM);
+		exit(1);
+	}
+
+	for (i = 0; i < 2; i++) {
+		/* Test 1: Test for SIGURG and OOB */
+		wait_for_signal(pipefd[0]);
+		memset(buf, 'x', sizeof(buf));
+		buf[63] = '@';
+		send(cfd, buf, sizeof(buf), MSG_OOB);
+
+		wait_for_signal(pipefd[0]);
+
+		/* Test 2: Test for OOB being overwitten */
+		memset(buf, 'x', sizeof(buf));
+		buf[63] = '%';
+		send(cfd, buf, sizeof(buf), MSG_OOB);
+
+		memset(buf, 'x', sizeof(buf));
+		buf[63] = '#';
+		send(cfd, buf, sizeof(buf), MSG_OOB);
+
+		wait_for_signal(pipefd[0]);
+
+		/* Test 3: Test for SIOCATMARK */
+		memset(buf, 'x', sizeof(buf));
+		buf[63] = '@';
+		send(cfd, buf, sizeof(buf), MSG_OOB);
+
+		memset(buf, 'x', sizeof(buf));
+		buf[63] = '%';
+		send(cfd, buf, sizeof(buf), MSG_OOB);
+
+		memset(buf, 'x', sizeof(buf));
+		send(cfd, buf, sizeof(buf), 0);
+
+		wait_for_signal(pipefd[0]);
+
+		/* Test 4: Test for 1byte OOB msg */
+		memset(buf, 'x', sizeof(buf));
+		buf[0] = '@';
+		send(cfd, buf, 1, MSG_OOB);
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	int lfd, pfd;
+	struct sockaddr_un consumer_addr, paddr;
+	socklen_t len = sizeof(consumer_addr);
+	char buf[1024];
+	int on = 0;
+	char oob;
+	int flags;
+	int atmark;
+	char *tmp_file;
+
+	lfd = socket(AF_UNIX, SOCK_STREAM, 0);
+	memset(&consumer_addr, 0, sizeof(consumer_addr));
+	consumer_addr.sun_family = AF_UNIX;
+	sprintf(sock_name, "unix_oob_%d", getpid());
+	unlink(sock_name);
+	strcpy(consumer_addr.sun_path, sock_name);
+
+	if ((bind(lfd, (struct sockaddr *)&consumer_addr,
+		  sizeof(consumer_addr))) != 0) {
+		perror("socket bind failed");
+		exit(1);
+	}
+
+	pipe(pipefd);
+
+	listen(lfd, 1);
+
+	producer_id = fork();
+	if (producer_id == 0) {
+		producer(&consumer_addr);
+		exit(0);
+	}
+
+	set_sig_handler(SIGURG);
+	signal_producer(pipefd[1]);
+
+	pfd = accept(lfd, (struct sockaddr *) &paddr, &len);
+	fcntl(pfd, F_SETOWN, getpid());
+
+	signal_recvd = 0;
+	signal_producer(pipefd[1]);
+
+	/* Test 1:
+	 * veriyf that SIGURG is
+	 * delivered and 63 bytes are
+	 * read and oob is '@'
+	 */
+	wait_for_data(pfd, POLLIN | POLLPRI);
+	read_oob(pfd, &oob);
+	len = read_data(pfd, buf, 1024);
+	if (!signal_recvd || len != 63 || oob != '@') {
+		fprintf(stderr, "Test 1 failed sigurg %d len %d %c\n",
+			 signal_recvd, len, oob);
+			die(1);
+	}
+
+	signal_recvd = 0;
+	signal_producer(pipefd[1]);
+
+	/* Test 2:
+	 * Verify that the first OOB is over written by
+	 * the 2nd one and the first OOB is returned as
+	 * part of the read, and sigurg is received.
+	 */
+	wait_for_data(pfd, POLLIN | POLLPRI);
+	len = 0;
+	while (len < 70)
+		len = recv(pfd, buf, 1024, MSG_PEEK);
+	len = read_data(pfd, buf, 1024);
+	read_oob(pfd, &oob);
+	if (!signal_recvd || len != 127 || oob != '#') {
+		fprintf(stderr, "Test 2 failed, sigurg %d len %d OOB %c\n",
+		signal_recvd, len, oob);
+		die(1);
+	}
+
+	signal_recvd = 0;
+	signal_producer(pipefd[1]);
+
+	/* Test 3:
+	 * verify that 2nd oob over writes
+	 * the first one and read breaks at
+	 * oob boundary returning 127 bytes
+	 * and sigurg is received and atmark
+	 * is set.
+	 * oob is '%' and second read returns
+	 * 64 bytes.
+	 */
+	len = 0;
+	wait_for_data(pfd, POLLIN | POLLPRI);
+	while (len < 150)
+		len = recv(pfd, buf, 1024, MSG_PEEK);
+	len = read_data(pfd, buf, 1024);
+	atmark = is_sioctatmark(pfd);
+	read_oob(pfd, &oob);
+
+	if (!signal_recvd || len != 127 || oob != '%' || atmark != 1) {
+		fprintf(stderr, "Test 3 failed, sigurg %d len %d OOB %c ",
+		"atmark %d\n", signal_recvd, len, oob, atmark);
+		die(1);
+	}
+
+	signal_recvd = 0;
+
+	len = read_data(pfd, buf, 1024);
+	if (len != 64) {
+		fprintf(stderr, "Test 3.1 failed, sigurg %d len %d OOB %c\n",
+			signal_recvd, len, oob);
+		die(1);
+	}
+
+	signal_recvd = 0;
+	signal_producer(pipefd[1]);
+
+	/* Test 4:
+	 * verify that a single byte
+	 * oob message is delivered.
+	 * set non blocking mode and
+	 * check proper error is
+	 * returned and sigurg is
+	 * received and correct
+	 * oob is read.
+	 */
+
+	set_filemode(pfd, 0);
+
+	wait_for_data(pfd, POLLIN | POLLPRI);
+	len = read_data(pfd, buf, 1024);
+	if ((len == -1) && (errno == 11))
+		len = 0;
+
+	read_oob(pfd, &oob);
+
+	if (!signal_recvd || len != 0 || oob != '@') {
+		fprintf(stderr, "Test 4 failed, sigurg %d len %d OOB %c\n",
+			 signal_recvd, len, oob);
+		die(1);
+	}
+
+	set_filemode(pfd, 1);
+
+	/* Inline Testing */
+
+	on = 1;
+	if (setsockopt(pfd, SOL_SOCKET, SO_OOBINLINE, &on, sizeof(on))) {
+		perror("SO_OOBINLINE");
+		die(1);
+	}
+
+	signal_recvd = 0;
+	signal_producer(pipefd[1]);
+
+	/* Test 1 -- Inline:
+	 * Check that SIGURG is
+	 * delivered and 63 bytes are
+	 * read and oob is '@'
+	 */
+
+	wait_for_data(pfd, POLLIN | POLLPRI);
+	len = read_data(pfd, buf, 1024);
+
+	if (!signal_recvd || len != 63) {
+		fprintf(stderr, "Test 1 Inline failed, sigurg %d len %d\n",
+			signal_recvd, len);
+		die(1);
+	}
+
+	len = read_data(pfd, buf, 1024);
+
+	if (len != 1) {
+		fprintf(stderr,
+			 "Test 1.1 Inline failed, sigurg %d len %d oob %c\n",
+			 signal_recvd, len, oob);
+		die(1);
+	}
+
+	signal_recvd = 0;
+	signal_producer(pipefd[1]);
+
+	/* Test 2 -- Inline:
+	 * Verify that the first OOB is over written by
+	 * the 2nd one and read breaks correctly on
+	 * 2nd OOB boundary with the first OOB returned as
+	 * part of the read, and sigurg is delivered and
+	 * siocatmark returns true.
+	 * next read returns one byte, the oob byte
+	 * and siocatmark returns false.
+	 */
+	len = 0;
+	wait_for_data(pfd, POLLIN | POLLPRI);
+	while (len < 70)
+		len = recv(pfd, buf, 1024, MSG_PEEK);
+	len = read_data(pfd, buf, 1024);
+	atmark = is_sioctatmark(pfd);
+	if (len != 127 || atmark != 1 || !signal_recvd) {
+		fprintf(stderr, "Test 2 Inline failed, len %d atmark %d\n",
+			 len, atmark);
+		die(1);
+	}
+
+	len = read_data(pfd, buf, 1024);
+	atmark = is_sioctatmark(pfd);
+	if (len != 1 || buf[0] != '#' || atmark == 1) {
+		fprintf(stderr, "Test 2.1 Inline failed, len %d data %c atmark %d\n",
+			len, buf[0], atmark);
+		die(1);
+	}
+
+	signal_recvd = 0;
+	signal_producer(pipefd[1]);
+
+	/* Test 3 -- Inline:
+	 * verify that 2nd oob over writes
+	 * the first one and read breaks at
+	 * oob boundary returning 127 bytes
+	 * and sigurg is received and siocatmark
+	 * is true after the read.
+	 * subsequent read returns 65 bytes
+	 * because of oob which should be '%'.
+	 */
+	len = 0;
+	wait_for_data(pfd, POLLIN | POLLPRI);
+	while (len < 126)
+		len = recv(pfd, buf, 1024, MSG_PEEK);
+	len = read_data(pfd, buf, 1024);
+	atmark = is_sioctatmark(pfd);
+	if (!signal_recvd || len != 127 || !atmark) {
+		fprintf(stderr,
+			 "Test 3 Inline failed, sigurg %d len %d data %c\n",
+			 signal_recvd, len, buf[0]);
+		die(1);
+	}
+
+	len = read_data(pfd, buf, 1024);
+	atmark = is_sioctatmark(pfd);
+	if (len != 65 || buf[0] != '%' || atmark != 0) {
+		fprintf(stderr,
+			 "Test 3.1 Inline failed, len %d oob %c atmark %d\n",
+			 len, buf[0], atmark);
+		die(1);
+	}
+
+	signal_recvd = 0;
+	signal_producer(pipefd[1]);
+
+	/* Test 4 -- Inline:
+	 * verify that a single
+	 * byte oob message is delivered
+	 * and read returns one byte, the oob
+	 * byte and sigurg is received
+	 */
+	wait_for_data(pfd, POLLIN | POLLPRI);
+	len = read_data(pfd, buf, 1024);
+	if (!signal_recvd || len != 1 || buf[0] != '@') {
+		fprintf(stderr,
+			"Test 4 Inline failed, signal %d len %d data %c\n",
+		signal_recvd, len, buf[0]);
+		die(1);
+	}
+	die(0);
+}
-- 
cgit v1.2.3


From 8679c31e0284aa3aaba038035e443180b5bacb99 Mon Sep 17 00:00:00 2001
From: Rocco Yue <rocco.yue@mediatek.com>
Date: Tue, 3 Aug 2021 20:02:50 +0800
Subject: net: add extack arg for link ops

Pass extack arg to validate_linkmsg and validate_link_af callbacks.
If a netlink attribute has a reject_message, use the extended ack
mechanism to carry the message back to user space.

Signed-off-by: Rocco Yue <rocco.yue@mediatek.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h | 3 ++-
 net/core/rtnetlink.c    | 9 +++++----
 net/ipv4/devinet.c      | 5 +++--
 net/ipv6/addrconf.c     | 5 +++--
 4 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 384e800665f2..9f48733bfd21 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -153,7 +153,8 @@ struct rtnl_af_ops {
 						    u32 ext_filter_mask);
 
 	int			(*validate_link_af)(const struct net_device *dev,
-						    const struct nlattr *attr);
+						    const struct nlattr *attr,
+						    struct netlink_ext_ack *extack);
 	int			(*set_link_af)(struct net_device *dev,
 					       const struct nlattr *attr,
 					       struct netlink_ext_ack *extack);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e79aaf1f7139..7c9d32cfe607 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2257,7 +2257,8 @@ invalid_attr:
 	return -EINVAL;
 }
 
-static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
+			    struct netlink_ext_ack *extack)
 {
 	if (dev) {
 		if (tb[IFLA_ADDRESS] &&
@@ -2284,7 +2285,7 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 				return -EOPNOTSUPP;
 
 			if (af_ops->validate_link_af) {
-				err = af_ops->validate_link_af(dev, af);
+				err = af_ops->validate_link_af(dev, af, extack);
 				if (err < 0)
 					return err;
 			}
@@ -2592,7 +2593,7 @@ static int do_setlink(const struct sk_buff *skb,
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int err;
 
-	err = validate_linkmsg(dev, tb);
+	err = validate_linkmsg(dev, tb, extack);
 	if (err < 0)
 		return err;
 
@@ -3290,7 +3291,7 @@ replay:
 			m_ops = master_dev->rtnl_link_ops;
 	}
 
-	err = validate_linkmsg(dev, tb);
+	err = validate_linkmsg(dev, tb, extack);
 	if (err < 0)
 		return err;
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c82aded8da7d..f4468980b675 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1950,7 +1950,8 @@ static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
 };
 
 static int inet_validate_link_af(const struct net_device *dev,
-				 const struct nlattr *nla)
+				 const struct nlattr *nla,
+				 struct netlink_ext_ack *extack)
 {
 	struct nlattr *a, *tb[IFLA_INET_MAX+1];
 	int err, rem;
@@ -1959,7 +1960,7 @@ static int inet_validate_link_af(const struct net_device *dev,
 		return -EAFNOSUPPORT;
 
 	err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla,
-					  inet_af_policy, NULL);
+					  inet_af_policy, extack);
 	if (err < 0)
 		return err;
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index db0a89810f28..0b786fc7b7d4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5792,7 +5792,8 @@ static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
 }
 
 static int inet6_validate_link_af(const struct net_device *dev,
-				  const struct nlattr *nla)
+				  const struct nlattr *nla,
+				  struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[IFLA_INET6_MAX + 1];
 	struct inet6_dev *idev = NULL;
@@ -5805,7 +5806,7 @@ static int inet6_validate_link_af(const struct net_device *dev,
 	}
 
 	err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla,
-					  inet6_af_policy, NULL);
+					  inet6_af_policy, extack);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 271e5b7d00aeff7c61fb6c5415d14dbedb783b68 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 3 Aug 2021 06:05:26 -0700
Subject: net: add netif_set_real_num_queues() for device reconfig

netif_set_real_num_rx_queues() and netif_set_real_num_tx_queues()
can fail which breaks drivers trying to implement reconfiguration
in a way that can't leave the device half-broken. In other words
those functions are incompatible with prepare/commit approach.

Luckily setting real number of queues can fail only if the number
is increased, meaning that if we order operations correctly we
can guarantee ending up with either new config (success), or
the old one (on error).

Provide a helper implementing such logic so that drivers don't
have to duplicate it.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cd136499ec59..1b4d4509d04b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3916,6 +3916,8 @@ static inline int netif_set_real_num_rx_queues(struct net_device *dev,
 	return 0;
 }
 #endif
+int netif_set_real_num_queues(struct net_device *dev,
+			      unsigned int txq, unsigned int rxq);
 
 static inline struct netdev_rx_queue *
 __netif_get_rx_queue(struct net_device *dev, unsigned int rxq)
diff --git a/net/core/dev.c b/net/core/dev.c
index 4a1401008db9..360cb2f1b1e9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2973,6 +2973,50 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 #endif
 
+/**
+ *	netif_set_real_num_queues - set actual number of RX and TX queues used
+ *	@dev: Network device
+ *	@txq: Actual number of TX queues
+ *	@rxq: Actual number of RX queues
+ *
+ *	Set the real number of both TX and RX queues.
+ *	Does nothing if the number of queues is already correct.
+ */
+int netif_set_real_num_queues(struct net_device *dev,
+			      unsigned int txq, unsigned int rxq)
+{
+	unsigned int old_rxq = dev->real_num_rx_queues;
+	int err;
+
+	if (txq < 1 || txq > dev->num_tx_queues ||
+	    rxq < 1 || rxq > dev->num_rx_queues)
+		return -EINVAL;
+
+	/* Start from increases, so the error path only does decreases -
+	 * decreases can't fail.
+	 */
+	if (rxq > dev->real_num_rx_queues) {
+		err = netif_set_real_num_rx_queues(dev, rxq);
+		if (err)
+			return err;
+	}
+	if (txq > dev->real_num_tx_queues) {
+		err = netif_set_real_num_tx_queues(dev, txq);
+		if (err)
+			goto undo_rx;
+	}
+	if (rxq < dev->real_num_rx_queues)
+		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
+	if (txq < dev->real_num_tx_queues)
+		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
+
+	return 0;
+undo_rx:
+	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
+	return err;
+}
+EXPORT_SYMBOL(netif_set_real_num_queues);
+
 /**
  * netif_get_num_default_rss_queues - default number of RSS queues
  *
-- 
cgit v1.2.3


From 5b9272e93f2efe3f6cda60cc2c26817b2ce49386 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Wed, 7 Jul 2021 11:48:54 +0200
Subject: can: j1939: extend UAPI to notify about RX status

To be able to create applications with user friendly feedback, we need be
able to provide receive status information.

Typical ETP transfer may take seconds or even hours. To give user some
clue or show a progress bar, the stack should push status updates.
Same as for the TX information, the socket error queue will be used with
following new signals:
- J1939_EE_INFO_RX_RTS   - received and accepted request to send signal.
- J1939_EE_INFO_RX_DPO   - received data package offset signal
- J1939_EE_INFO_RX_ABORT - RX session was aborted

Instead of completion signal, user will get data package.
To activate this signals, application should set
SOF_TIMESTAMPING_RX_SOFTWARE to the SO_TIMESTAMPING socket option. This
will avoid unpredictable application behavior for the old software.

Link: https://lore.kernel.org/r/20210707094854.30781-3-o.rempel@pengutronix.de
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/j1939.h |   9 +++
 net/can/j1939/j1939-priv.h     |   4 ++
 net/can/j1939/socket.c         | 135 ++++++++++++++++++++++++++++++++---------
 net/can/j1939/transport.c      |  22 +++++--
 4 files changed, 136 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/can/j1939.h b/include/uapi/linux/can/j1939.h
index df6e821075c1..38936460f668 100644
--- a/include/uapi/linux/can/j1939.h
+++ b/include/uapi/linux/can/j1939.h
@@ -78,11 +78,20 @@ enum {
 enum {
 	J1939_NLA_PAD,
 	J1939_NLA_BYTES_ACKED,
+	J1939_NLA_TOTAL_SIZE,
+	J1939_NLA_PGN,
+	J1939_NLA_SRC_NAME,
+	J1939_NLA_DEST_NAME,
+	J1939_NLA_SRC_ADDR,
+	J1939_NLA_DEST_ADDR,
 };
 
 enum {
 	J1939_EE_INFO_NONE,
 	J1939_EE_INFO_TX_ABORT,
+	J1939_EE_INFO_RX_RTS,
+	J1939_EE_INFO_RX_DPO,
+	J1939_EE_INFO_RX_ABORT,
 };
 
 struct j1939_filter {
diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h
index 93b8ad7f7d04..f6df20808f5e 100644
--- a/net/can/j1939/j1939-priv.h
+++ b/net/can/j1939/j1939-priv.h
@@ -23,6 +23,9 @@ enum j1939_sk_errqueue_type {
 	J1939_ERRQUEUE_TX_ACK,
 	J1939_ERRQUEUE_TX_SCHED,
 	J1939_ERRQUEUE_TX_ABORT,
+	J1939_ERRQUEUE_RX_RTS,
+	J1939_ERRQUEUE_RX_DPO,
+	J1939_ERRQUEUE_RX_ABORT,
 };
 
 /* j1939 devices */
@@ -87,6 +90,7 @@ struct j1939_priv {
 	struct list_head j1939_socks;
 
 	struct kref rx_kref;
+	u32 rx_tskey;
 };
 
 void j1939_ecu_put(struct j1939_ecu *ecu);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 6f3b10472f7f..6dff4510687a 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -905,20 +905,33 @@ failure:
 	return NULL;
 }
 
-static size_t j1939_sk_opt_stats_get_size(void)
+static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type)
 {
-	return
-		nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */
-		0;
+	switch (type) {
+	case J1939_ERRQUEUE_RX_RTS:
+		return
+			nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */
+			nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */
+			nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */
+			nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */
+			nla_total_size(sizeof(u8)) +  /* J1939_NLA_SRC_ADDR */
+			nla_total_size(sizeof(u8)) +  /* J1939_NLA_DEST_ADDR */
+			0;
+	default:
+		return
+			nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */
+			0;
+	}
 }
 
 static struct sk_buff *
-j1939_sk_get_timestamping_opt_stats(struct j1939_session *session)
+j1939_sk_get_timestamping_opt_stats(struct j1939_session *session,
+				    enum j1939_sk_errqueue_type type)
 {
 	struct sk_buff *stats;
 	u32 size;
 
-	stats = alloc_skb(j1939_sk_opt_stats_get_size(), GFP_ATOMIC);
+	stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
 
@@ -928,32 +941,67 @@ j1939_sk_get_timestamping_opt_stats(struct j1939_session *session)
 		size = min(session->pkt.tx_acked * 7,
 			   session->total_message_size);
 
-	nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size);
+	switch (type) {
+	case J1939_ERRQUEUE_RX_RTS:
+		nla_put_u32(stats, J1939_NLA_TOTAL_SIZE,
+			    session->total_message_size);
+		nla_put_u32(stats, J1939_NLA_PGN,
+			    session->skcb.addr.pgn);
+		nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME,
+				  session->skcb.addr.src_name, J1939_NLA_PAD);
+		nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME,
+				  session->skcb.addr.dst_name, J1939_NLA_PAD);
+		nla_put_u8(stats, J1939_NLA_SRC_ADDR,
+			   session->skcb.addr.sa);
+		nla_put_u8(stats, J1939_NLA_DEST_ADDR,
+			   session->skcb.addr.da);
+		break;
+	default:
+		nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size);
+	}
 
 	return stats;
 }
 
-void j1939_sk_errqueue(struct j1939_session *session,
-		       enum j1939_sk_errqueue_type type)
+static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk,
+				enum j1939_sk_errqueue_type type)
 {
 	struct j1939_priv *priv = session->priv;
-	struct sock *sk = session->sk;
 	struct j1939_sock *jsk;
 	struct sock_exterr_skb *serr;
 	struct sk_buff *skb;
 	char *state = "UNK";
 	int err;
 
-	/* currently we have no sk for the RX session */
-	if (!sk)
-		return;
-
 	jsk = j1939_sk(sk);
 
 	if (!(jsk->state & J1939_SOCK_ERRQUEUE))
 		return;
 
-	skb = j1939_sk_get_timestamping_opt_stats(session);
+	switch (type) {
+	case J1939_ERRQUEUE_TX_ACK:
+		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))
+			return;
+		break;
+	case J1939_ERRQUEUE_TX_SCHED:
+		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED))
+			return;
+		break;
+	case J1939_ERRQUEUE_TX_ABORT:
+		break;
+	case J1939_ERRQUEUE_RX_RTS:
+		fallthrough;
+	case J1939_ERRQUEUE_RX_DPO:
+		fallthrough;
+	case J1939_ERRQUEUE_RX_ABORT:
+		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE))
+			return;
+		break;
+	default:
+		netdev_err(priv->ndev, "Unknown errqueue type %i\n", type);
+	}
+
+	skb = j1939_sk_get_timestamping_opt_stats(session, type);
 	if (!skb)
 		return;
 
@@ -965,35 +1013,41 @@ void j1939_sk_errqueue(struct j1939_session *session,
 	memset(serr, 0, sizeof(*serr));
 	switch (type) {
 	case J1939_ERRQUEUE_TX_ACK:
-		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) {
-			kfree_skb(skb);
-			return;
-		}
-
 		serr->ee.ee_errno = ENOMSG;
 		serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 		serr->ee.ee_info = SCM_TSTAMP_ACK;
-		state = "ACK";
+		state = "TX ACK";
 		break;
 	case J1939_ERRQUEUE_TX_SCHED:
-		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) {
-			kfree_skb(skb);
-			return;
-		}
-
 		serr->ee.ee_errno = ENOMSG;
 		serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 		serr->ee.ee_info = SCM_TSTAMP_SCHED;
-		state = "SCH";
+		state = "TX SCH";
 		break;
 	case J1939_ERRQUEUE_TX_ABORT:
 		serr->ee.ee_errno = session->err;
 		serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
 		serr->ee.ee_info = J1939_EE_INFO_TX_ABORT;
-		state = "ABT";
+		state = "TX ABT";
+		break;
+	case J1939_ERRQUEUE_RX_RTS:
+		serr->ee.ee_errno = ENOMSG;
+		serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+		serr->ee.ee_info = J1939_EE_INFO_RX_RTS;
+		state = "RX RTS";
+		break;
+	case J1939_ERRQUEUE_RX_DPO:
+		serr->ee.ee_errno = ENOMSG;
+		serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+		serr->ee.ee_info = J1939_EE_INFO_RX_DPO;
+		state = "RX DPO";
+		break;
+	case J1939_ERRQUEUE_RX_ABORT:
+		serr->ee.ee_errno = session->err;
+		serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+		serr->ee.ee_info = J1939_EE_INFO_RX_ABORT;
+		state = "RX ABT";
 		break;
-	default:
-		netdev_err(priv->ndev, "Unknown errqueue type %i\n", type);
 	}
 
 	serr->opt_stats = true;
@@ -1008,6 +1062,27 @@ void j1939_sk_errqueue(struct j1939_session *session,
 		kfree_skb(skb);
 };
 
+void j1939_sk_errqueue(struct j1939_session *session,
+		       enum j1939_sk_errqueue_type type)
+{
+	struct j1939_priv *priv = session->priv;
+	struct j1939_sock *jsk;
+
+	if (session->sk) {
+		/* send TX notifications to the socket of origin  */
+		__j1939_sk_errqueue(session, session->sk, type);
+		return;
+	}
+
+	/* spread RX notifications to all sockets subscribed to this session */
+	spin_lock_bh(&priv->j1939_socks_lock);
+	list_for_each_entry(jsk, &priv->j1939_socks, list) {
+		if (j1939_sk_recv_match_one(jsk, &session->skcb))
+			__j1939_sk_errqueue(session, &jsk->sk, type);
+	}
+	spin_unlock_bh(&priv->j1939_socks_lock);
+};
+
 void j1939_sk_send_loop_abort(struct sock *sk, int err)
 {
 	sk->sk_err = err;
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index 801e700eaba6..bb5c4b8979be 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -260,10 +260,14 @@ static void __j1939_session_drop(struct j1939_session *session)
 
 static void j1939_session_destroy(struct j1939_session *session)
 {
-	if (session->err)
-		j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT);
-	else
-		j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ACK);
+	if (session->transmission) {
+		if (session->err)
+			j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT);
+		else
+			j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ACK);
+	} else if (session->err) {
+			j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+	}
 
 	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
 
@@ -1116,6 +1120,8 @@ static void __j1939_session_cancel(struct j1939_session *session,
 
 	if (session->sk)
 		j1939_sk_send_loop_abort(session->sk, session->err);
+	else
+		j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
 }
 
 static void j1939_session_cancel(struct j1939_session *session,
@@ -1330,6 +1336,8 @@ static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb,
 	session->err = j1939_xtp_abort_to_errno(priv, abort);
 	if (session->sk)
 		j1939_sk_send_loop_abort(session->sk, session->err);
+	else
+		j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
 	j1939_session_deactivate_activate_next(session);
 
 abort_put:
@@ -1630,6 +1638,9 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv,
 	session->pkt.rx = 0;
 	session->pkt.tx = 0;
 
+	session->tskey = priv->rx_tskey++;
+	j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_RTS);
+
 	WARN_ON_ONCE(j1939_session_activate(session));
 
 	return session;
@@ -1752,6 +1763,9 @@ static void j1939_xtp_rx_dpo_one(struct j1939_session *session,
 	session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data);
 	session->last_cmd = dat[0];
 	j1939_tp_set_rxtimeout(session, 750);
+
+	if (!session->transmission)
+		j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_DPO);
 }
 
 static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb,
-- 
cgit v1.2.3


From 957e2235e5264c97cd6be8e2e17f2e11b41f2239 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 3 Aug 2021 23:34:08 +0300
Subject: net: make switchdev_bridge_port_{,unoffload} loosely coupled with the
 bridge

With the introduction of explicit offloading API in switchdev in commit
2f5dc00f7a3e ("net: bridge: switchdev: let drivers inform which bridge
ports are offloaded"), we started having Ethernet switch drivers calling
directly into a function exported by net/bridge/br_switchdev.c, which is
a function exported by the bridge driver.

This means that drivers that did not have an explicit dependency on the
bridge before, like cpsw and am65-cpsw, now do - otherwise it is not
possible to call a symbol exported by a driver that can be built as
module unless you are a module too.

There was an attempt to solve the dependency issue in the form of commit
b0e81817629a ("net: build all switchdev drivers as modules when the
bridge is a module"). Grygorii Strashko, however, says about it:

| In my opinion, the problem is a bit bigger here than just fixing the
| build :(
|
| In case, of ^cpsw the switchdev mode is kinda optional and in many
| cases (especially for testing purposes, NFS) the multi-mac mode is
| still preferable mode.
|
| There were no such tight dependency between switchdev drivers and
| bridge core before and switchdev serviced as independent, notification
| based layer between them, so ^cpsw still can be "Y" and bridge can be
| "M". Now for mostly every kernel build configuration the CONFIG_BRIDGE
| will need to be set as "Y", or we will have to update drivers to
| support build with BRIDGE=n and maintain separate builds for
| networking vs non-networking testing.  But is this enough?  Wouldn't
| it cause 'chain reaction' required to add more and more "Y" options
| (like CONFIG_VLAN_8021Q)?
|
| PS. Just to be sure we on the same page - ARM builds will be forced
| (with this patch) to have CONFIG_TI_CPSW_SWITCHDEV=m and so all our
| automation testing will just fail with omap2plus_defconfig.

In the light of this, it would be desirable for some configurations to
avoid dependencies between switchdev drivers and the bridge, and have
the switchdev mode as completely optional within the driver.

Arnd Bergmann also tried to write a patch which better expressed the
build time dependency for Ethernet switch drivers where the switchdev
support is optional, like cpsw/am65-cpsw, and this made the drivers
follow the bridge (compile as module if the bridge is a module) only if
the optional switchdev support in the driver was enabled in the first
place:
https://patchwork.kernel.org/project/netdevbpf/patch/20210802144813.1152762-1-arnd@kernel.org/

but this still did not solve the fact that cpsw and am65-cpsw now must
be built as modules when the bridge is a module - it just expressed
correctly that optional dependency. But the new behavior is an apparent
regression from Grygorii's perspective.

So to support the use case where the Ethernet driver is built-in,
NET_SWITCHDEV (a bool option) is enabled, and the bridge is a module, we
need a framework that can handle the possible absence of the bridge from
the running system, i.e. runtime bloatware as opposed to build-time
bloatware.

Luckily we already have this framework, since switchdev has been using
it extensively. Events from the bridge side are transmitted to the
driver side using notifier chains - this was originally done so that
unrelated drivers could snoop for events emitted by the bridge towards
ports that are implemented by other drivers (think of a switch driver
with LAG offload that listens for switchdev events on a bonding/team
interface that it offloads).

There are also events which are transmitted from the driver side to the
bridge side, which again are modeled using notifiers.
SWITCHDEV_FDB_ADD_TO_BRIDGE is an example of this, and deals with
notifying the bridge that a MAC address has been dynamically learned.
So there is a precedent we can use for modeling the new framework.

The difference compared to SWITCHDEV_FDB_ADD_TO_BRIDGE is that the work
that the bridge needs to do when a port becomes offloaded is blocking in
its nature: replay VLANs, MDBs etc. The calling context is indeed
blocking (we are under rtnl_mutex), but the existing switchdev
notification chain that the bridge is subscribed to is only the atomic
one. So we need to subscribe the bridge to the blocking switchdev
notification chain too.

This patch:
- keeps the driver-side perception of the switchdev_bridge_port_{,un}offload
  unchanged
- moves the implementation of switchdev_bridge_port_{,un}offload from
  the bridge module into the switchdev module.
- makes everybody that is subscribed to the switchdev blocking notifier
  chain "hear" offload & unoffload events
- makes the bridge driver subscribe and handle those events
- moves the bridge driver's handling of those events into 2 new
  functions called br_switchdev_port_{,un}offload. These functions
  contain in fact the core of the logic that was previously in
  switchdev_bridge_port_{,un}offload, just that now we go through an
  extra indirection layer to reach them.

Unlike all the other switchdev notification structures, the structure
used to carry the bridge port information, struct
switchdev_notifier_brport_info, does not contain a "bool handled".
This is because in the current usage pattern, we always know that a
switchdev bridge port offloading event will be handled by the bridge,
because the switchdev_bridge_port_offload() call was initiated by a
NETDEV_CHANGEUPPER event in the first place, where info->upper_dev is a
bridge. So if the bridge wasn't loaded, then the CHANGEUPPER event
couldn't have happened.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/am65-cpsw-nuss.c |  2 +-
 drivers/net/ethernet/ti/cpsw_new.c       |  2 +-
 include/linux/if_bridge.h                | 35 ----------------------
 include/net/switchdev.h                  | 46 ++++++++++++++++++++++++++++
 net/bridge/br.c                          | 51 +++++++++++++++++++++++++++++++-
 net/bridge/br_private.h                  | 29 ++++++++++++++++++
 net/bridge/br_switchdev.c                | 36 ++++++----------------
 net/switchdev/switchdev.c                | 48 ++++++++++++++++++++++++++++++
 8 files changed, 184 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index 4f67d1a98c0d..fb5d2ac3f0d2 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -7,7 +7,6 @@
 
 #include <linux/clk.h>
 #include <linux/etherdevice.h>
-#include <linux/if_bridge.h>
 #include <linux/if_vlan.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
@@ -28,6 +27,7 @@
 #include <linux/sys_soc.h>
 #include <linux/dma/ti-cppi5.h>
 #include <linux/dma/k3-udma-glue.h>
+#include <net/switchdev.h>
 
 #include "cpsw_ale.h"
 #include "cpsw_sl.h"
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index b4f55ff4e84f..ae167223e87f 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/irqreturn.h>
 #include <linux/interrupt.h>
-#include <linux/if_bridge.h>
 #include <linux/if_ether.h>
 #include <linux/etherdevice.h>
 #include <linux/net_tstamp.h>
@@ -29,6 +28,7 @@
 #include <linux/kmemleak.h>
 #include <linux/sys_soc.h>
 
+#include <net/switchdev.h>
 #include <net/page_pool.h>
 #include <net/pkt_cls.h>
 #include <net/devlink.h>
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 21daed10322e..509e18c7e740 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -190,39 +190,4 @@ static inline clock_t br_get_ageing_time(const struct net_device *br_dev)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_NET_SWITCHDEV)
-
-int switchdev_bridge_port_offload(struct net_device *brport_dev,
-				  struct net_device *dev, const void *ctx,
-				  struct notifier_block *atomic_nb,
-				  struct notifier_block *blocking_nb,
-				  bool tx_fwd_offload,
-				  struct netlink_ext_ack *extack);
-void switchdev_bridge_port_unoffload(struct net_device *brport_dev,
-				     const void *ctx,
-				     struct notifier_block *atomic_nb,
-				     struct notifier_block *blocking_nb);
-
-#else
-
-static inline int
-switchdev_bridge_port_offload(struct net_device *brport_dev,
-			      struct net_device *dev, const void *ctx,
-			      struct notifier_block *atomic_nb,
-			      struct notifier_block *blocking_nb,
-			      bool tx_fwd_offload,
-			      struct netlink_ext_ack *extack)
-{
-	return -EINVAL;
-}
-
-static inline void
-switchdev_bridge_port_unoffload(struct net_device *brport_dev,
-				const void *ctx,
-				struct notifier_block *atomic_nb,
-				struct notifier_block *blocking_nb)
-{
-}
-#endif
-
 #endif
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 66468ff8cc0a..60d806b6a5ae 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -180,6 +180,14 @@ struct switchdev_obj_in_state_mrp {
 
 typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj);
 
+struct switchdev_brport {
+	struct net_device *dev;
+	const void *ctx;
+	struct notifier_block *atomic_nb;
+	struct notifier_block *blocking_nb;
+	bool tx_fwd_offload;
+};
+
 enum switchdev_notifier_type {
 	SWITCHDEV_FDB_ADD_TO_BRIDGE = 1,
 	SWITCHDEV_FDB_DEL_TO_BRIDGE,
@@ -197,6 +205,9 @@ enum switchdev_notifier_type {
 	SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
 	SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE,
 	SWITCHDEV_VXLAN_FDB_OFFLOADED,
+
+	SWITCHDEV_BRPORT_OFFLOADED,
+	SWITCHDEV_BRPORT_UNOFFLOADED,
 };
 
 struct switchdev_notifier_info {
@@ -226,6 +237,11 @@ struct switchdev_notifier_port_attr_info {
 	bool handled;
 };
 
+struct switchdev_notifier_brport_info {
+	struct switchdev_notifier_info info; /* must be first */
+	const struct switchdev_brport brport;
+};
+
 static inline struct net_device *
 switchdev_notifier_info_to_dev(const struct switchdev_notifier_info *info)
 {
@@ -246,6 +262,17 @@ switchdev_fdb_is_dynamically_learned(const struct switchdev_notifier_fdb_info *f
 
 #ifdef CONFIG_NET_SWITCHDEV
 
+int switchdev_bridge_port_offload(struct net_device *brport_dev,
+				  struct net_device *dev, const void *ctx,
+				  struct notifier_block *atomic_nb,
+				  struct notifier_block *blocking_nb,
+				  bool tx_fwd_offload,
+				  struct netlink_ext_ack *extack);
+void switchdev_bridge_port_unoffload(struct net_device *brport_dev,
+				     const void *ctx,
+				     struct notifier_block *atomic_nb,
+				     struct notifier_block *blocking_nb);
+
 void switchdev_deferred_process(void);
 int switchdev_port_attr_set(struct net_device *dev,
 			    const struct switchdev_attr *attr,
@@ -316,6 +343,25 @@ int switchdev_handle_port_attr_set(struct net_device *dev,
 				      struct netlink_ext_ack *extack));
 #else
 
+static inline int
+switchdev_bridge_port_offload(struct net_device *brport_dev,
+			      struct net_device *dev, const void *ctx,
+			      struct notifier_block *atomic_nb,
+			      struct notifier_block *blocking_nb,
+			      bool tx_fwd_offload,
+			      struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+switchdev_bridge_port_unoffload(struct net_device *brport_dev,
+				const void *ctx,
+				struct notifier_block *atomic_nb,
+				struct notifier_block *blocking_nb)
+{
+}
+
 static inline void switchdev_deferred_process(void)
 {
 }
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 8fb5dca5f8e0..d3a32c6813e0 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -201,6 +201,48 @@ static struct notifier_block br_switchdev_notifier = {
 	.notifier_call = br_switchdev_event,
 };
 
+/* called under rtnl_mutex */
+static int br_switchdev_blocking_event(struct notifier_block *nb,
+				       unsigned long event, void *ptr)
+{
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+	struct switchdev_notifier_brport_info *brport_info;
+	const struct switchdev_brport *b;
+	struct net_bridge_port *p;
+	int err = NOTIFY_DONE;
+
+	p = br_port_get_rtnl(dev);
+	if (!p)
+		goto out;
+
+	switch (event) {
+	case SWITCHDEV_BRPORT_OFFLOADED:
+		brport_info = ptr;
+		b = &brport_info->brport;
+
+		err = br_switchdev_port_offload(p, b->dev, b->ctx,
+						b->atomic_nb, b->blocking_nb,
+						b->tx_fwd_offload, extack);
+		err = notifier_from_errno(err);
+		break;
+	case SWITCHDEV_BRPORT_UNOFFLOADED:
+		brport_info = ptr;
+		b = &brport_info->brport;
+
+		br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb,
+					    b->blocking_nb);
+		break;
+	}
+
+out:
+	return err;
+}
+
+static struct notifier_block br_switchdev_blocking_notifier = {
+	.notifier_call = br_switchdev_blocking_event,
+};
+
 /* br_boolopt_toggle - change user-controlled boolean option
  *
  * @br: bridge device
@@ -355,10 +397,14 @@ static int __init br_init(void)
 	if (err)
 		goto err_out4;
 
-	err = br_netlink_init();
+	err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier);
 	if (err)
 		goto err_out5;
 
+	err = br_netlink_init();
+	if (err)
+		goto err_out6;
+
 	brioctl_set(br_ioctl_stub);
 
 #if IS_ENABLED(CONFIG_ATM_LANE)
@@ -373,6 +419,8 @@ static int __init br_init(void)
 
 	return 0;
 
+err_out6:
+	unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier);
 err_out5:
 	unregister_switchdev_notifier(&br_switchdev_notifier);
 err_out4:
@@ -392,6 +440,7 @@ static void __exit br_deinit(void)
 {
 	stp_proto_unregister(&br_stp_proto);
 	br_netlink_fini();
+	unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier);
 	unregister_switchdev_notifier(&br_switchdev_notifier);
 	unregister_netdevice_notifier(&br_device_notifier);
 	brioctl_set(NULL);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c939631428b9..10d43bf4bb80 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1880,6 +1880,17 @@ static inline void br_sysfs_delbr(struct net_device *dev) { return; }
 
 /* br_switchdev.c */
 #ifdef CONFIG_NET_SWITCHDEV
+int br_switchdev_port_offload(struct net_bridge_port *p,
+			      struct net_device *dev, const void *ctx,
+			      struct notifier_block *atomic_nb,
+			      struct notifier_block *blocking_nb,
+			      bool tx_fwd_offload,
+			      struct netlink_ext_ack *extack);
+
+void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
+				 struct notifier_block *atomic_nb,
+				 struct notifier_block *blocking_nb);
+
 bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb);
 
 void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb);
@@ -1908,6 +1919,24 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
 	skb->offload_fwd_mark = 0;
 }
 #else
+static inline int
+br_switchdev_port_offload(struct net_bridge_port *p,
+			  struct net_device *dev, const void *ctx,
+			  struct notifier_block *atomic_nb,
+			  struct notifier_block *blocking_nb,
+			  bool tx_fwd_offload,
+			  struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
+			    struct notifier_block *atomic_nb,
+			    struct notifier_block *blocking_nb)
+{
+}
+
 static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb)
 {
 	return false;
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 36d75fd4a80c..6bf518d78f02 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -312,23 +312,16 @@ static void nbp_switchdev_unsync_objs(struct net_bridge_port *p,
 /* Let the bridge know that this port is offloaded, so that it can assign a
  * switchdev hardware domain to it.
  */
-int switchdev_bridge_port_offload(struct net_device *brport_dev,
-				  struct net_device *dev, const void *ctx,
-				  struct notifier_block *atomic_nb,
-				  struct notifier_block *blocking_nb,
-				  bool tx_fwd_offload,
-				  struct netlink_ext_ack *extack)
+int br_switchdev_port_offload(struct net_bridge_port *p,
+			      struct net_device *dev, const void *ctx,
+			      struct notifier_block *atomic_nb,
+			      struct notifier_block *blocking_nb,
+			      bool tx_fwd_offload,
+			      struct netlink_ext_ack *extack)
 {
 	struct netdev_phys_item_id ppid;
-	struct net_bridge_port *p;
 	int err;
 
-	ASSERT_RTNL();
-
-	p = br_port_get_rtnl(brport_dev);
-	if (!p)
-		return -ENODEV;
-
 	err = dev_get_port_parent_id(dev, &ppid, false);
 	if (err)
 		return err;
@@ -348,23 +341,12 @@ out_switchdev_del:
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload);
 
-void switchdev_bridge_port_unoffload(struct net_device *brport_dev,
-				     const void *ctx,
-				     struct notifier_block *atomic_nb,
-				     struct notifier_block *blocking_nb)
+void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
+				 struct notifier_block *atomic_nb,
+				 struct notifier_block *blocking_nb)
 {
-	struct net_bridge_port *p;
-
-	ASSERT_RTNL();
-
-	p = br_port_get_rtnl(brport_dev);
-	if (!p)
-		return;
-
 	nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb);
 
 	nbp_switchdev_del(p);
 }
-EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload);
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 0ae3478561f4..0b2c18efc079 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -809,3 +809,51 @@ int switchdev_handle_port_attr_set(struct net_device *dev,
 	return err;
 }
 EXPORT_SYMBOL_GPL(switchdev_handle_port_attr_set);
+
+int switchdev_bridge_port_offload(struct net_device *brport_dev,
+				  struct net_device *dev, const void *ctx,
+				  struct notifier_block *atomic_nb,
+				  struct notifier_block *blocking_nb,
+				  bool tx_fwd_offload,
+				  struct netlink_ext_ack *extack)
+{
+	struct switchdev_notifier_brport_info brport_info = {
+		.brport = {
+			.dev = dev,
+			.ctx = ctx,
+			.atomic_nb = atomic_nb,
+			.blocking_nb = blocking_nb,
+			.tx_fwd_offload = tx_fwd_offload,
+		},
+	};
+	int err;
+
+	ASSERT_RTNL();
+
+	err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_OFFLOADED,
+						brport_dev, &brport_info.info,
+						extack);
+	return notifier_to_errno(err);
+}
+EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload);
+
+void switchdev_bridge_port_unoffload(struct net_device *brport_dev,
+				     const void *ctx,
+				     struct notifier_block *atomic_nb,
+				     struct notifier_block *blocking_nb)
+{
+	struct switchdev_notifier_brport_info brport_info = {
+		.brport = {
+			.ctx = ctx,
+			.atomic_nb = atomic_nb,
+			.blocking_nb = blocking_nb,
+		},
+	};
+
+	ASSERT_RTNL();
+
+	call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_UNOFFLOADED,
+					  brport_dev, &brport_info.info,
+					  NULL);
+}
+EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload);
-- 
cgit v1.2.3


From 04190bf8944deb7e3ac165a1a494db23aa0160a9 Mon Sep 17 00:00:00 2001
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date: Wed, 4 Aug 2021 10:55:56 +0300
Subject: sock: allow reading and changing sk_userlocks with setsockopt

SOCK_SNDBUF_LOCK and SOCK_RCVBUF_LOCK flags disable automatic socket
buffers adjustment done by kernel (see tcp_fixup_rcvbuf() and
tcp_sndbuf_expand()). If we've just created a new socket this adjustment
is enabled on it, but if one changes the socket buffer size by
setsockopt(SO_{SND,RCV}BUF*) it becomes disabled.

CRIU needs to call setsockopt(SO_{SND,RCV}BUF*) on each socket on
restore as it first needs to increase buffer sizes for packet queues
restore and second it needs to restore back original buffer sizes. So
after CRIU restore all sockets become non-auto-adjustable, which can
decrease network performance of restored applications significantly.

CRIU need to be able to restore sockets with enabled/disabled adjustment
to the same state it was before dump, so let's add special setsockopt
for it.

Let's also export SOCK_SNDBUF_LOCK and SOCK_RCVBUF_LOCK flags to uAPI so
that using these interface one can reenable automatic socket buffer
adjustment on their sockets.

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  |  2 ++
 arch/mips/include/uapi/asm/socket.h   |  2 ++
 arch/parisc/include/uapi/asm/socket.h |  2 ++
 arch/sparc/include/uapi/asm/socket.h  |  2 ++
 include/net/sock.h                    |  3 +--
 include/uapi/asm-generic/socket.h     |  2 ++
 include/uapi/linux/socket.h           |  5 +++++
 net/core/sock.c                       | 13 +++++++++++++
 8 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 6b3daba60987..1dd9baf4a6c2 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -129,6 +129,8 @@
 
 #define SO_NETNS_COOKIE		71
 
+#define SO_BUF_LOCK		72
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index cdf404a831b2..1eaf6a1ca561 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -140,6 +140,8 @@
 
 #define SO_NETNS_COOKIE		71
 
+#define SO_BUF_LOCK		72
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 5b5351cdcb33..8baaad52d799 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -121,6 +121,8 @@
 
 #define SO_NETNS_COOKIE		0x4045
 
+#define SO_BUF_LOCK		0x4046
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 92675dc380fa..e80ee8641ac3 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -122,6 +122,8 @@
 
 #define SO_NETNS_COOKIE          0x0050
 
+#define SO_BUF_LOCK              0x0051
+
 #if !defined(__KERNEL__)
 
 
diff --git a/include/net/sock.h b/include/net/sock.h
index ff1be7e7e90b..6e761451c927 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -68,6 +68,7 @@
 #include <net/tcp_states.h>
 #include <linux/net_tstamp.h>
 #include <net/l3mdev.h>
+#include <uapi/linux/socket.h>
 
 /*
  * This structure really needs to be cleaned up.
@@ -1438,8 +1439,6 @@ static inline int __sk_prot_rehash(struct sock *sk)
 #define RCV_SHUTDOWN	1
 #define SEND_SHUTDOWN	2
 
-#define SOCK_SNDBUF_LOCK	1
-#define SOCK_RCVBUF_LOCK	2
 #define SOCK_BINDADDR_LOCK	4
 #define SOCK_BINDPORT_LOCK	8
 
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index d588c244ec2f..1f0a2b4864e4 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -124,6 +124,8 @@
 
 #define SO_NETNS_COOKIE		71
 
+#define SO_BUF_LOCK		72
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h
index c3409c8ec0dd..eb0a9a5b6e71 100644
--- a/include/uapi/linux/socket.h
+++ b/include/uapi/linux/socket.h
@@ -26,4 +26,9 @@ struct __kernel_sockaddr_storage {
 	};
 };
 
+#define SOCK_SNDBUF_LOCK	1
+#define SOCK_RCVBUF_LOCK	2
+
+#define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK)
+
 #endif /* _UAPI_LINUX_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 9671c32e6ef5..aada649e07e8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1358,6 +1358,15 @@ set_sndbuf:
 		ret = sock_bindtoindex_locked(sk, val);
 		break;
 
+	case SO_BUF_LOCK:
+		if (val & ~SOCK_BUF_LOCK_MASK) {
+			ret = -EINVAL;
+			break;
+		}
+		sk->sk_userlocks = val | (sk->sk_userlocks &
+					  ~SOCK_BUF_LOCK_MASK);
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1720,6 +1729,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val64 = sock_net(sk)->net_cookie;
 		break;
 
+	case SO_BUF_LOCK:
+		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
+		break;
+
 	default:
 		/* We implement the SO_SNDLOWAT etc to not be settable
 		 * (1003.1g 7).
-- 
cgit v1.2.3


From db243b796439c0caba47865564d8acd18a301d18 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 4 Aug 2021 15:45:36 -0500
Subject: net/ipv4/ipv6: Replace one-element arraya with flexible-array members
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having
a dynamically sized set of trailing elements in a structure. Kernel code
should always use “flexible array members”[1] for these cases. The older
style of one-element or zero-length arrays should no longer be used[2].

Use an anonymous union with a couple of anonymous structs in order to
keep userspace unchanged and refactor the related code accordingly:

$ pahole -C group_filter net/ipv4/ip_sockglue.o
struct group_filter {
	union {
		struct {
			__u32      gf_interface_aux;     /*     0     4 */

			/* XXX 4 bytes hole, try to pack */

			struct __kernel_sockaddr_storage gf_group_aux; /*     8   128 */
			/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
			__u32      gf_fmode_aux;         /*   136     4 */
			__u32      gf_numsrc_aux;        /*   140     4 */
			struct __kernel_sockaddr_storage gf_slist[1]; /*   144   128 */
		};                                       /*     0   272 */
		struct {
			__u32      gf_interface;         /*     0     4 */

			/* XXX 4 bytes hole, try to pack */

			struct __kernel_sockaddr_storage gf_group; /*     8   128 */
			/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
			__u32      gf_fmode;             /*   136     4 */
			__u32      gf_numsrc;            /*   140     4 */
			struct __kernel_sockaddr_storage gf_slist_flex[0]; /*   144     0 */
		};                                       /*     0   144 */
	};                                               /*     0   272 */

	/* size: 272, cachelines: 5, members: 1 */
	/* last cacheline: 16 bytes */
};

$ pahole -C compat_group_filter net/ipv4/ip_sockglue.o
struct compat_group_filter {
	union {
		struct {
			__u32      gf_interface_aux;     /*     0     4 */
			struct __kernel_sockaddr_storage gf_group_aux __attribute__((__aligned__(4))); /*     4   128 */
			/* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */
			__u32      gf_fmode_aux;         /*   132     4 */
			__u32      gf_numsrc_aux;        /*   136     4 */
			struct __kernel_sockaddr_storage gf_slist[1] __attribute__((__aligned__(4))); /*   140   128 */
		} __attribute__((__packed__)) __attribute__((__aligned__(4)));                     /*     0   268 */
		struct {
			__u32      gf_interface;         /*     0     4 */
			struct __kernel_sockaddr_storage gf_group __attribute__((__aligned__(4))); /*     4   128 */
			/* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */
			__u32      gf_fmode;             /*   132     4 */
			__u32      gf_numsrc;            /*   136     4 */
			struct __kernel_sockaddr_storage gf_slist_flex[0] __attribute__((__aligned__(4))); /*   140     0 */
		} __attribute__((__packed__)) __attribute__((__aligned__(4)));                     /*     0   140 */
	} __attribute__((__aligned__(1)));               /*     0   268 */

	/* size: 268, cachelines: 5, members: 1 */
	/* forced alignments: 1 */
	/* last cacheline: 12 bytes */
} __attribute__((__packed__));

This helps with the ongoing efforts to globally enable -Warray-bounds
and get us closer to being able to tighten the FORTIFY_SOURCE routines
on memcpy().

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/79
Link: https://github.com/KSPP/linux/issues/109
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/compat.h     | 27 ++++++++++++++++++++-------
 include/uapi/linux/in.h  | 21 ++++++++++++++++-----
 net/ipv4/ip_sockglue.c   | 19 ++++++++++---------
 net/ipv6/ipv6_sockglue.c | 18 +++++++++---------
 4 files changed, 55 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/compat.h b/include/net/compat.h
index 84805bdc4435..595fee069b82 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -71,13 +71,26 @@ struct compat_group_source_req {
 } __packed;
 
 struct compat_group_filter {
-	__u32				 gf_interface;
-	struct __kernel_sockaddr_storage gf_group
-		__aligned(4);
-	__u32				 gf_fmode;
-	__u32				 gf_numsrc;
-	struct __kernel_sockaddr_storage gf_slist[1]
-		__aligned(4);
+	union {
+		struct {
+			__u32				 gf_interface_aux;
+			struct __kernel_sockaddr_storage gf_group_aux
+				__aligned(4);
+			__u32				 gf_fmode_aux;
+			__u32				 gf_numsrc_aux;
+			struct __kernel_sockaddr_storage gf_slist[1]
+				__aligned(4);
+		} __packed;
+		struct {
+			__u32				 gf_interface;
+			struct __kernel_sockaddr_storage gf_group
+				__aligned(4);
+			__u32				 gf_fmode;
+			__u32				 gf_numsrc;
+			struct __kernel_sockaddr_storage gf_slist_flex[]
+				__aligned(4);
+		} __packed;
+	};
 } __packed;
 
 #endif /* NET_COMPAT_H */
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 193b7cf1f0ac..14168225cecd 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -222,11 +222,22 @@ struct group_source_req {
 };
 
 struct group_filter {
-	__u32				 gf_interface;	/* interface index */
-	struct __kernel_sockaddr_storage gf_group;	/* multicast address */
-	__u32				 gf_fmode;	/* filter mode */
-	__u32				 gf_numsrc;	/* number of sources */
-	struct __kernel_sockaddr_storage gf_slist[1];	/* interface index */
+	union {
+		struct {
+			__u32				 gf_interface_aux; /* interface index */
+			struct __kernel_sockaddr_storage gf_group_aux;	   /* multicast address */
+			__u32				 gf_fmode_aux;	   /* filter mode */
+			__u32				 gf_numsrc_aux;	   /* number of sources */
+			struct __kernel_sockaddr_storage gf_slist[1];	   /* interface index */
+		};
+		struct {
+			__u32				 gf_interface;	  /* interface index */
+			struct __kernel_sockaddr_storage gf_group;	  /* multicast address */
+			__u32				 gf_fmode;	  /* filter mode */
+			__u32				 gf_numsrc;	  /* number of sources */
+			struct __kernel_sockaddr_storage gf_slist_flex[]; /* interface index */
+		};
+	};
 };
 
 #define GROUP_FILTER_SIZE(numsrc) \
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 468969c75708..b297bb28556e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -790,7 +790,8 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen)
 		goto out_free_gsf;
 
 	err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc,
-				 gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist);
+				 gsf->gf_fmode, &gsf->gf_group,
+				 gsf->gf_slist_flex);
 out_free_gsf:
 	kfree(gsf);
 	return err;
@@ -799,7 +800,7 @@ out_free_gsf:
 static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 		int optlen)
 {
-	const int size0 = offsetof(struct compat_group_filter, gf_slist);
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
 	struct compat_group_filter *gf32;
 	unsigned int n;
 	void *p;
@@ -813,7 +814,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 	p = kmalloc(optlen + 4, GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
-	gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */
+	gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
 
 	err = -EFAULT;
 	if (copy_from_sockptr(gf32, optval, optlen))
@@ -826,7 +827,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 		goto out_free_gsf;
 
 	err = -EINVAL;
-	if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen)
+	if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
 		goto out_free_gsf;
 
 	/* numsrc >= (4G-140)/128 overflow in 32 bits */
@@ -834,7 +835,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 	if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf)
 		goto out_free_gsf;
 	err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode,
-				 &gf32->gf_group, gf32->gf_slist);
+				 &gf32->gf_group, gf32->gf_slist_flex);
 out_free_gsf:
 	kfree(p);
 	return err;
@@ -1455,7 +1456,7 @@ static bool getsockopt_needs_rtnl(int optname)
 static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 		int __user *optlen, int len)
 {
-	const int size0 = offsetof(struct group_filter, gf_slist);
+	const int size0 = offsetof(struct group_filter, gf_slist_flex);
 	struct group_filter __user *p = optval;
 	struct group_filter gsf;
 	int num;
@@ -1467,7 +1468,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 		return -EFAULT;
 
 	num = gsf.gf_numsrc;
-	err = ip_mc_gsfget(sk, &gsf, p->gf_slist);
+	err = ip_mc_gsfget(sk, &gsf, p->gf_slist_flex);
 	if (err)
 		return err;
 	if (gsf.gf_numsrc < num)
@@ -1481,7 +1482,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 		int __user *optlen, int len)
 {
-	const int size0 = offsetof(struct compat_group_filter, gf_slist);
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
 	struct compat_group_filter __user *p = optval;
 	struct compat_group_filter gf32;
 	struct group_filter gf;
@@ -1498,7 +1499,7 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 	num = gf.gf_numsrc = gf32.gf_numsrc;
 	gf.gf_group = gf32.gf_group;
 
-	err = ip_mc_gsfget(sk, &gf, p->gf_slist);
+	err = ip_mc_gsfget(sk, &gf, p->gf_slist_flex);
 	if (err)
 		return err;
 	if (gf.gf_numsrc < num)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a6804a7e34c1..e4bdb09c5586 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -225,7 +225,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 	if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen)
 		goto out_free_gsf;
 
-	ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist);
+	ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex);
 out_free_gsf:
 	kfree(gsf);
 	return ret;
@@ -234,7 +234,7 @@ out_free_gsf:
 static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 		int optlen)
 {
-	const int size0 = offsetof(struct compat_group_filter, gf_slist);
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
 	struct compat_group_filter *gf32;
 	void *p;
 	int ret;
@@ -249,7 +249,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 	if (!p)
 		return -ENOMEM;
 
-	gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */
+	gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
 	ret = -EFAULT;
 	if (copy_from_sockptr(gf32, optval, optlen))
 		goto out_free_p;
@@ -261,14 +261,14 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 		goto out_free_p;
 
 	ret = -EINVAL;
-	if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen)
+	if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
 		goto out_free_p;
 
 	ret = ip6_mc_msfilter(sk, &(struct group_filter){
 			.gf_interface = gf32->gf_interface,
 			.gf_group = gf32->gf_group,
 			.gf_fmode = gf32->gf_fmode,
-			.gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist);
+			.gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex);
 
 out_free_p:
 	kfree(p);
@@ -1048,7 +1048,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
 static int ipv6_get_msfilter(struct sock *sk, void __user *optval,
 		int __user *optlen, int len)
 {
-	const int size0 = offsetof(struct group_filter, gf_slist);
+	const int size0 = offsetof(struct group_filter, gf_slist_flex);
 	struct group_filter __user *p = optval;
 	struct group_filter gsf;
 	int num;
@@ -1062,7 +1062,7 @@ static int ipv6_get_msfilter(struct sock *sk, void __user *optval,
 		return -EADDRNOTAVAIL;
 	num = gsf.gf_numsrc;
 	lock_sock(sk);
-	err = ip6_mc_msfget(sk, &gsf, p->gf_slist);
+	err = ip6_mc_msfget(sk, &gsf, p->gf_slist_flex);
 	if (!err) {
 		if (num > gsf.gf_numsrc)
 			num = gsf.gf_numsrc;
@@ -1077,7 +1077,7 @@ static int ipv6_get_msfilter(struct sock *sk, void __user *optval,
 static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval,
 		int __user *optlen)
 {
-	const int size0 = offsetof(struct compat_group_filter, gf_slist);
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
 	struct compat_group_filter __user *p = optval;
 	struct compat_group_filter gf32;
 	struct group_filter gf;
@@ -1100,7 +1100,7 @@ static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval,
 		return -EADDRNOTAVAIL;
 
 	lock_sock(sk);
-	err = ip6_mc_msfget(sk, &gf, p->gf_slist);
+	err = ip6_mc_msfget(sk, &gf, p->gf_slist_flex);
 	release_sock(sk);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From e6a1f7e0b0fe5997b896b793c70d12fc5ed06cdd Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 4 Aug 2021 16:18:50 -0500
Subject: net/ipv4/igmp: Use struct_size() helper

Replace IP_SFLSIZE() with struct_size() helper in order to avoid any
potential type mistakes or integer overflows that, in the worst
scenario, could lead to heap overflows.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h |  3 ---
 net/ipv4/igmp.c      | 20 +++++++++++++-------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 64ce8cd1cfaf..93c262ecbdc9 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -41,9 +41,6 @@ struct ip_sf_socklist {
 	__be32			sl_addr[];
 };
 
-#define IP_SFLSIZE(count)	(sizeof(struct ip_sf_socklist) + \
-	(count) * sizeof(__be32))
-
 #define IP_SFBLOCK	10	/* allocate this many at once */
 
 /* ip_mc_socklist is real list now. Speed is not argument;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c2d477eb6825..7e5072722f05 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2233,7 +2233,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
 			iml->sfmode, psf->sl_count, psf->sl_addr, 0);
 	RCU_INIT_POINTER(iml->sflist, NULL);
 	/* decrease mem now to avoid the memleak warning */
-	atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
+	atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc);
 	kfree_rcu(psf, rcu);
 	return err;
 }
@@ -2382,7 +2382,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 
 		if (psl)
 			count += psl->sl_max;
-		newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL);
+		newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
+				      GFP_KERNEL);
 		if (!newpsl) {
 			err = -ENOBUFS;
 			goto done;
@@ -2393,7 +2394,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 			for (i = 0; i < psl->sl_count; i++)
 				newpsl->sl_addr[i] = psl->sl_addr[i];
 			/* decrease mem now to avoid the memleak warning */
-			atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+			atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+				   &sk->sk_omem_alloc);
 			kfree_rcu(psl, rcu);
 		}
 		rcu_assign_pointer(pmc->sflist, newpsl);
@@ -2468,8 +2470,9 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 		goto done;
 	}
 	if (msf->imsf_numsrc) {
-		newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc),
-							   GFP_KERNEL);
+		newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
+						      msf->imsf_numsrc),
+				      GFP_KERNEL);
 		if (!newpsl) {
 			err = -ENOBUFS;
 			goto done;
@@ -2480,7 +2483,9 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 		err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
 			msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
 		if (err) {
-			sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+			sock_kfree_s(sk, newpsl,
+				     struct_size(newpsl, sl_addr,
+						 newpsl->sl_max));
 			goto done;
 		}
 	} else {
@@ -2493,7 +2498,8 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 		(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
 			psl->sl_count, psl->sl_addr, 0);
 		/* decrease mem now to avoid the memleak warning */
-		atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+		atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+			   &sk->sk_omem_alloc);
 		kfree_rcu(psl, rcu);
 	} else
 		(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
-- 
cgit v1.2.3


From e11c0e258c1a87e478a7a44e7c3d9e8ea4b8438c Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 4 Aug 2021 16:43:52 -0500
Subject: net/ipv6/mcast: Use struct_size() helper

Replace IP6_SFLSIZE() with struct_size() helper in order to avoid any
potential type mistakes or integer overflows that, in the worst
scenario, could lead to heap overflows.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/if_inet6.h |  3 ---
 net/ipv6/mcast.c       | 20 +++++++++++++-------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 71bb4cc4d05d..42235c178b06 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -82,9 +82,6 @@ struct ip6_sf_socklist {
 	struct in6_addr		sl_addr[];
 };
 
-#define IP6_SFLSIZE(count)	(sizeof(struct ip6_sf_socklist) + \
-	(count) * sizeof(struct in6_addr))
-
 #define IP6_SFBLOCK	10	/* allocate this many at once */
 
 struct ipv6_mc_socklist {
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 54ec163fbafa..cd951faa2fac 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -447,7 +447,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 
 		if (psl)
 			count += psl->sl_max;
-		newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_KERNEL);
+		newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
+				      GFP_KERNEL);
 		if (!newpsl) {
 			err = -ENOBUFS;
 			goto done;
@@ -457,7 +458,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 		if (psl) {
 			for (i = 0; i < psl->sl_count; i++)
 				newpsl->sl_addr[i] = psl->sl_addr[i];
-			atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+			atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+				   &sk->sk_omem_alloc);
 			kfree_rcu(psl, rcu);
 		}
 		psl = newpsl;
@@ -525,8 +527,9 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
 		goto done;
 	}
 	if (gsf->gf_numsrc) {
-		newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc),
-							  GFP_KERNEL);
+		newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
+						      gsf->gf_numsrc),
+				      GFP_KERNEL);
 		if (!newpsl) {
 			err = -ENOBUFS;
 			goto done;
@@ -543,7 +546,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
 				     newpsl->sl_count, newpsl->sl_addr, 0);
 		if (err) {
 			mutex_unlock(&idev->mc_lock);
-			sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max));
+			sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr,
+							     newpsl->sl_max));
 			goto done;
 		}
 		mutex_unlock(&idev->mc_lock);
@@ -559,7 +563,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
 	if (psl) {
 		ip6_mc_del_src(idev, group, pmc->sfmode,
 			       psl->sl_count, psl->sl_addr, 0);
-		atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+		atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+			   &sk->sk_omem_alloc);
 		kfree_rcu(psl, rcu);
 	} else {
 		ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
@@ -2607,7 +2612,8 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 		err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
 				     psl->sl_count, psl->sl_addr, 0);
 		RCU_INIT_POINTER(iml->sflist, NULL);
-		atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+		atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+			   &sk->sk_omem_alloc);
 		kfree_rcu(psl, rcu);
 	}
 
-- 
cgit v1.2.3


From 9344988d2979ce9eefe136a69efcf692615ebba8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 30 Jul 2021 15:14:22 +0200
Subject: netfilter: ctnetlink: allow to filter dump by status bits

If CTA_STATUS is present, but CTA_STATUS_MASK is not, then the
mask is automatically set to 'status', so that kernel returns those
entries that have all of the requested bits set.

This makes more sense than using a all-one mask since we'd hardly
ever find a match.

There are no other checks for status bits, so if e.g. userspace
sets impossible combinations it will get an empty dump.

If kernel would reject unknown status bits, then a program that works on
a future kernel that has IPS_FOO bit fails on old kernels.

Same for 'impossible' combinations:

Kernel never sets ASSURED without first having set SEEN_REPLY, but its
possible that a future kernel could do so.

Therefore no sanity tests other than a 0-mask.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |  1 +
 net/netfilter/nf_conntrack_netlink.c               | 34 +++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index d8484be72fdc..c6e6d7d7d538 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -56,6 +56,7 @@ enum ctattr_type {
 	CTA_LABELS_MASK,
 	CTA_SYNPROXY,
 	CTA_FILTER,
+	CTA_STATUS_MASK,
 	__CTA_MAX
 };
 #define CTA_MAX (__CTA_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index e8368e66b0f5..eb35c6151fb0 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -868,6 +868,7 @@ struct ctnetlink_filter {
 	struct nf_conntrack_zone zone;
 
 	struct ctnetlink_filter_u32 mark;
+	struct ctnetlink_filter_u32 status;
 };
 
 static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = {
@@ -927,6 +928,28 @@ static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark,
 	return 0;
 }
 
+static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status,
+					 const struct nlattr * const cda[])
+{
+	if (cda[CTA_STATUS]) {
+		status->val = ntohl(nla_get_be32(cda[CTA_STATUS]));
+		if (cda[CTA_STATUS_MASK])
+			status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK]));
+		else
+			status->mask = status->val;
+
+		/* status->val == 0? always true, else always false. */
+		if (status->mask == 0)
+			return -EINVAL;
+	} else if (cda[CTA_STATUS_MASK]) {
+		return -EINVAL;
+	}
+
+	/* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */
+	BUILD_BUG_ON(__IPS_MAX_BIT >= 32);
+	return 0;
+}
+
 static struct ctnetlink_filter *
 ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
 {
@@ -948,6 +971,10 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
 	if (err)
 		goto err_filter;
 
+	err = ctnetlink_filter_parse_status(&filter->status, cda);
+	if (err)
+		goto err_filter;
+
 	if (!cda[CTA_FILTER])
 		return filter;
 
@@ -1001,7 +1028,7 @@ err_filter:
 
 static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda)
 {
-	return family || cda[CTA_MARK] || cda[CTA_FILTER];
+	return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS];
 }
 
 static int ctnetlink_start(struct netlink_callback *cb)
@@ -1094,6 +1121,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
 {
 	struct ctnetlink_filter *filter = data;
 	struct nf_conntrack_tuple *tuple;
+	u32 status;
 
 	if (filter == NULL)
 		goto out;
@@ -1125,6 +1153,9 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
 	if ((ct->mark & filter->mark.mask) != filter->mark.val)
 		goto ignore_entry;
 #endif
+	status = (u32)READ_ONCE(ct->status);
+	if ((status & filter->status.mask) != filter->status.val)
+		goto ignore_entry;
 
 out:
 	return 1;
@@ -1507,6 +1538,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
 	[CTA_LABELS_MASK]	= { .type = NLA_BINARY,
 				    .len = NF_CT_LABELS_MAX_SIZE },
 	[CTA_FILTER]		= { .type = NLA_NESTED },
+	[CTA_STATUS_MASK]	= { .type = NLA_U32 },
 };
 
 static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
-- 
cgit v1.2.3


From b37a466837393af72fe8bcb8f1436410f3f173f3 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Thu, 5 Aug 2021 19:54:34 +0800
Subject: netdevice: add the case if dev is NULL

Add the case if dev is NULL in dev_{put, hold}, so the caller doesn't
need to care whether dev is NULL or not.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1b4d4509d04b..135c943699d0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4143,11 +4143,13 @@ void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
+	if (dev) {
 #ifdef CONFIG_PCPU_DEV_REFCNT
-	this_cpu_dec(*dev->pcpu_refcnt);
+		this_cpu_dec(*dev->pcpu_refcnt);
 #else
-	refcount_dec(&dev->dev_refcnt);
+		refcount_dec(&dev->dev_refcnt);
 #endif
+	}
 }
 
 /**
@@ -4158,11 +4160,13 @@ static inline void dev_put(struct net_device *dev)
  */
 static inline void dev_hold(struct net_device *dev)
 {
+	if (dev) {
 #ifdef CONFIG_PCPU_DEV_REFCNT
-	this_cpu_inc(*dev->pcpu_refcnt);
+		this_cpu_inc(*dev->pcpu_refcnt);
 #else
-	refcount_inc(&dev->dev_refcnt);
+		refcount_inc(&dev->dev_refcnt);
 #endif
+	}
 }
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
-- 
cgit v1.2.3


From 6ec566131de0e330fffd33753b325fb4d6b2d00e Mon Sep 17 00:00:00 2001
From: Tedd Ho-Jeong An <tedd.an@intel.com>
Date: Wed, 4 Aug 2021 17:32:08 -0700
Subject: Bluetooth: Add support hdev to allocate private data

This patch adds support hdev to allocate extra size for private data.
The size of private data is specified in the hdev_alloc_size(priv_size)
and the allocated buffer can be accessed with hci_get_priv(hdev).

Signed-off-by: Tedd Ho-Jeong An <tedd.an@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h | 13 ++++++++++++-
 net/bluetooth/hci_core.c         | 13 ++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b011eeea28c3..a7d06d7da602 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1224,10 +1224,21 @@ static inline void hci_set_drvdata(struct hci_dev *hdev, void *data)
 	dev_set_drvdata(&hdev->dev, data);
 }
 
+static inline void *hci_get_priv(struct hci_dev *hdev)
+{
+	return (char *)hdev + sizeof(*hdev);
+}
+
 struct hci_dev *hci_dev_get(int index);
 struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, u8 src_type);
 
-struct hci_dev *hci_alloc_dev(void);
+struct hci_dev *hci_alloc_dev_priv(int sizeof_priv);
+
+static inline struct hci_dev *hci_alloc_dev(void)
+{
+	return hci_alloc_dev_priv(0);
+}
+
 void hci_free_dev(struct hci_dev *hdev);
 int hci_register_dev(struct hci_dev *hdev);
 void hci_unregister_dev(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2b78e1336c53..cb2e9e513907 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3751,11 +3751,18 @@ done:
 }
 
 /* Alloc HCI device */
-struct hci_dev *hci_alloc_dev(void)
+struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
 {
 	struct hci_dev *hdev;
+	unsigned int alloc_size;
 
-	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
+	alloc_size = sizeof(*hdev);
+	if (sizeof_priv) {
+		/* Fixme: May need ALIGN-ment? */
+		alloc_size += sizeof_priv;
+	}
+
+	hdev = kzalloc(alloc_size, GFP_KERNEL);
 	if (!hdev)
 		return NULL;
 
@@ -3869,7 +3876,7 @@ struct hci_dev *hci_alloc_dev(void)
 
 	return hdev;
 }
-EXPORT_SYMBOL(hci_alloc_dev);
+EXPORT_SYMBOL(hci_alloc_dev_priv);
 
 /* Free HCI device */
 void hci_free_dev(struct hci_dev *hdev)
-- 
cgit v1.2.3


From 97a8a8c1f985baf13a3d0d252b787850330d2ea7 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Tue, 3 Aug 2021 16:19:46 -0700
Subject: net/mlx5: Return mdev from eswitch

Export a function so users can retrieve the mellanox device that manages
the eswitch from the eswitch device.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 12 ++++++++++++
 include/linux/mlx5/eswitch.h                      |  6 ++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 97e6cb6f13c1..b65a472067d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -2384,3 +2384,15 @@ u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev)
 	return mlx5_esw_allowed(esw) ? esw->total_vports : 0;
 }
 EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports);
+
+/**
+ * mlx5_eswitch_get_core_dev - Get the mdev device
+ * @esw : eswitch device.
+ *
+ * Return the mellanox core device which manages the eswitch.
+ */
+struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw)
+{
+	return mlx5_esw_allowed(esw) ? esw->dev : NULL;
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_core_dev);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index bc7db2e059eb..c2a34ff85188 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -128,6 +128,7 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
 
 u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev);
 u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
+struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw);
 
 #else  /* CONFIG_MLX5_ESWITCH */
 
@@ -171,6 +172,11 @@ static inline u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev)
 	return 0;
 }
 
+static inline struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_MLX5_ESWITCH */
 
 static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev)
-- 
cgit v1.2.3


From af8c0e25f249abf8829f0cfa074b08d7398e3e38 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Tue, 3 Aug 2021 16:19:47 -0700
Subject: net/mlx5: Lag, add initial logic for shared FDB

As shared FDB requires changes in two subsystems first expose the needed
core functions so the RDMA side can be changed.

mlx5_lag_is_master(): return true if a given mlx5 device is the lag master.
mlx5_lag_is_shared_fdb(): Returns true if the lag mode is shared FDB.
mlx5_lag_get_peer_mdev(): Return the peer mdev in lag.

The mentioned functions will be used by downstream patches in order
to add support for shared FDB for the RDMA side.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lag.c | 49 +++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/lag.h |  1 +
 include/linux/mlx5/driver.h                   |  3 ++
 3 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
index 5c043c5cc403..3049de648256 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -746,6 +746,21 @@ bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
 }
 EXPORT_SYMBOL(mlx5_lag_is_active);
 
+bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;
+	bool res;
+
+	spin_lock(&lag_lock);
+	ldev = mlx5_lag_dev(dev);
+	res = ldev && __mlx5_lag_is_active(ldev) &&
+		dev == ldev->pf[MLX5_LAG_P1].dev;
+	spin_unlock(&lag_lock);
+
+	return res;
+}
+EXPORT_SYMBOL(mlx5_lag_is_master);
+
 bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
 {
 	struct mlx5_lag *ldev;
@@ -760,6 +775,20 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
 }
 EXPORT_SYMBOL(mlx5_lag_is_sriov);
 
+bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;
+	bool res;
+
+	spin_lock(&lag_lock);
+	ldev = mlx5_lag_dev(dev);
+	res = ldev && __mlx5_lag_is_sriov(ldev) && ldev->shared_fdb;
+	spin_unlock(&lag_lock);
+
+	return res;
+}
+EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);
+
 void mlx5_lag_update(struct mlx5_core_dev *dev)
 {
 	struct mlx5_lag *ldev;
@@ -827,6 +856,26 @@ unlock:
 }
 EXPORT_SYMBOL(mlx5_lag_get_slave_port);
 
+struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_dev *peer_dev = NULL;
+	struct mlx5_lag *ldev;
+
+	spin_lock(&lag_lock);
+	ldev = mlx5_lag_dev(dev);
+	if (!ldev)
+		goto unlock;
+
+	peer_dev = ldev->pf[MLX5_LAG_P1].dev == dev ?
+			   ldev->pf[MLX5_LAG_P2].dev :
+			   ldev->pf[MLX5_LAG_P1].dev;
+
+unlock:
+	spin_unlock(&lag_lock);
+	return peer_dev;
+}
+EXPORT_SYMBOL(mlx5_lag_get_peer_mdev);
+
 int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
 				 u64 *values,
 				 int num_counters,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag.h
index 191392c37558..70b244b1a09e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.h
@@ -39,6 +39,7 @@ struct lag_tracker {
  */
 struct mlx5_lag {
 	u8                        flags;
+	bool			  shared_fdb;
 	u8                        v2p_map[MLX5_MAX_PORTS];
 	struct kref               ref;
 	struct lag_func           pf[MLX5_MAX_PORTS];
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1efe37466969..af4dd6e9f97f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1138,6 +1138,8 @@ bool mlx5_lag_is_roce(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
+bool mlx5_lag_is_master(struct mlx5_core_dev *dev);
+bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev);
 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
 u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
 			   struct net_device *slave);
@@ -1145,6 +1147,7 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
 				 u64 *values,
 				 int num_counters,
 				 size_t *offsets);
+struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev);
 struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev);
 void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up);
 int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
-- 
cgit v1.2.3


From 979bf468fc543444eb750c8f8817407f509bd504 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Tue, 3 Aug 2021 16:19:49 -0700
Subject: {net, RDMA}/mlx5: Extend send to vport rules

In shared FDB there is only one eswitch which is active and it receives
traffic from all representors and all vports in the HCA.

While the Ethernet representor will always reside on its native PF
the IB representor will not. Extend send to vport rule creation to
support such flows. Need to account for source vport that sends the
traffic (on which the representors resides) and the target eswitch
the traffic which reach.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c                        | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c           | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 5 +++--
 include/linux/mlx5/eswitch.h                               | 1 +
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index b25e0b33a11a..bf5a6e4d1c03 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -123,7 +123,7 @@ struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
 
 	rep = dev->port[port - 1].rep;
 
-	return mlx5_eswitch_add_send_to_vport_rule(esw, rep, sq->base.mqp.qpn);
+	return mlx5_eswitch_add_send_to_vport_rule(esw, esw, rep, sq->base.mqp.qpn);
 }
 
 static int mlx5r_rep_probe(struct auxiliary_device *adev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index bf94bcb6fa5d..1d016cc64015 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -337,7 +337,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
 		}
 
 		/* Add re-inject rule to the PF/representor sqs */
-		flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw, rep,
+		flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw, esw, rep,
 								sqns_array[i]);
 		if (IS_ERR(flow_rule)) {
 			err = PTR_ERR(flow_rule);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 7579f3402776..12567002997f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -925,6 +925,7 @@ out:
 
 struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw,
+				    struct mlx5_eswitch *from_esw,
 				    struct mlx5_eswitch_rep *rep,
 				    u32 sqn)
 {
@@ -943,10 +944,10 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw,
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
 	MLX5_SET(fte_match_set_misc, misc, source_sqn, sqn);
 	/* source vport is the esw manager */
-	MLX5_SET(fte_match_set_misc, misc, source_port, rep->esw->manager_vport);
+	MLX5_SET(fte_match_set_misc, misc, source_port, from_esw->manager_vport);
 	if (MLX5_CAP_ESW(on_esw->dev, merged_eswitch))
 		MLX5_SET(fte_match_set_misc, misc, source_eswitch_owner_vhca_id,
-			 MLX5_CAP_GEN(rep->esw->dev, vhca_id));
+			 MLX5_CAP_GEN(from_esw->dev, vhca_id));
 
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
 	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_sqn);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index c2a34ff85188..0bfcf7b8ecf9 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -63,6 +63,7 @@ struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw,
 void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type);
 struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw,
+				    struct mlx5_eswitch *from_esw,
 				    struct mlx5_eswitch_rep *rep, u32 sqn);
 
 #ifdef CONFIG_MLX5_ESWITCH
-- 
cgit v1.2.3


From c8e6a9e6d6bb29db08e0b69ae97f1e46ccc5691c Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Tue, 3 Aug 2021 16:19:54 -0700
Subject: net/mlx5: E-Switch, Add event callback for representors

This callback will allow to notify representors about relevant events
when in OFFLOADS mode. In downstream patches, this will be used to notify
about PAIR/UNPAIR devcom events.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 50 ++++++++++++++++++++--
 include/linux/mlx5/eswitch.h                       |  9 ++++
 2 files changed, 56 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index dd5eadd6047b..b57a5c188832 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2316,11 +2316,22 @@ void esw_offloads_unload_rep(struct mlx5_eswitch *esw, u16 vport_num)
 #define ESW_OFFLOADS_DEVCOM_PAIR	(0)
 #define ESW_OFFLOADS_DEVCOM_UNPAIR	(1)
 
-static int mlx5_esw_offloads_pair(struct mlx5_eswitch *esw,
-				  struct mlx5_eswitch *peer_esw)
+static void mlx5_esw_offloads_rep_event_unpair(struct mlx5_eswitch *esw)
 {
+	const struct mlx5_eswitch_rep_ops *ops;
+	struct mlx5_eswitch_rep *rep;
+	unsigned long i;
+	u8 rep_type;
 
-	return esw_add_fdb_peer_miss_rules(esw, peer_esw->dev);
+	mlx5_esw_for_each_rep(esw, i, rep) {
+		rep_type = NUM_REP_TYPES;
+		while (rep_type--) {
+			ops = esw->offloads.rep_ops[rep_type];
+			if (atomic_read(&rep->rep_data[rep_type].state) == REP_LOADED &&
+			    ops->event)
+				ops->event(esw, rep, MLX5_SWITCHDEV_EVENT_UNPAIR, NULL);
+		}
+	}
 }
 
 static void mlx5_esw_offloads_unpair(struct mlx5_eswitch *esw)
@@ -2328,9 +2339,42 @@ static void mlx5_esw_offloads_unpair(struct mlx5_eswitch *esw)
 #if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
 	mlx5e_tc_clean_fdb_peer_flows(esw);
 #endif
+	mlx5_esw_offloads_rep_event_unpair(esw);
 	esw_del_fdb_peer_miss_rules(esw);
 }
 
+static int mlx5_esw_offloads_pair(struct mlx5_eswitch *esw,
+				  struct mlx5_eswitch *peer_esw)
+{
+	const struct mlx5_eswitch_rep_ops *ops;
+	struct mlx5_eswitch_rep *rep;
+	unsigned long i;
+	u8 rep_type;
+	int err;
+
+	err = esw_add_fdb_peer_miss_rules(esw, peer_esw->dev);
+	if (err)
+		return err;
+
+	mlx5_esw_for_each_rep(esw, i, rep) {
+		for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) {
+			ops = esw->offloads.rep_ops[rep_type];
+			if (atomic_read(&rep->rep_data[rep_type].state) == REP_LOADED &&
+			    ops->event) {
+				err = ops->event(esw, rep, MLX5_SWITCHDEV_EVENT_PAIR, peer_esw);
+				if (err)
+					goto err_out;
+			}
+		}
+	}
+
+	return 0;
+
+err_out:
+	mlx5_esw_offloads_unpair(esw);
+	return err;
+}
+
 static int mlx5_esw_offloads_set_ns_peer(struct mlx5_eswitch *esw,
 					 struct mlx5_eswitch *peer_esw,
 					 bool pair)
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 0bfcf7b8ecf9..4ab5c1fc1270 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -29,11 +29,20 @@ enum {
 	REP_LOADED,
 };
 
+enum mlx5_switchdev_event {
+	MLX5_SWITCHDEV_EVENT_PAIR,
+	MLX5_SWITCHDEV_EVENT_UNPAIR,
+};
+
 struct mlx5_eswitch_rep;
 struct mlx5_eswitch_rep_ops {
 	int (*load)(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep);
 	void (*unload)(struct mlx5_eswitch_rep *rep);
 	void *(*get_proto_dev)(struct mlx5_eswitch_rep *rep);
+	int (*event)(struct mlx5_eswitch *esw,
+		     struct mlx5_eswitch_rep *rep,
+		     enum mlx5_switchdev_event event,
+		     void *data);
 };
 
 struct mlx5_eswitch_rep_data {
-- 
cgit v1.2.3


From c73c57081b3d59aa99093fbedced32ea02620cd3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 6 Aug 2021 03:20:08 +0300
Subject: net: dsa: don't disable multicast flooding to the CPU even without an
 IGMP querier

Commit 08cc83cc7fd8 ("net: dsa: add support for BRIDGE_MROUTER
attribute") added an option for users to turn off multicast flooding
towards the CPU if they turn off the IGMP querier on a bridge which
already has enslaved ports (echo 0 > /sys/class/net/br0/bridge/multicast_router).

And commit a8b659e7ff75 ("net: dsa: act as passthrough for bridge port flags")
simply papered over that issue, because it moved the decision to flood
the CPU with multicast (or not) from the DSA core down to individual drivers,
instead of taking a more radical position then.

The truth is that disabling multicast flooding to the CPU is simply
something we are not prepared to do now, if at all. Some reasons:

- ICMP6 neighbor solicitation messages are unregistered multicast
  packets as far as the bridge is concerned. So if we stop flooding
  multicast, the outside world cannot ping the bridge device's IPv6
  link-local address.

- There might be foreign interfaces bridged with our DSA switch ports
  (sending a packet towards the host does not necessarily equal
  termination, but maybe software forwarding). So if there is no one
  interested in that multicast traffic in the local network stack, that
  doesn't mean nobody is.

- PTP over L4 (IPv4, IPv6) is multicast, but is unregistered as far as
  the bridge is concerned. This should reach the CPU port.

- The switch driver might not do FDB partitioning. And since we don't
  even bother to do more fine-grained flood disabling (such as "disable
  flooding _from_port_N_ towards the CPU port" as opposed to "disable
  flooding _from_any_port_ towards the CPU port"), this breaks standalone
  ports, or even multiple bridges where one has an IGMP querier and one
  doesn't.

Reverting the logic makes all of the above work.

Fixes: a8b659e7ff75 ("net: dsa: act as passthrough for bridge port flags")
Fixes: 08cc83cc7fd8 ("net: dsa: add support for BRIDGE_MROUTER attribute")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 10 ----------
 drivers/net/dsa/b53/b53_priv.h   |  2 --
 drivers/net/dsa/bcm_sf2.c        |  1 -
 drivers/net/dsa/mv88e6xxx/chip.c | 18 ------------------
 include/net/dsa.h                |  2 --
 net/dsa/dsa_priv.h               |  2 --
 net/dsa/port.c                   | 11 -----------
 net/dsa/slave.c                  |  6 ------
 8 files changed, 52 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index b23e3488695b..bd1417a66cbf 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -2016,15 +2016,6 @@ int b53_br_flags(struct dsa_switch *ds, int port,
 }
 EXPORT_SYMBOL(b53_br_flags);
 
-int b53_set_mrouter(struct dsa_switch *ds, int port, bool mrouter,
-		    struct netlink_ext_ack *extack)
-{
-	b53_port_set_mcast_flood(ds->priv, port, mrouter);
-
-	return 0;
-}
-EXPORT_SYMBOL(b53_set_mrouter);
-
 static bool b53_possible_cpu_port(struct dsa_switch *ds, int port)
 {
 	/* Broadcom switches will accept enabling Broadcom tags on the
@@ -2268,7 +2259,6 @@ static const struct dsa_switch_ops b53_switch_ops = {
 	.port_bridge_leave	= b53_br_leave,
 	.port_pre_bridge_flags	= b53_br_flags_pre,
 	.port_bridge_flags	= b53_br_flags,
-	.port_set_mrouter	= b53_set_mrouter,
 	.port_stp_state_set	= b53_br_set_stp_state,
 	.port_fast_age		= b53_br_fast_age,
 	.port_vlan_filtering	= b53_vlan_filtering,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 82700a5714c1..9bf8319342b0 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -328,8 +328,6 @@ int b53_br_flags_pre(struct dsa_switch *ds, int port,
 int b53_br_flags(struct dsa_switch *ds, int port,
 		 struct switchdev_brport_flags flags,
 		 struct netlink_ext_ack *extack);
-int b53_set_mrouter(struct dsa_switch *ds, int port, bool mrouter,
-		    struct netlink_ext_ack *extack);
 int b53_setup_devlink_resources(struct dsa_switch *ds);
 void b53_port_event(struct dsa_switch *ds, int port);
 void b53_phylink_validate(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 3b018fcf4412..6ce9ec1283e0 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1199,7 +1199,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
 	.port_pre_bridge_flags	= b53_br_flags_pre,
 	.port_bridge_flags	= b53_br_flags,
 	.port_stp_state_set	= b53_br_set_stp_state,
-	.port_set_mrouter	= b53_set_mrouter,
 	.port_fast_age		= b53_br_fast_age,
 	.port_vlan_filtering	= b53_vlan_filtering,
 	.port_vlan_add		= b53_vlan_add,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ddb51dd132ef..c2c5f1573fe5 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -5849,23 +5849,6 @@ out:
 	return err;
 }
 
-static int mv88e6xxx_port_set_mrouter(struct dsa_switch *ds, int port,
-				      bool mrouter,
-				      struct netlink_ext_ack *extack)
-{
-	struct mv88e6xxx_chip *chip = ds->priv;
-	int err;
-
-	if (!chip->info->ops->port_set_mcast_flood)
-		return -EOPNOTSUPP;
-
-	mv88e6xxx_reg_lock(chip);
-	err = chip->info->ops->port_set_mcast_flood(chip, port, mrouter);
-	mv88e6xxx_reg_unlock(chip);
-
-	return err;
-}
-
 static bool mv88e6xxx_lag_can_offload(struct dsa_switch *ds,
 				      struct net_device *lag,
 				      struct netdev_lag_upper_info *info)
@@ -6167,7 +6150,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
 	.port_bridge_leave	= mv88e6xxx_port_bridge_leave,
 	.port_pre_bridge_flags	= mv88e6xxx_port_pre_bridge_flags,
 	.port_bridge_flags	= mv88e6xxx_port_bridge_flags,
-	.port_set_mrouter	= mv88e6xxx_port_set_mrouter,
 	.port_stp_state_set	= mv88e6xxx_port_stp_state_set,
 	.port_fast_age		= mv88e6xxx_port_fast_age,
 	.port_vlan_filtering	= mv88e6xxx_port_vlan_filtering,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 7cc9507282d3..d7dc26d316ea 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -714,8 +714,6 @@ struct dsa_switch_ops {
 	int	(*port_bridge_flags)(struct dsa_switch *ds, int port,
 				     struct switchdev_brport_flags flags,
 				     struct netlink_ext_ack *extack);
-	int	(*port_set_mrouter)(struct dsa_switch *ds, int port, bool mrouter,
-				    struct netlink_ext_ack *extack);
 
 	/*
 	 * VLAN support
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index e43c5dc04282..7841b3957516 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -244,8 +244,6 @@ int dsa_port_pre_bridge_flags(const struct dsa_port *dp,
 int dsa_port_bridge_flags(const struct dsa_port *dp,
 			  struct switchdev_brport_flags flags,
 			  struct netlink_ext_ack *extack);
-int dsa_port_mrouter(struct dsa_port *dp, bool mrouter,
-		     struct netlink_ext_ack *extack);
 int dsa_port_vlan_add(struct dsa_port *dp,
 		      const struct switchdev_obj_port_vlan *vlan,
 		      struct netlink_ext_ack *extack);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index c18077a3c779..797a3269a964 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -641,17 +641,6 @@ int dsa_port_bridge_flags(const struct dsa_port *dp,
 	return ds->ops->port_bridge_flags(ds, dp->index, flags, extack);
 }
 
-int dsa_port_mrouter(struct dsa_port *dp, bool mrouter,
-		     struct netlink_ext_ack *extack)
-{
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->port_set_mrouter)
-		return -EOPNOTSUPP;
-
-	return ds->ops->port_set_mrouter(ds, dp->index, mrouter, extack);
-}
-
 int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu,
 			bool targeted_match)
 {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6e1135d3ee33..022174635bc1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -314,12 +314,6 @@ static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx,
 
 		ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack);
 		break;
-	case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER:
-		if (!dsa_port_offloads_bridge(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, extack);
-		break;
 	default:
 		ret = -EOPNOTSUPP;
 		break;
-- 
cgit v1.2.3


From 82564f6c706a37e5f7dec962375581cc9f8fca5d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 8 Aug 2021 14:41:21 +0300
Subject: devlink: Simplify devlink port API calls

Devlink port already has pointer to the devlink instance and all API
calls that forward these devlink ports to the drivers perform same
"devlink_port->devlink" assignment before actual call.

This patch removes useless parameter and allows us in the future
to create specific devlink_port_ops to manage user space access with
reliable ops assignment.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 10 +--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  6 +-
 .../net/ethernet/mellanox/mlx5/core/sf/devlink.c   |  8 +-
 drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h    |  4 +-
 include/net/devlink.h                              | 12 ++-
 net/core/devlink.c                                 | 95 +++++++++++-----------
 6 files changed, 64 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 97e6cb6f13c1..2b90388ef209 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1889,8 +1889,7 @@ is_port_function_supported(struct mlx5_eswitch *esw, u16 vport_num)
 	       mlx5_esw_is_sf_vport(esw, vport_num);
 }
 
-int mlx5_devlink_port_function_hw_addr_get(struct devlink *devlink,
-					   struct devlink_port *port,
+int mlx5_devlink_port_function_hw_addr_get(struct devlink_port *port,
 					   u8 *hw_addr, int *hw_addr_len,
 					   struct netlink_ext_ack *extack)
 {
@@ -1899,7 +1898,7 @@ int mlx5_devlink_port_function_hw_addr_get(struct devlink *devlink,
 	int err = -EOPNOTSUPP;
 	u16 vport_num;
 
-	esw = mlx5_devlink_eswitch_get(devlink);
+	esw = mlx5_devlink_eswitch_get(port->devlink);
 	if (IS_ERR(esw))
 		return PTR_ERR(esw);
 
@@ -1923,8 +1922,7 @@ int mlx5_devlink_port_function_hw_addr_get(struct devlink *devlink,
 	return err;
 }
 
-int mlx5_devlink_port_function_hw_addr_set(struct devlink *devlink,
-					   struct devlink_port *port,
+int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port,
 					   const u8 *hw_addr, int hw_addr_len,
 					   struct netlink_ext_ack *extack)
 {
@@ -1933,7 +1931,7 @@ int mlx5_devlink_port_function_hw_addr_set(struct devlink *devlink,
 	int err = -EOPNOTSUPP;
 	u16 vport_num;
 
-	esw = mlx5_devlink_eswitch_get(devlink);
+	esw = mlx5_devlink_eswitch_get(port->devlink);
 	if (IS_ERR(esw)) {
 		NL_SET_ERR_MSG_MOD(extack, "Eswitch doesn't support set hw_addr");
 		return PTR_ERR(esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index d562edf5b0bc..41eff9dd1bf6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -475,12 +475,10 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink,
 					struct netlink_ext_ack *extack);
 int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink,
 					enum devlink_eswitch_encap_mode *encap);
-int mlx5_devlink_port_function_hw_addr_get(struct devlink *devlink,
-					   struct devlink_port *port,
+int mlx5_devlink_port_function_hw_addr_get(struct devlink_port *port,
 					   u8 *hw_addr, int *hw_addr_len,
 					   struct netlink_ext_ack *extack);
-int mlx5_devlink_port_function_hw_addr_set(struct devlink *devlink,
-					   struct devlink_port *port,
+int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port,
 					   const u8 *hw_addr, int hw_addr_len,
 					   struct netlink_ext_ack *extack);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index 1be048769309..720195c4be7c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -164,12 +164,12 @@ static bool mlx5_sf_is_active(const struct mlx5_sf *sf)
 	return sf->hw_state == MLX5_VHCA_STATE_ACTIVE || sf->hw_state == MLX5_VHCA_STATE_IN_USE;
 }
 
-int mlx5_devlink_sf_port_fn_state_get(struct devlink *devlink, struct devlink_port *dl_port,
+int mlx5_devlink_sf_port_fn_state_get(struct devlink_port *dl_port,
 				      enum devlink_port_fn_state *state,
 				      enum devlink_port_fn_opstate *opstate,
 				      struct netlink_ext_ack *extack)
 {
-	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_core_dev *dev = devlink_priv(dl_port->devlink);
 	struct mlx5_sf_table *table;
 	struct mlx5_sf *sf;
 	int err = 0;
@@ -248,11 +248,11 @@ out:
 	return err;
 }
 
-int mlx5_devlink_sf_port_fn_state_set(struct devlink *devlink, struct devlink_port *dl_port,
+int mlx5_devlink_sf_port_fn_state_set(struct devlink_port *dl_port,
 				      enum devlink_port_fn_state state,
 				      struct netlink_ext_ack *extack)
 {
-	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_core_dev *dev = devlink_priv(dl_port->devlink);
 	struct mlx5_sf_table *table;
 	struct mlx5_sf *sf;
 	int err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
index 81ce13b19ee8..3a480e06ecc0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
@@ -24,11 +24,11 @@ int mlx5_devlink_sf_port_new(struct devlink *devlink,
 			     unsigned int *new_port_index);
 int mlx5_devlink_sf_port_del(struct devlink *devlink, unsigned int port_index,
 			     struct netlink_ext_ack *extack);
-int mlx5_devlink_sf_port_fn_state_get(struct devlink *devlink, struct devlink_port *dl_port,
+int mlx5_devlink_sf_port_fn_state_get(struct devlink_port *dl_port,
 				      enum devlink_port_fn_state *state,
 				      enum devlink_port_fn_opstate *opstate,
 				      struct netlink_ext_ack *extack);
-int mlx5_devlink_sf_port_fn_state_set(struct devlink *devlink, struct devlink_port *dl_port,
+int mlx5_devlink_sf_port_fn_state_set(struct devlink_port *dl_port,
 				      enum devlink_port_fn_state state,
 				      struct netlink_ext_ack *extack);
 #else
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 08f4c6191e72..ccbfb3a844aa 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1396,8 +1396,8 @@ struct devlink_ops {
 	 *
 	 * Note: @extack can be NULL when port notifier queries the port function.
 	 */
-	int (*port_function_hw_addr_get)(struct devlink *devlink, struct devlink_port *port,
-					 u8 *hw_addr, int *hw_addr_len,
+	int (*port_function_hw_addr_get)(struct devlink_port *port, u8 *hw_addr,
+					 int *hw_addr_len,
 					 struct netlink_ext_ack *extack);
 	/**
 	 * @port_function_hw_addr_set: Port function's hardware address set function.
@@ -1406,7 +1406,7 @@ struct devlink_ops {
 	 * by the devlink port. Driver should return -EOPNOTSUPP if it doesn't support port
 	 * function handling for a particular port.
 	 */
-	int (*port_function_hw_addr_set)(struct devlink *devlink, struct devlink_port *port,
+	int (*port_function_hw_addr_set)(struct devlink_port *port,
 					 const u8 *hw_addr, int hw_addr_len,
 					 struct netlink_ext_ack *extack);
 	/**
@@ -1462,8 +1462,7 @@ struct devlink_ops {
 	 *
 	 * Return: 0 on success, negative value otherwise.
 	 */
-	int (*port_fn_state_get)(struct devlink *devlink,
-				 struct devlink_port *port,
+	int (*port_fn_state_get)(struct devlink_port *port,
 				 enum devlink_port_fn_state *state,
 				 enum devlink_port_fn_opstate *opstate,
 				 struct netlink_ext_ack *extack);
@@ -1478,8 +1477,7 @@ struct devlink_ops {
 	 *
 	 * Return: 0 on success, negative value otherwise.
 	 */
-	int (*port_fn_state_set)(struct devlink *devlink,
-				 struct devlink_port *port,
+	int (*port_fn_state_set)(struct devlink_port *port,
 				 enum devlink_port_fn_state state,
 				 struct netlink_ext_ack *extack);
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8fa015319af6..ee95eee8d0ed 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -804,10 +804,11 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
 	return 0;
 }
 
-static int
-devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *ops,
-			     struct devlink_port *port, struct sk_buff *msg,
-			     struct netlink_ext_ack *extack, bool *msg_updated)
+static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops,
+					struct devlink_port *port,
+					struct sk_buff *msg,
+					struct netlink_ext_ack *extack,
+					bool *msg_updated)
 {
 	u8 hw_addr[MAX_ADDR_LEN];
 	int hw_addr_len;
@@ -816,7 +817,8 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *
 	if (!ops->port_function_hw_addr_get)
 		return 0;
 
-	err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack);
+	err = ops->port_function_hw_addr_get(port, hw_addr, &hw_addr_len,
+					     extack);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
@@ -893,12 +895,11 @@ devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate)
 	       opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED;
 }
 
-static int
-devlink_port_fn_state_fill(struct devlink *devlink,
-			   const struct devlink_ops *ops,
-			   struct devlink_port *port, struct sk_buff *msg,
-			   struct netlink_ext_ack *extack,
-			   bool *msg_updated)
+static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
+				      struct devlink_port *port,
+				      struct sk_buff *msg,
+				      struct netlink_ext_ack *extack,
+				      bool *msg_updated)
 {
 	enum devlink_port_fn_opstate opstate;
 	enum devlink_port_fn_state state;
@@ -907,7 +908,7 @@ devlink_port_fn_state_fill(struct devlink *devlink,
 	if (!ops->port_fn_state_get)
 		return 0;
 
-	err = ops->port_fn_state_get(devlink, port, &state, &opstate, extack);
+	err = ops->port_fn_state_get(port, &state, &opstate, extack);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
@@ -935,7 +936,6 @@ static int
 devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
 				   struct netlink_ext_ack *extack)
 {
-	struct devlink *devlink = port->devlink;
 	const struct devlink_ops *ops;
 	struct nlattr *function_attr;
 	bool msg_updated = false;
@@ -945,13 +945,12 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
 	if (!function_attr)
 		return -EMSGSIZE;
 
-	ops = devlink->ops;
-	err = devlink_port_fn_hw_addr_fill(devlink, ops, port, msg,
-					   extack, &msg_updated);
+	ops = port->devlink->ops;
+	err = devlink_port_fn_hw_addr_fill(ops, port, msg, extack,
+					   &msg_updated);
 	if (err)
 		goto out;
-	err = devlink_port_fn_state_fill(devlink, ops, port, msg, extack,
-					 &msg_updated);
+	err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
 out:
 	if (err || !msg_updated)
 		nla_nest_cancel(msg, function_attr);
@@ -1269,31 +1268,33 @@ out:
 	return msg->len;
 }
 
-static int devlink_port_type_set(struct devlink *devlink,
-				 struct devlink_port *devlink_port,
+static int devlink_port_type_set(struct devlink_port *devlink_port,
 				 enum devlink_port_type port_type)
 
 {
 	int err;
 
-	if (devlink->ops->port_type_set) {
-		if (port_type == devlink_port->type)
-			return 0;
-		err = devlink->ops->port_type_set(devlink_port, port_type);
-		if (err)
-			return err;
-		devlink_port->desired_type = port_type;
-		devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+	if (devlink_port->devlink->ops->port_type_set)
+		return -EOPNOTSUPP;
+
+	if (port_type == devlink_port->type)
 		return 0;
-	}
-	return -EOPNOTSUPP;
+
+	err = devlink_port->devlink->ops->port_type_set(devlink_port,
+							port_type);
+	if (err)
+		return err;
+
+	devlink_port->desired_type = port_type;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+	return 0;
 }
 
-static int
-devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *port,
-				  const struct nlattr *attr, struct netlink_ext_ack *extack)
+static int devlink_port_function_hw_addr_set(struct devlink_port *port,
+					     const struct nlattr *attr,
+					     struct netlink_ext_ack *extack)
 {
-	const struct devlink_ops *ops;
+	const struct devlink_ops *ops = port->devlink->ops;
 	const u8 *hw_addr;
 	int hw_addr_len;
 
@@ -1314,17 +1315,16 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *
 		}
 	}
 
-	ops = devlink->ops;
 	if (!ops->port_function_hw_addr_set) {
 		NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes");
 		return -EOPNOTSUPP;
 	}
 
-	return ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack);
+	return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len,
+					      extack);
 }
 
-static int devlink_port_fn_state_set(struct devlink *devlink,
-				     struct devlink_port *port,
+static int devlink_port_fn_state_set(struct devlink_port *port,
 				     const struct nlattr *attr,
 				     struct netlink_ext_ack *extack)
 {
@@ -1332,18 +1332,18 @@ static int devlink_port_fn_state_set(struct devlink *devlink,
 	const struct devlink_ops *ops;
 
 	state = nla_get_u8(attr);
-	ops = devlink->ops;
+	ops = port->devlink->ops;
 	if (!ops->port_fn_state_set) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Function does not support state setting");
 		return -EOPNOTSUPP;
 	}
-	return ops->port_fn_state_set(devlink, port, state, extack);
+	return ops->port_fn_state_set(port, state, extack);
 }
 
-static int
-devlink_port_function_set(struct devlink *devlink, struct devlink_port *port,
-			  const struct nlattr *attr, struct netlink_ext_ack *extack)
+static int devlink_port_function_set(struct devlink_port *port,
+				     const struct nlattr *attr,
+				     struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1];
 	int err;
@@ -1357,7 +1357,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port,
 
 	attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR];
 	if (attr) {
-		err = devlink_port_function_hw_addr_set(devlink, port, attr, extack);
+		err = devlink_port_function_hw_addr_set(port, attr, extack);
 		if (err)
 			return err;
 	}
@@ -1367,7 +1367,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port,
 	 */
 	attr = tb[DEVLINK_PORT_FN_ATTR_STATE];
 	if (attr)
-		err = devlink_port_fn_state_set(devlink, port, attr, extack);
+		err = devlink_port_fn_state_set(port, attr, extack);
 
 	if (!err)
 		devlink_port_notify(port, DEVLINK_CMD_PORT_NEW);
@@ -1378,14 +1378,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
 					struct genl_info *info)
 {
 	struct devlink_port *devlink_port = info->user_ptr[1];
-	struct devlink *devlink = devlink_port->devlink;
 	int err;
 
 	if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
 		enum devlink_port_type port_type;
 
 		port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
-		err = devlink_port_type_set(devlink, devlink_port, port_type);
+		err = devlink_port_type_set(devlink_port, port_type);
 		if (err)
 			return err;
 	}
@@ -1394,7 +1393,7 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
 		struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION];
 		struct netlink_ext_ack *extack = info->extack;
 
-		err = devlink_port_function_set(devlink, devlink_port, attr, extack);
+		err = devlink_port_function_set(devlink_port, attr, extack);
 		if (err)
 			return err;
 	}
-- 
cgit v1.2.3


From 045c45d1f598c65806f885b59f6fbc4cebb62b15 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 8 Aug 2021 17:35:23 +0300
Subject: net: dsa: centralize fast ageing when address learning is turned off

Currently DSA leaves it down to device drivers to fast age the FDB on a
port when address learning is disabled on it. There are 2 reasons for
doing that in the first place:

- when address learning is disabled by user space, through
  IFLA_BRPORT_LEARNING or the brport_attr_learning sysfs, what user
  space typically wants to achieve is to operate in a mode with no
  dynamic FDB entry on that port. But if the port is already up, some
  addresses might have been already learned on it, and it seems silly to
  wait for 5 minutes for them to expire until something useful can be
  done.

- when a port leaves a bridge and becomes standalone, DSA turns off
  address learning on it. This also has the nice side effect of flushing
  the dynamically learned bridge FDB entries on it, which is a good idea
  because standalone ports should not have bridge FDB entries on them.

We let drivers manage fast ageing under this condition because if DSA
were to do it, it would need to track each port's learning state, and
act upon the transition, which it currently doesn't.

But there are 2 reasons why doing it is better after all:

- drivers might get it wrong and not do it (see b53_port_set_learning)

- we would like to flush the dynamic entries from the software bridge
  too, and letting drivers do that would be another pain point

So track the port learning state and trigger a fast age process
automatically within DSA.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c |  7 -------
 include/net/dsa.h                |  1 +
 net/dsa/dsa_priv.h               |  2 +-
 net/dsa/port.c                   | 35 +++++++++++++++++++++++++++++++----
 4 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index c2c5f1573fe5..c45ca2473743 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -5797,7 +5797,6 @@ static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
 				       struct netlink_ext_ack *extack)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
-	bool do_fast_age = false;
 	int err = -EOPNOTSUPP;
 
 	mv88e6xxx_reg_lock(chip);
@@ -5809,9 +5808,6 @@ static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
 		err = mv88e6xxx_port_set_assoc_vector(chip, port, pav);
 		if (err)
 			goto out;
-
-		if (!learning)
-			do_fast_age = true;
 	}
 
 	if (flags.mask & BR_FLOOD) {
@@ -5843,9 +5839,6 @@ static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
 out:
 	mv88e6xxx_reg_unlock(chip);
 
-	if (do_fast_age)
-		mv88e6xxx_port_fast_age(ds, port);
-
 	return err;
 }
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index d7dc26d316ea..995e9d3f9cfc 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -254,6 +254,7 @@ struct dsa_port {
 	struct device_node	*dn;
 	unsigned int		ageing_time;
 	bool			vlan_filtering;
+	bool			learning;
 	u8			stp_state;
 	struct net_device	*bridge_dev;
 	int			bridge_num;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 8dad40b2cf5c..9575cabd3ec3 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -241,7 +241,7 @@ int dsa_port_host_mdb_del(const struct dsa_port *dp,
 int dsa_port_pre_bridge_flags(const struct dsa_port *dp,
 			      struct switchdev_brport_flags flags,
 			      struct netlink_ext_ack *extack);
-int dsa_port_bridge_flags(const struct dsa_port *dp,
+int dsa_port_bridge_flags(struct dsa_port *dp,
 			  struct switchdev_brport_flags flags,
 			  struct netlink_ext_ack *extack);
 int dsa_port_vlan_add(struct dsa_port *dp,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index ef5e08b09bb7..d6a35a03acd6 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -30,6 +30,16 @@ static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v)
 	return dsa_tree_notify(dp->ds->dst, e, v);
 }
 
+static void dsa_port_fast_age(const struct dsa_port *dp)
+{
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->port_fast_age)
+		return;
+
+	ds->ops->port_fast_age(ds, dp->index);
+}
+
 int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age)
 {
 	struct dsa_switch *ds = dp->ds;
@@ -40,7 +50,7 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age)
 
 	ds->ops->port_stp_state_set(ds, port, state);
 
-	if (do_fast_age && ds->ops->port_fast_age) {
+	if (do_fast_age) {
 		/* Fast age FDB entries or flush appropriate forwarding database
 		 * for the given port, if we are moving it from Learning or
 		 * Forwarding state, to Disabled or Blocking or Listening state.
@@ -54,7 +64,7 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age)
 		    (state == BR_STATE_DISABLED ||
 		     state == BR_STATE_BLOCKING ||
 		     state == BR_STATE_LISTENING))
-			ds->ops->port_fast_age(ds, port);
+			dsa_port_fast_age(dp);
 	}
 
 	dp->stp_state = state;
@@ -633,16 +643,33 @@ int dsa_port_pre_bridge_flags(const struct dsa_port *dp,
 	return ds->ops->port_pre_bridge_flags(ds, dp->index, flags, extack);
 }
 
-int dsa_port_bridge_flags(const struct dsa_port *dp,
+int dsa_port_bridge_flags(struct dsa_port *dp,
 			  struct switchdev_brport_flags flags,
 			  struct netlink_ext_ack *extack)
 {
 	struct dsa_switch *ds = dp->ds;
+	int err;
 
 	if (!ds->ops->port_bridge_flags)
 		return -EOPNOTSUPP;
 
-	return ds->ops->port_bridge_flags(ds, dp->index, flags, extack);
+	err = ds->ops->port_bridge_flags(ds, dp->index, flags, extack);
+	if (err)
+		return err;
+
+	if (flags.mask & BR_LEARNING) {
+		bool learning = flags.val & BR_LEARNING;
+
+		if (learning == dp->learning)
+			return 0;
+
+		if (dp->learning && !learning)
+			dsa_port_fast_age(dp);
+
+		dp->learning = learning;
+	}
+
+	return 0;
 }
 
 int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu,
-- 
cgit v1.2.3


From 5313a37b881e57767bc37185bef2873862be8d47 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 8 Aug 2021 17:35:26 +0300
Subject: net: dsa: sja1105: rely on DSA core tracking of port learning state

Now that DSA keeps track of the port learning state, it becomes
superfluous to keep an additional variable with this information in the
sja1105 driver. Remove it.

The DSA core's learning state is present in struct dsa_port *dp.
To avoid the antipattern where we iterate through a DSA switch's
ports and then call dsa_to_port to obtain the "dp" reference (which is
bad because dsa_to_port iterates through the DSA switch tree once
again), just iterate through the dst->ports and operate on those
directly.

The sja1105 had an extra use of priv->learn_ena on non-user ports. DSA
does not touch the learning state of those ports - drivers are free to
do what they wish on them. Mark that information with a comment in
struct dsa_port and let sja1105 set dp->learning for cascade ports.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h      |  1 -
 drivers/net/dsa/sja1105/sja1105_main.c | 32 +++++++++++++-------------------
 include/net/dsa.h                      |  1 +
 3 files changed, 14 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 9cd7dbdd7db9..2e899c9f036d 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -233,7 +233,6 @@ struct sja1105_private {
 	phy_interface_t phy_mode[SJA1105_MAX_NUM_PORTS];
 	bool fixed_link[SJA1105_MAX_NUM_PORTS];
 	bool vlan_aware;
-	unsigned long learn_ena;
 	unsigned long ucast_egress_floods;
 	unsigned long bcast_egress_floods;
 	const struct sja1105_info *info;
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 241fd25b0b86..87e279be89c9 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -176,7 +176,7 @@ static int sja1105_init_mac_settings(struct sja1105_private *priv)
 	struct sja1105_mac_config_entry *mac;
 	struct dsa_switch *ds = priv->ds;
 	struct sja1105_table *table;
-	int i;
+	struct dsa_port *dp;
 
 	table = &priv->static_config.tables[BLK_IDX_MAC_CONFIG];
 
@@ -195,8 +195,11 @@ static int sja1105_init_mac_settings(struct sja1105_private *priv)
 
 	mac = table->entries;
 
-	for (i = 0; i < ds->num_ports; i++) {
-		mac[i] = default_mac;
+	list_for_each_entry(dp, &ds->dst->ports, list) {
+		if (dp->ds != ds)
+			continue;
+
+		mac[dp->index] = default_mac;
 
 		/* Let sja1105_bridge_stp_state_set() keep address learning
 		 * enabled for the DSA ports. CPU ports use software-assisted
@@ -205,8 +208,8 @@ static int sja1105_init_mac_settings(struct sja1105_private *priv)
 		 * CPU ports in a cross-chip topology if multiple CPU ports
 		 * exist.
 		 */
-		if (dsa_is_dsa_port(ds, i))
-			priv->learn_ena |= BIT(i);
+		if (dsa_port_is_dsa(dp))
+			dp->learning = true;
 	}
 
 	return 0;
@@ -1899,6 +1902,7 @@ static int sja1105_bridge_member(struct dsa_switch *ds, int port,
 static void sja1105_bridge_stp_state_set(struct dsa_switch *ds, int port,
 					 u8 state)
 {
+	struct dsa_port *dp = dsa_to_port(ds, port);
 	struct sja1105_private *priv = ds->priv;
 	struct sja1105_mac_config_entry *mac;
 
@@ -1924,12 +1928,12 @@ static void sja1105_bridge_stp_state_set(struct dsa_switch *ds, int port,
 	case BR_STATE_LEARNING:
 		mac[port].ingress   = true;
 		mac[port].egress    = false;
-		mac[port].dyn_learn = !!(priv->learn_ena & BIT(port));
+		mac[port].dyn_learn = dp->learning;
 		break;
 	case BR_STATE_FORWARDING:
 		mac[port].ingress   = true;
 		mac[port].egress    = true;
-		mac[port].dyn_learn = !!(priv->learn_ena & BIT(port));
+		mac[port].dyn_learn = dp->learning;
 		break;
 	default:
 		dev_err(ds->dev, "invalid STP state: %d\n", state);
@@ -2891,23 +2895,13 @@ static int sja1105_port_set_learning(struct sja1105_private *priv, int port,
 				     bool enabled)
 {
 	struct sja1105_mac_config_entry *mac;
-	int rc;
 
 	mac = priv->static_config.tables[BLK_IDX_MAC_CONFIG].entries;
 
 	mac[port].dyn_learn = enabled;
 
-	rc = sja1105_dynamic_config_write(priv, BLK_IDX_MAC_CONFIG, port,
-					  &mac[port], true);
-	if (rc)
-		return rc;
-
-	if (enabled)
-		priv->learn_ena |= BIT(port);
-	else
-		priv->learn_ena &= ~BIT(port);
-
-	return 0;
+	return sja1105_dynamic_config_write(priv, BLK_IDX_MAC_CONFIG, port,
+					    &mac[port], true);
 }
 
 static int sja1105_port_ucast_bcast_flood(struct sja1105_private *priv, int to,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 995e9d3f9cfc..0c2cba45fa79 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -254,6 +254,7 @@ struct dsa_port {
 	struct device_node	*dn;
 	unsigned int		ageing_time;
 	bool			vlan_filtering;
+	/* Managed by DSA on user ports and by drivers on CPU and DSA ports */
 	bool			learning;
 	u8			stp_state;
 	struct net_device	*bridge_dev;
-- 
cgit v1.2.3


From fdacd57c79b79a03c7ca88f706ad9fb7b46831c1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 3 Aug 2021 16:47:19 +0200
Subject: netfilter: x_tables: never register tables by default

For historical reasons x_tables still register tables by default in the
initial namespace.
Only newly created net namespaces add the hook on demand.

This means that the init_net always pays hook cost, even if no filtering
rules are added (e.g. only used inside a single netns).

Note that the hooks are added even when 'iptables -L' is called.
This is because there is no way to tell 'iptables -A' and 'iptables -L'
apart at kernel level.

The only solution would be to register the table, but delay hook
registration until the first rule gets added (or policy gets changed).

That however means that counters are not hooked either, so 'iptables -L'
would always show 0-counters even when traffic is flowing which might be
unexpected.

This keeps table and hook registration consistent with what is already done
in non-init netns: first iptables(-save) invocation registers both table
and hooks.

This applies the same solution adopted for ebtables.
All tables register a template that contains the l3 family, the name
and a constructor function that is called when the initial table has to
be added.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h     |  6 +--
 net/ipv4/netfilter/arptable_filter.c   | 23 ++++----
 net/ipv4/netfilter/iptable_filter.c    | 24 ++++++---
 net/ipv4/netfilter/iptable_mangle.c    | 17 +++---
 net/ipv4/netfilter/iptable_nat.c       | 20 +++----
 net/ipv4/netfilter/iptable_raw.c       | 21 ++++----
 net/ipv4/netfilter/iptable_security.c  | 23 ++++----
 net/ipv6/netfilter/ip6table_filter.c   | 23 +++++---
 net/ipv6/netfilter/ip6table_mangle.c   | 22 ++++----
 net/ipv6/netfilter/ip6table_nat.c      | 16 +++---
 net/ipv6/netfilter/ip6table_raw.c      | 24 ++++-----
 net/ipv6/netfilter/ip6table_security.c | 22 ++++----
 net/netfilter/x_tables.c               | 98 +++++++++++++++++++++++++++-------
 13 files changed, 204 insertions(+), 135 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 28d7027cd460..5897f3dbaf7c 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -238,9 +238,6 @@ struct xt_table {
 	u_int8_t af;		/* address/protocol family */
 	int priority;		/* hook order */
 
-	/* called when table is needed in the given netns */
-	int (*table_init)(struct net *net);
-
 	/* A unique name... */
 	const char name[XT_TABLE_MAXNAMELEN];
 };
@@ -452,6 +449,9 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
 
 struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);
 
+int xt_register_template(const struct xt_table *t, int(*table_init)(struct net *net));
+void xt_unregister_template(const struct xt_table *t);
+
 #ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 #include <net/compat.h>
 
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 6922612df456..3de78416ec76 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -18,15 +18,12 @@ MODULE_DESCRIPTION("arptables filter table");
 #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
 			   (1 << NF_ARP_FORWARD))
 
-static int __net_init arptable_filter_table_init(struct net *net);
-
 static const struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_ARP,
 	.priority	= NF_IP_PRI_FILTER,
-	.table_init	= arptable_filter_table_init,
 };
 
 /* The work comes in here from netfilter.c */
@@ -39,7 +36,7 @@ arptable_filter_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *arpfilter_ops __read_mostly;
 
-static int __net_init arptable_filter_table_init(struct net *net)
+static int arptable_filter_table_init(struct net *net)
 {
 	struct arpt_replace *repl;
 	int err;
@@ -69,30 +66,32 @@ static struct pernet_operations arptable_filter_net_ops = {
 
 static int __init arptable_filter_init(void)
 {
-	int ret;
+	int ret = xt_register_template(&packet_filter,
+				       arptable_filter_table_init);
+
+	if (ret < 0)
+		return ret;
 
 	arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook);
-	if (IS_ERR(arpfilter_ops))
+	if (IS_ERR(arpfilter_ops)) {
+		xt_unregister_template(&packet_filter);
 		return PTR_ERR(arpfilter_ops);
+	}
 
 	ret = register_pernet_subsys(&arptable_filter_net_ops);
 	if (ret < 0) {
+		xt_unregister_template(&packet_filter);
 		kfree(arpfilter_ops);
 		return ret;
 	}
 
-	ret = arptable_filter_table_init(&init_net);
-	if (ret) {
-		unregister_pernet_subsys(&arptable_filter_net_ops);
-		kfree(arpfilter_ops);
-	}
-
 	return ret;
 }
 
 static void __exit arptable_filter_fini(void)
 {
 	unregister_pernet_subsys(&arptable_filter_net_ops);
+	xt_unregister_template(&packet_filter);
 	kfree(arpfilter_ops);
 }
 
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 8272df7c6ad5..0eb0e2ab9bfc 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -19,7 +19,6 @@ MODULE_DESCRIPTION("iptables filter table");
 #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
 			    (1 << NF_INET_FORWARD) | \
 			    (1 << NF_INET_LOCAL_OUT))
-static int __net_init iptable_filter_table_init(struct net *net);
 
 static const struct xt_table packet_filter = {
 	.name		= "filter",
@@ -27,7 +26,6 @@ static const struct xt_table packet_filter = {
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
 	.priority	= NF_IP_PRI_FILTER,
-	.table_init	= iptable_filter_table_init,
 };
 
 static unsigned int
@@ -43,7 +41,7 @@ static struct nf_hook_ops *filter_ops __read_mostly;
 static bool forward __read_mostly = true;
 module_param(forward, bool, 0000);
 
-static int __net_init iptable_filter_table_init(struct net *net)
+static int iptable_filter_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int err;
@@ -62,7 +60,7 @@ static int __net_init iptable_filter_table_init(struct net *net)
 
 static int __net_init iptable_filter_net_init(struct net *net)
 {
-	if (net == &init_net || !forward)
+	if (!forward)
 		return iptable_filter_table_init(net);
 
 	return 0;
@@ -86,22 +84,32 @@ static struct pernet_operations iptable_filter_net_ops = {
 
 static int __init iptable_filter_init(void)
 {
-	int ret;
+	int ret = xt_register_template(&packet_filter,
+				       iptable_filter_table_init);
+
+	if (ret < 0)
+		return ret;
 
 	filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
-	if (IS_ERR(filter_ops))
+	if (IS_ERR(filter_ops)) {
+		xt_unregister_template(&packet_filter);
 		return PTR_ERR(filter_ops);
+	}
 
 	ret = register_pernet_subsys(&iptable_filter_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		xt_unregister_template(&packet_filter);
 		kfree(filter_ops);
+		return ret;
+	}
 
-	return ret;
+	return 0;
 }
 
 static void __exit iptable_filter_fini(void)
 {
 	unregister_pernet_subsys(&iptable_filter_net_ops);
+	xt_unregister_template(&packet_filter);
 	kfree(filter_ops);
 }
 
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 2abc3836f391..b52a4c8a14fc 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -25,15 +25,12 @@ MODULE_DESCRIPTION("iptables mangle table");
 			    (1 << NF_INET_LOCAL_OUT) | \
 			    (1 << NF_INET_POST_ROUTING))
 
-static int __net_init iptable_mangle_table_init(struct net *net);
-
 static const struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
 	.priority	= NF_IP_PRI_MANGLE,
-	.table_init	= iptable_mangle_table_init,
 };
 
 static unsigned int
@@ -83,7 +80,7 @@ iptable_mangle_hook(void *priv,
 }
 
 static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init iptable_mangle_table_init(struct net *net)
+static int iptable_mangle_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
@@ -113,32 +110,30 @@ static struct pernet_operations iptable_mangle_net_ops = {
 
 static int __init iptable_mangle_init(void)
 {
-	int ret;
+	int ret = xt_register_template(&packet_mangler,
+				       iptable_mangle_table_init);
 
 	mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
 	if (IS_ERR(mangle_ops)) {
+		xt_unregister_template(&packet_mangler);
 		ret = PTR_ERR(mangle_ops);
 		return ret;
 	}
 
 	ret = register_pernet_subsys(&iptable_mangle_net_ops);
 	if (ret < 0) {
+		xt_unregister_template(&packet_mangler);
 		kfree(mangle_ops);
 		return ret;
 	}
 
-	ret = iptable_mangle_table_init(&init_net);
-	if (ret) {
-		unregister_pernet_subsys(&iptable_mangle_net_ops);
-		kfree(mangle_ops);
-	}
-
 	return ret;
 }
 
 static void __exit iptable_mangle_fini(void)
 {
 	unregister_pernet_subsys(&iptable_mangle_net_ops);
+	xt_unregister_template(&packet_mangler);
 	kfree(mangle_ops);
 }
 
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a9913842ef18..45d7e072e6a5 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -17,8 +17,6 @@ struct iptable_nat_pernet {
 	struct nf_hook_ops *nf_nat_ops;
 };
 
-static int __net_init iptable_nat_table_init(struct net *net);
-
 static unsigned int iptable_nat_net_id __read_mostly;
 
 static const struct xt_table nf_nat_ipv4_table = {
@@ -29,7 +27,6 @@ static const struct xt_table nf_nat_ipv4_table = {
 			  (1 << NF_INET_LOCAL_IN),
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
-	.table_init	= iptable_nat_table_init,
 };
 
 static unsigned int iptable_nat_do_chain(void *priv,
@@ -113,7 +110,7 @@ static void ipt_nat_unregister_lookups(struct net *net)
 	kfree(ops);
 }
 
-static int __net_init iptable_nat_table_init(struct net *net)
+static int iptable_nat_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
@@ -155,20 +152,25 @@ static struct pernet_operations iptable_nat_net_ops = {
 
 static int __init iptable_nat_init(void)
 {
-	int ret = register_pernet_subsys(&iptable_nat_net_ops);
+	int ret = xt_register_template(&nf_nat_ipv4_table,
+				       iptable_nat_table_init);
+
+	if (ret < 0)
+		return ret;
 
-	if (ret)
+	ret = register_pernet_subsys(&iptable_nat_net_ops);
+	if (ret < 0) {
+		xt_unregister_template(&nf_nat_ipv4_table);
 		return ret;
+	}
 
-	ret = iptable_nat_table_init(&init_net);
-	if (ret)
-		unregister_pernet_subsys(&iptable_nat_net_ops);
 	return ret;
 }
 
 static void __exit iptable_nat_exit(void)
 {
 	unregister_pernet_subsys(&iptable_nat_net_ops);
+	xt_unregister_template(&nf_nat_ipv4_table);
 }
 
 module_init(iptable_nat_init);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index ceef397c1f5f..b88e0f36cd05 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -12,8 +12,6 @@
 
 #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
 
-static int __net_init iptable_raw_table_init(struct net *net);
-
 static bool raw_before_defrag __read_mostly;
 MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
 module_param(raw_before_defrag, bool, 0000);
@@ -24,7 +22,6 @@ static const struct xt_table packet_raw = {
 	.me = THIS_MODULE,
 	.af = NFPROTO_IPV4,
 	.priority = NF_IP_PRI_RAW,
-	.table_init = iptable_raw_table_init,
 };
 
 static const struct xt_table packet_raw_before_defrag = {
@@ -33,7 +30,6 @@ static const struct xt_table packet_raw_before_defrag = {
 	.me = THIS_MODULE,
 	.af = NFPROTO_IPV4,
 	.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG,
-	.table_init = iptable_raw_table_init,
 };
 
 /* The work comes in here from netfilter.c. */
@@ -89,22 +85,24 @@ static int __init iptable_raw_init(void)
 		pr_info("Enabling raw table before defrag\n");
 	}
 
+	ret = xt_register_template(table,
+				   iptable_raw_table_init);
+	if (ret < 0)
+		return ret;
+
 	rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook);
-	if (IS_ERR(rawtable_ops))
+	if (IS_ERR(rawtable_ops)) {
+		xt_unregister_template(table);
 		return PTR_ERR(rawtable_ops);
+	}
 
 	ret = register_pernet_subsys(&iptable_raw_net_ops);
 	if (ret < 0) {
+		xt_unregister_template(table);
 		kfree(rawtable_ops);
 		return ret;
 	}
 
-	ret = iptable_raw_table_init(&init_net);
-	if (ret) {
-		unregister_pernet_subsys(&iptable_raw_net_ops);
-		kfree(rawtable_ops);
-	}
-
 	return ret;
 }
 
@@ -112,6 +110,7 @@ static void __exit iptable_raw_fini(void)
 {
 	unregister_pernet_subsys(&iptable_raw_net_ops);
 	kfree(rawtable_ops);
+	xt_unregister_template(&packet_raw);
 }
 
 module_init(iptable_raw_init);
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 77973f5fd8f6..f519162a2fa5 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -25,15 +25,12 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
 				(1 << NF_INET_FORWARD) | \
 				(1 << NF_INET_LOCAL_OUT)
 
-static int __net_init iptable_security_table_init(struct net *net);
-
 static const struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
 	.priority	= NF_IP_PRI_SECURITY,
-	.table_init	= iptable_security_table_init,
 };
 
 static unsigned int
@@ -45,7 +42,7 @@ iptable_security_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
 
-static int __net_init iptable_security_table_init(struct net *net)
+static int iptable_security_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
@@ -75,24 +72,25 @@ static struct pernet_operations iptable_security_net_ops = {
 
 static int __init iptable_security_init(void)
 {
-	int ret;
+	int ret = xt_register_template(&security_table,
+				       iptable_security_table_init);
+
+	if (ret < 0)
+		return ret;
 
 	sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook);
-	if (IS_ERR(sectbl_ops))
+	if (IS_ERR(sectbl_ops)) {
+		xt_unregister_template(&security_table);
 		return PTR_ERR(sectbl_ops);
+	}
 
 	ret = register_pernet_subsys(&iptable_security_net_ops);
 	if (ret < 0) {
+		xt_unregister_template(&security_table);
 		kfree(sectbl_ops);
 		return ret;
 	}
 
-	ret = iptable_security_table_init(&init_net);
-	if (ret) {
-		unregister_pernet_subsys(&iptable_security_net_ops);
-		kfree(sectbl_ops);
-	}
-
 	return ret;
 }
 
@@ -100,6 +98,7 @@ static void __exit iptable_security_fini(void)
 {
 	unregister_pernet_subsys(&iptable_security_net_ops);
 	kfree(sectbl_ops);
+	xt_unregister_template(&security_table);
 }
 
 module_init(iptable_security_init);
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index bb784ea7bbd3..727ee8097012 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -19,15 +19,12 @@ MODULE_DESCRIPTION("ip6tables filter table");
 			    (1 << NF_INET_FORWARD) | \
 			    (1 << NF_INET_LOCAL_OUT))
 
-static int __net_init ip6table_filter_table_init(struct net *net);
-
 static const struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
 	.priority	= NF_IP6_PRI_FILTER,
-	.table_init	= ip6table_filter_table_init,
 };
 
 /* The work comes in here from netfilter.c. */
@@ -44,7 +41,7 @@ static struct nf_hook_ops *filter_ops __read_mostly;
 static bool forward = true;
 module_param(forward, bool, 0000);
 
-static int __net_init ip6table_filter_table_init(struct net *net)
+static int ip6table_filter_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int err;
@@ -63,7 +60,7 @@ static int __net_init ip6table_filter_table_init(struct net *net)
 
 static int __net_init ip6table_filter_net_init(struct net *net)
 {
-	if (net == &init_net || !forward)
+	if (!forward)
 		return ip6table_filter_table_init(net);
 
 	return 0;
@@ -87,15 +84,24 @@ static struct pernet_operations ip6table_filter_net_ops = {
 
 static int __init ip6table_filter_init(void)
 {
-	int ret;
+	int ret = xt_register_template(&packet_filter,
+					ip6table_filter_table_init);
+
+	if (ret < 0)
+		return ret;
 
 	filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook);
-	if (IS_ERR(filter_ops))
+	if (IS_ERR(filter_ops)) {
+		xt_unregister_template(&packet_filter);
 		return PTR_ERR(filter_ops);
+	}
 
 	ret = register_pernet_subsys(&ip6table_filter_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		xt_unregister_template(&packet_filter);
 		kfree(filter_ops);
+		return ret;
+	}
 
 	return ret;
 }
@@ -103,6 +109,7 @@ static int __init ip6table_filter_init(void)
 static void __exit ip6table_filter_fini(void)
 {
 	unregister_pernet_subsys(&ip6table_filter_net_ops);
+	xt_unregister_template(&packet_filter);
 	kfree(filter_ops);
 }
 
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index c76cffd63041..9b518ce37d6a 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -20,15 +20,12 @@ MODULE_DESCRIPTION("ip6tables mangle table");
 			    (1 << NF_INET_LOCAL_OUT) | \
 			    (1 << NF_INET_POST_ROUTING))
 
-static int __net_init ip6table_mangle_table_init(struct net *net);
-
 static const struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
 	.priority	= NF_IP6_PRI_MANGLE,
-	.table_init	= ip6table_mangle_table_init,
 };
 
 static unsigned int
@@ -76,7 +73,7 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb,
 }
 
 static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init ip6table_mangle_table_init(struct net *net)
+static int ip6table_mangle_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
@@ -106,29 +103,32 @@ static struct pernet_operations ip6table_mangle_net_ops = {
 
 static int __init ip6table_mangle_init(void)
 {
-	int ret;
+	int ret = xt_register_template(&packet_mangler,
+				       ip6table_mangle_table_init);
+
+	if (ret < 0)
+		return ret;
 
 	mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
-	if (IS_ERR(mangle_ops))
+	if (IS_ERR(mangle_ops)) {
+		xt_unregister_template(&packet_mangler);
 		return PTR_ERR(mangle_ops);
+	}
 
 	ret = register_pernet_subsys(&ip6table_mangle_net_ops);
 	if (ret < 0) {
+		xt_unregister_template(&packet_mangler);
 		kfree(mangle_ops);
 		return ret;
 	}
 
-	ret = ip6table_mangle_table_init(&init_net);
-	if (ret) {
-		unregister_pernet_subsys(&ip6table_mangle_net_ops);
-		kfree(mangle_ops);
-	}
 	return ret;
 }
 
 static void __exit ip6table_mangle_fini(void)
 {
 	unregister_pernet_subsys(&ip6table_mangle_net_ops);
+	xt_unregister_template(&packet_mangler);
 	kfree(mangle_ops);
 }
 
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index b0292251e655..921c1723a01e 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -19,8 +19,6 @@ struct ip6table_nat_pernet {
 	struct nf_hook_ops *nf_nat_ops;
 };
 
-static int __net_init ip6table_nat_table_init(struct net *net);
-
 static unsigned int ip6table_nat_net_id __read_mostly;
 
 static const struct xt_table nf_nat_ipv6_table = {
@@ -31,7 +29,6 @@ static const struct xt_table nf_nat_ipv6_table = {
 			  (1 << NF_INET_LOCAL_IN),
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
-	.table_init	= ip6table_nat_table_init,
 };
 
 static unsigned int ip6table_nat_do_chain(void *priv,
@@ -115,7 +112,7 @@ static void ip6t_nat_unregister_lookups(struct net *net)
 	kfree(ops);
 }
 
-static int __net_init ip6table_nat_table_init(struct net *net)
+static int ip6table_nat_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
@@ -157,20 +154,23 @@ static struct pernet_operations ip6table_nat_net_ops = {
 
 static int __init ip6table_nat_init(void)
 {
-	int ret = register_pernet_subsys(&ip6table_nat_net_ops);
+	int ret = xt_register_template(&nf_nat_ipv6_table,
+				       ip6table_nat_table_init);
 
-	if (ret)
+	if (ret < 0)
 		return ret;
 
-	ret = ip6table_nat_table_init(&init_net);
+	ret = register_pernet_subsys(&ip6table_nat_net_ops);
 	if (ret)
-		unregister_pernet_subsys(&ip6table_nat_net_ops);
+		xt_unregister_template(&nf_nat_ipv6_table);
+
 	return ret;
 }
 
 static void __exit ip6table_nat_exit(void)
 {
 	unregister_pernet_subsys(&ip6table_nat_net_ops);
+	xt_unregister_template(&nf_nat_ipv6_table);
 }
 
 module_init(ip6table_nat_init);
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index f63c106c521e..4f2a04af71d3 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -11,8 +11,6 @@
 
 #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
 
-static int __net_init ip6table_raw_table_init(struct net *net);
-
 static bool raw_before_defrag __read_mostly;
 MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
 module_param(raw_before_defrag, bool, 0000);
@@ -23,7 +21,6 @@ static const struct xt_table packet_raw = {
 	.me = THIS_MODULE,
 	.af = NFPROTO_IPV6,
 	.priority = NF_IP6_PRI_RAW,
-	.table_init = ip6table_raw_table_init,
 };
 
 static const struct xt_table packet_raw_before_defrag = {
@@ -32,7 +29,6 @@ static const struct xt_table packet_raw_before_defrag = {
 	.me = THIS_MODULE,
 	.af = NFPROTO_IPV6,
 	.priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG,
-	.table_init = ip6table_raw_table_init,
 };
 
 /* The work comes in here from netfilter.c. */
@@ -45,7 +41,7 @@ ip6table_raw_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
 
-static int __net_init ip6table_raw_table_init(struct net *net)
+static int ip6table_raw_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	const struct xt_table *table = &packet_raw;
@@ -79,37 +75,39 @@ static struct pernet_operations ip6table_raw_net_ops = {
 
 static int __init ip6table_raw_init(void)
 {
-	int ret;
 	const struct xt_table *table = &packet_raw;
+	int ret;
 
 	if (raw_before_defrag) {
 		table = &packet_raw_before_defrag;
-
 		pr_info("Enabling raw table before defrag\n");
 	}
 
+	ret = xt_register_template(table, ip6table_raw_table_init);
+	if (ret < 0)
+		return ret;
+
 	/* Register hooks */
 	rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook);
-	if (IS_ERR(rawtable_ops))
+	if (IS_ERR(rawtable_ops)) {
+		xt_unregister_template(table);
 		return PTR_ERR(rawtable_ops);
+	}
 
 	ret = register_pernet_subsys(&ip6table_raw_net_ops);
 	if (ret < 0) {
 		kfree(rawtable_ops);
+		xt_unregister_template(table);
 		return ret;
 	}
 
-	ret = ip6table_raw_table_init(&init_net);
-	if (ret) {
-		unregister_pernet_subsys(&ip6table_raw_net_ops);
-		kfree(rawtable_ops);
-	}
 	return ret;
 }
 
 static void __exit ip6table_raw_fini(void)
 {
 	unregister_pernet_subsys(&ip6table_raw_net_ops);
+	xt_unregister_template(&packet_raw);
 	kfree(rawtable_ops);
 }
 
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 8dc335cf450b..931674034d8b 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -24,15 +24,12 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
 				(1 << NF_INET_FORWARD) | \
 				(1 << NF_INET_LOCAL_OUT)
 
-static int __net_init ip6table_security_table_init(struct net *net);
-
 static const struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
 	.priority	= NF_IP6_PRI_SECURITY,
-	.table_init     = ip6table_security_table_init,
 };
 
 static unsigned int
@@ -44,7 +41,7 @@ ip6table_security_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
 
-static int __net_init ip6table_security_table_init(struct net *net)
+static int ip6table_security_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
@@ -74,29 +71,32 @@ static struct pernet_operations ip6table_security_net_ops = {
 
 static int __init ip6table_security_init(void)
 {
-	int ret;
+	int ret = xt_register_template(&security_table,
+				       ip6table_security_table_init);
+
+	if (ret < 0)
+		return ret;
 
 	sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook);
-	if (IS_ERR(sectbl_ops))
+	if (IS_ERR(sectbl_ops)) {
+		xt_unregister_template(&security_table);
 		return PTR_ERR(sectbl_ops);
+	}
 
 	ret = register_pernet_subsys(&ip6table_security_net_ops);
 	if (ret < 0) {
 		kfree(sectbl_ops);
+		xt_unregister_template(&security_table);
 		return ret;
 	}
 
-	ret = ip6table_security_table_init(&init_net);
-	if (ret) {
-		unregister_pernet_subsys(&ip6table_security_net_ops);
-		kfree(sectbl_ops);
-	}
 	return ret;
 }
 
 static void __exit ip6table_security_fini(void)
 {
 	unregister_pernet_subsys(&ip6table_security_net_ops);
+	xt_unregister_template(&security_table);
 	kfree(sectbl_ops);
 }
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 84e58ee501a4..25524e393349 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -39,6 +39,20 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
 #define XT_PCPU_BLOCK_SIZE 4096
 #define XT_MAX_TABLE_SIZE	(512 * 1024 * 1024)
 
+struct xt_template {
+	struct list_head list;
+
+	/* called when table is needed in the given netns */
+	int (*table_init)(struct net *net);
+
+	struct module *me;
+
+	/* A unique name... */
+	char name[XT_TABLE_MAXNAMELEN];
+};
+
+static struct list_head xt_templates[NFPROTO_NUMPROTO];
+
 struct xt_pernet {
 	struct list_head tables[NFPROTO_NUMPROTO];
 };
@@ -1221,48 +1235,43 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
 {
 	struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
-	struct xt_table *t, *found = NULL;
+	struct module *owner = NULL;
+	struct xt_template *tmpl;
+	struct xt_table *t;
 
 	mutex_lock(&xt[af].mutex);
 	list_for_each_entry(t, &xt_net->tables[af], list)
 		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
 			return t;
 
-	if (net == &init_net)
-		goto out;
-
-	/* Table doesn't exist in this netns, re-try init */
-	xt_net = net_generic(&init_net, xt_pernet_id);
-	list_for_each_entry(t, &xt_net->tables[af], list) {
+	/* Table doesn't exist in this netns, check larval list */
+	list_for_each_entry(tmpl, &xt_templates[af], list) {
 		int err;
 
-		if (strcmp(t->name, name))
+		if (strcmp(tmpl->name, name))
 			continue;
-		if (!try_module_get(t->me))
+		if (!try_module_get(tmpl->me))
 			goto out;
+
+		owner = tmpl->me;
+
 		mutex_unlock(&xt[af].mutex);
-		err = t->table_init(net);
+		err = tmpl->table_init(net);
 		if (err < 0) {
-			module_put(t->me);
+			module_put(owner);
 			return ERR_PTR(err);
 		}
 
-		found = t;
-
 		mutex_lock(&xt[af].mutex);
 		break;
 	}
 
-	if (!found)
-		goto out;
-
-	xt_net = net_generic(net, xt_pernet_id);
 	/* and once again: */
 	list_for_each_entry(t, &xt_net->tables[af], list)
 		if (strcmp(t->name, name) == 0)
 			return t;
 
-	module_put(found->me);
+	module_put(owner);
  out:
 	mutex_unlock(&xt[af].mutex);
 	return ERR_PTR(-ENOENT);
@@ -1749,6 +1758,58 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
 }
 EXPORT_SYMBOL_GPL(xt_hook_ops_alloc);
 
+int xt_register_template(const struct xt_table *table,
+			 int (*table_init)(struct net *net))
+{
+	int ret = -EEXIST, af = table->af;
+	struct xt_template *t;
+
+	mutex_lock(&xt[af].mutex);
+
+	list_for_each_entry(t, &xt_templates[af], list) {
+		if (WARN_ON_ONCE(strcmp(table->name, t->name) == 0))
+			goto out_unlock;
+	}
+
+	ret = -ENOMEM;
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out_unlock;
+
+	BUILD_BUG_ON(sizeof(t->name) != sizeof(table->name));
+
+	strscpy(t->name, table->name, sizeof(t->name));
+	t->table_init = table_init;
+	t->me = table->me;
+	list_add(&t->list, &xt_templates[af]);
+	ret = 0;
+out_unlock:
+	mutex_unlock(&xt[af].mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xt_register_template);
+
+void xt_unregister_template(const struct xt_table *table)
+{
+	struct xt_template *t;
+	int af = table->af;
+
+	mutex_lock(&xt[af].mutex);
+	list_for_each_entry(t, &xt_templates[af], list) {
+		if (strcmp(table->name, t->name))
+			continue;
+
+		list_del(&t->list);
+		mutex_unlock(&xt[af].mutex);
+		kfree(t);
+		return;
+	}
+
+	mutex_unlock(&xt[af].mutex);
+	WARN_ON_ONCE(1);
+}
+EXPORT_SYMBOL_GPL(xt_unregister_template);
+
 int xt_proto_init(struct net *net, u_int8_t af)
 {
 #ifdef CONFIG_PROC_FS
@@ -1937,6 +1998,7 @@ static int __init xt_init(void)
 #endif
 		INIT_LIST_HEAD(&xt[i].target);
 		INIT_LIST_HEAD(&xt[i].match);
+		INIT_LIST_HEAD(&xt_templates[i]);
 	}
 	rv = register_pernet_subsys(&xt_net_ops);
 	if (rv < 0)
-- 
cgit v1.2.3


From 919d13a7e455c2e7676042d7a5f94c164e859d8a Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 8 Aug 2021 21:57:43 +0300
Subject: devlink: Set device as early as possible

All kernel devlink implementations call to devlink_alloc() during
initialization routine for specific device which is used later as
a parent device for devlink_register().

Such late device assignment causes to the situation which requires us to
call to device_register() before setting other parameters, but that call
opens devlink to the world and makes accessible for the netlink users.

Any attempt to move devlink_register() to be the last call generates the
following error due to access to the devlink->dev pointer.

[    8.758862]  devlink_nl_param_fill+0x2e8/0xe50
[    8.760305]  devlink_param_notify+0x6d/0x180
[    8.760435]  __devlink_params_register+0x2f1/0x670
[    8.760558]  devlink_params_register+0x1e/0x20

The simple change of API to set devlink device in the devlink_alloc()
instead of devlink_register() fixes all this above and ensures that
prior to call to devlink_register() everything already set.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c  |  9 +++++---
 drivers/net/ethernet/cavium/liquidio/lio_main.c    |  5 +++--
 .../ethernet/freescale/dpaa2/dpaa2-eth-devlink.c   |  5 +++--
 .../ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c |  4 ++--
 .../hisilicon/hns3/hns3vf/hclgevf_devlink.c        |  7 +++---
 drivers/net/ethernet/huawei/hinic/hinic_devlink.c  |  8 +++----
 drivers/net/ethernet/huawei/hinic/hinic_devlink.h  |  4 ++--
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c   |  2 +-
 drivers/net/ethernet/huawei/hinic/hinic_main.c     |  2 +-
 drivers/net/ethernet/intel/ice/ice_devlink.c       |  4 ++--
 .../ethernet/marvell/octeontx2/af/rvu_devlink.c    |  5 +++--
 .../ethernet/marvell/prestera/prestera_devlink.c   |  7 +++---
 .../ethernet/marvell/prestera/prestera_devlink.h   |  2 +-
 .../net/ethernet/marvell/prestera/prestera_main.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx4/main.c          |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c  |  9 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/devlink.h  |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  4 ++--
 .../ethernet/mellanox/mlx5/core/sf/dev/driver.c    |  2 +-
 drivers/net/ethernet/mellanox/mlxsw/core.c         |  5 +++--
 drivers/net/ethernet/mscc/ocelot_vsc7514.c         |  5 +++--
 drivers/net/ethernet/netronome/nfp/nfp_main.c      |  2 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c  |  2 +-
 .../net/ethernet/pensando/ionic/ionic_devlink.c    |  4 ++--
 drivers/net/ethernet/qlogic/qed/qed_devlink.c      |  5 +++--
 drivers/net/ethernet/ti/am65-cpsw-nuss.c           |  4 ++--
 drivers/net/ethernet/ti/cpsw_new.c                 |  4 ++--
 drivers/net/netdevsim/dev.c                        |  4 ++--
 drivers/ptp/ptp_ocp.c                              | 26 ++++------------------
 drivers/staging/qlge/qlge_main.c                   |  5 +++--
 include/net/devlink.h                              | 10 +++++----
 net/core/devlink.c                                 | 15 ++++++-------
 net/dsa/dsa2.c                                     |  5 +++--
 33 files changed, 91 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 64381be935a8..2cd8bb37e641 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -743,14 +743,17 @@ static void bnxt_dl_params_unregister(struct bnxt *bp)
 
 int bnxt_dl_register(struct bnxt *bp)
 {
+	const struct devlink_ops *devlink_ops;
 	struct devlink_port_attrs attrs = {};
 	struct devlink *dl;
 	int rc;
 
 	if (BNXT_PF(bp))
-		dl = devlink_alloc(&bnxt_dl_ops, sizeof(struct bnxt_dl));
+		devlink_ops = &bnxt_dl_ops;
 	else
-		dl = devlink_alloc(&bnxt_vf_dl_ops, sizeof(struct bnxt_dl));
+		devlink_ops = &bnxt_vf_dl_ops;
+
+	dl = devlink_alloc(devlink_ops, sizeof(struct bnxt_dl), &bp->pdev->dev);
 	if (!dl) {
 		netdev_warn(bp->dev, "devlink_alloc failed\n");
 		return -ENOMEM;
@@ -763,7 +766,7 @@ int bnxt_dl_register(struct bnxt *bp)
 	    bp->hwrm_spec_code > 0x10803)
 		bp->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY;
 
-	rc = devlink_register(dl, &bp->pdev->dev);
+	rc = devlink_register(dl);
 	if (rc) {
 		netdev_warn(bp->dev, "devlink_register failed. rc=%d\n", rc);
 		goto err_dl_free;
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index af116ef83bad..2907e13b9df6 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3750,7 +3750,8 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 	}
 
 	devlink = devlink_alloc(&liquidio_devlink_ops,
-				sizeof(struct lio_devlink_priv));
+				sizeof(struct lio_devlink_priv),
+				&octeon_dev->pci_dev->dev);
 	if (!devlink) {
 		dev_err(&octeon_dev->pci_dev->dev, "devlink alloc failed\n");
 		goto setup_nic_dev_free;
@@ -3759,7 +3760,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 	lio_devlink = devlink_priv(devlink);
 	lio_devlink->oct = octeon_dev;
 
-	if (devlink_register(devlink, &octeon_dev->pci_dev->dev)) {
+	if (devlink_register(devlink)) {
 		devlink_free(devlink);
 		dev_err(&octeon_dev->pci_dev->dev,
 			"devlink registration failed\n");
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c
index 8e09f65ea295..605a39f892b9 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c
@@ -196,7 +196,8 @@ int dpaa2_eth_dl_register(struct dpaa2_eth_priv *priv)
 	struct dpaa2_eth_devlink_priv *dl_priv;
 	int err;
 
-	priv->devlink = devlink_alloc(&dpaa2_eth_devlink_ops, sizeof(*dl_priv));
+	priv->devlink =
+		devlink_alloc(&dpaa2_eth_devlink_ops, sizeof(*dl_priv), dev);
 	if (!priv->devlink) {
 		dev_err(dev, "devlink_alloc failed\n");
 		return -ENOMEM;
@@ -204,7 +205,7 @@ int dpaa2_eth_dl_register(struct dpaa2_eth_priv *priv)
 	dl_priv = devlink_priv(priv->devlink);
 	dl_priv->dpaa2_priv = priv;
 
-	err = devlink_register(priv->devlink, dev);
+	err = devlink_register(priv->devlink);
 	if (err) {
 		dev_err(dev, "devlink_register() = %d\n", err);
 		goto devlink_free;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c
index 06d29945d4e1..448f29aa4e6b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c
@@ -112,14 +112,14 @@ int hclge_devlink_init(struct hclge_dev *hdev)
 	int ret;
 
 	devlink = devlink_alloc(&hclge_devlink_ops,
-				sizeof(struct hclge_devlink_priv));
+				sizeof(struct hclge_devlink_priv), &pdev->dev);
 	if (!devlink)
 		return -ENOMEM;
 
 	priv = devlink_priv(devlink);
 	priv->hdev = hdev;
 
-	ret = devlink_register(devlink, &pdev->dev);
+	ret = devlink_register(devlink);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to register devlink, ret = %d\n",
 			ret);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c
index 21a45279fd99..1e6061fb8ed4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c
@@ -112,15 +112,16 @@ int hclgevf_devlink_init(struct hclgevf_dev *hdev)
 	struct devlink *devlink;
 	int ret;
 
-	devlink = devlink_alloc(&hclgevf_devlink_ops,
-				sizeof(struct hclgevf_devlink_priv));
+	devlink =
+		devlink_alloc(&hclgevf_devlink_ops,
+			      sizeof(struct hclgevf_devlink_priv), &pdev->dev);
 	if (!devlink)
 		return -ENOMEM;
 
 	priv = devlink_priv(devlink);
 	priv->hdev = hdev;
 
-	ret = devlink_register(devlink, &pdev->dev);
+	ret = devlink_register(devlink);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to register devlink, ret = %d\n",
 			ret);
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c
index 58d5646444b0..6e11ee339f12 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c
@@ -293,9 +293,9 @@ static const struct devlink_ops hinic_devlink_ops = {
 	.flash_update = hinic_devlink_flash_update,
 };
 
-struct devlink *hinic_devlink_alloc(void)
+struct devlink *hinic_devlink_alloc(struct device *dev)
 {
-	return devlink_alloc(&hinic_devlink_ops, sizeof(struct hinic_dev));
+	return devlink_alloc(&hinic_devlink_ops, sizeof(struct hinic_dev), dev);
 }
 
 void hinic_devlink_free(struct devlink *devlink)
@@ -303,11 +303,11 @@ void hinic_devlink_free(struct devlink *devlink)
 	devlink_free(devlink);
 }
 
-int hinic_devlink_register(struct hinic_devlink_priv *priv, struct device *dev)
+int hinic_devlink_register(struct hinic_devlink_priv *priv)
 {
 	struct devlink *devlink = priv_to_devlink(priv);
 
-	return devlink_register(devlink, dev);
+	return devlink_register(devlink);
 }
 
 void hinic_devlink_unregister(struct hinic_devlink_priv *priv)
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_devlink.h b/drivers/net/ethernet/huawei/hinic/hinic_devlink.h
index a090ebcfaabb..9e315011015c 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_devlink.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_devlink.h
@@ -108,9 +108,9 @@ struct host_image_st {
 	u32 device_id;
 };
 
-struct devlink *hinic_devlink_alloc(void);
+struct devlink *hinic_devlink_alloc(struct device *dev);
 void hinic_devlink_free(struct devlink *devlink);
-int hinic_devlink_register(struct hinic_devlink_priv *priv, struct device *dev);
+int hinic_devlink_register(struct hinic_devlink_priv *priv);
 void hinic_devlink_unregister(struct hinic_devlink_priv *priv);
 
 int hinic_health_reporters_create(struct hinic_devlink_priv *priv);
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
index 428108eb10d2..56b6b04e209b 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
@@ -754,7 +754,7 @@ static int init_pfhwdev(struct hinic_pfhwdev *pfhwdev)
 		return err;
 	}
 
-	err = hinic_devlink_register(hwdev->devlink_dev, &pdev->dev);
+	err = hinic_devlink_register(hwdev->devlink_dev);
 	if (err) {
 		dev_err(&hwif->pdev->dev, "Failed to register devlink\n");
 		hinic_pf_to_mgmt_free(&pfhwdev->pf_to_mgmt);
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c
index 405ee4d2d2b1..881d0b247561 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_main.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c
@@ -1183,7 +1183,7 @@ static int nic_dev_init(struct pci_dev *pdev)
 	struct devlink *devlink;
 	int err, num_qps;
 
-	devlink = hinic_devlink_alloc();
+	devlink = hinic_devlink_alloc(&pdev->dev);
 	if (!devlink) {
 		dev_err(&pdev->dev, "Hinic devlink alloc failed\n");
 		return -ENOMEM;
diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c
index 91b545ab8b8f..8c863d64930b 100644
--- a/drivers/net/ethernet/intel/ice/ice_devlink.c
+++ b/drivers/net/ethernet/intel/ice/ice_devlink.c
@@ -475,7 +475,7 @@ struct ice_pf *ice_allocate_pf(struct device *dev)
 {
 	struct devlink *devlink;
 
-	devlink = devlink_alloc(&ice_devlink_ops, sizeof(struct ice_pf));
+	devlink = devlink_alloc(&ice_devlink_ops, sizeof(struct ice_pf), dev);
 	if (!devlink)
 		return NULL;
 
@@ -502,7 +502,7 @@ int ice_devlink_register(struct ice_pf *pf)
 	struct device *dev = ice_pf_to_dev(pf);
 	int err;
 
-	err = devlink_register(devlink, dev);
+	err = devlink_register(devlink);
 	if (err) {
 		dev_err(dev, "devlink registration failed: %d\n", err);
 		return err;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
index 6f963b2f54a7..a55b46ad162d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
@@ -1503,13 +1503,14 @@ int rvu_register_dl(struct rvu *rvu)
 	struct devlink *dl;
 	int err;
 
-	dl = devlink_alloc(&rvu_devlink_ops, sizeof(struct rvu_devlink));
+	dl = devlink_alloc(&rvu_devlink_ops, sizeof(struct rvu_devlink),
+			   rvu->dev);
 	if (!dl) {
 		dev_warn(rvu->dev, "devlink_alloc failed\n");
 		return -ENOMEM;
 	}
 
-	err = devlink_register(dl, rvu->dev);
+	err = devlink_register(dl);
 	if (err) {
 		dev_err(rvu->dev, "devlink register failed with error %d\n", err);
 		devlink_free(dl);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c
index fa7a0682ad1e..68b442eb6d69 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c
@@ -390,11 +390,12 @@ static const struct devlink_ops prestera_dl_ops = {
 	.trap_drop_counter_get = prestera_drop_counter_get,
 };
 
-struct prestera_switch *prestera_devlink_alloc(void)
+struct prestera_switch *prestera_devlink_alloc(struct prestera_device *dev)
 {
 	struct devlink *dl;
 
-	dl = devlink_alloc(&prestera_dl_ops, sizeof(struct prestera_switch));
+	dl = devlink_alloc(&prestera_dl_ops, sizeof(struct prestera_switch),
+			   dev->dev);
 
 	return devlink_priv(dl);
 }
@@ -411,7 +412,7 @@ int prestera_devlink_register(struct prestera_switch *sw)
 	struct devlink *dl = priv_to_devlink(sw);
 	int err;
 
-	err = devlink_register(dl, sw->dev->dev);
+	err = devlink_register(dl);
 	if (err) {
 		dev_err(prestera_dev(sw), "devlink_register failed: %d\n", err);
 		return err;
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.h b/drivers/net/ethernet/marvell/prestera/prestera_devlink.h
index 5d73aa9db897..cc34c3db13a2 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.h
@@ -6,7 +6,7 @@
 
 #include "prestera.h"
 
-struct prestera_switch *prestera_devlink_alloc(void);
+struct prestera_switch *prestera_devlink_alloc(struct prestera_device *dev);
 void prestera_devlink_free(struct prestera_switch *sw);
 
 int prestera_devlink_register(struct prestera_switch *sw);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c
index 7c569c1abefc..44c670807fb3 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_main.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c
@@ -905,7 +905,7 @@ int prestera_device_register(struct prestera_device *dev)
 	struct prestera_switch *sw;
 	int err;
 
-	sw = prestera_devlink_alloc();
+	sw = prestera_devlink_alloc(dev);
 	if (!sw)
 		return -ENOMEM;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 28ac4693da3c..7267c6c6d2e2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -4005,7 +4005,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	printk_once(KERN_INFO "%s", mlx4_version);
 
-	devlink = devlink_alloc(&mlx4_devlink_ops, sizeof(*priv));
+	devlink = devlink_alloc(&mlx4_devlink_ops, sizeof(*priv), &pdev->dev);
 	if (!devlink)
 		return -ENOMEM;
 	priv = devlink_priv(devlink);
@@ -4024,7 +4024,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	mutex_init(&dev->persist->interface_state_mutex);
 	mutex_init(&dev->persist->pci_status_mutex);
 
-	ret = devlink_register(devlink, &pdev->dev);
+	ret = devlink_register(devlink);
 	if (ret)
 		goto err_persist_free;
 	ret = devlink_params_register(devlink, mlx4_devlink_params,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index d791d351b489..f38553ff538b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -359,9 +359,10 @@ int mlx5_devlink_traps_get_action(struct mlx5_core_dev *dev, int trap_id,
 	return 0;
 }
 
-struct devlink *mlx5_devlink_alloc(void)
+struct devlink *mlx5_devlink_alloc(struct device *dev)
 {
-	return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev));
+	return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev),
+			     dev);
 }
 
 void mlx5_devlink_free(struct devlink *devlink)
@@ -638,11 +639,11 @@ static void mlx5_devlink_traps_unregister(struct devlink *devlink)
 				       ARRAY_SIZE(mlx5_trap_groups_arr));
 }
 
-int mlx5_devlink_register(struct devlink *devlink, struct device *dev)
+int mlx5_devlink_register(struct devlink *devlink)
 {
 	int err;
 
-	err = devlink_register(devlink, dev);
+	err = devlink_register(devlink);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 7318d44b774b..30bf4882779b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -31,9 +31,9 @@ int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev);
 int mlx5_devlink_traps_get_action(struct mlx5_core_dev *dev, int trap_id,
 				  enum devlink_trap_action *action);
 
-struct devlink *mlx5_devlink_alloc(void);
+struct devlink *mlx5_devlink_alloc(struct device *dev);
 void mlx5_devlink_free(struct devlink *devlink);
-int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
+int mlx5_devlink_register(struct devlink *devlink);
 void mlx5_devlink_unregister(struct devlink *devlink);
 
 #endif /* __MLX5_DEVLINK_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index eb1b316560a8..a8efd9f1af4c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1271,7 +1271,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
 
 	set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
 
-	err = mlx5_devlink_register(priv_to_devlink(dev), dev->device);
+	err = mlx5_devlink_register(priv_to_devlink(dev));
 	if (err)
 		goto err_devlink_reg;
 
@@ -1452,7 +1452,7 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	struct devlink *devlink;
 	int err;
 
-	devlink = mlx5_devlink_alloc();
+	devlink = mlx5_devlink_alloc(&pdev->dev);
 	if (!devlink) {
 		dev_err(&pdev->dev, "devlink alloc failed\n");
 		return -ENOMEM;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
index 42c8ee03fe3e..052f48068dc1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
@@ -14,7 +14,7 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia
 	struct devlink *devlink;
 	int err;
 
-	devlink = mlx5_devlink_alloc();
+	devlink = mlx5_devlink_alloc(&adev->dev);
 	if (!devlink)
 		return -ENOMEM;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index e775f08fb464..f080fab3de2b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1927,7 +1927,8 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 
 	if (!reload) {
 		alloc_size = sizeof(*mlxsw_core) + mlxsw_driver->priv_size;
-		devlink = devlink_alloc(&mlxsw_devlink_ops, alloc_size);
+		devlink = devlink_alloc(&mlxsw_devlink_ops, alloc_size,
+					mlxsw_bus_info->dev);
 		if (!devlink) {
 			err = -ENOMEM;
 			goto err_devlink_alloc;
@@ -1974,7 +1975,7 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 		goto err_emad_init;
 
 	if (!reload) {
-		err = devlink_register(devlink, mlxsw_bus_info->dev);
+		err = devlink_register(devlink);
 		if (err)
 			goto err_devlink_register;
 	}
diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
index 4bd7e9d9ec61..aa41c9cde643 100644
--- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
@@ -1103,7 +1103,8 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 	if (!np && !pdev->dev.platform_data)
 		return -ENODEV;
 
-	devlink = devlink_alloc(&ocelot_devlink_ops, sizeof(*ocelot));
+	devlink =
+		devlink_alloc(&ocelot_devlink_ops, sizeof(*ocelot), &pdev->dev);
 	if (!devlink)
 		return -ENOMEM;
 
@@ -1187,7 +1188,7 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 	if (err)
 		goto out_put_ports;
 
-	err = devlink_register(devlink, ocelot->dev);
+	err = devlink_register(devlink);
 	if (err)
 		goto out_ocelot_deinit;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index 742a420152b3..bb3b8a7f6c5d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -692,7 +692,7 @@ static int nfp_pci_probe(struct pci_dev *pdev,
 		goto err_pci_disable;
 	}
 
-	devlink = devlink_alloc(&nfp_devlink_ops, sizeof(*pf));
+	devlink = devlink_alloc(&nfp_devlink_ops, sizeof(*pf), &pdev->dev);
 	if (!devlink) {
 		err = -ENOMEM;
 		goto err_rel_regions;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index 921db40047d7..d10a93801344 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -701,7 +701,7 @@ int nfp_net_pci_probe(struct nfp_pf *pf)
 	if (err)
 		goto err_unmap;
 
-	err = devlink_register(devlink, &pf->pdev->dev);
+	err = devlink_register(devlink);
 	if (err)
 		goto err_app_clean;
 
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
index cd520e4c5522..c7d0e195d176 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
@@ -64,7 +64,7 @@ struct ionic *ionic_devlink_alloc(struct device *dev)
 {
 	struct devlink *dl;
 
-	dl = devlink_alloc(&ionic_dl_ops, sizeof(struct ionic));
+	dl = devlink_alloc(&ionic_dl_ops, sizeof(struct ionic), dev);
 
 	return devlink_priv(dl);
 }
@@ -82,7 +82,7 @@ int ionic_devlink_register(struct ionic *ionic)
 	struct devlink_port_attrs attrs = {};
 	int err;
 
-	err = devlink_register(dl, ionic->dev);
+	err = devlink_register(dl);
 	if (err) {
 		dev_warn(ionic->dev, "devlink_register failed: %d\n", err);
 		return err;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_devlink.c b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
index cf7f4da68e69..4c7501b9c284 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_devlink.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
@@ -207,14 +207,15 @@ struct devlink *qed_devlink_register(struct qed_dev *cdev)
 	struct devlink *dl;
 	int rc;
 
-	dl = devlink_alloc(&qed_dl_ops, sizeof(struct qed_devlink));
+	dl = devlink_alloc(&qed_dl_ops, sizeof(struct qed_devlink),
+			   &cdev->pdev->dev);
 	if (!dl)
 		return ERR_PTR(-ENOMEM);
 
 	qdevlink = devlink_priv(dl);
 	qdevlink->cdev = cdev;
 
-	rc = devlink_register(dl, &cdev->pdev->dev);
+	rc = devlink_register(dl);
 	if (rc)
 		goto err_free;
 
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index 588e7df0b1cc..130346f74ee8 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -2422,14 +2422,14 @@ static int am65_cpsw_nuss_register_devlink(struct am65_cpsw_common *common)
 	int i;
 
 	common->devlink =
-		devlink_alloc(&am65_cpsw_devlink_ops, sizeof(*dl_priv));
+		devlink_alloc(&am65_cpsw_devlink_ops, sizeof(*dl_priv), dev);
 	if (!common->devlink)
 		return -ENOMEM;
 
 	dl_priv = devlink_priv(common->devlink);
 	dl_priv->common = common;
 
-	ret = devlink_register(common->devlink, dev);
+	ret = devlink_register(common->devlink);
 	if (ret) {
 		dev_err(dev, "devlink reg fail ret:%d\n", ret);
 		goto dl_free;
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index ae167223e87f..192394fe4c1c 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -1800,14 +1800,14 @@ static int cpsw_register_devlink(struct cpsw_common *cpsw)
 	struct cpsw_devlink *dl_priv;
 	int ret = 0;
 
-	cpsw->devlink = devlink_alloc(&cpsw_devlink_ops, sizeof(*dl_priv));
+	cpsw->devlink = devlink_alloc(&cpsw_devlink_ops, sizeof(*dl_priv), dev);
 	if (!cpsw->devlink)
 		return -ENOMEM;
 
 	dl_priv = devlink_priv(cpsw->devlink);
 	dl_priv->cpsw = cpsw;
 
-	ret = devlink_register(cpsw->devlink, dev);
+	ret = devlink_register(cpsw->devlink);
 	if (ret) {
 		dev_err(dev, "DL reg fail ret:%d\n", ret);
 		goto dl_free;
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 53068e184c90..54313bd57797 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -1449,7 +1449,7 @@ int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev)
 	int err;
 
 	devlink = devlink_alloc_ns(&nsim_dev_devlink_ops, sizeof(*nsim_dev),
-				   nsim_bus_dev->initial_net);
+				 nsim_bus_dev->initial_net, &nsim_bus_dev->dev);
 	if (!devlink)
 		return -ENOMEM;
 	nsim_dev = devlink_priv(devlink);
@@ -1470,7 +1470,7 @@ int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev)
 	if (err)
 		goto err_devlink_free;
 
-	err = devlink_register(devlink, &nsim_bus_dev->dev);
+	err = devlink_register(devlink);
 	if (err)
 		goto err_resources_unregister;
 
diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c
index 6b9c14586987..92edf772feed 100644
--- a/drivers/ptp/ptp_ocp.c
+++ b/drivers/ptp/ptp_ocp.c
@@ -735,24 +735,6 @@ ptp_ocp_info(struct ptp_ocp *bp)
 	ptp_ocp_tod_info(bp);
 }
 
-static int
-ptp_ocp_devlink_register(struct devlink *devlink, struct device *dev)
-{
-	int err;
-
-	err = devlink_register(devlink, dev);
-	if (err)
-		return err;
-
-	return 0;
-}
-
-static void
-ptp_ocp_devlink_unregister(struct devlink *devlink)
-{
-	devlink_unregister(devlink);
-}
-
 static struct device *
 ptp_ocp_find_flash(struct ptp_ocp *bp)
 {
@@ -1437,13 +1419,13 @@ ptp_ocp_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	struct ptp_ocp *bp;
 	int err;
 
-	devlink = devlink_alloc(&ptp_ocp_devlink_ops, sizeof(*bp));
+	devlink = devlink_alloc(&ptp_ocp_devlink_ops, sizeof(*bp), &pdev->dev);
 	if (!devlink) {
 		dev_err(&pdev->dev, "devlink_alloc failed\n");
 		return -ENOMEM;
 	}
 
-	err = ptp_ocp_devlink_register(devlink, &pdev->dev);
+	err = devlink_register(devlink);
 	if (err)
 		goto out_free;
 
@@ -1497,7 +1479,7 @@ out:
 	pci_disable_device(pdev);
 	pci_set_drvdata(pdev, NULL);
 out_unregister:
-	ptp_ocp_devlink_unregister(devlink);
+	devlink_unregister(devlink);
 out_free:
 	devlink_free(devlink);
 
@@ -1514,7 +1496,7 @@ ptp_ocp_remove(struct pci_dev *pdev)
 	pci_disable_device(pdev);
 	pci_set_drvdata(pdev, NULL);
 
-	ptp_ocp_devlink_unregister(devlink);
+	devlink_unregister(devlink);
 	devlink_free(devlink);
 }
 
diff --git a/drivers/staging/qlge/qlge_main.c b/drivers/staging/qlge/qlge_main.c
index 19a02e958865..8fcdf89da8aa 100644
--- a/drivers/staging/qlge/qlge_main.c
+++ b/drivers/staging/qlge/qlge_main.c
@@ -4547,7 +4547,8 @@ static int qlge_probe(struct pci_dev *pdev,
 	static int cards_found;
 	int err;
 
-	devlink = devlink_alloc(&qlge_devlink_ops, sizeof(struct qlge_adapter));
+	devlink = devlink_alloc(&qlge_devlink_ops, sizeof(struct qlge_adapter),
+				&pdev->dev);
 	if (!devlink)
 		return -ENOMEM;
 
@@ -4613,7 +4614,7 @@ static int qlge_probe(struct pci_dev *pdev,
 		goto netdev_free;
 	}
 
-	err = devlink_register(devlink, &pdev->dev);
+	err = devlink_register(devlink);
 	if (err)
 		goto netdev_free;
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index ccbfb3a844aa..0236c77f2fd0 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1544,13 +1544,15 @@ struct net *devlink_net(const struct devlink *devlink);
  * Drivers that operate on real HW must use devlink_alloc() instead.
  */
 struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
-				 size_t priv_size, struct net *net);
+				 size_t priv_size, struct net *net,
+				 struct device *dev);
 static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
-					    size_t priv_size)
+					    size_t priv_size,
+					    struct device *dev)
 {
-	return devlink_alloc_ns(ops, priv_size, &init_net);
+	return devlink_alloc_ns(ops, priv_size, &init_net, dev);
 }
-int devlink_register(struct devlink *devlink, struct device *dev);
+int devlink_register(struct devlink *devlink);
 void devlink_unregister(struct devlink *devlink);
 void devlink_reload_enable(struct devlink *devlink);
 void devlink_reload_disable(struct devlink *devlink);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ee95eee8d0ed..d3b16dd9f64e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -8768,24 +8768,26 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops)
  *	@ops: ops
  *	@priv_size: size of user private data
  *	@net: net namespace
+ *	@dev: parent device
  *
  *	Allocate new devlink instance resources, including devlink index
  *	and name.
  */
 struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
-				 size_t priv_size, struct net *net)
+				 size_t priv_size, struct net *net,
+				 struct device *dev)
 {
 	struct devlink *devlink;
 
-	if (WARN_ON(!ops))
-		return NULL;
-
+	WARN_ON(!ops || !dev);
 	if (!devlink_reload_actions_valid(ops))
 		return NULL;
 
 	devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
 	if (!devlink)
 		return NULL;
+
+	devlink->dev = dev;
 	devlink->ops = ops;
 	xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
 	write_pnet(&devlink->_net, net);
@@ -8810,12 +8812,9 @@ EXPORT_SYMBOL_GPL(devlink_alloc_ns);
  *	devlink_register - Register devlink instance
  *
  *	@devlink: devlink
- *	@dev: parent device
  */
-int devlink_register(struct devlink *devlink, struct device *dev)
+int devlink_register(struct devlink *devlink)
 {
-	WARN_ON(devlink->dev);
-	devlink->dev = dev;
 	mutex_lock(&devlink_mutex);
 	list_add_tail(&devlink->list, &devlink_list);
 	devlink_notify(devlink, DEVLINK_CMD_NEW);
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index a4c525f1cb17..8150e16aaa55 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -746,13 +746,14 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	/* Add the switch to devlink before calling setup, so that setup can
 	 * add dpipe tables
 	 */
-	ds->devlink = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv));
+	ds->devlink =
+		devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv), ds->dev);
 	if (!ds->devlink)
 		return -ENOMEM;
 	dl_priv = devlink_priv(ds->devlink);
 	dl_priv->ds = ds;
 
-	err = devlink_register(ds->devlink, ds->dev);
+	err = devlink_register(ds->devlink);
 	if (err)
 		goto free_devlink;
 
-- 
cgit v1.2.3


From 879af96ffd72706c6e3278ea6b45b0b0e37ec5d7 Mon Sep 17 00:00:00 2001
From: Jussi Maki <joamaki@gmail.com>
Date: Sat, 31 Jul 2021 05:57:33 +0000
Subject: net, core: Add support for XDP redirection to slave device

This adds the ndo_xdp_get_xmit_slave hook for transforming XDP_TX
into XDP_REDIRECT after BPF program run when the ingress device
is a bond slave.

The dev_xdp_prog_count is exposed so that slave devices can be checked
for loaded XDP programs in order to avoid the situation where both
bond master and slave have programs loaded according to xdp_state.

Signed-off-by: Jussi Maki <joamaki@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Link: https://lore.kernel.org/bpf/20210731055738.16820-3-joamaki@gmail.com
---
 include/linux/filter.h    | 13 ++++++++++++-
 include/linux/netdevice.h |  6 ++++++
 net/core/dev.c            | 13 ++++++++++++-
 net/core/filter.c         | 25 +++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ff698c9d1c94..1797e8506929 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -776,6 +776,10 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 
 DECLARE_BPF_DISPATCHER(xdp)
 
+DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
+
+u32 xdp_master_redirect(struct xdp_buff *xdp);
+
 static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 					    struct xdp_buff *xdp)
 {
@@ -783,7 +787,14 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	 * under local_bh_disable(), which provides the needed RCU protection
 	 * for accessing map entries.
 	 */
-	return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
+	u32 act = __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
+
+	if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) {
+		if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev))
+			act = xdp_master_redirect(xdp);
+	}
+
+	return act;
 }
 
 void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d63a94ecbf3b..02c6e8e10c86 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1330,6 +1330,9 @@ struct netdev_net_notifier {
  *	that got dropped are freed/returned via xdp_return_frame().
  *	Returns negative number, means general error invoking ndo, meaning
  *	no frames were xmit'ed and core-caller will free all frames.
+ * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
+ *					        struct xdp_buff *xdp);
+ *      Get the xmit slave of master device based on the xdp_buff.
  * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
  *      This function is used to wake up the softirq, ksoftirqd or kthread
  *	responsible for sending and/or receiving packets on a specific
@@ -1557,6 +1560,8 @@ struct net_device_ops {
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
 						struct xdp_frame **xdp,
 						u32 flags);
+	struct net_device *	(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
+							  struct xdp_buff *xdp);
 	int			(*ndo_xsk_wakeup)(struct net_device *dev,
 						  u32 queue_id, u32 flags);
 	struct devlink_port *	(*ndo_get_devlink_port)(struct net_device *dev);
@@ -4087,6 +4092,7 @@ typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, int expected_fd, u32 flags);
 int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+u8 dev_xdp_prog_count(struct net_device *dev);
 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 64e1a5f63f93..9eb6dc9e02b3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9380,7 +9380,7 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 	return dev->xdp_state[mode].prog;
 }
 
-static u8 dev_xdp_prog_count(struct net_device *dev)
+u8 dev_xdp_prog_count(struct net_device *dev)
 {
 	u8 count = 0;
 	int i;
@@ -9390,6 +9390,7 @@ static u8 dev_xdp_prog_count(struct net_device *dev)
 			count++;
 	return count;
 }
+EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
 
 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 {
@@ -9483,6 +9484,8 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 {
 	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
 	struct bpf_prog *cur_prog;
+	struct net_device *upper;
+	struct list_head *iter;
 	enum bpf_xdp_mode mode;
 	bpf_op_t bpf_op;
 	int err;
@@ -9521,6 +9524,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 		return -EBUSY;
 	}
 
+	/* don't allow if an upper device already has a program */
+	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+		if (dev_xdp_prog_count(upper) > 0) {
+			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
+			return -EEXIST;
+		}
+	}
+
 	cur_prog = dev_xdp_prog(dev, mode);
 	/* can't replace attached prog with link */
 	if (link && cur_prog) {
diff --git a/net/core/filter.c b/net/core/filter.c
index faf29fd82276..ff62cd39046d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3950,6 +3950,31 @@ void bpf_clear_redirect_map(struct bpf_map *map)
 	}
 }
 
+DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
+EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
+
+u32 xdp_master_redirect(struct xdp_buff *xdp)
+{
+	struct net_device *master, *slave;
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+	master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
+	slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
+	if (slave && slave != xdp->rxq->dev) {
+		/* The target device is different from the receiving device, so
+		 * redirect it to the new device.
+		 * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
+		 * drivers to unmap the packet from their rx ring.
+		 */
+		ri->tgt_index = slave->ifindex;
+		ri->map_id = INT_MAX;
+		ri->map_type = BPF_MAP_TYPE_UNSPEC;
+		return XDP_REDIRECT;
+	}
+	return XDP_TX;
+}
+EXPORT_SYMBOL_GPL(xdp_master_redirect);
+
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		    struct bpf_prog *xdp_prog)
 {
-- 
cgit v1.2.3


From 9e2ee5c7e7c35d195e2aa0692a7241d47a433d1e Mon Sep 17 00:00:00 2001
From: Jussi Maki <joamaki@gmail.com>
Date: Sat, 31 Jul 2021 05:57:34 +0000
Subject: net, bonding: Add XDP support to the bonding driver

XDP is implemented in the bonding driver by transparently delegating
the XDP program loading, removal and xmit operations to the bonding
slave devices. The overall goal of this work is that XDP programs
can be attached to a bond device *without* any further changes (or
awareness) necessary to the program itself, meaning the same XDP
program can be attached to a native device but also a bonding device.

Semantics of XDP_TX when attached to a bond are equivalent in such
setting to the case when a tc/BPF program would be attached to the
bond, meaning transmitting the packet out of the bond itself using one
of the bond's configured xmit methods to select a slave device (rather
than XDP_TX on the slave itself). Handling of XDP_TX to transmit
using the configured bonding mechanism is therefore implemented by
rewriting the BPF program return value in bpf_prog_run_xdp. To avoid
performance impact this check is guarded by a static key, which is
incremented when a XDP program is loaded onto a bond device. This
approach was chosen to avoid changes to drivers implementing XDP. If
the slave device does not match the receive device, then XDP_REDIRECT
is transparently used to perform the redirection in order to have
the network driver release the packet from its RX ring. The bonding
driver hashing functions have been refactored to allow reuse with
xdp_buff's to avoid code duplication.

The motivation for this change is to enable use of bonding (and
802.3ad) in hairpinning L4 load-balancers such as [1] implemented with
XDP and also to transparently support bond devices for projects that
use XDP given most modern NICs have dual port adapters. An alternative
to this approach would be to implement 802.3ad in user-space and
implement the bonding load-balancing in the XDP program itself, but
is rather a cumbersome endeavor in terms of slave device management
(e.g. by watching netlink) and requires separate programs for native
vs bond cases for the orchestrator. A native in-kernel implementation
overcomes these issues and provides more flexibility.

Below are benchmark results done on two machines with 100Gbit
Intel E810 (ice) NIC and with 32-core 3970X on sending machine, and
16-core 3950X on receiving machine. 64 byte packets were sent with
pktgen-dpdk at full rate. Two issues [2, 3] were identified with the
ice driver, so the tests were performed with iommu=off and patch [2]
applied. Additionally the bonding round robin algorithm was modified
to use per-cpu tx counters as high CPU load (50% vs 10%) and high rate
of cache misses were caused by the shared rr_tx_counter (see patch
2/3). The statistics were collected using "sar -n dev -u 1 10". On top
of that, for ice, further work is in progress on improving the XDP_TX
numbers [4].

 -----------------------|  CPU  |--| rxpck/s |--| txpck/s |----
 without patch (1 dev):
   XDP_DROP:              3.15%      48.6Mpps
   XDP_TX:                3.12%      18.3Mpps     18.3Mpps
   XDP_DROP (RSS):        9.47%      116.5Mpps
   XDP_TX (RSS):          9.67%      25.3Mpps     24.2Mpps
 -----------------------
 with patch, bond (1 dev):
   XDP_DROP:              3.14%      46.7Mpps
   XDP_TX:                3.15%      13.9Mpps     13.9Mpps
   XDP_DROP (RSS):        10.33%     117.2Mpps
   XDP_TX (RSS):          10.64%     25.1Mpps     24.0Mpps
 -----------------------
 with patch, bond (2 devs):
   XDP_DROP:              6.27%      92.7Mpps
   XDP_TX:                6.26%      17.6Mpps     17.5Mpps
   XDP_DROP (RSS):       11.38%      117.2Mpps
   XDP_TX (RSS):         14.30%      28.7Mpps     27.4Mpps
 --------------------------------------------------------------

RSS: Receive Side Scaling, e.g. the packets were sent to a range of
destination IPs.

  [1]: https://cilium.io/blog/2021/05/20/cilium-110#standalonelb
  [2]: https://lore.kernel.org/bpf/20210601113236.42651-1-maciej.fijalkowski@intel.com/T/#t
  [3]: https://lore.kernel.org/bpf/CAHn8xckNXci+X_Eb2WMv4uVYjO2331UWB2JLtXr_58z0Av8+8A@mail.gmail.com/
  [4]: https://lore.kernel.org/bpf/20210805230046.28715-1-maciej.fijalkowski@intel.com/T/#t

Signed-off-by: Jussi Maki <joamaki@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Cc: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20210731055738.16820-4-joamaki@gmail.com
---
 drivers/net/bonding/bond_main.c | 309 +++++++++++++++++++++++++++++++++++++++-
 include/net/bonding.h           |   1 +
 2 files changed, 309 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 487c71fcec58..04cf78fa1721 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -317,6 +317,19 @@ bool bond_sk_check(struct bonding *bond)
 	}
 }
 
+static bool bond_xdp_check(struct bonding *bond)
+{
+	switch (BOND_MODE(bond)) {
+	case BOND_MODE_ROUNDROBIN:
+	case BOND_MODE_ACTIVEBACKUP:
+	case BOND_MODE_8023AD:
+	case BOND_MODE_XOR:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /*---------------------------------- VLAN -----------------------------------*/
 
 /* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid,
@@ -2133,6 +2146,41 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		bond_update_slave_arr(bond, NULL);
 
 
+	if (!slave_dev->netdev_ops->ndo_bpf ||
+	    !slave_dev->netdev_ops->ndo_xdp_xmit) {
+		if (bond->xdp_prog) {
+			NL_SET_ERR_MSG(extack, "Slave does not support XDP");
+			slave_err(bond_dev, slave_dev, "Slave does not support XDP\n");
+			res = -EOPNOTSUPP;
+			goto err_sysfs_del;
+		}
+	} else {
+		struct netdev_bpf xdp = {
+			.command = XDP_SETUP_PROG,
+			.flags   = 0,
+			.prog    = bond->xdp_prog,
+			.extack  = extack,
+		};
+
+		if (dev_xdp_prog_count(slave_dev) > 0) {
+			NL_SET_ERR_MSG(extack,
+				       "Slave has XDP program loaded, please unload before enslaving");
+			slave_err(bond_dev, slave_dev,
+				  "Slave has XDP program loaded, please unload before enslaving\n");
+			res = -EOPNOTSUPP;
+			goto err_sysfs_del;
+		}
+
+		res = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp);
+		if (res < 0) {
+			/* ndo_bpf() sets extack error message */
+			slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res);
+			goto err_sysfs_del;
+		}
+		if (bond->xdp_prog)
+			bpf_prog_inc(bond->xdp_prog);
+	}
+
 	slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n",
 		   bond_is_active_slave(new_slave) ? "an active" : "a backup",
 		   new_slave->link != BOND_LINK_DOWN ? "an up" : "a down");
@@ -2252,6 +2300,17 @@ static int __bond_release_one(struct net_device *bond_dev,
 	/* recompute stats just before removing the slave */
 	bond_get_stats(bond->dev, &bond->bond_stats);
 
+	if (bond->xdp_prog) {
+		struct netdev_bpf xdp = {
+			.command = XDP_SETUP_PROG,
+			.flags   = 0,
+			.prog	 = NULL,
+			.extack  = NULL,
+		};
+		if (slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp))
+			slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n");
+	}
+
 	/* unregister rx_handler early so bond_handle_frame wouldn't be called
 	 * for this slave anymore.
 	 */
@@ -3638,7 +3697,7 @@ static inline u32 bond_eth_hash(struct sk_buff *skb, const void *data, int mhoff
 		return 0;
 
 	ep = (struct ethhdr *)(data + mhoff);
-	return ep->h_dest[5] ^ ep->h_source[5] ^ ep->h_proto;
+	return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto);
 }
 
 static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, const void *data,
@@ -3807,6 +3866,26 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 				skb_headlen(skb));
 }
 
+/**
+ * bond_xmit_hash_xdp - generate a hash value based on the xmit policy
+ * @bond: bonding device
+ * @xdp: buffer to use for headers
+ *
+ * The XDP variant of bond_xmit_hash.
+ */
+static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp)
+{
+	struct ethhdr *eth;
+
+	if (xdp->data + sizeof(struct ethhdr) > xdp->data_end)
+		return 0;
+
+	eth = (struct ethhdr *)xdp->data;
+
+	return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0,
+				sizeof(struct ethhdr), xdp->data_end - xdp->data);
+}
+
 /*-------------------------- Device entry points ----------------------------*/
 
 void bond_work_init_all(struct bonding *bond)
@@ -4455,6 +4534,47 @@ non_igmp:
 	return NULL;
 }
 
+static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond,
+							struct xdp_buff *xdp)
+{
+	struct slave *slave;
+	int slave_cnt;
+	u32 slave_id;
+	const struct ethhdr *eth;
+	void *data = xdp->data;
+
+	if (data + sizeof(struct ethhdr) > xdp->data_end)
+		goto non_igmp;
+
+	eth = (struct ethhdr *)data;
+	data += sizeof(struct ethhdr);
+
+	/* See comment on IGMP in bond_xmit_roundrobin_slave_get() */
+	if (eth->h_proto == htons(ETH_P_IP)) {
+		const struct iphdr *iph;
+
+		if (data + sizeof(struct iphdr) > xdp->data_end)
+			goto non_igmp;
+
+		iph = (struct iphdr *)data;
+
+		if (iph->protocol == IPPROTO_IGMP) {
+			slave = rcu_dereference(bond->curr_active_slave);
+			if (slave)
+				return slave;
+			return bond_get_slave_by_id(bond, 0);
+		}
+	}
+
+non_igmp:
+	slave_cnt = READ_ONCE(bond->slave_cnt);
+	if (likely(slave_cnt)) {
+		slave_id = bond_rr_gen_slave_id(bond) % slave_cnt;
+		return bond_get_slave_by_id(bond, slave_id);
+	}
+	return NULL;
+}
+
 static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
 					struct net_device *bond_dev)
 {
@@ -4670,6 +4790,22 @@ static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond,
 	return slave;
 }
 
+static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond,
+						     struct xdp_buff *xdp)
+{
+	struct bond_up_slave *slaves;
+	unsigned int count;
+	u32 hash;
+
+	hash = bond_xmit_hash_xdp(bond, xdp);
+	slaves = rcu_dereference(bond->usable_slaves);
+	count = slaves ? READ_ONCE(slaves->count) : 0;
+	if (unlikely(!count))
+		return NULL;
+
+	return slaves->arr[hash % count];
+}
+
 /* Use this Xmit function for 3AD as well as XOR modes. The current
  * usable slave array is formed in the control path. The xmit function
  * just calculates hash and sends the packet out.
@@ -4954,6 +5090,174 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	return ret;
 }
 
+static struct net_device *
+bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave;
+
+	/* Caller needs to hold rcu_read_lock() */
+
+	switch (BOND_MODE(bond)) {
+	case BOND_MODE_ROUNDROBIN:
+		slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp);
+		break;
+
+	case BOND_MODE_ACTIVEBACKUP:
+		slave = bond_xmit_activebackup_slave_get(bond);
+		break;
+
+	case BOND_MODE_8023AD:
+	case BOND_MODE_XOR:
+		slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp);
+		break;
+
+	default:
+		/* Should never happen. Mode guarded by bond_xdp_check() */
+		netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", BOND_MODE(bond));
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
+	if (slave)
+		return slave->dev;
+
+	return NULL;
+}
+
+static int bond_xdp_xmit(struct net_device *bond_dev,
+			 int n, struct xdp_frame **frames, u32 flags)
+{
+	int nxmit, err = -ENXIO;
+
+	rcu_read_lock();
+
+	for (nxmit = 0; nxmit < n; nxmit++) {
+		struct xdp_frame *frame = frames[nxmit];
+		struct xdp_frame *frames1[] = {frame};
+		struct net_device *slave_dev;
+		struct xdp_buff xdp;
+
+		xdp_convert_frame_to_buff(frame, &xdp);
+
+		slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp);
+		if (!slave_dev) {
+			err = -ENXIO;
+			break;
+		}
+
+		err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags);
+		if (err < 1)
+			break;
+	}
+
+	rcu_read_unlock();
+
+	/* If error happened on the first frame then we can pass the error up, otherwise
+	 * report the number of frames that were xmitted.
+	 */
+	if (err < 0)
+		return (nxmit == 0 ? err : nxmit);
+
+	return nxmit;
+}
+
+static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+			struct netlink_ext_ack *extack)
+{
+	struct bonding *bond = netdev_priv(dev);
+	struct list_head *iter;
+	struct slave *slave, *rollback_slave;
+	struct bpf_prog *old_prog;
+	struct netdev_bpf xdp = {
+		.command = XDP_SETUP_PROG,
+		.flags   = 0,
+		.prog    = prog,
+		.extack  = extack,
+	};
+	int err;
+
+	ASSERT_RTNL();
+
+	if (!bond_xdp_check(bond))
+		return -EOPNOTSUPP;
+
+	old_prog = bond->xdp_prog;
+	bond->xdp_prog = prog;
+
+	bond_for_each_slave(bond, slave, iter) {
+		struct net_device *slave_dev = slave->dev;
+
+		if (!slave_dev->netdev_ops->ndo_bpf ||
+		    !slave_dev->netdev_ops->ndo_xdp_xmit) {
+			NL_SET_ERR_MSG(extack, "Slave device does not support XDP");
+			slave_err(dev, slave_dev, "Slave does not support XDP\n");
+			err = -EOPNOTSUPP;
+			goto err;
+		}
+
+		if (dev_xdp_prog_count(slave_dev) > 0) {
+			NL_SET_ERR_MSG(extack,
+				       "Slave has XDP program loaded, please unload before enslaving");
+			slave_err(dev, slave_dev,
+				  "Slave has XDP program loaded, please unload before enslaving\n");
+			err = -EOPNOTSUPP;
+			goto err;
+		}
+
+		err = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp);
+		if (err < 0) {
+			/* ndo_bpf() sets extack error message */
+			slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err);
+			goto err;
+		}
+		if (prog)
+			bpf_prog_inc(prog);
+	}
+
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	if (prog)
+		static_branch_inc(&bpf_master_redirect_enabled_key);
+	else
+		static_branch_dec(&bpf_master_redirect_enabled_key);
+
+	return 0;
+
+err:
+	/* unwind the program changes */
+	bond->xdp_prog = old_prog;
+	xdp.prog = old_prog;
+	xdp.extack = NULL; /* do not overwrite original error */
+
+	bond_for_each_slave(bond, rollback_slave, iter) {
+		struct net_device *slave_dev = rollback_slave->dev;
+		int err_unwind;
+
+		if (slave == rollback_slave)
+			break;
+
+		err_unwind = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp);
+		if (err_unwind < 0)
+			slave_err(dev, slave_dev,
+				  "Error %d when unwinding XDP program change\n", err_unwind);
+		else if (xdp.prog)
+			bpf_prog_inc(xdp.prog);
+	}
+	return err;
+}
+
+static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return bond_xdp_set(dev, xdp->prog, xdp->extack);
+	default:
+		return -EINVAL;
+	}
+}
+
 static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed)
 {
 	if (speed == 0 || speed == SPEED_UNKNOWN)
@@ -5042,6 +5346,9 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_features_check	= passthru_features_check,
 	.ndo_get_xmit_slave	= bond_xmit_get_slave,
 	.ndo_sk_get_lower_dev	= bond_sk_get_lower_dev,
+	.ndo_bpf		= bond_xdp,
+	.ndo_xdp_xmit           = bond_xdp_xmit,
+	.ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave,
 };
 
 static const struct device_type bond_type = {
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 625d9c72dee3..b91c365e4e95 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -258,6 +258,7 @@ struct bonding {
 	/* protecting ipsec_list */
 	spinlock_t ipsec_lock;
 #endif /* CONFIG_XFRM_OFFLOAD */
+	struct bpf_prog *xdp_prog;
 };
 
 #define bond_slave_get_rcu(dev) \
-- 
cgit v1.2.3


From 57f05bc2ab2443b89c2e2562c05053bcc7d30e8b Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 6 Aug 2021 10:46:19 +0800
Subject: page_pool: keep pp info as long as page pool owns the page

Currently, page->pp is cleared and set everytime the page
is recycled, which is unnecessary.

So only set the page->pp when the page is added to the page
pool and only clear it when the page is released from the
page pool.

This is also a preparation to support allocating frag page
in page pool.

Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/mvneta.c           |  6 +-----
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c |  2 +-
 drivers/net/ethernet/ti/cpsw.c                  |  2 +-
 drivers/net/ethernet/ti/cpsw_new.c              |  2 +-
 include/linux/skbuff.h                          |  4 +---
 include/net/page_pool.h                         |  7 -------
 net/core/page_pool.c                            | 21 +++++++++++++++++----
 7 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index ff8db311963c..5d1007e1b5c9 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
-	skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);
+	skb_mark_for_recycle(skb);
 
 	skb_reserve(skb, xdp->data - xdp->data_hard_start);
 	skb_put(skb, xdp->data_end - xdp->data);
@@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
 				skb_frag_page(frag), skb_frag_off(frag),
 				skb_frag_size(frag), PAGE_SIZE);
-		/* We don't need to reset pp_recycle here. It's already set, so
-		 * just mark fragments for recycling.
-		 */
-		page_pool_store_mem_info(skb_frag_page(frag), pool);
 	}
 
 	return skb;
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 99bd8b8aa0e2..744f58f41ecc 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -3995,7 +3995,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
 		}
 
 		if (pp)
-			skb_mark_for_recycle(skb, page, pp);
+			skb_mark_for_recycle(skb);
 		else
 			dma_unmap_single_attrs(dev->dev.parent, dma_addr,
 					       bm_pool->buf_size, DMA_FROM_DEVICE,
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 8e1e582a10c8..9f70e40779f6 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -431,7 +431,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
 	skb->protocol = eth_type_trans(skb, ndev);
 
 	/* mark skb for recycling */
-	skb_mark_for_recycle(skb, page, pool);
+	skb_mark_for_recycle(skb);
 	netif_receive_skb(skb);
 
 	ndev->stats.rx_bytes += len;
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 192394fe4c1c..ff3a96b084ee 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -375,7 +375,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
 	skb->protocol = eth_type_trans(skb, ndev);
 
 	/* mark skb for recycling */
-	skb_mark_for_recycle(skb, page, pool);
+	skb_mark_for_recycle(skb);
 	netif_receive_skb(skb);
 
 	ndev->stats.rx_bytes += len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 783cc2368bb1..6bdb0db3e825 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4712,11 +4712,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 }
 
 #ifdef CONFIG_PAGE_POOL
-static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,
-					struct page_pool *pp)
+static inline void skb_mark_for_recycle(struct sk_buff *skb)
 {
 	skb->pp_recycle = 1;
-	page_pool_store_mem_info(page, pp);
 }
 #endif
 
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 3dd62dd73027..8d7744d1c7c1 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -253,11 +253,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)
 		spin_unlock_bh(&pool->ring.producer_lock);
 }
 
-/* Store mem_info on struct page and use it while recycling skb frags */
-static inline
-void page_pool_store_mem_info(struct page *page, struct page_pool *pp)
-{
-	page->pp = pp;
-}
-
 #endif /* _NET_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5e4eb45b139c..78838c6fe007 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -206,6 +206,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	return true;
 }
 
+static void page_pool_set_pp_info(struct page_pool *pool,
+				  struct page *page)
+{
+	page->pp = pool;
+	page->pp_magic |= PP_SIGNATURE;
+}
+
+static void page_pool_clear_pp_info(struct page *page)
+{
+	page->pp_magic = 0;
+	page->pp = NULL;
+}
+
 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 						 gfp_t gfp)
 {
@@ -222,7 +235,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 		return NULL;
 	}
 
-	page->pp_magic |= PP_SIGNATURE;
+	page_pool_set_pp_info(pool, page);
 
 	/* Track how many pages are held 'in-flight' */
 	pool->pages_state_hold_cnt++;
@@ -266,7 +279,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 			put_page(page);
 			continue;
 		}
-		page->pp_magic |= PP_SIGNATURE;
+
+		page_pool_set_pp_info(pool, page);
 		pool->alloc.cache[pool->alloc.count++] = page;
 		/* Track how many pages are held 'in-flight' */
 		pool->pages_state_hold_cnt++;
@@ -345,7 +359,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
 			     DMA_ATTR_SKIP_CPU_SYNC);
 	page_pool_set_dma_addr(page, 0);
 skip_dma_unmap:
-	page->pp_magic = 0;
+	page_pool_clear_pp_info(page);
 
 	/* This may be the last page returned, releasing the pool, so
 	 * it is not safe to reference pool afterwards.
@@ -644,7 +658,6 @@ bool page_pool_return_skb_page(struct page *page)
 	 * The page will be returned to the pool here regardless of the
 	 * 'flipped' fragment being in use or not.
 	 */
-	page->pp = NULL;
 	page_pool_put_full_page(pp, page, false);
 
 	return true;
-- 
cgit v1.2.3


From 0e9d2a0a3a836c37528899010e73b5be8111753e Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 6 Aug 2021 10:46:20 +0800
Subject: page_pool: add interface to manipulate frag count in page pool

For 32 bit systems with 64 bit dma, dma_addr[1] is used to
store the upper 32 bit dma addr, those system should be rare
those days.

For normal system, the dma_addr[1] in 'struct page' is not
used, so we can reuse dma_addr[1] for storing frag count,
which means how many frags this page might be splited to.

In order to simplify the page frag support in the page pool,
the PAGE_POOL_DMA_USE_PP_FRAG_COUNT macro is added to indicate
the 32 bit systems with 64 bit dma, and the page frag support
in page pool is disabled for such system.

The newly added page_pool_set_frag_count() is called to reserve
the maximum frag count before any page frag is passed to the
user. The page_pool_atomic_sub_frag_count_return() is called
when user is done with the page frag.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mm_types.h | 18 +++++++++++++-----
 include/net/page_pool.h  | 46 +++++++++++++++++++++++++++++++++++++++-------
 net/core/page_pool.c     |  4 ++++
 3 files changed, 56 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 52bbd2b7cb46..7f8ee09c711f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -103,11 +103,19 @@ struct page {
 			unsigned long pp_magic;
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
-			/**
-			 * @dma_addr: might require a 64-bit value on
-			 * 32-bit architectures.
-			 */
-			unsigned long dma_addr[2];
+			unsigned long dma_addr;
+			union {
+				/**
+				 * dma_addr_upper: might require a 64-bit
+				 * value on 32-bit architectures.
+				 */
+				unsigned long dma_addr_upper;
+				/**
+				 * For frag page support, not supported in
+				 * 32-bit architectures with 64-bit DMA.
+				 */
+				atomic_long_t pp_frag_count;
+			};
 		};
 		struct {	/* slab, slob and slub */
 			union {
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 8d7744d1c7c1..42e6997e637d 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -45,7 +45,10 @@
 					* Please note DMA-sync-for-CPU is still
 					* device driver responsibility
 					*/
-#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)
+#define PP_FLAG_PAGE_FRAG	BIT(2) /* for page frag feature */
+#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP |\
+				 PP_FLAG_DMA_SYNC_DEV |\
+				 PP_FLAG_PAGE_FRAG)
 
 /*
  * Fast allocation side cache array/stack
@@ -198,19 +201,48 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	page_pool_put_full_page(pool, page, true);
 }
 
+#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
+		(sizeof(dma_addr_t) > sizeof(unsigned long))
+
 static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
-	dma_addr_t ret = page->dma_addr[0];
-	if (sizeof(dma_addr_t) > sizeof(unsigned long))
-		ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
+	dma_addr_t ret = page->dma_addr;
+
+	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
+		ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16;
+
 	return ret;
 }
 
 static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
 {
-	page->dma_addr[0] = addr;
-	if (sizeof(dma_addr_t) > sizeof(unsigned long))
-		page->dma_addr[1] = upper_32_bits(addr);
+	page->dma_addr = addr;
+	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
+		page->dma_addr_upper = upper_32_bits(addr);
+}
+
+static inline void page_pool_set_frag_count(struct page *page, long nr)
+{
+	atomic_long_set(&page->pp_frag_count, nr);
+}
+
+static inline long page_pool_atomic_sub_frag_count_return(struct page *page,
+							  long nr)
+{
+	long ret;
+
+	/* As suggested by Alexander, atomic_long_read() may cover up the
+	 * reference count errors, so avoid calling atomic_long_read() in
+	 * the cases of freeing or draining the page_frags, where we would
+	 * not expect it to match or that are slowpath anyway.
+	 */
+	if (__builtin_constant_p(nr) &&
+	    atomic_long_read(&page->pp_frag_count) == nr)
+		return 0;
+
+	ret = atomic_long_sub_return(nr, &page->pp_frag_count);
+	WARN_ON(ret < 0);
+	return ret;
 }
 
 static inline bool is_page_pool_compiled_in(void)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 78838c6fe007..68fab94ac422 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -67,6 +67,10 @@ static int page_pool_init(struct page_pool *pool,
 		 */
 	}
 
+	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
+	    pool->p.flags & PP_FLAG_PAGE_FRAG)
+		return -EINVAL;
+
 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 53e0961da1c7bbdabd1abebb20de403ec237ec09 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 6 Aug 2021 10:46:21 +0800
Subject: page_pool: add frag page recycling support in page pool

Currently page pool only support page recycling when there
is only one user of the page, and the split page reusing
implemented in the most driver can not use the page pool as
bing-pong way of reusing requires the multi user support in
page pool.

Those reusing or recycling has below limitations:
1. page from page pool can only be used be one user in order
   for the page recycling to happen.
2. Bing-pong way of reusing in most driver does not support
   multi desc using different part of the same page in order
   to save memory.

So add multi-users support and frag page recycling in page
pool to overcome the above limitation.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool.h | 15 +++++++++
 net/core/page_pool.c    | 87 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 42e6997e637d..a4082406a003 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -91,6 +91,9 @@ struct page_pool {
 	unsigned long defer_warn;
 
 	u32 pages_state_hold_cnt;
+	unsigned int frag_offset;
+	struct page *frag_page;
+	long frag_users;
 
 	/*
 	 * Data structure for allocation side
@@ -140,6 +143,18 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
 	return page_pool_alloc_pages(pool, gfp);
 }
 
+struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
+				  unsigned int size, gfp_t gfp);
+
+static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
+						    unsigned int *offset,
+						    unsigned int size)
+{
+	gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+	return page_pool_alloc_frag(pool, offset, size, gfp);
+}
+
 /* get the stored dma direction. A driver might decide to treat this locally and
  * avoid the extra cache line from page_pool to determine the direction
  */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 68fab94ac422..ac116041b35f 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -24,6 +24,8 @@
 #define DEFER_TIME (msecs_to_jiffies(1000))
 #define DEFER_WARN_INTERVAL (60 * HZ)
 
+#define BIAS_MAX	LONG_MAX
+
 static int page_pool_init(struct page_pool *pool,
 			  const struct page_pool_params *params)
 {
@@ -423,6 +425,11 @@ static __always_inline struct page *
 __page_pool_put_page(struct page_pool *pool, struct page *page,
 		     unsigned int dma_sync_size, bool allow_direct)
 {
+	/* It is not the last user for the page frag case */
+	if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
+	    page_pool_atomic_sub_frag_count_return(page, 1))
+		return NULL;
+
 	/* This allocator is optimized for the XDP mode that uses
 	 * one-frame-per-page, but have fallbacks that act like the
 	 * regular page allocator APIs.
@@ -515,6 +522,84 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 }
 EXPORT_SYMBOL(page_pool_put_page_bulk);
 
+static struct page *page_pool_drain_frag(struct page_pool *pool,
+					 struct page *page)
+{
+	long drain_count = BIAS_MAX - pool->frag_users;
+
+	/* Some user is still using the page frag */
+	if (likely(page_pool_atomic_sub_frag_count_return(page,
+							  drain_count)))
+		return NULL;
+
+	if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
+		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+			page_pool_dma_sync_for_device(pool, page, -1);
+
+		return page;
+	}
+
+	page_pool_return_page(pool, page);
+	return NULL;
+}
+
+static void page_pool_free_frag(struct page_pool *pool)
+{
+	long drain_count = BIAS_MAX - pool->frag_users;
+	struct page *page = pool->frag_page;
+
+	pool->frag_page = NULL;
+
+	if (!page ||
+	    page_pool_atomic_sub_frag_count_return(page, drain_count))
+		return;
+
+	page_pool_return_page(pool, page);
+}
+
+struct page *page_pool_alloc_frag(struct page_pool *pool,
+				  unsigned int *offset,
+				  unsigned int size, gfp_t gfp)
+{
+	unsigned int max_size = PAGE_SIZE << pool->p.order;
+	struct page *page = pool->frag_page;
+
+	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
+		    size > max_size))
+		return NULL;
+
+	size = ALIGN(size, dma_get_cache_alignment());
+	*offset = pool->frag_offset;
+
+	if (page && *offset + size > max_size) {
+		page = page_pool_drain_frag(pool, page);
+		if (page)
+			goto frag_reset;
+	}
+
+	if (!page) {
+		page = page_pool_alloc_pages(pool, gfp);
+		if (unlikely(!page)) {
+			pool->frag_page = NULL;
+			return NULL;
+		}
+
+		pool->frag_page = page;
+
+frag_reset:
+		pool->frag_users = 1;
+		*offset = 0;
+		pool->frag_offset = size;
+		page_pool_set_frag_count(page, BIAS_MAX);
+		return page;
+	}
+
+	pool->frag_users++;
+	pool->frag_offset = *offset + size;
+	return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_frag);
+
 static void page_pool_empty_ring(struct page_pool *pool)
 {
 	struct page *page;
@@ -620,6 +705,8 @@ void page_pool_destroy(struct page_pool *pool)
 	if (!page_pool_put(pool))
 		return;
 
+	page_pool_free_frag(pool);
+
 	if (!page_pool_release(pool))
 		return;
 
-- 
cgit v1.2.3


From 8702997074363c294a1f83928cd0c33ca57bf813 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 5 Aug 2021 12:02:43 +0200
Subject: netfilter: nf_queue: move hookfn registration out of struct net

This was done to detect when the pernet->init() function was not called
yet, by checking if net->nf.queue_handler is NULL.

Once the nfnetlink_queue module is active, all struct net pointers
contain the same address.  So place this back in nf_queue.c.

Handle the 'netns error unwind' test by checking nfnl_queue_net for a
NULL pointer and add a comment for this.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h |  4 ++--
 include/net/netns/netfilter.h    |  1 -
 net/netfilter/nf_queue.c         | 19 +++++++++----------
 net/netfilter/nfnetlink_queue.c  | 15 +++++++++++++--
 4 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index e770bba00066..9eed51e920e8 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -33,8 +33,8 @@ struct nf_queue_handler {
 	void		(*nf_hook_drop)(struct net *net);
 };
 
-void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh);
-void nf_unregister_queue_handler(struct net *net);
+void nf_register_queue_handler(const struct nf_queue_handler *qh);
+void nf_unregister_queue_handler(void);
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict);
 
 void nf_queue_entry_get_refs(struct nf_queue_entry *entry);
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 15e2b13fb0c0..986a2a9cfdfa 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -12,7 +12,6 @@ struct netns_nf {
 #if defined CONFIG_PROC_FS
 	struct proc_dir_entry *proc_netfilter;
 #endif
-	const struct nf_queue_handler __rcu *queue_handler;
 	const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO];
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header *nf_log_dir_header;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index bbd1209694b8..4903da82dc04 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -21,6 +21,8 @@
 
 #include "nf_internals.h"
 
+static const struct nf_queue_handler __rcu *nf_queue_handler;
+
 /*
  * Hook for nfnetlink_queue to register its queue handler.
  * We do this so that most of the NFQUEUE code can be modular.
@@ -29,20 +31,18 @@
  * receives, no matter what.
  */
 
-/* return EBUSY when somebody else is registered, return EEXIST if the
- * same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
+void nf_register_queue_handler(const struct nf_queue_handler *qh)
 {
 	/* should never happen, we only have one queueing backend in kernel */
-	WARN_ON(rcu_access_pointer(net->nf.queue_handler));
-	rcu_assign_pointer(net->nf.queue_handler, qh);
+	WARN_ON(rcu_access_pointer(nf_queue_handler));
+	rcu_assign_pointer(nf_queue_handler, qh);
 }
 EXPORT_SYMBOL(nf_register_queue_handler);
 
 /* The caller must flush their queue before this */
-void nf_unregister_queue_handler(struct net *net)
+void nf_unregister_queue_handler(void)
 {
-	RCU_INIT_POINTER(net->nf.queue_handler, NULL);
+	RCU_INIT_POINTER(nf_queue_handler, NULL);
 }
 EXPORT_SYMBOL(nf_unregister_queue_handler);
 
@@ -116,7 +116,7 @@ void nf_queue_nf_hook_drop(struct net *net)
 	const struct nf_queue_handler *qh;
 
 	rcu_read_lock();
-	qh = rcu_dereference(net->nf.queue_handler);
+	qh = rcu_dereference(nf_queue_handler);
 	if (qh)
 		qh->nf_hook_drop(net);
 	rcu_read_unlock();
@@ -157,12 +157,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 {
 	struct nf_queue_entry *entry = NULL;
 	const struct nf_queue_handler *qh;
-	struct net *net = state->net;
 	unsigned int route_key_size;
 	int status;
 
 	/* QUEUE == DROP if no one is waiting, to be safe. */
-	qh = rcu_dereference(net->nf.queue_handler);
+	qh = rcu_dereference(nf_queue_handler);
 	if (!qh)
 		return -ESRCH;
 
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f774de0fc24f..4c3fbaaeb103 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -951,6 +951,16 @@ static void nfqnl_nf_hook_drop(struct net *net)
 	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 	int i;
 
+	/* This function is also called on net namespace error unwind,
+	 * when pernet_ops->init() failed and ->exit() functions of the
+	 * previous pernet_ops gets called.
+	 *
+	 * This may result in a call to nfqnl_nf_hook_drop() before
+	 * struct nfnl_queue_net was allocated.
+	 */
+	if (!q)
+		return;
+
 	for (i = 0; i < INSTANCE_BUCKETS; i++) {
 		struct nfqnl_instance *inst;
 		struct hlist_head *head = &q->instance_table[i];
@@ -1502,7 +1512,6 @@ static int __net_init nfnl_queue_net_init(struct net *net)
 			&nfqnl_seq_ops, sizeof(struct iter_state)))
 		return -ENOMEM;
 #endif
-	nf_register_queue_handler(net, &nfqh);
 	return 0;
 }
 
@@ -1511,7 +1520,6 @@ static void __net_exit nfnl_queue_net_exit(struct net *net)
 	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 	unsigned int i;
 
-	nf_unregister_queue_handler(net);
 #ifdef CONFIG_PROC_FS
 	remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
 #endif
@@ -1555,6 +1563,8 @@ static int __init nfnetlink_queue_init(void)
 		goto cleanup_netlink_subsys;
 	}
 
+	nf_register_queue_handler(&nfqh);
+
 	return status;
 
 cleanup_netlink_subsys:
@@ -1568,6 +1578,7 @@ out:
 
 static void __exit nfnetlink_queue_fini(void)
 {
+	nf_unregister_queue_handler();
 	unregister_netdevice_notifier(&nfqnl_dev_notifier);
 	nfnetlink_subsys_unregister(&nfqnl_subsys);
 	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
-- 
cgit v1.2.3


From df271cd641f101decaa4f7c1dd5c62939900bd4c Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:19 +0300
Subject: net: bridge: vlan: add support for mcast igmp/mld version global
 options

Add support to change and retrieve global vlan IGMP/MLD versions.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  2 ++
 net/bridge/br_multicast.c      | 18 ++++++++++--------
 net/bridge/br_netlink.c        |  6 ++++--
 net/bridge/br_private.h        | 26 ++++++++++++++++++++++++--
 net/bridge/br_sysfs_br.c       |  4 ++--
 net/bridge/br_vlan_options.c   | 40 ++++++++++++++++++++++++++++++++++++++--
 6 files changed, 80 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 5aca85874447..5188b9f6da28 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -549,6 +549,8 @@ enum {
 	BRIDGE_VLANDB_GOPTS_ID,
 	BRIDGE_VLANDB_GOPTS_RANGE,
 	BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
+	BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
+	BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 470f1ec3b579..643b69d767f7 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4327,7 +4327,8 @@ unlock:
 	return 0;
 }
 
-int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
+int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
+				  unsigned long val)
 {
 	/* Currently we support only version 2 and 3 */
 	switch (val) {
@@ -4338,15 +4339,16 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
 		return -EINVAL;
 	}
 
-	spin_lock_bh(&br->multicast_lock);
-	br->multicast_ctx.multicast_igmp_version = val;
-	spin_unlock_bh(&br->multicast_lock);
+	spin_lock_bh(&brmctx->br->multicast_lock);
+	brmctx->multicast_igmp_version = val;
+	spin_unlock_bh(&brmctx->br->multicast_lock);
 
 	return 0;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
+int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx,
+				 unsigned long val)
 {
 	/* Currently we support version 1 and 2 */
 	switch (val) {
@@ -4357,9 +4359,9 @@ int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
 		return -EINVAL;
 	}
 
-	spin_lock_bh(&br->multicast_lock);
-	br->multicast_ctx.multicast_mld_version = val;
-	spin_unlock_bh(&br->multicast_lock);
+	spin_lock_bh(&brmctx->br->multicast_lock);
+	brmctx->multicast_mld_version = val;
+	spin_unlock_bh(&brmctx->br->multicast_lock);
 
 	return 0;
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 616a1b6dec3c..ded1b244dfcd 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1380,7 +1380,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 		__u8 igmp_version;
 
 		igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]);
-		err = br_multicast_set_igmp_version(br, igmp_version);
+		err = br_multicast_set_igmp_version(&br->multicast_ctx,
+						    igmp_version);
 		if (err)
 			return err;
 	}
@@ -1390,7 +1391,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 		__u8 mld_version;
 
 		mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]);
-		err = br_multicast_set_mld_version(br, mld_version);
+		err = br_multicast_set_mld_version(&br->multicast_ctx,
+						   mld_version);
 		if (err)
 			return err;
 	}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 51991f1b3e5a..b0b1e1aa4db4 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -883,9 +883,11 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val,
 			struct netlink_ext_ack *extack);
 int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
 int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
-int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val);
+int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
+				  unsigned long val);
 #if IS_ENABLED(CONFIG_IPV6)
-int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val);
+int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx,
+				 unsigned long val);
 #endif
 struct net_bridge_mdb_entry *
 br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst);
@@ -1165,6 +1167,19 @@ br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx)
 	       (br_multicast_port_ctx_is_vlan(pmctx) &&
 		pmctx->vlan->state == BR_STATE_BLOCKING);
 }
+
+static inline bool
+br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
+			       const struct net_bridge_mcast *brmctx2)
+{
+	return brmctx1->multicast_igmp_version ==
+	       brmctx2->multicast_igmp_version &&
+#if IS_ENABLED(CONFIG_IPV6)
+	       brmctx1->multicast_mld_version ==
+	       brmctx2->multicast_mld_version &&
+#endif
+	       true;
+}
 #else
 static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx,
 				   struct net_bridge_mcast_port **pmctx,
@@ -1330,6 +1345,13 @@ static inline int br_mdb_replay(struct net_device *br_dev,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline bool
+br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
+			       const struct net_bridge_mcast *brmctx2)
+{
+	return true;
+}
 #endif
 
 /* br_vlan.c */
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 953d544663d5..08e31debd6f2 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -520,7 +520,7 @@ static ssize_t multicast_igmp_version_show(struct device *d,
 static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val,
 				      struct netlink_ext_ack *extack)
 {
-	return br_multicast_set_igmp_version(br, val);
+	return br_multicast_set_igmp_version(&br->multicast_ctx, val);
 }
 
 static ssize_t multicast_igmp_version_store(struct device *d,
@@ -757,7 +757,7 @@ static ssize_t multicast_mld_version_show(struct device *d,
 static int set_multicast_mld_version(struct net_bridge *br, unsigned long val,
 				     struct netlink_ext_ack *extack)
 {
-	return br_multicast_set_mld_version(br, val);
+	return br_multicast_set_mld_version(&br->multicast_ctx, val);
 }
 
 static ssize_t multicast_mld_version_store(struct device *d,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 4ef975b20185..ac32fb40b7ba 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -264,7 +264,9 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
 {
 	return v_curr->vid - r_end->vid == 1 &&
 	       ((v_curr->priv_flags ^ r_end->priv_flags) &
-		BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0;
+		BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 &&
+		br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx,
+					       &r_end->br_mcast_ctx);
 }
 
 bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
@@ -285,8 +287,16 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
-		       !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)))
+		       !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) ||
+	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
+		       v_opts->br_mcast_ctx.multicast_igmp_version))
+		goto out_err;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
+		       v_opts->br_mcast_ctx.multicast_mld_version))
 		goto out_err;
+#endif
 #endif
 
 	nla_nest_end(skb, nest);
@@ -305,6 +315,8 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -359,6 +371,8 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 					   bool *changed,
 					   struct netlink_ext_ack *extack)
 {
+	int err __maybe_unused;
+
 	*changed = false;
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) {
@@ -368,6 +382,26 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		if (br_multicast_toggle_global_vlan(v, !!mc_snooping))
 			*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]) {
+		u8 ver;
+
+		ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]);
+		err = br_multicast_set_igmp_version(&v->br_mcast_ctx, ver);
+		if (err)
+			return err;
+		*changed = true;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
+		u8 ver;
+
+		ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]);
+		err = br_multicast_set_mld_version(&v->br_mcast_ctx, ver);
+		if (err)
+			return err;
+		*changed = true;
+	}
+#endif
 #endif
 
 	return 0;
@@ -377,6 +411,8 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_ID]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_GOPTS_RANGE]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3


From 931ba87d2017f3869d656f3c705883549bfeb97f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:20 +0300
Subject: net: bridge: vlan: add support for mcast last member count global
 option

Add support to change and retrieve global vlan multicast last member
count option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 ++++++++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 5188b9f6da28..d7a150034376 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -551,6 +551,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
 	BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
 	BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
+	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b0b1e1aa4db4..53627a841ae1 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1174,6 +1174,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 {
 	return brmctx1->multicast_igmp_version ==
 	       brmctx2->multicast_igmp_version &&
+	       brmctx1->multicast_last_member_count ==
+	       brmctx2->multicast_last_member_count &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index ac32fb40b7ba..75733b5bc6f4 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -289,7 +289,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
 		       !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) ||
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
-		       v_opts->br_mcast_ctx.multicast_igmp_version))
+		       v_opts->br_mcast_ctx.multicast_igmp_version) ||
+	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
+			v_opts->br_mcast_ctx.multicast_last_member_count))
 		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -317,6 +319,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
+		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -391,6 +394,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 			return err;
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]) {
+		u32 cnt;
+
+		cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]);
+		v->br_mcast_ctx.multicast_last_member_count = cnt;
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -413,6 +423,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3


From 50725f6e6b217e7661ca696b7cc1f1b9aa7bda84 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:21 +0300
Subject: net: bridge: vlan: add support for mcast startup query count global
 option

Add support to change and retrieve global vlan multicast startup query
count option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 ++++++++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index d7a150034376..082b413e1342 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -552,6 +552,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
 	BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
+	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 53627a841ae1..9d59b60046a4 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1176,6 +1176,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_igmp_version &&
 	       brmctx1->multicast_last_member_count ==
 	       brmctx2->multicast_last_member_count &&
+	       brmctx1->multicast_startup_query_count ==
+	       brmctx2->multicast_startup_query_count &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 75733b5bc6f4..81a0988b97c1 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -291,7 +291,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
 		       v_opts->br_mcast_ctx.multicast_igmp_version) ||
 	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
-			v_opts->br_mcast_ctx.multicast_last_member_count))
+			v_opts->br_mcast_ctx.multicast_last_member_count) ||
+	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
+			v_opts->br_mcast_ctx.multicast_startup_query_count))
 		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -320,6 +322,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
+		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -401,6 +404,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_last_member_count = cnt;
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]) {
+		u32 cnt;
+
+		cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]);
+		v->br_mcast_ctx.multicast_startup_query_count = cnt;
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -424,6 +434,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3


From 77f6ababa299112092a264cac96bedf1a87015ef Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:22 +0300
Subject: net: bridge: vlan: add support for mcast last member interval global
 option

Add support to change and retrieve global vlan multicast last member
interval option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  2 ++
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 15 +++++++++++++++
 3 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 082b413e1342..950ad175610e 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -553,6 +553,8 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
+	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
+	BRIDGE_VLANDB_GOPTS_PAD,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 9d59b60046a4..07b2241859ae 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1178,6 +1178,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_last_member_count &&
 	       brmctx1->multicast_startup_query_count ==
 	       brmctx2->multicast_startup_query_count &&
+	       brmctx1->multicast_last_member_interval ==
+	       brmctx2->multicast_last_member_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 81a0988b97c1..26f242acef75 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -272,6 +272,7 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
 bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			      const struct net_bridge_vlan *v_opts)
 {
+	u64 clockval __maybe_unused;
 	struct nlattr *nest;
 
 	nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS);
@@ -296,6 +297,11 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			v_opts->br_mcast_ctx.multicast_startup_query_count))
 		goto out_err;
 
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
+
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 		       v_opts->br_mcast_ctx.multicast_mld_version))
@@ -323,6 +329,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -411,6 +418,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_startup_query_count = cnt;
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]);
+		v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -435,6 +449,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3


From 2da0aea21f1c40d003af6680551eaa5471103164 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:23 +0300
Subject: net: bridge: vlan: add support for mcast membership interval global
 option

Add support to change and retrieve global vlan multicast membership
interval option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 950ad175610e..93f1f16617c8 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -555,6 +555,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
 	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
 	BRIDGE_VLANDB_GOPTS_PAD,
+	BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 07b2241859ae..6508340fd5fc 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1180,6 +1180,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_startup_query_count &&
 	       brmctx1->multicast_last_member_interval ==
 	       brmctx2->multicast_last_member_interval &&
+	       brmctx1->multicast_membership_interval ==
+	       brmctx2->multicast_membership_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 26f242acef75..9d695a63732c 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -301,6 +301,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_membership_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -330,6 +334,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -425,6 +430,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]);
+		v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -450,6 +462,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]	= { .type = NLA_U64 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3


From cd9269d463107bc4a53a0965d90a57efeee9ae11 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:24 +0300
Subject: net: bridge: vlan: add support for mcast querier interval global
 option

Add support to change and retrieve global vlan multicast querier interval
option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 93f1f16617c8..fdc264c57009 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -556,6 +556,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
 	BRIDGE_VLANDB_GOPTS_PAD,
 	BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6508340fd5fc..94487e019e96 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1182,6 +1182,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_last_member_interval &&
 	       brmctx1->multicast_membership_interval ==
 	       brmctx2->multicast_membership_interval &&
+	       brmctx1->multicast_querier_interval ==
+	       brmctx2->multicast_querier_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 9d695a63732c..58ed4277cd1b 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -305,6 +305,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_querier_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -335,6 +339,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -437,6 +442,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]);
+		v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -463,6 +475,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]	= { .type = NLA_U64 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3


From d6c08aba4f29f606769939eb6156efceb7dbb790 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:25 +0300
Subject: net: bridge: vlan: add support for mcast query interval global option

Add support to change and retrieve global vlan multicast query interval
option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index fdc264c57009..1517aea738f4 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -557,6 +557,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_PAD,
 	BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 94487e019e96..1f353e2ddaec 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1184,6 +1184,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_membership_interval &&
 	       brmctx1->multicast_querier_interval ==
 	       brmctx2->multicast_querier_interval &&
+	       brmctx1->multicast_query_interval ==
+	       brmctx2->multicast_query_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 58ed4277cd1b..10ed84336fd7 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -309,6 +309,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -340,6 +344,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -449,6 +454,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]);
+		v->br_mcast_ctx.multicast_query_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -470,6 +482,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_RANGE]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
-- 
cgit v1.2.3


From 425214508b1bd3596edb31da8d9aedee30f2b4f5 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:26 +0300
Subject: net: bridge: vlan: add support for mcast query response interval
 global option

Add support to change and retrieve global vlan multicast query response
interval option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 1517aea738f4..2627a657c3b3 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -558,6 +558,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1f353e2ddaec..10b4eaf78f54 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1186,6 +1186,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_querier_interval &&
 	       brmctx1->multicast_query_interval ==
 	       brmctx2->multicast_query_interval &&
+	       brmctx1->multicast_query_response_interval ==
+	       brmctx2->multicast_query_response_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 10ed84336fd7..cd8320b22a89 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -313,6 +313,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_response_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -345,6 +349,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -461,6 +466,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_query_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]);
+		v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -489,6 +501,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3


From 941121ee22a69935252473f03976f1f1200b9ae9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:27 +0300
Subject: net: bridge: vlan: add support for mcast startup query interval
 global option

Add support to change and retrieve global vlan multicast startup query
interval option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 2627a657c3b3..b5d01538acd4 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -559,6 +559,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 10b4eaf78f54..8dde124d0464 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1188,6 +1188,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_query_interval &&
 	       brmctx1->multicast_query_response_interval ==
 	       brmctx2->multicast_query_response_interval &&
+	       brmctx1->multicast_startup_query_interval ==
+	       brmctx2->multicast_startup_query_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index cd8320b22a89..7b8dfd138045 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -317,6 +317,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_startup_query_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -350,6 +354,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -473,6 +478,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]);
+		v->br_mcast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -501,6 +513,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 },
 };
 
-- 
cgit v1.2.3


From 62938182c35906c0ed4beb7845b93b8ffb937597 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:30 +0300
Subject: net: bridge: vlan: add support for mcast querier global option

Add support to change and retrieve global vlan multicast querier state.
We just need to pass multicast context to br_multicast_set_querier
instead of bridge device and the rest of the logic remains the same.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      |  7 +++----
 net/bridge/br_netlink.c        |  5 +++--
 net/bridge/br_private.h        |  5 +++--
 net/bridge/br_sysfs_br.c       |  4 ++--
 net/bridge/br_vlan_options.c   | 15 ++++++++++++++-
 6 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index b5d01538acd4..03fd14a4e377 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -560,6 +560,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f30c2e5d3142..a780ad8aca37 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4297,14 +4297,13 @@ bool br_multicast_router(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(br_multicast_router);
 
-int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
+int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val)
 {
-	struct net_bridge_mcast *brmctx = &br->multicast_ctx;
 	unsigned long max_delay;
 
 	val = !!val;
 
-	spin_lock_bh(&br->multicast_lock);
+	spin_lock_bh(&brmctx->br->multicast_lock);
 	if (brmctx->multicast_querier == val)
 		goto unlock;
 
@@ -4327,7 +4326,7 @@ int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
 #endif
 
 unlock:
-	spin_unlock_bh(&br->multicast_lock);
+	spin_unlock_bh(&brmctx->br->multicast_lock);
 
 	return 0;
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 12616e9bdd11..60616a4380b3 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1309,7 +1309,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 	if (data[IFLA_BR_MCAST_QUERIER]) {
 		u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]);
 
-		err = br_multicast_set_querier(br, mcast_querier);
+		err = br_multicast_set_querier(&br->multicast_ctx,
+					       mcast_querier);
 		if (err)
 			return err;
 	}
@@ -1575,7 +1576,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	    nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
 		       br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) ||
 	    nla_put_u8(skb, IFLA_BR_MCAST_QUERIER,
-		       READ_ONCE(br->multicast_ctx.multicast_querier)) ||
+		       br->multicast_ctx.multicast_querier) ||
 	    nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
 		       br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
 	    nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) ||
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a1529f0bafcc..40477885fb2a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -881,7 +881,7 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val);
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
 int br_multicast_toggle(struct net_bridge *br, unsigned long val,
 			struct netlink_ext_ack *extack);
-int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
+int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val);
 int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
 int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
 				  unsigned long val);
@@ -1028,7 +1028,7 @@ __br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
 {
 	bool own_querier_enabled;
 
-	if (READ_ONCE(brmctx->multicast_querier)) {
+	if (brmctx->multicast_querier) {
 		if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR))
 			own_querier_enabled = false;
 		else
@@ -1190,6 +1190,7 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_query_response_interval &&
 	       brmctx1->multicast_startup_query_interval ==
 	       brmctx2->multicast_startup_query_interval &&
+	       brmctx1->multicast_querier == brmctx2->multicast_querier &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 570edfd391f4..e1234bd8d5a0 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -447,13 +447,13 @@ static ssize_t multicast_querier_show(struct device *d,
 				      char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%d\n", READ_ONCE(br->multicast_ctx.multicast_querier));
+	return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier);
 }
 
 static int set_multicast_querier(struct net_bridge *br, unsigned long val,
 				 struct netlink_ext_ack *extack)
 {
-	return br_multicast_set_querier(br, val);
+	return br_multicast_set_querier(&br->multicast_ctx, val);
 }
 
 static ssize_t multicast_querier_store(struct device *d,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 7b8dfd138045..0d0db8ddae45 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -294,7 +294,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
 			v_opts->br_mcast_ctx.multicast_last_member_count) ||
 	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
-			v_opts->br_mcast_ctx.multicast_startup_query_count))
+			v_opts->br_mcast_ctx.multicast_startup_query_count) ||
+	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
+		       v_opts->br_mcast_ctx.multicast_querier))
 		goto out_err;
 
 	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
@@ -355,6 +357,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -485,6 +488,15 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]) {
+		u8 val;
+
+		val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]);
+		err = br_multicast_set_querier(&v->br_mcast_ctx, val);
+		if (err)
+			return err;
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -507,6 +519,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
-- 
cgit v1.2.3


From a97df080b6a86c105f98052ca3a9d66149b461b3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:31 +0300
Subject: net: bridge: vlan: add support for mcast router global option

Add support to change and retrieve global vlan multicast router state
which is used for the bridge itself. We just need to pass multicast context
to br_multicast_set_router instead of bridge device and the rest of the
logic remains the same.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      | 11 +++++------
 net/bridge/br_netlink.c        |  3 ++-
 net/bridge/br_private.h        |  3 ++-
 net/bridge/br_sysfs_br.c       |  2 +-
 net/bridge/br_vlan_options.c   | 15 ++++++++++++++-
 6 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 03fd14a4e377..2104dd3557b4 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -561,6 +561,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
+	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index a780ad8aca37..df6bf6a237aa 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4053,17 +4053,16 @@ void br_multicast_dev_del(struct net_bridge *br)
 	rcu_barrier();
 }
 
-int br_multicast_set_router(struct net_bridge *br, unsigned long val)
+int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val)
 {
-	struct net_bridge_mcast *brmctx = &br->multicast_ctx;
 	int err = -EINVAL;
 
-	spin_lock_bh(&br->multicast_lock);
+	spin_lock_bh(&brmctx->br->multicast_lock);
 
 	switch (val) {
 	case MDB_RTR_TYPE_DISABLED:
 	case MDB_RTR_TYPE_PERM:
-		br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM);
+		br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM);
 		del_timer(&brmctx->ip4_mc_router_timer);
 #if IS_ENABLED(CONFIG_IPV6)
 		del_timer(&brmctx->ip6_mc_router_timer);
@@ -4073,13 +4072,13 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 		break;
 	case MDB_RTR_TYPE_TEMP_QUERY:
 		if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
-			br_mc_router_state_change(br, false);
+			br_mc_router_state_change(brmctx->br, false);
 		brmctx->multicast_router = val;
 		err = 0;
 		break;
 	}
 
-	spin_unlock_bh(&br->multicast_lock);
+	spin_unlock_bh(&brmctx->br->multicast_lock);
 
 	return err;
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 60616a4380b3..8ae026fa2ad7 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1286,7 +1286,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 	if (data[IFLA_BR_MCAST_ROUTER]) {
 		u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]);
 
-		err = br_multicast_set_router(br, multicast_router);
+		err = br_multicast_set_router(&br->multicast_ctx,
+					      multicast_router);
 		if (err)
 			return err;
 	}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 40477885fb2a..6d5681ca8d2f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -877,7 +877,7 @@ void br_multicast_dev_del(struct net_bridge *br);
 void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb,
 			struct net_bridge_mcast *brmctx,
 			bool local_rcv, bool local_orig);
-int br_multicast_set_router(struct net_bridge *br, unsigned long val);
+int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val);
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
 int br_multicast_toggle(struct net_bridge *br, unsigned long val,
 			struct netlink_ext_ack *extack);
@@ -1191,6 +1191,7 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx1->multicast_startup_query_interval ==
 	       brmctx2->multicast_startup_query_interval &&
 	       brmctx1->multicast_querier == brmctx2->multicast_querier &&
+	       brmctx1->multicast_router == brmctx2->multicast_router &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index e1234bd8d5a0..d9a89ddd0331 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -390,7 +390,7 @@ static ssize_t multicast_router_show(struct device *d,
 static int set_multicast_router(struct net_bridge *br, unsigned long val,
 				struct netlink_ext_ack *extack)
 {
-	return br_multicast_set_router(br, val);
+	return br_multicast_set_router(&br->multicast_ctx, val);
 }
 
 static ssize_t multicast_router_store(struct device *d,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 0d0db8ddae45..6ba45b73931f 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -296,7 +296,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
 			v_opts->br_mcast_ctx.multicast_startup_query_count) ||
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
-		       v_opts->br_mcast_ctx.multicast_querier))
+		       v_opts->br_mcast_ctx.multicast_querier) ||
+	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
+		       v_opts->br_mcast_ctx.multicast_router))
 		goto out_err;
 
 	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
@@ -358,6 +360,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -497,6 +500,15 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 			return err;
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]) {
+		u8 val;
+
+		val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]);
+		err = br_multicast_set_router(&v->br_mcast_ctx, val);
+		if (err)
+			return err;
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -520,6 +532,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
-- 
cgit v1.2.3


From dc002875c22b56c795ec24dc987ac2dd2081588e Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:33 +0300
Subject: net: bridge: vlan: use br_rports_fill_info() to export mcast router
 ports

Embed the standard multicast router port export by br_rports_fill_info()
into a new global vlan attribute BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS.
In order to have the same format for the global bridge mcast context and
the per-vlan mcast context we need a double-nesting:
 - BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS
   - MDBA_ROUTER

Currently we don't compare router lists, if any router port exists in
the bridge mcast contexts we consider their option sets as different and
export them separately.

In addition we export the router port vlan id when dumping similar to
the router port notification format.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_mdb.c            | 17 ++++-------------
 net/bridge/br_private.h        | 15 +++++++++++++++
 net/bridge/br_vlan_options.c   | 18 ++++++++++++++++++
 4 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 2104dd3557b4..620d86e825b8 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -562,6 +562,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
+	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 7c16e2c76220..389ff3c1e9d9 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -16,16 +16,6 @@
 
 #include "br_private.h"
 
-static bool br_rports_have_mc_router(const struct net_bridge_mcast *brmctx)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	return !hlist_empty(&brmctx->ip4_mc_router_list) ||
-	       !hlist_empty(&brmctx->ip6_mc_router_list);
-#else
-	return !hlist_empty(&brmctx->ip4_mc_router_list);
-#endif
-}
-
 static bool
 br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx,
 			unsigned long *timer)
@@ -47,8 +37,8 @@ br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx,
 #endif
 }
 
-static int br_rports_fill_info(struct sk_buff *skb,
-			       const struct net_bridge_mcast *brmctx)
+int br_rports_fill_info(struct sk_buff *skb,
+			const struct net_bridge_mcast *brmctx)
 {
 	u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0;
 	bool have_ip4_mc_rtr, have_ip6_mc_rtr;
@@ -97,7 +87,8 @@ static int br_rports_fill_info(struct sk_buff *skb,
 				 ip4_timer)) ||
 		    (have_ip6_mc_rtr &&
 		     nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER,
-				 ip6_timer))) {
+				 ip6_timer)) ||
+		    (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) {
 			nla_nest_cancel(skb, port_nest);
 			goto fail;
 		}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6d5681ca8d2f..32c218aa3f36 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -945,6 +945,8 @@ bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on);
 int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
 		  const void *ctx, bool adding, struct notifier_block *nb,
 		  struct netlink_ext_ack *extack);
+int br_rports_fill_info(struct sk_buff *skb,
+			const struct net_bridge_mcast *brmctx);
 
 static inline bool br_group_is_l2(const struct br_ip *group)
 {
@@ -1168,6 +1170,17 @@ br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx)
 		pmctx->vlan->state == BR_STATE_BLOCKING);
 }
 
+static inline bool
+br_rports_have_mc_router(const struct net_bridge_mcast *brmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	return !hlist_empty(&brmctx->ip4_mc_router_list) ||
+	       !hlist_empty(&brmctx->ip6_mc_router_list);
+#else
+	return !hlist_empty(&brmctx->ip4_mc_router_list);
+#endif
+}
+
 static inline bool
 br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 			       const struct net_bridge_mcast *brmctx2)
@@ -1192,6 +1205,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_startup_query_interval &&
 	       brmctx1->multicast_querier == brmctx2->multicast_querier &&
 	       brmctx1->multicast_router == brmctx2->multicast_router &&
+	       !br_rports_have_mc_router(brmctx1) &&
+	       !br_rports_have_mc_router(brmctx2) &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 6ba45b73931f..b4fd5fa441b7 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -272,6 +272,7 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
 bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			      const struct net_bridge_vlan *v_opts)
 {
+	struct nlattr *nest2 __maybe_unused;
 	u64 clockval __maybe_unused;
 	struct nlattr *nest;
 
@@ -326,6 +327,23 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
 
+	if (br_rports_have_mc_router(&v_opts->br_mcast_ctx)) {
+		nest2 = nla_nest_start(skb,
+				       BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS);
+		if (!nest2)
+			goto out_err;
+
+		rcu_read_lock();
+		if (br_rports_fill_info(skb, &v_opts->br_mcast_ctx)) {
+			rcu_read_unlock();
+			nla_nest_cancel(skb, nest2);
+			goto out_err;
+		}
+		rcu_read_unlock();
+
+		nla_nest_end(skb, nest2);
+	}
+
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 		       v_opts->br_mcast_ctx.multicast_mld_version))
-- 
cgit v1.2.3


From f13a5ad88186c142b4f6060fb06f0f8fb1674915 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 10 Aug 2021 16:24:15 +0300
Subject: devlink: Add new "enable_eth" generic device param

Add new device generic parameter to enable/disable creation of
Ethernet auxiliary device and associated device functionality
in the devlink instance.

User who prefers to disable such functionality can disable it using below
example.

$ devlink dev param set pci/0000:06:00.0 \
              name enable_eth value false cmode driverinit
$ devlink dev reload pci/0000:06:00.0

At this point devlink instance do not create auxiliary device for the
Ethernet functionality.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-params.rst | 4 ++++
 include/net/devlink.h                               | 4 ++++
 net/core/devlink.c                                  | 5 +++++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst
index 54c9f107c4b0..219c1272f2d6 100644
--- a/Documentation/networking/devlink/devlink-params.rst
+++ b/Documentation/networking/devlink/devlink-params.rst
@@ -97,6 +97,10 @@ own name.
    * - ``enable_roce``
      - Boolean
      - Enable handling of RoCE traffic in the device.
+   * - ``enable_eth``
+     - Boolean
+     - When enabled, the device driver will instantiate Ethernet specific
+       auxiliary device of the devlink device.
    * - ``internal_err_reset``
      - Boolean
      - When enabled, the device driver will reset the device on internal
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0236c77f2fd0..1e3e183bb2c2 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -519,6 +519,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE,
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET,
+	DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -559,6 +560,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME "enable_remote_dev_reset"
 #define DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE DEVLINK_PARAM_TYPE_BOOL
 
+#define DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME "enable_eth"
+#define DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE DEVLINK_PARAM_TYPE_BOOL
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b02d54ab59ac..9a59f45c8bf9 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4277,6 +4277,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME,
 		.type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
+		.name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME,
+		.type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3


From 8ddaabee3c7994854841a9b097fd94538126c12c Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 10 Aug 2021 16:24:16 +0300
Subject: devlink: Add new "enable_rdma" generic device param

Add new device generic parameter to enable/disable creation of
RDMA auxiliary device and associated device functionality
in the devlink instance.

User who prefers to disable such functionality can disable it using below
example.

$ devlink dev param set pci/0000:06:00.0 \
              name enable_rdma value false cmode driverinit
$ devlink dev reload pci/0000:06:00.0

At this point devlink instance do not create auxiliary device for the
RDMA functionality.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-params.rst | 4 ++++
 include/net/devlink.h                               | 4 ++++
 net/core/devlink.c                                  | 5 +++++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst
index 219c1272f2d6..a49da0b049b6 100644
--- a/Documentation/networking/devlink/devlink-params.rst
+++ b/Documentation/networking/devlink/devlink-params.rst
@@ -101,6 +101,10 @@ own name.
      - Boolean
      - When enabled, the device driver will instantiate Ethernet specific
        auxiliary device of the devlink device.
+   * - ``enable_rdma``
+     - Boolean
+     - When enabled, the device driver will instantiate RDMA specific
+       auxiliary device of the devlink device.
    * - ``internal_err_reset``
      - Boolean
      - When enabled, the device driver will reset the device on internal
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 1e3e183bb2c2..6f4f0416e598 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -520,6 +520,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET,
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
+	DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -563,6 +564,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME "enable_eth"
 #define DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE DEVLINK_PARAM_TYPE_BOOL
 
+#define DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME "enable_rdma"
+#define DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE DEVLINK_PARAM_TYPE_BOOL
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 9a59f45c8bf9..b68d6921d34f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4282,6 +4282,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME,
 		.type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
+		.name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME,
+		.type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3


From 076b2a9dbb28e8b3d9a264a8bca664794255d448 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 10 Aug 2021 16:24:17 +0300
Subject: devlink: Add new "enable_vnet" generic device param

Add new device generic parameter to enable/disable creation of
VDPA net auxiliary device and associated device functionality
in the devlink instance.

User who prefers to disable such functionality can disable it using below
example.

$ devlink dev param set pci/0000:06:00.0 \
              name enable_vnet value false cmode driverinit
$ devlink dev reload pci/0000:06:00.0

At this point devlink instance do not create auxiliary device for the
VDPA net functionality.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-params.rst | 4 ++++
 include/net/devlink.h                               | 4 ++++
 net/core/devlink.c                                  | 5 +++++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst
index a49da0b049b6..4878907e9232 100644
--- a/Documentation/networking/devlink/devlink-params.rst
+++ b/Documentation/networking/devlink/devlink-params.rst
@@ -105,6 +105,10 @@ own name.
      - Boolean
      - When enabled, the device driver will instantiate RDMA specific
        auxiliary device of the devlink device.
+   * - ``enable_vnet``
+     - Boolean
+     - When enabled, the device driver will instantiate VDPA networking
+       specific auxiliary device of the devlink device.
    * - ``internal_err_reset``
      - Boolean
      - When enabled, the device driver will reset the device on internal
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 6f4f0416e598..0a0becbcdc49 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -521,6 +521,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET,
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
+	DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -567,6 +568,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME "enable_rdma"
 #define DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE DEVLINK_PARAM_TYPE_BOOL
 
+#define DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME "enable_vnet"
+#define DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE DEVLINK_PARAM_TYPE_BOOL
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b68d6921d34f..867ae7e39788 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4287,6 +4287,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME,
 		.type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET,
+		.name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME,
+		.type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3


From b40c51efefbc4a3ddec682f118adefea1ccf70dc Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 10 Aug 2021 16:24:19 +0300
Subject: devlink: Add API to register and unregister single parameter

Currently device configuration parameters can be registered as an array.
Due to this a constant array must be registered. A single driver
supporting multiple devices each with different device capabilities end
up registering all parameters even if it doesn't support it.

One possible workaround a driver can do is, it registers multiple single
entry arrays to overcome such limitation.

Better is to provide a API that enables driver to register/unregister a
single parameter. This also further helps in two ways.
(1) to reduce the memory of devlink_param_entry by avoiding in registering
parameters which are not supported by the device.
(2) avoid generating multiple parameter add, delete, publish, unpublish,
init value notifications for such unsupported parameters

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  4 ++++
 net/core/devlink.c    | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0a0becbcdc49..f6459ee77114 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1645,6 +1645,10 @@ int devlink_params_register(struct devlink *devlink,
 void devlink_params_unregister(struct devlink *devlink,
 			       const struct devlink_param *params,
 			       size_t params_count);
+int devlink_param_register(struct devlink *devlink,
+			   const struct devlink_param *param);
+void devlink_param_unregister(struct devlink *devlink,
+			      const struct devlink_param *param);
 void devlink_params_publish(struct devlink *devlink);
 void devlink_params_unpublish(struct devlink *devlink);
 int devlink_port_params_register(struct devlink_port *devlink_port,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 050dd7271a45..629291175af3 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -9903,6 +9903,43 @@ void devlink_params_unregister(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devlink_params_unregister);
 
+/**
+ * devlink_param_register - register one configuration parameter
+ *
+ * @devlink: devlink
+ * @param: one configuration parameter
+ *
+ * Register the configuration parameter supported by the driver.
+ * Return: returns 0 on successful registration or error code otherwise.
+ */
+int devlink_param_register(struct devlink *devlink,
+			   const struct devlink_param *param)
+{
+	int err;
+
+	mutex_lock(&devlink->lock);
+	err = __devlink_param_register_one(devlink, 0, &devlink->param_list,
+					   param, DEVLINK_CMD_PARAM_NEW);
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_param_register);
+
+/**
+ * devlink_param_unregister - unregister one configuration parameter
+ * @devlink: devlink
+ * @param: configuration parameter to unregister
+ */
+void devlink_param_unregister(struct devlink *devlink,
+			      const struct devlink_param *param)
+{
+	mutex_lock(&devlink->lock);
+	devlink_param_unregister_one(devlink, 0, &devlink->param_list, param,
+				     DEVLINK_CMD_PARAM_DEL);
+	mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_param_unregister);
+
 /**
  *	devlink_params_publish - publish configuration parameters
  *
-- 
cgit v1.2.3


From 9c4a7665b4237621879caf115a78f69bad67b9c7 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 10 Aug 2021 16:24:20 +0300
Subject: devlink: Add APIs to publish, unpublish individual parameter

Enable drivers to publish/unpublish individual parameter.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  4 ++++
 net/core/devlink.c    | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index f6459ee77114..1151497c0ec5 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1651,6 +1651,10 @@ void devlink_param_unregister(struct devlink *devlink,
 			      const struct devlink_param *param);
 void devlink_params_publish(struct devlink *devlink);
 void devlink_params_unpublish(struct devlink *devlink);
+void devlink_param_publish(struct devlink *devlink,
+			   const struct devlink_param *param);
+void devlink_param_unpublish(struct devlink *devlink,
+			     const struct devlink_param *param);
 int devlink_port_params_register(struct devlink_port *devlink_port,
 				 const struct devlink_param *params,
 				 size_t params_count);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 629291175af3..ee9787314cff 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -9982,6 +9982,54 @@ void devlink_params_unpublish(struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devlink_params_unpublish);
 
+/**
+ * devlink_param_publish - publish one configuration parameter
+ *
+ * @devlink: devlink
+ * @param: one configuration parameter
+ *
+ * Publish previously registered configuration parameter.
+ */
+void devlink_param_publish(struct devlink *devlink,
+			   const struct devlink_param *param)
+{
+	struct devlink_param_item *param_item;
+
+	list_for_each_entry(param_item, &devlink->param_list, list) {
+		if (param_item->param != param || param_item->published)
+			continue;
+		param_item->published = true;
+		devlink_param_notify(devlink, 0, param_item,
+				     DEVLINK_CMD_PARAM_NEW);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(devlink_param_publish);
+
+/**
+ * devlink_param_unpublish - unpublish one configuration parameter
+ *
+ * @devlink: devlink
+ * @param: one configuration parameter
+ *
+ * Unpublish previously registered configuration parameter.
+ */
+void devlink_param_unpublish(struct devlink *devlink,
+			     const struct devlink_param *param)
+{
+	struct devlink_param_item *param_item;
+
+	list_for_each_entry(param_item, &devlink->param_list, list) {
+		if (param_item->param != param || !param_item->published)
+			continue;
+		param_item->published = false;
+		devlink_param_notify(devlink, 0, param_item,
+				     DEVLINK_CMD_PARAM_DEL);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(devlink_param_unpublish);
+
 /**
  *	devlink_port_params_register - register port configuration parameters
  *
-- 
cgit v1.2.3


From 39c538d64479c949aaeca4fe73d2226f715adfb7 Mon Sep 17 00:00:00 2001
From: Cai Huoqing <caihuoqing@baidu.com>
Date: Fri, 30 Jul 2021 11:03:00 +0800
Subject: net/mlx5: Fix typo in comments

Fix typo:
*vectores  ==> vectors
*realeased  ==> released
*erros  ==> errors
*namepsace  ==> namespace
*trafic  ==> traffic
*proccessed  ==> processed
*retore  ==> restore
*Currenlty  ==> Currently
*crated  ==> created
*chane  ==> change
*cannnot  ==> cannot
*usuallly  ==> usually
*failes  ==> fails
*importent  ==> important
*reenabled  ==> re-enabled
*alocation  ==> allocation
*recived  ==> received
*tanslation  ==> translation

Signed-off-by: Cai Huoqing <caihuoqing@baidu.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c    | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_common.c    | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h       | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c        | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c      | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/events.c       | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c   | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c      | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/health.c       | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c    | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c    | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c      | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c   | 2 +-
 include/linux/mlx5/device.h                            | 2 +-
 include/linux/mlx5/driver.h                            | 4 ++--
 16 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
index 8f79f04eccd6..a61731cb6045 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
@@ -520,7 +520,7 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
 	e->out_dev = attr.out_dev;
 	e->route_dev_ifindex = attr.route_dev->ifindex;
 
-	/* It's importent to add the neigh to the hash table before checking
+	/* It's important to add the neigh to the hash table before checking
 	 * the neigh validity state. So if we'll get a notification, in case the
 	 * neigh changes it's validity state, we would find the relevant neigh
 	 * in the hash.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
index c06267477b27..538bc2419bd8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
@@ -126,7 +126,7 @@ int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
 	/* Create a separate SQ, so that when the buff pool is disabled, we could
 	 * close this SQ safely and stop receiving CQEs. In other case, e.g., if
 	 * the XDPSQ was used instead, we might run into trouble when the buff pool
-	 * is disabled and then reenabled, but the SQ continues receiving CQEs
+	 * is disabled and then re-enabled, but the SQ continues receiving CQEs
 	 * from the old buff pool.
 	 */
 	err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, pool, &c->xsksq, true);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index c4db367d4baf..84eb7201c142 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -33,7 +33,7 @@
 #include "en.h"
 
 /* mlx5e global resources should be placed in this file.
- * Global resources are common to all the netdevices crated on the same nic.
+ * Global resources are common to all the netdevices created on the same nic.
  */
 
 void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index 8f0c82448eec..756f806401d7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -146,7 +146,7 @@ struct mlx5e_neigh_hash_entry {
 	 */
 	refcount_t refcnt;
 
-	/* Save the last reported time offloaded trafic pass over one of the
+	/* Save the last reported time offloaded traffic pass over one of the
 	 * neigh hash entry flows. Use it to periodically update the neigh
 	 * 'used' value and avoid neigh deleting by the kernel.
 	 */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index e5c4344a114e..d6ad7328f298 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -97,7 +97,7 @@ struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = {
 	[MARK_TO_REG] = mark_to_reg_ct,
 	[LABELS_TO_REG] = labels_to_reg_ct,
 	[FTEID_TO_REG] = fteid_to_reg_ct,
-	/* For NIC rules we store the retore metadata directly
+	/* For NIC rules we store the restore metadata directly
 	 * into reg_b that is passed to SW since we don't
 	 * jump between steering domains.
 	 */
@@ -2448,7 +2448,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 			spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3;
 		}
 	}
-	/* Currenlty supported only for MPLS over UDP */
+	/* Currently supported only for MPLS over UDP */
 	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS) &&
 	    !netif_is_bareudp(filter_dev)) {
 		NL_SET_ERR_MSG_MOD(extack,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 7ffea2350f44..2fde9f59e8b4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1492,7 +1492,7 @@ abort:
 /**
  * mlx5_eswitch_enable - Enable eswitch
  * @esw:	Pointer to eswitch
- * @num_vfs:	Enable eswitch swich for given number of VFs.
+ * @num_vfs:	Enable eswitch switch for given number of VFs.
  *		Caller must pass num_vfs > 0 when enabling eswitch for
  *		vf vports.
  * mlx5_eswitch_enable() returns 0 on success or error code on failure.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index d713ae24d6b6..a1ac3a654962 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -27,7 +27,7 @@ static int pcie_core(struct notifier_block *, unsigned long, void *);
 static int forward_event(struct notifier_block *, unsigned long, void *);
 
 static struct mlx5_nb events_nbs_ref[] = {
-	/* Events to be proccessed by mlx5_core */
+	/* Events to be processed by mlx5_core */
 	{.nb.notifier_call = any_notifier,  .event_type = MLX5_EVENT_TYPE_NOTIFY_ANY },
 	{.nb.notifier_call = temp_warn,     .event_type = MLX5_EVENT_TYPE_TEMP_WARN_EVENT },
 	{.nb.notifier_call = port_module,   .event_type = MLX5_EVENT_TYPE_PORT_MODULE_EVENT },
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
index 0bba92cf5dc0..8ec148010d62 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
@@ -1516,7 +1516,7 @@ static int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
 	mutex_lock(&fpga_xfrm->lock);
 
 	if (!fpga_xfrm->sa_ctx)
-		/* Unbounded xfrm, chane only sw attrs */
+		/* Unbounded xfrm, change only sw attrs */
 		goto change_sw_xfrm_attrs;
 
 	/* copy original hw sa */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 8481027e493c..fee51050ed64 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2493,7 +2493,7 @@ static void set_prio_attrs_in_prio(struct fs_prio *prio, int acc_level)
 		acc_level_ns = set_prio_attrs_in_ns(ns, acc_level);
 
 		/* If this a prio with chains, and we can jump from one chain
-		 * (namepsace) to another, so we accumulate the levels
+		 * (namespace) to another, so we accumulate the levels
 		 */
 		if (prio->node.type == FS_TYPE_PRIO_CHAINS)
 			acc_level = acc_level_ns;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 9abeb80ffa31..4a7de1259004 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -170,7 +170,7 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
 
 	/* The reset only needs to be issued by one PF. The health buffer is
 	 * shared between all functions, and will be cleared during a reset.
-	 * Check again to avoid a redundant 2nd reset. If the fatal erros was
+	 * Check again to avoid a redundant 2nd reset. If the fatal errors was
 	 * PCI related a reset won't help.
 	 */
 	fatal_error = mlx5_health_check_fatal_sensors(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
index ce696d523493..ffac8a0e7a23 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -749,7 +749,7 @@ static int mlx5_pps_event(struct notifier_block *nb,
 		} else {
 			ptp_event.type = PTP_CLOCK_EXTTS;
 		}
-		/* TODOL clock->ptp can be NULL if ptp_clock_register failes */
+		/* TODOL clock->ptp can be NULL if ptp_clock_register fails */
 		ptp_clock_event(clock->ptp, &ptp_event);
 		break;
 	case PTP_PF_PEROUT:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
index 38084400ee8f..e3b0a131c3e1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
@@ -40,7 +40,7 @@
 
 struct mlx5_vxlan {
 	struct mlx5_core_dev		*mdev;
-	/* max_num_ports is usuallly 4, 16 buckets is more than enough */
+	/* max_num_ports is usually 4, 16 buckets is more than enough */
 	DECLARE_HASHTABLE(htable, 4);
 	struct mutex                    sync_lock; /* sync add/del port HW operations */
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index b25f764daa08..9fb75d79bf08 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -18,7 +18,7 @@
 
 #define MLX5_SFS_PER_CTRL_IRQ 64
 #define MLX5_IRQ_CTRL_SF_MAX 8
-/* min num of vectores for SFs to be enabled */
+/* min num of vectors for SFs to be enabled */
 #define MLX5_IRQ_VEC_COMP_BASE_SF 2
 
 #define MLX5_EQ_SHARE_IRQ_MAX_COMP (8)
@@ -597,7 +597,7 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
 		return;
 
 	/* There are cases where IRQs still will be in used when we reaching
-	 * to here. Hence, making sure all the irqs are realeased.
+	 * to here. Hence, making sure all the irqs are released.
 	 */
 	irq_pools_destroy(table);
 	pci_free_irq_vectors(dev->pdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index 720195c4be7c..13891fdc607e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -476,7 +476,7 @@ static void mlx5_sf_table_disable(struct mlx5_sf_table *table)
 		return;
 
 	/* Balances with refcount_set; drop the reference so that new user cmd cannot start
-	 * and new vhca event handler cannnot run.
+	 * and new vhca event handler cannot run.
 	 */
 	mlx5_sf_table_put(table);
 	wait_for_completion(&table->disable_complete);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 0025913505ab..1e9d55dc1a9c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1038,7 +1038,7 @@ enum {
 struct mlx5_mkey_seg {
 	/* This is a two bit field occupying bits 31-30.
 	 * bit 31 is always 0,
-	 * bit 30 is zero for regular MRs and 1 (e.g free) for UMRs that do not have tanslation
+	 * bit 30 is zero for regular MRs and 1 (e.g free) for UMRs that do not have translation
 	 */
 	u8		status;
 	u8		pcie_control;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index af4dd6e9f97f..524051d1b2e3 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -581,7 +581,7 @@ struct mlx5_priv {
 	/* end: qp staff */
 
 	/* start: alloc staff */
-	/* protect buffer alocation according to numa node */
+	/* protect buffer allocation according to numa node */
 	struct mutex            alloc_mutex;
 	int                     numa_node;
 
@@ -1111,7 +1111,7 @@ static inline u8 mlx5_mkey_variant(u32 mkey)
 }
 
 /* Async-atomic event notifier used by mlx5 core to forward FW
- * evetns recived from event queue to mlx5 consumers.
+ * evetns received from event queue to mlx5 consumers.
  * Optimise event queue dipatching.
  */
 int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
-- 
cgit v1.2.3


From 8e792700b994a4b79abe1303eb379bbd1f4212be Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 1 Aug 2021 11:37:57 +0300
Subject: net/mlx5: Delete impossible dev->state checks

New mlx5_core device structure is allocated through devlink_alloc
with\ kzalloc and that ensures that all fields are equal to zero
and it includes ->state too.

That means that checks of that field in the mlx5_init_one() is
completely redundant, because that function is called only once
in the begging of mlx5_core_dev lifetime.

PCI:
 .probe()
  -> probe_one()
   -> mlx5_init_one()

The recovery flow can't run at that time or before it, because relevant
work initialized later in mlx5_init_once().

Such initialization flow ensures that dev->state can't be
MLX5_DEVICE_STATE_UNINITIALIZED at all, so remove such impossible
checks.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 4 ----
 drivers/net/ethernet/mellanox/mlx5/core/main.c   | 6 ------
 include/linux/mlx5/driver.h                      | 3 +--
 3 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 4a7de1259004..037e18dd4be0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -213,10 +213,6 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 	mutex_lock(&dev->intf_state_mutex);
 	if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
 		goto unlock;/* a previous error is still being handled */
-	if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) {
-		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
-		goto unlock;
-	}
 
 	enter_error_state(dev, force);
 unlock:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 6fe560307c05..1a65e744d2e2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1249,11 +1249,6 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
 	int err = 0;
 
 	mutex_lock(&dev->intf_state_mutex);
-	if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
-		mlx5_core_warn(dev, "interface is up, NOP\n");
-		goto out;
-	}
-	/* remove any previous indication of internal error */
 	dev->state = MLX5_DEVICE_STATE_UP;
 
 	err = mlx5_function_setup(dev, true);
@@ -1294,7 +1289,6 @@ function_teardown:
 	mlx5_function_teardown(dev, true);
 err_function:
 	dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
-out:
 	mutex_unlock(&dev->intf_state_mutex);
 	return err;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 524051d1b2e3..2b5c5604b091 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -623,8 +623,7 @@ struct mlx5_priv {
 };
 
 enum mlx5_device_state {
-	MLX5_DEVICE_STATE_UNINITIALIZED,
-	MLX5_DEVICE_STATE_UP,
+	MLX5_DEVICE_STATE_UP = 1,
 	MLX5_DEVICE_STATE_INTERNAL_ERROR,
 };
 
-- 
cgit v1.2.3


From 5958a6fad623ad3b67a9e4d8dbd5f1874cc7039e Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 13 Jul 2021 12:36:05 +0300
Subject: net/mlx5: Reorganize current and maximal capabilities to be per-type

In the current code, the current and maximal capabilities are
maintained in separate arrays which are both per type. In order to
allow the creation of such a basic structure as a dynamically
allocated array, we move curr and max fields to a unified
structure so that specific capabilities can be allocated as one unit.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 10 ++--
 include/linux/mlx5/device.h                       | 66 +++++++++++------------
 include/linux/mlx5/driver.h                       |  8 ++-
 4 files changed, 45 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index fee51050ed64..813ff8186829 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2343,7 +2343,7 @@ static int create_leaf_prios(struct mlx5_flow_namespace *ns, int prio,
 
 #define FLOW_TABLE_BIT_SZ 1
 #define GET_FLOW_TABLE_CAP(dev, offset) \
-	((be32_to_cpu(*((__be32 *)(dev->caps.hca_cur[MLX5_CAP_FLOW_TABLE]) +	\
+	((be32_to_cpu(*((__be32 *)(dev->caps.hca[MLX5_CAP_FLOW_TABLE].cur) +	\
 			offset / 32)) >>					\
 	  (32 - FLOW_TABLE_BIT_SZ - (offset & 0x1f))) & FLOW_TABLE_BIT_SZ)
 static bool has_required_caps(struct mlx5_core_dev *dev, struct node_caps *caps)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 1a65e744d2e2..6cefe2a981c7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -389,11 +389,11 @@ static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev,
 
 	switch (cap_mode) {
 	case HCA_CAP_OPMOD_GET_MAX:
-		memcpy(dev->caps.hca_max[cap_type], hca_caps,
+		memcpy(dev->caps.hca[cap_type].max, hca_caps,
 		       MLX5_UN_SZ_BYTES(hca_cap_union));
 		break;
 	case HCA_CAP_OPMOD_GET_CUR:
-		memcpy(dev->caps.hca_cur[cap_type], hca_caps,
+		memcpy(dev->caps.hca[cap_type].cur, hca_caps,
 		       MLX5_UN_SZ_BYTES(hca_cap_union));
 		break;
 	default:
@@ -469,7 +469,7 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx)
 		return err;
 
 	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
-	memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_ODP],
+	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ODP].cur,
 	       MLX5_ST_SZ_BYTES(odp_cap));
 
 #define ODP_CAP_SET_MAX(dev, field)                                            \
@@ -514,7 +514,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
 
 	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx,
 				   capability);
-	memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_GENERAL],
+	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_GENERAL].cur,
 	       MLX5_ST_SZ_BYTES(cmd_hca_cap));
 
 	mlx5_core_dbg(dev, "Current Pkey table size %d Setting new size %d\n",
@@ -596,7 +596,7 @@ static int handle_hca_cap_roce(struct mlx5_core_dev *dev, void *set_ctx)
 		return 0;
 
 	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
-	memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_ROCE],
+	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ROCE].cur,
 	       MLX5_ST_SZ_BYTES(roce_cap));
 	MLX5_SET(roce_cap, set_hca_cap, sw_r_roce_src_udp_port, 1);
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 1e9d55dc1a9c..2736f12bb57c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1213,55 +1213,55 @@ enum mlx5_qcam_feature_groups {
 
 /* GET Dev Caps macros */
 #define MLX5_CAP_GEN(mdev, cap) \
-	MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap)
+	MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].cur, cap)
 
 #define MLX5_CAP_GEN_64(mdev, cap) \
-	MLX5_GET64(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap)
+	MLX5_GET64(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].cur, cap)
 
 #define MLX5_CAP_GEN_MAX(mdev, cap) \
-	MLX5_GET(cmd_hca_cap, mdev->caps.hca_max[MLX5_CAP_GENERAL], cap)
+	MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].max, cap)
 
 #define MLX5_CAP_GEN_2(mdev, cap) \
-	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca_cur[MLX5_CAP_GENERAL_2], cap)
+	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].cur, cap)
 
 #define MLX5_CAP_GEN_2_64(mdev, cap) \
-	MLX5_GET64(cmd_hca_cap_2, mdev->caps.hca_cur[MLX5_CAP_GENERAL_2], cap)
+	MLX5_GET64(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].cur, cap)
 
 #define MLX5_CAP_GEN_2_MAX(mdev, cap) \
-	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca_max[MLX5_CAP_GENERAL_2], cap)
+	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].max, cap)
 
 #define MLX5_CAP_ETH(mdev, cap) \
 	MLX5_GET(per_protocol_networking_offload_caps,\
-		 mdev->caps.hca_cur[MLX5_CAP_ETHERNET_OFFLOADS], cap)
+		 mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS].cur, cap)
 
 #define MLX5_CAP_ETH_MAX(mdev, cap) \
 	MLX5_GET(per_protocol_networking_offload_caps,\
-		 mdev->caps.hca_max[MLX5_CAP_ETHERNET_OFFLOADS], cap)
+		 mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS].max, cap)
 
 #define MLX5_CAP_IPOIB_ENHANCED(mdev, cap) \
 	MLX5_GET(per_protocol_networking_offload_caps,\
-		 mdev->caps.hca_cur[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS], cap)
+		 mdev->caps.hca[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS].cur, cap)
 
 #define MLX5_CAP_ROCE(mdev, cap) \
-	MLX5_GET(roce_cap, mdev->caps.hca_cur[MLX5_CAP_ROCE], cap)
+	MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE].cur, cap)
 
 #define MLX5_CAP_ROCE_MAX(mdev, cap) \
-	MLX5_GET(roce_cap, mdev->caps.hca_max[MLX5_CAP_ROCE], cap)
+	MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE].max, cap)
 
 #define MLX5_CAP_ATOMIC(mdev, cap) \
-	MLX5_GET(atomic_caps, mdev->caps.hca_cur[MLX5_CAP_ATOMIC], cap)
+	MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC].cur, cap)
 
 #define MLX5_CAP_ATOMIC_MAX(mdev, cap) \
-	MLX5_GET(atomic_caps, mdev->caps.hca_max[MLX5_CAP_ATOMIC], cap)
+	MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC].max, cap)
 
 #define MLX5_CAP_FLOWTABLE(mdev, cap) \
-	MLX5_GET(flow_table_nic_cap, mdev->caps.hca_cur[MLX5_CAP_FLOW_TABLE], cap)
+	MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE].cur, cap)
 
 #define MLX5_CAP64_FLOWTABLE(mdev, cap) \
-	MLX5_GET64(flow_table_nic_cap, (mdev)->caps.hca_cur[MLX5_CAP_FLOW_TABLE], cap)
+	MLX5_GET64(flow_table_nic_cap, (mdev)->caps.hca[MLX5_CAP_FLOW_TABLE].cur, cap)
 
 #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \
-	MLX5_GET(flow_table_nic_cap, mdev->caps.hca_max[MLX5_CAP_FLOW_TABLE], cap)
+	MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE].max, cap)
 
 #define MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) \
 	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.cap)
@@ -1301,11 +1301,11 @@ enum mlx5_qcam_feature_groups {
 
 #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \
 	MLX5_GET(flow_table_eswitch_cap, \
-		 mdev->caps.hca_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
+		 mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].cur, cap)
 
 #define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \
 	MLX5_GET(flow_table_eswitch_cap, \
-		 mdev->caps.hca_max[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
+		 mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].max, cap)
 
 #define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \
 	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap)
@@ -1327,31 +1327,31 @@ enum mlx5_qcam_feature_groups {
 
 #define MLX5_CAP_ESW(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
-		 mdev->caps.hca_cur[MLX5_CAP_ESWITCH], cap)
+		 mdev->caps.hca[MLX5_CAP_ESWITCH].cur, cap)
 
 #define MLX5_CAP64_ESW_FLOWTABLE(mdev, cap) \
 	MLX5_GET64(flow_table_eswitch_cap, \
-		(mdev)->caps.hca_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
+		(mdev)->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].cur, cap)
 
 #define MLX5_CAP_ESW_MAX(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
-		 mdev->caps.hca_max[MLX5_CAP_ESWITCH], cap)
+		 mdev->caps.hca[MLX5_CAP_ESWITCH].max, cap)
 
 #define MLX5_CAP_ODP(mdev, cap)\
-	MLX5_GET(odp_cap, mdev->caps.hca_cur[MLX5_CAP_ODP], cap)
+	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP].cur, cap)
 
 #define MLX5_CAP_ODP_MAX(mdev, cap)\
-	MLX5_GET(odp_cap, mdev->caps.hca_max[MLX5_CAP_ODP], cap)
+	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP].max, cap)
 
 #define MLX5_CAP_VECTOR_CALC(mdev, cap) \
 	MLX5_GET(vector_calc_cap, \
-		 mdev->caps.hca_cur[MLX5_CAP_VECTOR_CALC], cap)
+		 mdev->caps.hca[MLX5_CAP_VECTOR_CALC].cur, cap)
 
 #define MLX5_CAP_QOS(mdev, cap)\
-	MLX5_GET(qos_cap, mdev->caps.hca_cur[MLX5_CAP_QOS], cap)
+	MLX5_GET(qos_cap, mdev->caps.hca[MLX5_CAP_QOS].cur, cap)
 
 #define MLX5_CAP_DEBUG(mdev, cap)\
-	MLX5_GET(debug_cap, mdev->caps.hca_cur[MLX5_CAP_DEBUG], cap)
+	MLX5_GET(debug_cap, mdev->caps.hca[MLX5_CAP_DEBUG].cur, cap)
 
 #define MLX5_CAP_PCAM_FEATURE(mdev, fld) \
 	MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld)
@@ -1387,27 +1387,27 @@ enum mlx5_qcam_feature_groups {
 	MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap)
 
 #define MLX5_CAP_DEV_MEM(mdev, cap)\
-	MLX5_GET(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap)
+	MLX5_GET(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM].cur, cap)
 
 #define MLX5_CAP64_DEV_MEM(mdev, cap)\
-	MLX5_GET64(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap)
+	MLX5_GET64(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM].cur, cap)
 
 #define MLX5_CAP_TLS(mdev, cap) \
-	MLX5_GET(tls_cap, (mdev)->caps.hca_cur[MLX5_CAP_TLS], cap)
+	MLX5_GET(tls_cap, (mdev)->caps.hca[MLX5_CAP_TLS].cur, cap)
 
 #define MLX5_CAP_DEV_EVENT(mdev, cap)\
-	MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca_cur[MLX5_CAP_DEV_EVENT], cap)
+	MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca[MLX5_CAP_DEV_EVENT].cur, cap)
 
 #define MLX5_CAP_DEV_VDPA_EMULATION(mdev, cap)\
 	MLX5_GET(virtio_emulation_cap, \
-		(mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap)
+		(mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION].cur, cap)
 
 #define MLX5_CAP64_DEV_VDPA_EMULATION(mdev, cap)\
 	MLX5_GET64(virtio_emulation_cap, \
-		(mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap)
+		(mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION].cur, cap)
 
 #define MLX5_CAP_IPSEC(mdev, cap)\
-	MLX5_GET(ipsec_cap, (mdev)->caps.hca_cur[MLX5_CAP_IPSEC], cap)
+	MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC].cur, cap)
 
 enum {
 	MLX5_CMD_STAT_OK			= 0x0,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2b5c5604b091..854443ea812c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -729,6 +729,11 @@ struct mlx5_profile {
 	} mr_cache[MAX_MR_CACHE_ENTRIES];
 };
 
+struct mlx5_hca_cap {
+	u32 cur[MLX5_UN_SZ_DW(hca_cap_union)];
+	u32 max[MLX5_UN_SZ_DW(hca_cap_union)];
+};
+
 struct mlx5_core_dev {
 	struct device *device;
 	enum mlx5_coredev_type coredev_type;
@@ -740,8 +745,7 @@ struct mlx5_core_dev {
 	char			board_id[MLX5_BOARD_ID_LEN];
 	struct mlx5_cmd		cmd;
 	struct {
-		u32 hca_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
-		u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
+		struct mlx5_hca_cap hca[MLX5_CAP_NUM];
 		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
 		u32 mcam[MLX5_MCAM_REGS_NUM][MLX5_ST_SZ_DW(mcam_reg)];
 		u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
-- 
cgit v1.2.3


From 48f02eef7f764f33e520ed8009d293396ca690cd Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 13 Jul 2021 14:17:03 +0300
Subject: net/mlx5: Allocate individual capability

Currently mlx5_core_dev contains array of capabilities. It contains 19
valid capabilities of the device, 2 reserved entries and 12 holes.
Due to this for 14 unused entries, mlx5_core_dev allocates 14 * 8K = 112K
bytes of memory which is never used. Due to this mlx5_core_dev structure
size is 270Kbytes odd. This allocation further aligns to next power of 2
to 512Kbytes.

By skipping non-existent entries,
(a) 112Kbyte is saved,
(b) mlx5_core_dev reduces to 8KB with alignment
(c) 350KB saved in alignment

In future individual capability allocation can be used to skip its
allocation when such capability is disabled at the device level. This
patch prepares mlx5_core_dev to hold capability using a pointer instead
of inline array.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 71 +++++++++++++++++++++--
 include/linux/mlx5/device.h                       | 69 +++++++++++-----------
 include/linux/mlx5/driver.h                       |  2 +-
 4 files changed, 104 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 813ff8186829..9fe8e3c204d6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2343,7 +2343,7 @@ static int create_leaf_prios(struct mlx5_flow_namespace *ns, int prio,
 
 #define FLOW_TABLE_BIT_SZ 1
 #define GET_FLOW_TABLE_CAP(dev, offset) \
-	((be32_to_cpu(*((__be32 *)(dev->caps.hca[MLX5_CAP_FLOW_TABLE].cur) +	\
+	((be32_to_cpu(*((__be32 *)(dev->caps.hca[MLX5_CAP_FLOW_TABLE]->cur) +	\
 			offset / 32)) >>					\
 	  (32 - FLOW_TABLE_BIT_SZ - (offset & 0x1f))) & FLOW_TABLE_BIT_SZ)
 static bool has_required_caps(struct mlx5_core_dev *dev, struct node_caps *caps)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 6cefe2a981c7..20f693cf58cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -389,11 +389,11 @@ static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev,
 
 	switch (cap_mode) {
 	case HCA_CAP_OPMOD_GET_MAX:
-		memcpy(dev->caps.hca[cap_type].max, hca_caps,
+		memcpy(dev->caps.hca[cap_type]->max, hca_caps,
 		       MLX5_UN_SZ_BYTES(hca_cap_union));
 		break;
 	case HCA_CAP_OPMOD_GET_CUR:
-		memcpy(dev->caps.hca[cap_type].cur, hca_caps,
+		memcpy(dev->caps.hca[cap_type]->cur, hca_caps,
 		       MLX5_UN_SZ_BYTES(hca_cap_union));
 		break;
 	default:
@@ -469,7 +469,7 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx)
 		return err;
 
 	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
-	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ODP].cur,
+	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ODP]->cur,
 	       MLX5_ST_SZ_BYTES(odp_cap));
 
 #define ODP_CAP_SET_MAX(dev, field)                                            \
@@ -514,7 +514,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
 
 	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx,
 				   capability);
-	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_GENERAL].cur,
+	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_GENERAL]->cur,
 	       MLX5_ST_SZ_BYTES(cmd_hca_cap));
 
 	mlx5_core_dbg(dev, "Current Pkey table size %d Setting new size %d\n",
@@ -596,7 +596,7 @@ static int handle_hca_cap_roce(struct mlx5_core_dev *dev, void *set_ctx)
 		return 0;
 
 	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
-	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ROCE].cur,
+	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ROCE]->cur,
 	       MLX5_ST_SZ_BYTES(roce_cap));
 	MLX5_SET(roce_cap, set_hca_cap, sw_r_roce_src_udp_port, 1);
 
@@ -1375,6 +1375,60 @@ out:
 	mutex_unlock(&dev->intf_state_mutex);
 }
 
+static const int types[] = {
+	MLX5_CAP_GENERAL,
+	MLX5_CAP_GENERAL_2,
+	MLX5_CAP_ETHERNET_OFFLOADS,
+	MLX5_CAP_IPOIB_ENHANCED_OFFLOADS,
+	MLX5_CAP_ODP,
+	MLX5_CAP_ATOMIC,
+	MLX5_CAP_ROCE,
+	MLX5_CAP_IPOIB_OFFLOADS,
+	MLX5_CAP_FLOW_TABLE,
+	MLX5_CAP_ESWITCH_FLOW_TABLE,
+	MLX5_CAP_ESWITCH,
+	MLX5_CAP_VECTOR_CALC,
+	MLX5_CAP_QOS,
+	MLX5_CAP_DEBUG,
+	MLX5_CAP_DEV_MEM,
+	MLX5_CAP_DEV_EVENT,
+	MLX5_CAP_TLS,
+	MLX5_CAP_VDPA_EMULATION,
+	MLX5_CAP_IPSEC,
+};
+
+static void mlx5_hca_caps_free(struct mlx5_core_dev *dev)
+{
+	int type;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(types); i++) {
+		type = types[i];
+		kfree(dev->caps.hca[type]);
+	}
+}
+
+static int mlx5_hca_caps_alloc(struct mlx5_core_dev *dev)
+{
+	struct mlx5_hca_cap *cap;
+	int type;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(types); i++) {
+		cap = kzalloc(sizeof(*cap), GFP_KERNEL);
+		if (!cap)
+			goto err;
+		type = types[i];
+		dev->caps.hca[type] = cap;
+	}
+
+	return 0;
+
+err:
+	mlx5_hca_caps_free(dev);
+	return -ENOMEM;
+}
+
 int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 {
 	struct mlx5_priv *priv = &dev->priv;
@@ -1410,8 +1464,14 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 	if (err)
 		goto err_adev_init;
 
+	err = mlx5_hca_caps_alloc(dev);
+	if (err)
+		goto err_hca_caps;
+
 	return 0;
 
+err_hca_caps:
+	mlx5_adev_cleanup(dev);
 err_adev_init:
 	mlx5_pagealloc_cleanup(dev);
 err_pagealloc_init:
@@ -1430,6 +1490,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 
+	mlx5_hca_caps_free(dev);
 	mlx5_adev_cleanup(dev);
 	mlx5_pagealloc_cleanup(dev);
 	mlx5_health_cleanup(dev);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 2736f12bb57c..66eaf0aa7f69 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1157,6 +1157,9 @@ enum mlx5_cap_mode {
 	HCA_CAP_OPMOD_GET_CUR	= 1,
 };
 
+/* Any new cap addition must update mlx5_hca_caps_alloc() to allocate
+ * capability memory.
+ */
 enum mlx5_cap_type {
 	MLX5_CAP_GENERAL = 0,
 	MLX5_CAP_ETHERNET_OFFLOADS,
@@ -1213,55 +1216,55 @@ enum mlx5_qcam_feature_groups {
 
 /* GET Dev Caps macros */
 #define MLX5_CAP_GEN(mdev, cap) \
-	MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].cur, cap)
+	MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL]->cur, cap)
 
 #define MLX5_CAP_GEN_64(mdev, cap) \
-	MLX5_GET64(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].cur, cap)
+	MLX5_GET64(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL]->cur, cap)
 
 #define MLX5_CAP_GEN_MAX(mdev, cap) \
-	MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].max, cap)
+	MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL]->max, cap)
 
 #define MLX5_CAP_GEN_2(mdev, cap) \
-	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].cur, cap)
+	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2]->cur, cap)
 
 #define MLX5_CAP_GEN_2_64(mdev, cap) \
-	MLX5_GET64(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].cur, cap)
+	MLX5_GET64(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2]->cur, cap)
 
 #define MLX5_CAP_GEN_2_MAX(mdev, cap) \
-	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].max, cap)
+	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2]->max, cap)
 
 #define MLX5_CAP_ETH(mdev, cap) \
 	MLX5_GET(per_protocol_networking_offload_caps,\
-		 mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS].cur, cap)
+		 mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS]->cur, cap)
 
 #define MLX5_CAP_ETH_MAX(mdev, cap) \
 	MLX5_GET(per_protocol_networking_offload_caps,\
-		 mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS].max, cap)
+		 mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS]->max, cap)
 
 #define MLX5_CAP_IPOIB_ENHANCED(mdev, cap) \
 	MLX5_GET(per_protocol_networking_offload_caps,\
-		 mdev->caps.hca[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS].cur, cap)
+		 mdev->caps.hca[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS]->cur, cap)
 
 #define MLX5_CAP_ROCE(mdev, cap) \
-	MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE].cur, cap)
+	MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE]->cur, cap)
 
 #define MLX5_CAP_ROCE_MAX(mdev, cap) \
-	MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE].max, cap)
+	MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE]->max, cap)
 
 #define MLX5_CAP_ATOMIC(mdev, cap) \
-	MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC].cur, cap)
+	MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC]->cur, cap)
 
 #define MLX5_CAP_ATOMIC_MAX(mdev, cap) \
-	MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC].max, cap)
+	MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC]->max, cap)
 
 #define MLX5_CAP_FLOWTABLE(mdev, cap) \
-	MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE].cur, cap)
+	MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE]->cur, cap)
 
 #define MLX5_CAP64_FLOWTABLE(mdev, cap) \
-	MLX5_GET64(flow_table_nic_cap, (mdev)->caps.hca[MLX5_CAP_FLOW_TABLE].cur, cap)
+	MLX5_GET64(flow_table_nic_cap, (mdev)->caps.hca[MLX5_CAP_FLOW_TABLE]->cur, cap)
 
 #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \
-	MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE].max, cap)
+	MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE]->max, cap)
 
 #define MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) \
 	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.cap)
@@ -1301,11 +1304,11 @@ enum mlx5_qcam_feature_groups {
 
 #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \
 	MLX5_GET(flow_table_eswitch_cap, \
-		 mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].cur, cap)
+		 mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE]->cur, cap)
 
 #define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \
 	MLX5_GET(flow_table_eswitch_cap, \
-		 mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].max, cap)
+		 mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE]->max, cap)
 
 #define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \
 	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap)
@@ -1327,31 +1330,31 @@ enum mlx5_qcam_feature_groups {
 
 #define MLX5_CAP_ESW(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
-		 mdev->caps.hca[MLX5_CAP_ESWITCH].cur, cap)
+		 mdev->caps.hca[MLX5_CAP_ESWITCH]->cur, cap)
 
 #define MLX5_CAP64_ESW_FLOWTABLE(mdev, cap) \
 	MLX5_GET64(flow_table_eswitch_cap, \
-		(mdev)->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].cur, cap)
+		(mdev)->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE]->cur, cap)
 
 #define MLX5_CAP_ESW_MAX(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
-		 mdev->caps.hca[MLX5_CAP_ESWITCH].max, cap)
+		 mdev->caps.hca[MLX5_CAP_ESWITCH]->max, cap)
 
 #define MLX5_CAP_ODP(mdev, cap)\
-	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP].cur, cap)
+	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap)
 
 #define MLX5_CAP_ODP_MAX(mdev, cap)\
-	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP].max, cap)
+	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->max, cap)
 
 #define MLX5_CAP_VECTOR_CALC(mdev, cap) \
 	MLX5_GET(vector_calc_cap, \
-		 mdev->caps.hca[MLX5_CAP_VECTOR_CALC].cur, cap)
+		 mdev->caps.hca[MLX5_CAP_VECTOR_CALC]->cur, cap)
 
 #define MLX5_CAP_QOS(mdev, cap)\
-	MLX5_GET(qos_cap, mdev->caps.hca[MLX5_CAP_QOS].cur, cap)
+	MLX5_GET(qos_cap, mdev->caps.hca[MLX5_CAP_QOS]->cur, cap)
 
 #define MLX5_CAP_DEBUG(mdev, cap)\
-	MLX5_GET(debug_cap, mdev->caps.hca[MLX5_CAP_DEBUG].cur, cap)
+	MLX5_GET(debug_cap, mdev->caps.hca[MLX5_CAP_DEBUG]->cur, cap)
 
 #define MLX5_CAP_PCAM_FEATURE(mdev, fld) \
 	MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld)
@@ -1387,27 +1390,27 @@ enum mlx5_qcam_feature_groups {
 	MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap)
 
 #define MLX5_CAP_DEV_MEM(mdev, cap)\
-	MLX5_GET(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM].cur, cap)
+	MLX5_GET(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM]->cur, cap)
 
 #define MLX5_CAP64_DEV_MEM(mdev, cap)\
-	MLX5_GET64(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM].cur, cap)
+	MLX5_GET64(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM]->cur, cap)
 
 #define MLX5_CAP_TLS(mdev, cap) \
-	MLX5_GET(tls_cap, (mdev)->caps.hca[MLX5_CAP_TLS].cur, cap)
+	MLX5_GET(tls_cap, (mdev)->caps.hca[MLX5_CAP_TLS]->cur, cap)
 
 #define MLX5_CAP_DEV_EVENT(mdev, cap)\
-	MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca[MLX5_CAP_DEV_EVENT].cur, cap)
+	MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca[MLX5_CAP_DEV_EVENT]->cur, cap)
 
 #define MLX5_CAP_DEV_VDPA_EMULATION(mdev, cap)\
 	MLX5_GET(virtio_emulation_cap, \
-		(mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION].cur, cap)
+		(mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION]->cur, cap)
 
 #define MLX5_CAP64_DEV_VDPA_EMULATION(mdev, cap)\
 	MLX5_GET64(virtio_emulation_cap, \
-		(mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION].cur, cap)
+		(mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION]->cur, cap)
 
 #define MLX5_CAP_IPSEC(mdev, cap)\
-	MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC].cur, cap)
+	MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC]->cur, cap)
 
 enum {
 	MLX5_CMD_STAT_OK			= 0x0,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 854443ea812c..90e5f42baa50 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -745,7 +745,7 @@ struct mlx5_core_dev {
 	char			board_id[MLX5_BOARD_ID_LEN];
 	struct mlx5_cmd		cmd;
 	struct {
-		struct mlx5_hca_cap hca[MLX5_CAP_NUM];
+		struct mlx5_hca_cap *hca[MLX5_CAP_NUM];
 		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
 		u32 mcam[MLX5_MCAM_REGS_NUM][MLX5_ST_SZ_DW(mcam_reg)];
 		u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
-- 
cgit v1.2.3


From 891a88f4f5768b1e6ff52a2386d48558aa5a3f63 Mon Sep 17 00:00:00 2001
From: Jonathan Toppins <jtoppins@redhat.com>
Date: Tue, 10 Aug 2021 22:53:30 -0400
Subject: bonding: remove extraneous definitions from bonding.h

All of the symbols either only exist in bond_options.c or nowhere at
all. These symbols were verified to not exist in the code base by
using `git grep` and their removal was verified by compiling bonding.ko.

Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/bonding.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index 9f3fdc180c6c..15e083e18f75 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -150,11 +150,6 @@ struct bond_params {
 	u8 ad_actor_system[ETH_ALEN + 2];
 };
 
-struct bond_parm_tbl {
-	char *modename;
-	int mode;
-};
-
 struct slave {
 	struct net_device *dev; /* first - useful for panic debug */
 	struct bonding *bond; /* our master */
@@ -755,13 +750,6 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip)
 
 /* exported from bond_main.c */
 extern unsigned int bond_net_id;
-extern const struct bond_parm_tbl bond_lacp_tbl[];
-extern const struct bond_parm_tbl xmit_hashtype_tbl[];
-extern const struct bond_parm_tbl arp_validate_tbl[];
-extern const struct bond_parm_tbl arp_all_targets_tbl[];
-extern const struct bond_parm_tbl fail_over_mac_tbl[];
-extern const struct bond_parm_tbl pri_reselect_tbl[];
-extern struct bond_parm_tbl ad_select_tbl[];
 
 /* exported from bond_netlink.c */
 extern struct rtnl_link_ops bond_link_ops;
-- 
cgit v1.2.3


From 83f0a0b7285b299e006b0698a0ddc1ffacff3e43 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Tue, 10 Aug 2021 10:38:34 +0800
Subject: mctp: Specify route types, require rtm_type in RTM_*ROUTE messages

This change adds a 'type' attribute to routes, which can be parsed from
a RTM_NEWROUTE message. This will help to distinguish local vs. peer
routes in a future change.

This means userspace will need to set a correct rtm_type in RTM_NEWROUTE
and RTM_DELROUTE messages; we currently only accept RTN_UNICAST.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://lore.kernel.org/r/20210810023834.2231088-1-jk@codeconstruct.com.au
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mctp.h |  1 +
 net/mctp/route.c   | 27 ++++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index 54bbe042c973..a824d47c3c6d 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -173,6 +173,7 @@ struct mctp_route {
 
 	struct mctp_dev		*dev;
 	unsigned int		mtu;
+	unsigned char		type;
 	int			(*output)(struct mctp_route *route,
 					  struct sk_buff *skb);
 
diff --git a/net/mctp/route.c b/net/mctp/route.c
index b3101375c8e7..5265525011ad 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -710,8 +710,9 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 /* route management */
 static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
 			  unsigned int daddr_extent, unsigned int mtu,
-			  bool is_local)
+			  unsigned char type)
 {
+	int (*rtfn)(struct mctp_route *rt, struct sk_buff *skb);
 	struct net *net = dev_net(mdev->dev);
 	struct mctp_route *rt, *ert;
 
@@ -721,6 +722,17 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
 	if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255)
 		return -EINVAL;
 
+	switch (type) {
+	case RTN_LOCAL:
+		rtfn = mctp_route_input;
+		break;
+	case RTN_UNICAST:
+		rtfn = mctp_route_output;
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	rt = mctp_route_alloc();
 	if (!rt)
 		return -ENOMEM;
@@ -730,7 +742,8 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
 	rt->mtu = mtu;
 	rt->dev = mdev;
 	dev_hold(rt->dev->dev);
-	rt->output = is_local ? mctp_route_input : mctp_route_output;
+	rt->type = type;
+	rt->output = rtfn;
 
 	ASSERT_RTNL();
 	/* Prevent duplicate identical routes. */
@@ -777,7 +790,7 @@ static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start,
 
 int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr)
 {
-	return mctp_route_add(mdev, addr, 0, 0, true);
+	return mctp_route_add(mdev, addr, 0, 0, RTN_LOCAL);
 }
 
 int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr)
@@ -936,7 +949,11 @@ static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	/* TODO: parse mtu from nlparse */
 	mtu = 0;
 
-	rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu, false);
+	if (rtm->rtm_type != RTN_UNICAST)
+		return -EINVAL;
+
+	rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu,
+			    rtm->rtm_type);
 	return rc;
 }
 
@@ -985,7 +1002,7 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt,
 	hdr->rtm_table = RT_TABLE_DEFAULT;
 	hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */
 	hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */
-	hdr->rtm_type = RTN_ANYCAST; /* TODO: type from route */
+	hdr->rtm_type = rt->type;
 
 	if (nla_put_u8(skb, RTA_DST, rt->min))
 		goto cancel;
-- 
cgit v1.2.3


From 3d2a2544eae93987f0688c2d6ec06c76f9e1477b Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Fri, 25 Jun 2021 11:17:16 +0300
Subject: nl80211: vendor-cmd: add Intel vendor commands for iwlmei usage

iwlmei allows to integrate with the CSME firmware. There are
flows that are prioprietary for this purpose:

* Get the information of the AP the CSME firmware is connected
  to. This is useful when we need to speed up the connection
  process in case the CSME firmware has a TCP connection
  that must be kept alive across the ownership transition.
* Forbid roaming, which will happen when the CSME firmware
  wants to tell the user space not disrupt the connection.
* Request ownership, upon driver boot when the CSME firmware
  owns the device. This is a notification sent by the kernel.

All those commands are expected to be used by any software
managing the connection (mainly NetworkManager). Those commands
are expected to be used only in case the CSME firmware owns
the device and doesn't want to release the device unless the
host made sure that it can keep the connectivity.

Here are the steps of the expected flow:

1) The machine boots while AMT has an active TCP connection
2) iwlwifi starts and tries to access the device
3) The device is not available because of the active TCP
   connection. (If there are no active connections, the CSME
   firmware would have allowed iwlwifi to use the device)

Note that all the steps up to here don't involve iwlmei. All
this happens in iwlwifi (in iwl_pcie_prepare_card_hw).

4) iwlmei establishes a connection to the CSME firmware (through
   SAP)

Here iwlwifi uses iwlmei to access the device's capabilities
(since it can't touch the device), but this is not relevant
for the vendor commands.

5) The CSME firmware tells iwlmei that it uses the NIC and
   that there is an acitve TCP connection, and hence, the
   host needs to think twice before asking the CSME firmware
   to release the device
6) iwlmei tells iwlwifi to report HW RFKILL with a special
   reason

Up to here, there was no user space involved.

7) The user space (NetworkManager) boots and sees that the
   device is in RFKILL because the host doesn't own the
   device
8) The user space asks the kernel what AP the CSME firmware
   is connected to (with the first vendor command mentionned
   above)
9) The user space checks if it has a profile that matches the
   reply from the CSME firmware
10) The user space installs a network to the wpa_supplicant
    with a specific BSSID and a specific frequency
11) The user space prevents any type of full scan
12) The user space asks iwlmei to request ownership on the
    device (with the third vendor command)
13) iwlmei request ownership from the CSME firmware
14) The CSME firmware grants ownership
15) iwlmei tells iwlwifi to lift the RFKILL
16) RFKILL OFF is reported to userspace
17) The host boots the device, loads the firwmare, and
    connect to a specific BSSID without scanning including IP
    in less than 600ms (this is what I measured, of course
    it depends on many factors)
18) The host reports to the CSME firmware that there is a
    connection
19) The TCP connection is preserved and the host has now
    connectivity

20) Later, the TCP connection to the CSME firmware is
    terminated
21) The CSME firmware tells iwlmei that it is now free to
    do whatever it likes
22) iwlwifi sends the second vendor command to tell the
    user space that it can remove the special network
    configuration and pick any SSID / BSSID it likes.

Co-Developed-by: Ayala Beker <ayala.beker@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Link: https://lore.kernel.org/r/20210625081717.7680-4-emmanuel.grumbach@intel.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211-vnd-intel.h | 77 ++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 include/uapi/linux/nl80211-vnd-intel.h

(limited to 'include')

diff --git a/include/uapi/linux/nl80211-vnd-intel.h b/include/uapi/linux/nl80211-vnd-intel.h
new file mode 100644
index 000000000000..0bf177b84fd9
--- /dev/null
+++ b/include/uapi/linux/nl80211-vnd-intel.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2012-2014, 2018-2021 Intel Corporation
+ * Copyright (C) 2013-2015 Intel Mobile Communications GmbH
+ * Copyright (C) 2016-2017 Intel Deutschland GmbH
+ */
+#ifndef __VENDOR_CMD_INTEL_H__
+#define __VENDOR_CMD_INTEL_H__
+
+#define INTEL_OUI	0x001735
+
+/**
+ * enum iwl_mvm_vendor_cmd - supported vendor commands
+ * @IWL_MVM_VENDOR_CMD_GET_CSME_CONN_INFO: reports CSME connection info.
+ * @IWL_MVM_VENDOR_CMD_HOST_GET_OWNERSHIP: asks for ownership on the device.
+ * @IWL_MVM_VENDOR_CMD_ROAMING_FORBIDDEN_EVENT: notifies if roaming is allowed.
+ *	It contains a &IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN and a
+ *	&IWL_MVM_VENDOR_ATTR_VIF_ADDR attributes.
+ */
+
+enum iwl_mvm_vendor_cmd {
+	IWL_MVM_VENDOR_CMD_GET_CSME_CONN_INFO			= 0x2d,
+	IWL_MVM_VENDOR_CMD_HOST_GET_OWNERSHIP			= 0x30,
+	IWL_MVM_VENDOR_CMD_ROAMING_FORBIDDEN_EVENT		= 0x32,
+};
+
+enum iwl_vendor_auth_akm_mode {
+	IWL_VENDOR_AUTH_OPEN,
+	IWL_VENDOR_AUTH_RSNA = 0x6,
+	IWL_VENDOR_AUTH_RSNA_PSK,
+	IWL_VENDOR_AUTH_SAE = 0x9,
+	IWL_VENDOR_AUTH_MAX,
+};
+
+/**
+ * enum iwl_mvm_vendor_attr - attributes used in vendor commands
+ * @__IWL_MVM_VENDOR_ATTR_INVALID: attribute 0 is invalid
+ * @IWL_MVM_VENDOR_ATTR_VIF_ADDR: interface MAC address
+ * @IWL_MVM_VENDOR_ATTR_ADDR: MAC address
+ * @IWL_MVM_VENDOR_ATTR_SSID: SSID (binary attribute, 0..32 octets)
+ * @IWL_MVM_VENDOR_ATTR_STA_CIPHER: the cipher to use for the station with the
+ *	mac address specified in &IWL_MVM_VENDOR_ATTR_ADDR.
+ * @IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN: u8 attribute. Indicates whether
+ *	roaming is forbidden or not. Value 1 means roaming is forbidden,
+ *	0 mean roaming is allowed.
+ * @IWL_MVM_VENDOR_ATTR_AUTH_MODE: u32 attribute. Authentication mode type
+ *	as specified in &enum iwl_vendor_auth_akm_mode.
+ * @IWL_MVM_VENDOR_ATTR_CHANNEL_NUM: u8 attribute. Contains channel number.
+ * @IWL_MVM_VENDOR_ATTR_BAND: u8 attribute.
+ *	0 for 2.4 GHz band, 1 for 5.2GHz band and 2 for 6GHz band.
+ * @IWL_MVM_VENDOR_ATTR_COLLOC_CHANNEL: u32 attribute. Channel number of
+ *	collocated AP. Relevant for 6GHz AP info.
+ * @IWL_MVM_VENDOR_ATTR_COLLOC_ADDR: MAC address of a collocated AP.
+ *	Relevant for 6GHz AP info.
+ *
+ * @NUM_IWL_MVM_VENDOR_ATTR: number of vendor attributes
+ * @MAX_IWL_MVM_VENDOR_ATTR: highest vendor attribute number
+
+ */
+enum iwl_mvm_vendor_attr {
+	__IWL_MVM_VENDOR_ATTR_INVALID				= 0x00,
+	IWL_MVM_VENDOR_ATTR_VIF_ADDR				= 0x02,
+	IWL_MVM_VENDOR_ATTR_ADDR				= 0x0a,
+	IWL_MVM_VENDOR_ATTR_SSID				= 0x3d,
+	IWL_MVM_VENDOR_ATTR_STA_CIPHER				= 0x51,
+	IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN			= 0x64,
+	IWL_MVM_VENDOR_ATTR_AUTH_MODE				= 0x65,
+	IWL_MVM_VENDOR_ATTR_CHANNEL_NUM				= 0x66,
+	IWL_MVM_VENDOR_ATTR_BAND				= 0x69,
+	IWL_MVM_VENDOR_ATTR_COLLOC_CHANNEL			= 0x70,
+	IWL_MVM_VENDOR_ATTR_COLLOC_ADDR				= 0x71,
+
+	NUM_IWL_MVM_VENDOR_ATTR,
+	MAX_IWL_MVM_VENDOR_ATTR = NUM_IWL_MVM_VENDOR_ATTR - 1,
+};
+
+#endif /* __VENDOR_CMD_INTEL_H__ */
-- 
cgit v1.2.3


From 8c89f7b3d3f2880c57b0bc96c72ccd98fe354399 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 6 Aug 2021 14:53:05 -0700
Subject: mac80211: Use flex-array for radiotap header bitmap

In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields.

The it_present member of struct ieee80211_radiotap_header is treated as a
flexible array (multiple u32s can be conditionally present). In order for
memcpy() to reason (or really, not reason) about the size of operations
against this struct, use of bytes beyond it_present need to be treated
as part of the flexible array. Add a trailing flexible array and
initialize its initial index via pointer arithmetic.

Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: linux-wireless@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210806215305.2875621-1-keescook@chromium.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h | 5 +++++
 net/mac80211/rx.c                | 7 ++++++-
 net/wireless/radiotap.c          | 5 ++---
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index c0854933e24f..11630351c978 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -43,6 +43,11 @@ struct ieee80211_radiotap_header {
 	 * @it_present: (first) present word
 	 */
 	__le32 it_present;
+
+	/**
+	 * @it_optional: all remaining presence bitmaps
+	 */
+	__le32 it_optional[];
 } __packed;
 
 /* version is always 0 */
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3eb7b03b23c6..33c56eab07fc 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -359,7 +359,12 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 
 	put_unaligned_le32(it_present_val, it_present);
 
-	pos = (void *)(it_present + 1);
+	/* This references through an offset into it_optional[] rather
+	 * than via it_present otherwise later uses of pos will cause
+	 * the compiler to think we have walked past the end of the
+	 * struct member.
+	 */
+	pos = (void *)&rthdr->it_optional[it_present - rthdr->it_optional];
 
 	/* the order of the following fields is important */
 
diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
index 8099c9564a59..ae2e1a896461 100644
--- a/net/wireless/radiotap.c
+++ b/net/wireless/radiotap.c
@@ -115,10 +115,9 @@ int ieee80211_radiotap_iterator_init(
 	iterator->_max_length = get_unaligned_le16(&radiotap_header->it_len);
 	iterator->_arg_index = 0;
 	iterator->_bitmap_shifter = get_unaligned_le32(&radiotap_header->it_present);
-	iterator->_arg = (uint8_t *)radiotap_header + sizeof(*radiotap_header);
+	iterator->_arg = (uint8_t *)radiotap_header->it_optional;
 	iterator->_reset_on_ext = 0;
-	iterator->_next_bitmap = &radiotap_header->it_present;
-	iterator->_next_bitmap++;
+	iterator->_next_bitmap = radiotap_header->it_optional;
 	iterator->_vns = vns;
 	iterator->current_namespace = &radiotap_ns;
 	iterator->is_radiotap_ns = 1;
-- 
cgit v1.2.3


From afa79d08c6c8e1901cb1547591e3ccd3ec6965d9 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Fri, 13 Aug 2021 22:57:49 +0800
Subject: net: in_irq() cleanup

Replace the obsolete and ambiguos macro in_irq() with new
macro in_hardirq().

Signed-off-by: Changbin Du <changbin.du@gmail.com>
Link: https://lore.kernel.org/r/20210813145749.86512-1-changbin.du@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 +-
 net/core/bpf_sk_storage.c | 4 ++--
 net/core/dev.c            | 2 +-
 net/core/skbuff.c         | 6 +++---
 net/nfc/rawsock.c         | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bd8d5b8e2de3..2f03cd9e371a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3956,7 +3956,7 @@ void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason);
 /*
  * It is not allowed to call kfree_skb() or consume_skb() from hardware
  * interrupt context or with hardware interrupts being disabled.
- * (in_irq() || irqs_disabled())
+ * (in_hardirq() || irqs_disabled())
  *
  * We provide four helpers that can be used in following contexts :
  *
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index f564f82e91d9..68d2cbf8331a 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -416,7 +416,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
 BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
 	   void *, value, u64, flags)
 {
-	if (in_irq() || in_nmi())
+	if (in_hardirq() || in_nmi())
 		return (unsigned long)NULL;
 
 	return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
@@ -425,7 +425,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
 BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
 	   struct sock *, sk)
 {
-	if (in_irq() || in_nmi())
+	if (in_hardirq() || in_nmi())
 		return -EPERM;
 
 	return ____bpf_sk_storage_delete(map, sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 88650791c360..74fd402d26dd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3107,7 +3107,7 @@ EXPORT_SYMBOL(__dev_kfree_skb_irq);
 
 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 {
-	if (in_irq() || irqs_disabled())
+	if (in_hardirq() || irqs_disabled())
 		__dev_kfree_skb_irq(skb, reason);
 	else
 		dev_kfree_skb(skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9240af2ea8c9..f9311762cc47 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -156,7 +156,7 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 	void *data;
 
 	fragsz = SKB_DATA_ALIGN(fragsz);
-	if (in_irq() || irqs_disabled()) {
+	if (in_hardirq() || irqs_disabled()) {
 		nc = this_cpu_ptr(&netdev_alloc_cache);
 		data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
 	} else {
@@ -502,7 +502,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 	if (sk_memalloc_socks())
 		gfp_mask |= __GFP_MEMALLOC;
 
-	if (in_irq() || irqs_disabled()) {
+	if (in_hardirq() || irqs_disabled()) {
 		nc = this_cpu_ptr(&netdev_alloc_cache);
 		data = page_frag_alloc(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
@@ -724,7 +724,7 @@ void skb_release_head_state(struct sk_buff *skb)
 {
 	skb_dst_drop(skb);
 	if (skb->destructor) {
-		WARN_ON(in_irq());
+		WARN_ON(in_hardirq());
 		skb->destructor(skb);
 	}
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 5e39640becdb..0ca214ab5aef 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -140,7 +140,7 @@ static void rawsock_data_exchange_complete(void *context, struct sk_buff *skb,
 {
 	struct sock *sk = (struct sock *) context;
 
-	BUG_ON(in_irq());
+	BUG_ON(in_hardirq());
 
 	pr_debug("sk=%p err=%d\n", sk, err);
 
-- 
cgit v1.2.3


From e5f31552674e88bff3a4e3ca3e5357668b5f2973 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 12 Aug 2021 20:33:58 +0200
Subject: ethernet: fix PTP_1588_CLOCK dependencies

The 'imply' keyword does not do what most people think it does, it only
politely asks Kconfig to turn on another symbol, but does not prevent
it from being disabled manually or built as a loadable module when the
user is built-in. In the ICE driver, the latter now causes a link failure:

aarch64-linux-ld: drivers/net/ethernet/intel/ice/ice_main.o: in function `ice_eth_ioctl':
ice_main.c:(.text+0x13b0): undefined reference to `ice_ptp_get_ts_config'
ice_main.c:(.text+0x13b0): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `ice_ptp_get_ts_config'
aarch64-linux-ld: ice_main.c:(.text+0x13bc): undefined reference to `ice_ptp_set_ts_config'
ice_main.c:(.text+0x13bc): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `ice_ptp_set_ts_config'
aarch64-linux-ld: drivers/net/ethernet/intel/ice/ice_main.o: in function `ice_prepare_for_reset':
ice_main.c:(.text+0x31fc): undefined reference to `ice_ptp_release'
ice_main.c:(.text+0x31fc): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `ice_ptp_release'
aarch64-linux-ld: drivers/net/ethernet/intel/ice/ice_main.o: in function `ice_rebuild':

This is a recurring problem in many drivers, and we have discussed
it several times befores, without reaching a consensus. I'm providing
a link to the previous email thread for reference, which discusses
some related problems.

To solve the dependency issue better than the 'imply' keyword, introduce a
separate Kconfig symbol "CONFIG_PTP_1588_CLOCK_OPTIONAL" that any driver
can depend on if it is able to use PTP support when available, but works
fine without it. Whenever CONFIG_PTP_1588_CLOCK=m, those drivers are
then prevented from being built-in, the same way as with a 'depends on
PTP_1588_CLOCK || !PTP_1588_CLOCK' dependency that does the same trick,
but that can be rather confusing when you first see it.

Since this should cover the dependencies correctly, the IS_REACHABLE()
hack in the header is no longer needed now, and can be turned back
into a normal IS_ENABLED() check. Any driver that gets the dependency
wrong will now cause a link time failure rather than being unable to use
PTP support when that is in a loadable module.

However, the two recently added ptp_get_vclocks_index() and
ptp_convert_timestamp() interfaces are only called from builtin code with
ethtool and socket timestamps, so keep the current behavior by stubbing
those out completely when PTP is in a loadable module. This should be
addressed properly in a follow-up.

As Richard suggested, we may want to actually turn PTP support into a
'bool' option later on, preventing it from being a loadable module
altogether, which would be one way to solve the problem with the ethtool
interface.

Fixes: 06c16d89d2cb ("ice: register 1588 PTP clock device object for E810 devices")
Link: https://lore.kernel.org/netdev/20210804121318.337276-1-arnd@kernel.org/
Link: https://lore.kernel.org/netdev/CAK8P3a06enZOf=XyZ+zcAwBczv41UuCTz+=0FMf2gBz1_cOnZQ@mail.gmail.com/
Link: https://lore.kernel.org/netdev/CAK8P3a3=eOxE-K25754+fB_-i_0BZzf9a9RfPTX3ppSwu9WZXw@mail.gmail.com/
Link: https://lore.kernel.org/netdev/20210726084540.3282344-1-arnd@kernel.org/
Acked-by: Shannon Nelson <snelson@pensando.io>
Acked-by: Jacob Keller <jacob.e.keller@intel.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20210812183509.1362782-1-arnd@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/mv88e6xxx/Kconfig               |  1 +
 drivers/net/dsa/ocelot/Kconfig                  |  2 ++
 drivers/net/dsa/sja1105/Kconfig                 |  1 +
 drivers/net/ethernet/amd/Kconfig                |  2 +-
 drivers/net/ethernet/broadcom/Kconfig           |  6 ++--
 drivers/net/ethernet/cadence/Kconfig            |  1 +
 drivers/net/ethernet/cavium/Kconfig             |  4 +--
 drivers/net/ethernet/chelsio/Kconfig            |  1 +
 drivers/net/ethernet/freescale/Kconfig          |  2 +-
 drivers/net/ethernet/hisilicon/Kconfig          |  2 +-
 drivers/net/ethernet/intel/Kconfig              | 12 +++----
 drivers/net/ethernet/marvell/octeontx2/Kconfig  |  2 ++
 drivers/net/ethernet/mellanox/mlx4/Kconfig      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig |  2 +-
 drivers/net/ethernet/mellanox/mlxsw/Kconfig     |  2 +-
 drivers/net/ethernet/microchip/Kconfig          |  1 +
 drivers/net/ethernet/mscc/Kconfig               |  1 +
 drivers/net/ethernet/oki-semi/pch_gbe/Kconfig   |  1 +
 drivers/net/ethernet/pensando/Kconfig           |  2 +-
 drivers/net/ethernet/qlogic/Kconfig             |  2 +-
 drivers/net/ethernet/renesas/Kconfig            |  2 +-
 drivers/net/ethernet/samsung/Kconfig            |  2 +-
 drivers/net/ethernet/sfc/Kconfig                |  2 +-
 drivers/net/ethernet/stmicro/stmmac/Kconfig     |  2 +-
 drivers/net/phy/Kconfig                         |  2 ++
 drivers/ptp/Kconfig                             | 15 +++++++-
 drivers/ptp/ptp_vclock.c                        |  2 ++
 drivers/scsi/cxgbi/cxgb4i/Kconfig               |  1 +
 include/linux/ptp_clock_kernel.h                | 48 ++++++++++++++-----------
 29 files changed, 81 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/mv88e6xxx/Kconfig b/drivers/net/dsa/mv88e6xxx/Kconfig
index 634a48e6616b..7a2445a34eb7 100644
--- a/drivers/net/dsa/mv88e6xxx/Kconfig
+++ b/drivers/net/dsa/mv88e6xxx/Kconfig
@@ -2,6 +2,7 @@
 config NET_DSA_MV88E6XXX
 	tristate "Marvell 88E6xxx Ethernet switch fabric support"
 	depends on NET_DSA
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select IRQ_DOMAIN
 	select NET_DSA_TAG_EDSA
 	select NET_DSA_TAG_DSA
diff --git a/drivers/net/dsa/ocelot/Kconfig b/drivers/net/dsa/ocelot/Kconfig
index 932b6b6fe817..9948544ba1c4 100644
--- a/drivers/net/dsa/ocelot/Kconfig
+++ b/drivers/net/dsa/ocelot/Kconfig
@@ -5,6 +5,7 @@ config NET_DSA_MSCC_FELIX
 	depends on NET_VENDOR_MICROSEMI
 	depends on NET_VENDOR_FREESCALE
 	depends on HAS_IOMEM
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select MSCC_OCELOT_SWITCH_LIB
 	select NET_DSA_TAG_OCELOT_8021Q
 	select NET_DSA_TAG_OCELOT
@@ -19,6 +20,7 @@ config NET_DSA_MSCC_SEVILLE
 	depends on NET_DSA
 	depends on NET_VENDOR_MICROSEMI
 	depends on HAS_IOMEM
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select MSCC_OCELOT_SWITCH_LIB
 	select NET_DSA_TAG_OCELOT_8021Q
 	select NET_DSA_TAG_OCELOT
diff --git a/drivers/net/dsa/sja1105/Kconfig b/drivers/net/dsa/sja1105/Kconfig
index b29d41e5e1e7..1291bba3f3b6 100644
--- a/drivers/net/dsa/sja1105/Kconfig
+++ b/drivers/net/dsa/sja1105/Kconfig
@@ -2,6 +2,7 @@
 config NET_DSA_SJA1105
 tristate "NXP SJA1105 Ethernet switch family support"
 	depends on NET_DSA && SPI
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select NET_DSA_TAG_SJA1105
 	select PCS_XPCS
 	select PACKING
diff --git a/drivers/net/ethernet/amd/Kconfig b/drivers/net/ethernet/amd/Kconfig
index c6a3abec86f5..4786f0504691 100644
--- a/drivers/net/ethernet/amd/Kconfig
+++ b/drivers/net/ethernet/amd/Kconfig
@@ -170,11 +170,11 @@ config AMD_XGBE
 	tristate "AMD 10GbE Ethernet driver"
 	depends on ((OF_NET && OF_ADDRESS) || ACPI || PCI) && HAS_IOMEM
 	depends on X86 || ARM64 || COMPILE_TEST
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select BITREVERSE
 	select CRC32
 	select PHYLIB
 	select AMD_XGBE_HAVE_ECC if X86
-	imply PTP_1588_CLOCK
 	help
 	  This driver supports the AMD 10GbE Ethernet device found on an
 	  AMD SoC.
diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index 1a02ca600b71..56e0fb07aec7 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -122,8 +122,8 @@ config SB1250_MAC
 config TIGON3
 	tristate "Broadcom Tigon3 support"
 	depends on PCI
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select PHYLIB
-	imply PTP_1588_CLOCK
 	help
 	  This driver supports Broadcom Tigon3 based gigabit Ethernet cards.
 
@@ -140,7 +140,7 @@ config TIGON3_HWMON
 config BNX2X
 	tristate "Broadcom NetXtremeII 10Gb support"
 	depends on PCI
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select FW_LOADER
 	select ZLIB_INFLATE
 	select LIBCRC32C
@@ -206,7 +206,7 @@ config SYSTEMPORT
 config BNXT
 	tristate "Broadcom NetXtreme-C/E support"
 	depends on PCI
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select FW_LOADER
 	select LIBCRC32C
 	select NET_DEVLINK
diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig
index e432a68ac520..5b2a461dfd28 100644
--- a/drivers/net/ethernet/cadence/Kconfig
+++ b/drivers/net/ethernet/cadence/Kconfig
@@ -22,6 +22,7 @@ if NET_VENDOR_CADENCE
 config MACB
 	tristate "Cadence MACB/GEM support"
 	depends on HAS_DMA && COMMON_CLK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select PHYLINK
 	select CRC32
 	help
diff --git a/drivers/net/ethernet/cavium/Kconfig b/drivers/net/ethernet/cavium/Kconfig
index 4875cdae622e..1c76c95b0b27 100644
--- a/drivers/net/ethernet/cavium/Kconfig
+++ b/drivers/net/ethernet/cavium/Kconfig
@@ -66,7 +66,7 @@ config LIQUIDIO
 	tristate "Cavium LiquidIO support"
 	depends on 64BIT && PCI
 	depends on PCI
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select FW_LOADER
 	select LIBCRC32C
 	select NET_DEVLINK
@@ -91,7 +91,7 @@ config OCTEON_MGMT_ETHERNET
 config LIQUIDIO_VF
 	tristate "Cavium LiquidIO VF support"
 	depends on 64BIT && PCI_MSI
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  This driver supports Cavium LiquidIO Intelligent Server Adapter
 	  based on CN23XX chips.
diff --git a/drivers/net/ethernet/chelsio/Kconfig b/drivers/net/ethernet/chelsio/Kconfig
index 8ba0e08e5e64..c931ec8cac40 100644
--- a/drivers/net/ethernet/chelsio/Kconfig
+++ b/drivers/net/ethernet/chelsio/Kconfig
@@ -69,6 +69,7 @@ config CHELSIO_T3
 config CHELSIO_T4
 	tristate "Chelsio Communications T4/T5/T6 Ethernet support"
 	depends on PCI && (IPV6 || IPV6=n) && (TLS || TLS=n)
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select FW_LOADER
 	select MDIO
 	select ZLIB_DEFLATE
diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig
index 2d1abdd58fab..e04e1c5cb013 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -25,10 +25,10 @@ config FEC
 	depends on (M523x || M527x || M5272 || M528x || M520x || M532x || \
 		   ARCH_MXC || SOC_IMX28 || COMPILE_TEST)
 	default ARCH_MXC || SOC_IMX28 if ARM
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select CRC32
 	select PHYLIB
 	imply NET_SELFTESTS
-	imply PTP_1588_CLOCK
 	help
 	  Say Y here if you want to use the built-in 10/100 Fast ethernet
 	  controller on some Motorola ColdFire and Freescale i.MX processors.
diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig
index 2ba0e7bd3466..3312e1d93c3b 100644
--- a/drivers/net/ethernet/hisilicon/Kconfig
+++ b/drivers/net/ethernet/hisilicon/Kconfig
@@ -104,7 +104,7 @@ config HNS3_HCLGE
 	tristate "Hisilicon HNS3 HCLGE Acceleration Engine & Compatibility Layer Support"
 	default m
 	depends on PCI_MSI
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  This selects the HNS3_HCLGE network acceleration engine & its hardware
 	  compatibility layer. The engine would be used in Hisilicon hip08 family of
diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index 82744a7501c7..b0b6f90deb7d 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -58,8 +58,8 @@ config E1000
 config E1000E
 	tristate "Intel(R) PRO/1000 PCI-Express Gigabit Ethernet support"
 	depends on PCI && (!SPARC32 || BROKEN)
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select CRC32
-	imply PTP_1588_CLOCK
 	help
 	  This driver supports the PCI-Express Intel(R) PRO/1000 gigabit
 	  ethernet family of adapters. For PCI or PCI-X e1000 adapters,
@@ -87,7 +87,7 @@ config E1000E_HWTS
 config IGB
 	tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support"
 	depends on PCI
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select I2C
 	select I2C_ALGOBIT
 	help
@@ -159,9 +159,9 @@ config IXGB
 config IXGBE
 	tristate "Intel(R) 10GbE PCI Express adapters support"
 	depends on PCI
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select MDIO
 	select PHYLIB
-	imply PTP_1588_CLOCK
 	help
 	  This driver supports Intel(R) 10GbE PCI Express family of
 	  adapters.  For more information on how to identify your adapter, go
@@ -239,7 +239,7 @@ config IXGBEVF_IPSEC
 
 config I40E
 	tristate "Intel(R) Ethernet Controller XL710 Family support"
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	depends on PCI
 	select AUXILIARY_BUS
 	help
@@ -295,11 +295,11 @@ config ICE
 	tristate "Intel(R) Ethernet Connection E800 Series Support"
 	default n
 	depends on PCI_MSI
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select AUXILIARY_BUS
 	select DIMLIB
 	select NET_DEVLINK
 	select PLDMFW
-	imply PTP_1588_CLOCK
 	help
 	  This driver supports Intel(R) Ethernet Connection E800 Series of
 	  devices.  For more information on how to identify your adapter, go
@@ -317,7 +317,7 @@ config FM10K
 	tristate "Intel(R) FM10000 Ethernet Switch Host Interface Support"
 	default n
 	depends on PCI_MSI
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  This driver supports Intel(R) FM10000 Ethernet Switch Host
 	  Interface.  For more information on how to identify your adapter,
diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig b/drivers/net/ethernet/marvell/octeontx2/Kconfig
index 16caa02095fe..2aa0ae8abfbb 100644
--- a/drivers/net/ethernet/marvell/octeontx2/Kconfig
+++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig
@@ -12,6 +12,7 @@ config OCTEONTX2_AF
 	select NET_DEVLINK
 	depends on (64BIT && COMPILE_TEST) || ARM64
 	depends on PCI
+	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  This driver supports Marvell's OcteonTX2 Resource Virtualization
 	  Unit's admin function manager which manages all RVU HW resources
@@ -32,6 +33,7 @@ config OCTEONTX2_PF
 	select OCTEONTX2_MBOX
 	depends on (64BIT && COMPILE_TEST) || ARM64
 	depends on PCI
+	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  This driver supports Marvell's OcteonTX2 NIC physical function.
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig
index 400e611ba041..1b4b1f642317 100644
--- a/drivers/net/ethernet/mellanox/mlx4/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig
@@ -6,8 +6,8 @@
 config MLX4_EN
 	tristate "Mellanox Technologies 1/10/40Gbit Ethernet support"
 	depends on PCI && NETDEVICES && ETHERNET && INET
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select MLX4_CORE
-	imply PTP_1588_CLOCK
 	help
 	  This driver supports Mellanox Technologies ConnectX Ethernet
 	  devices.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index e1a5a79e27c7..92056452a9e3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -10,7 +10,7 @@ config MLX5_CORE
 	select NET_DEVLINK
 	depends on VXLAN || !VXLAN
 	depends on MLXFW || !MLXFW
-	depends on PTP_1588_CLOCK || !PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	depends on PCI_HYPERV_INTERFACE || !PCI_HYPERV_INTERFACE
 	help
 	  Core driver for low level functionality of the ConnectX-4 and
diff --git a/drivers/net/ethernet/mellanox/mlxsw/Kconfig b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
index 12871c8dc7c1..d1ae248e125c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
@@ -58,10 +58,10 @@ config MLXSW_SPECTRUM
 	depends on NET_IPGRE || NET_IPGRE=n
 	depends on IPV6_GRE || IPV6_GRE=n
 	depends on VXLAN || VXLAN=n
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select GENERIC_ALLOCATOR
 	select PARMAN
 	select OBJAGG
-	imply PTP_1588_CLOCK
 	select NET_PTP_CLASSIFY if PTP_1588_CLOCK
 	default m
 	help
diff --git a/drivers/net/ethernet/microchip/Kconfig b/drivers/net/ethernet/microchip/Kconfig
index d54aa164c4e9..735eea1dacf1 100644
--- a/drivers/net/ethernet/microchip/Kconfig
+++ b/drivers/net/ethernet/microchip/Kconfig
@@ -45,6 +45,7 @@ config ENCX24J600
 config LAN743X
 	tristate "LAN743x support"
 	depends on PCI
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select PHYLIB
 	select CRC16
 	select CRC32
diff --git a/drivers/net/ethernet/mscc/Kconfig b/drivers/net/ethernet/mscc/Kconfig
index 2d3157e4d081..b1d68e197258 100644
--- a/drivers/net/ethernet/mscc/Kconfig
+++ b/drivers/net/ethernet/mscc/Kconfig
@@ -24,6 +24,7 @@ config MSCC_OCELOT_SWITCH_LIB
 
 config MSCC_OCELOT_SWITCH
 	tristate "Ocelot switch driver"
+	depends on PTP_1588_CLOCK_OPTIONAL
 	depends on BRIDGE || BRIDGE=n
 	depends on NET_SWITCHDEV
 	depends on HAS_IOMEM
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/Kconfig b/drivers/net/ethernet/oki-semi/pch_gbe/Kconfig
index af84f72bf08e..4e18b64dceb9 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/Kconfig
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/Kconfig
@@ -6,6 +6,7 @@
 config PCH_GBE
 	tristate "OKI SEMICONDUCTOR IOH(ML7223/ML7831) GbE"
 	depends on PCI && (X86_32 || COMPILE_TEST)
+	depends on PTP_1588_CLOCK
 	select MII
 	select PTP_1588_CLOCK_PCH
 	select NET_PTP_CLASSIFY
diff --git a/drivers/net/ethernet/pensando/Kconfig b/drivers/net/ethernet/pensando/Kconfig
index 202973a82712..3f7519e435b8 100644
--- a/drivers/net/ethernet/pensando/Kconfig
+++ b/drivers/net/ethernet/pensando/Kconfig
@@ -20,7 +20,7 @@ if NET_VENDOR_PENSANDO
 config IONIC
 	tristate "Pensando Ethernet IONIC Support"
 	depends on 64BIT && PCI
-	depends on PTP_1588_CLOCK || !PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select NET_DEVLINK
 	select DIMLIB
 	help
diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig
index 98f430905ffa..1203353238e5 100644
--- a/drivers/net/ethernet/qlogic/Kconfig
+++ b/drivers/net/ethernet/qlogic/Kconfig
@@ -99,7 +99,7 @@ config QED_SRIOV
 config QEDE
 	tristate "QLogic QED 25/40/100Gb Ethernet NIC"
 	depends on QED
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  This enables the support for Marvell FastLinQ adapters family,
 	  ethernet driver.
diff --git a/drivers/net/ethernet/renesas/Kconfig b/drivers/net/ethernet/renesas/Kconfig
index 5a2a4af31812..8008b2f45934 100644
--- a/drivers/net/ethernet/renesas/Kconfig
+++ b/drivers/net/ethernet/renesas/Kconfig
@@ -32,11 +32,11 @@ config SH_ETH
 config RAVB
 	tristate "Renesas Ethernet AVB support"
 	depends on ARCH_RENESAS || COMPILE_TEST
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select CRC32
 	select MII
 	select MDIO_BITBANG
 	select PHYLIB
-	imply PTP_1588_CLOCK
 	help
 	  Renesas Ethernet AVB device driver.
 	  This driver supports the following SoCs:
diff --git a/drivers/net/ethernet/samsung/Kconfig b/drivers/net/ethernet/samsung/Kconfig
index 0582e110b1c0..2a6c2658d284 100644
--- a/drivers/net/ethernet/samsung/Kconfig
+++ b/drivers/net/ethernet/samsung/Kconfig
@@ -20,9 +20,9 @@ if NET_VENDOR_SAMSUNG
 config SXGBE_ETH
 	tristate "Samsung 10G/2.5G/1G SXGBE Ethernet driver"
 	depends on HAS_IOMEM && HAS_DMA
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select PHYLIB
 	select CRC32
-	imply PTP_1588_CLOCK
 	help
 	  This is the driver for the SXGBE 10G Ethernet IP block found on
 	  Samsung platforms.
diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig
index 5e37c8313725..97ce64079855 100644
--- a/drivers/net/ethernet/sfc/Kconfig
+++ b/drivers/net/ethernet/sfc/Kconfig
@@ -19,9 +19,9 @@ if NET_VENDOR_SOLARFLARE
 config SFC
 	tristate "Solarflare SFC9000/SFC9100/EF100-family support"
 	depends on PCI
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select MDIO
 	select CRC32
-	imply PTP_1588_CLOCK
 	help
 	  This driver supports 10/40-gigabit Ethernet cards based on
 	  the Solarflare SFC9000-family and SFC9100-family controllers.
diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig
index ac3c248d4f9b..929cfc22cd0c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/Kconfig
+++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig
@@ -2,12 +2,12 @@
 config STMMAC_ETH
 	tristate "STMicroelectronics Multi-Gigabit Ethernet driver"
 	depends on HAS_IOMEM && HAS_DMA
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select MII
 	select PCS_XPCS
 	select PAGE_POOL
 	select PHYLINK
 	select CRC32
-	imply PTP_1588_CLOCK
 	select RESET_CONTROLLER
 	help
 	  This is the driver for the Ethernet IPs built around a
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 7564ae0c1997..902495afcb38 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -236,6 +236,7 @@ config MICROCHIP_T1_PHY
 config MICROSEMI_PHY
 	tristate "Microsemi PHYs"
 	depends on MACSEC || MACSEC=n
+	depends on PTP_1588_CLOCK_OPTIONAL || !NETWORK_PHY_TIMESTAMPING
 	select CRYPTO_LIB_AES if MACSEC
 	help
 	  Currently supports VSC8514, VSC8530, VSC8531, VSC8540 and VSC8541 PHYs
@@ -253,6 +254,7 @@ config NATIONAL_PHY
 
 config NXP_C45_TJA11XX_PHY
 	tristate "NXP C45 TJA11XX PHYs"
+	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  Enable support for NXP C45 TJA11XX PHYs.
 	  Currently supports only the TJA1103 PHY.
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 8b08745e1ca1..e82b4a678acb 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -8,6 +8,7 @@ menu "PTP clock support"
 config PTP_1588_CLOCK
 	tristate "PTP clock support"
 	depends on NET && POSIX_TIMERS
+	default ETHERNET
 	select PPS
 	select NET_PTP_CLASSIFY
 	help
@@ -26,6 +27,18 @@ config PTP_1588_CLOCK
 	  To compile this driver as a module, choose M here: the module
 	  will be called ptp.
 
+config PTP_1588_CLOCK_OPTIONAL
+	tristate
+	default y if PTP_1588_CLOCK=n
+	default PTP_1588_CLOCK
+	help
+	  Drivers that can optionally use the PTP_1588_CLOCK framework
+	  should depend on this symbol to prevent them from being built
+	  into vmlinux while the PTP support itself is in a loadable
+	  module.
+	  If PTP support is disabled, this dependency will still be
+	  met, and drivers refer to dummy helpers.
+
 config PTP_1588_CLOCK_DTE
 	tristate "Broadcom DTE as PTP clock"
 	depends on PTP_1588_CLOCK
@@ -91,7 +104,7 @@ config PTP_1588_CLOCK_PCH
 	tristate "Intel PCH EG20T as PTP clock"
 	depends on X86_32 || COMPILE_TEST
 	depends on HAS_IOMEM && NET
-	imply PTP_1588_CLOCK
+	depends on PTP_1588_CLOCK
 	help
 	  This driver adds support for using the PCH EG20T as a PTP
 	  clock. The hardware supports time stamping of PTP packets
diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
index e0f87c57749a..baee0379482b 100644
--- a/drivers/ptp/ptp_vclock.c
+++ b/drivers/ptp/ptp_vclock.c
@@ -149,6 +149,7 @@ void ptp_vclock_unregister(struct ptp_vclock *vclock)
 	kfree(vclock);
 }
 
+#if IS_BUILTIN(CONFIG_PTP_1588_CLOCK)
 int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
 {
 	char name[PTP_CLOCK_NAME_LEN] = "";
@@ -217,3 +218,4 @@ void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
 	hwtstamps->hwtstamp = ns_to_ktime(ns);
 }
 EXPORT_SYMBOL(ptp_convert_timestamp);
+#endif
diff --git a/drivers/scsi/cxgbi/cxgb4i/Kconfig b/drivers/scsi/cxgbi/cxgb4i/Kconfig
index 8b0deece9758..63c8a0f3cd0c 100644
--- a/drivers/scsi/cxgbi/cxgb4i/Kconfig
+++ b/drivers/scsi/cxgbi/cxgb4i/Kconfig
@@ -2,6 +2,7 @@
 config SCSI_CXGB4_ISCSI
 	tristate "Chelsio T4 iSCSI support"
 	depends on PCI && INET && (IPV6 || IPV6=n)
+	depends on PTP_1588_CLOCK_OPTIONAL
 	depends on THERMAL || !THERMAL
 	depends on ETHERNET
 	depends on TLS || TLS=n
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 71fac9237725..2e5565067355 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -215,7 +215,7 @@ static inline long scaled_ppm_to_ppb(long ppm)
 	return (long)ppb;
 }
 
-#if IS_REACHABLE(CONFIG_PTP_1588_CLOCK)
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
 
 /**
  * ptp_clock_register() - register a PTP hardware clock driver
@@ -307,6 +307,33 @@ int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay);
  */
 void ptp_cancel_worker_sync(struct ptp_clock *ptp);
 
+#else
+static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
+						   struct device *parent)
+{ return NULL; }
+static inline int ptp_clock_unregister(struct ptp_clock *ptp)
+{ return 0; }
+static inline void ptp_clock_event(struct ptp_clock *ptp,
+				   struct ptp_clock_event *event)
+{ }
+static inline int ptp_clock_index(struct ptp_clock *ptp)
+{ return -1; }
+static inline int ptp_find_pin(struct ptp_clock *ptp,
+			       enum ptp_pin_function func, unsigned int chan)
+{ return -1; }
+static inline int ptp_schedule_worker(struct ptp_clock *ptp,
+				      unsigned long delay)
+{ return -EOPNOTSUPP; }
+static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp)
+{ }
+#endif
+
+#if IS_BUILTIN(CONFIG_PTP_1588_CLOCK)
+/*
+ * These are called by the network core, and don't work if PTP is in
+ * a loadable module.
+ */
+
 /**
  * ptp_get_vclocks_index() - get all vclocks index on pclock, and
  *                           caller is responsible to free memory
@@ -327,26 +354,7 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index);
  */
 void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
 			   int vclock_index);
-
 #else
-static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
-						   struct device *parent)
-{ return NULL; }
-static inline int ptp_clock_unregister(struct ptp_clock *ptp)
-{ return 0; }
-static inline void ptp_clock_event(struct ptp_clock *ptp,
-				   struct ptp_clock_event *event)
-{ }
-static inline int ptp_clock_index(struct ptp_clock *ptp)
-{ return -1; }
-static inline int ptp_find_pin(struct ptp_clock *ptp,
-			       enum ptp_pin_function func, unsigned int chan)
-{ return -1; }
-static inline int ptp_schedule_worker(struct ptp_clock *ptp,
-				      unsigned long delay)
-{ return -EOPNOTSUPP; }
-static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp)
-{ }
 static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
 { return 0; }
 static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
-- 
cgit v1.2.3


From 437ebfd90a2567aab19dce47bafc81ebd8a63324 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sat, 14 Aug 2021 12:57:28 +0300
Subject: devlink: Count struct devlink consumers

The struct devlink itself is protected by internal lock and doesn't
need global lock during operation. That global lock is used to protect
addition/removal new devlink instances from the global list in use by
all devlink consumers in the system.

The future conversion of linked list to be xarray will allow us to
actually delete that lock, but first we need to count all struct devlink
users.

The reference counting provides us a way to ensure that no new user
space commands success to grab devlink instance which is going to be
destroyed makes it is safe to access it without lock.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |   2 +
 net/core/devlink.c    | 205 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 172 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 1151497c0ec5..4c60d61d92da 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -56,6 +56,8 @@ struct devlink {
 			    */
 	u8 reload_failed:1,
 	   reload_enabled:1;
+	refcount_t refcount;
+	struct completion comp;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index c8a8eecad1c5..76f459da6e05 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -108,10 +108,22 @@ struct net *devlink_net(const struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devlink_net);
 
+static void devlink_put(struct devlink *devlink)
+{
+	if (refcount_dec_and_test(&devlink->refcount))
+		complete(&devlink->comp);
+}
+
+static bool __must_check devlink_try_get(struct devlink *devlink)
+{
+	return refcount_inc_not_zero(&devlink->refcount);
+}
+
 static struct devlink *devlink_get_from_attrs(struct net *net,
 					      struct nlattr **attrs)
 {
 	struct devlink *devlink;
+	bool found = false;
 	char *busname;
 	char *devname;
 
@@ -126,16 +138,16 @@ static struct devlink *devlink_get_from_attrs(struct net *net,
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
 		    strcmp(dev_name(devlink->dev), devname) == 0 &&
-		    net_eq(devlink_net(devlink), net))
-			return devlink;
+		    net_eq(devlink_net(devlink), net)) {
+			found = true;
+			break;
+		}
 	}
 
-	return ERR_PTR(-ENODEV);
-}
+	if (!found || !devlink_try_get(devlink))
+		devlink = ERR_PTR(-ENODEV);
 
-static struct devlink *devlink_get_from_info(struct genl_info *info)
-{
-	return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+	return devlink;
 }
 
 static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
@@ -486,7 +498,7 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	devlink = devlink_get_from_info(info);
+	devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs);
 	if (IS_ERR(devlink)) {
 		mutex_unlock(&devlink_mutex);
 		return PTR_ERR(devlink);
@@ -529,6 +541,7 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
 unlock:
 	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
 		mutex_unlock(&devlink->lock);
+	devlink_put(devlink);
 	mutex_unlock(&devlink_mutex);
 	return err;
 }
@@ -541,6 +554,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
 	devlink = info->user_ptr[0];
 	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
 		mutex_unlock(&devlink->lock);
+	devlink_put(devlink);
 	mutex_unlock(&devlink_mutex);
 }
 
@@ -1078,8 +1092,12 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
 			enum devlink_command cmd = DEVLINK_CMD_RATE_NEW;
@@ -1094,11 +1112,14 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg,
 						   NLM_F_MULTI, NULL);
 			if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 			idx++;
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -1173,15 +1194,24 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) {
+			devlink_put(devlink);
+			continue;
+		}
+
 		if (idx < start) {
 			idx++;
+			devlink_put(devlink);
 			continue;
 		}
+
 		err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
 				      NETLINK_CB(cb->skb).portid,
 				      cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		devlink_put(devlink);
 		if (err)
 			goto out;
 		idx++;
@@ -1226,8 +1256,12 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_port, &devlink->port_list, list) {
 			if (idx < start) {
@@ -1241,11 +1275,14 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 						   NLM_F_MULTI, cb->extack);
 			if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 			idx++;
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -1884,8 +1921,12 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			if (idx < start) {
@@ -1899,11 +1940,14 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 						 NLM_F_MULTI);
 			if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 			idx++;
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -2028,9 +2072,13 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!devlink_try_get(devlink))
+			continue;
+
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops->sb_pool_get)
-			continue;
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
@@ -2041,10 +2089,13 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
 				err = 0;
 			} else if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -2241,9 +2292,13 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!devlink_try_get(devlink))
+			continue;
+
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops->sb_port_pool_get)
-			continue;
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_port_pool_get_dumpit(msg, start, &idx,
@@ -2254,10 +2309,13 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
 				err = 0;
 			} else if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -2482,9 +2540,12 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!devlink_try_get(devlink))
+			continue;
+
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops->sb_tc_pool_bind_get)
-			continue;
+			goto retry;
 
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
@@ -2497,10 +2558,13 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
 				err = 0;
 			} else if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -4552,8 +4616,12 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(param_item, &devlink->param_list, list) {
 			if (idx < start) {
@@ -4569,11 +4637,14 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
 				err = 0;
 			} else if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 			idx++;
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -4820,8 +4891,12 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_port, &devlink->port_list, list) {
 			list_for_each_entry(param_item,
@@ -4841,12 +4916,15 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
 					err = 0;
 				} else if (err) {
 					mutex_unlock(&devlink->lock);
+					devlink_put(devlink);
 					goto out;
 				}
 				idx++;
 			}
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -5385,14 +5463,20 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
 	struct devlink *devlink;
 	int start = cb->args[0];
 	int idx = 0;
-	int err;
+	int err = 0;
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
 		err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink,
 							       &idx, start);
+retry:
+		devlink_put(devlink);
 		if (err)
 			goto out;
 	}
@@ -5755,6 +5839,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 	nla_nest_end(skb, chunks_attr);
 	genlmsg_end(skb, hdr);
 	mutex_unlock(&devlink->lock);
+	devlink_put(devlink);
 	mutex_unlock(&devlink_mutex);
 
 	return skb->len;
@@ -5763,6 +5848,7 @@ nla_put_failure:
 	genlmsg_cancel(skb, hdr);
 out_unlock:
 	mutex_unlock(&devlink->lock);
+	devlink_put(devlink);
 out_dev:
 	mutex_unlock(&devlink_mutex);
 	return err;
@@ -5914,17 +6000,14 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
-		if (idx < start) {
-			idx++;
-			continue;
-		}
 
-		if (!devlink->ops->info_get) {
-			idx++;
-			continue;
-		}
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
+		if (idx < start || !devlink->ops->info_get)
+			goto inc;
 
 		mutex_lock(&devlink->lock);
 		err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
@@ -5934,9 +6017,14 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
 		mutex_unlock(&devlink->lock);
 		if (err == -EOPNOTSUPP)
 			err = 0;
-		else if (err)
+		else if (err) {
+			devlink_put(devlink);
 			break;
+		}
+inc:
 		idx++;
+retry:
+		devlink_put(devlink);
 	}
 	mutex_unlock(&devlink_mutex);
 
@@ -7021,6 +7109,7 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
 		goto unlock;
 
 	reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
+	devlink_put(devlink);
 	mutex_unlock(&devlink_mutex);
 	return reporter;
 unlock:
@@ -7092,8 +7181,12 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry_rep;
+
 		mutex_lock(&devlink->reporters_lock);
 		list_for_each_entry(reporter, &devlink->reporter_list,
 				    list) {
@@ -7107,16 +7200,23 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
 				NLM_F_MULTI);
 			if (err) {
 				mutex_unlock(&devlink->reporters_lock);
+				devlink_put(devlink);
 				goto out;
 			}
 			idx++;
 		}
 		mutex_unlock(&devlink->reporters_lock);
+retry_rep:
+		devlink_put(devlink);
 	}
 
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry_port;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(port, &devlink->port_list, list) {
 			mutex_lock(&port->reporters_lock);
@@ -7133,6 +7233,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
 				if (err) {
 					mutex_unlock(&port->reporters_lock);
 					mutex_unlock(&devlink->lock);
+					devlink_put(devlink);
 					goto out;
 				}
 				idx++;
@@ -7140,6 +7241,8 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
 			mutex_unlock(&port->reporters_lock);
 		}
 		mutex_unlock(&devlink->lock);
+retry_port:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -7673,8 +7776,12 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(trap_item, &devlink->trap_list, list) {
 			if (idx < start) {
@@ -7688,11 +7795,14 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg,
 						   NLM_F_MULTI);
 			if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 			idx++;
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -7892,8 +8002,12 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(group_item, &devlink->trap_group_list,
 				    list) {
@@ -7908,11 +8022,14 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg,
 							 NLM_F_MULTI);
 			if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 			idx++;
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -8198,8 +8315,12 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg,
 
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+		if (!devlink_try_get(devlink))
 			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
 		mutex_lock(&devlink->lock);
 		list_for_each_entry(policer_item, &devlink->trap_policer_list,
 				    list) {
@@ -8214,11 +8335,14 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg,
 							   NLM_F_MULTI);
 			if (err) {
 				mutex_unlock(&devlink->lock);
+				devlink_put(devlink);
 				goto out;
 			}
 			idx++;
 		}
 		mutex_unlock(&devlink->lock);
+retry:
+		devlink_put(devlink);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -8801,6 +8925,9 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 	INIT_LIST_HEAD(&devlink->trap_policer_list);
 	mutex_init(&devlink->lock);
 	mutex_init(&devlink->reporters_lock);
+	refcount_set(&devlink->refcount, 1);
+	init_completion(&devlink->comp);
+
 	return devlink;
 }
 EXPORT_SYMBOL_GPL(devlink_alloc_ns);
@@ -8827,6 +8954,9 @@ EXPORT_SYMBOL_GPL(devlink_register);
  */
 void devlink_unregister(struct devlink *devlink)
 {
+	devlink_put(devlink);
+	wait_for_completion(&devlink->comp);
+
 	mutex_lock(&devlink_mutex);
 	WARN_ON(devlink_reload_supported(devlink->ops) &&
 		devlink->reload_enabled);
@@ -11374,9 +11504,12 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
 	 */
 	mutex_lock(&devlink_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), net))
+		if (!devlink_try_get(devlink))
 			continue;
 
+		if (!net_eq(devlink_net(devlink), net))
+			goto retry;
+
 		WARN_ON(!devlink_reload_supported(devlink->ops));
 		err = devlink_reload(devlink, &init_net,
 				     DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
@@ -11384,6 +11517,8 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
 				     &actions_performed, NULL);
 		if (err && err != -EOPNOTSUPP)
 			pr_warn("Failed to reload devlink instance into init_net\n");
+retry:
+		devlink_put(devlink);
 	}
 	mutex_unlock(&devlink_mutex);
 }
-- 
cgit v1.2.3


From 11a861d767cdd87a34397821b0fd2095893b84b3 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sat, 14 Aug 2021 12:57:29 +0300
Subject: devlink: Use xarray to store devlink instances

We can use xarray instead of linearly organized linked lists for the
devlink instances. This will let us revise the locking scheme in favour
of internal xarray locking that protects database.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  2 +-
 net/core/devlink.c    | 70 +++++++++++++++++++++++++++++++++++----------------
 2 files changed, 50 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4c60d61d92da..154cf0dbca37 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -32,7 +32,7 @@ struct devlink_dev_stats {
 struct devlink_ops;
 
 struct devlink {
-	struct list_head list;
+	u32 index;
 	struct list_head port_list;
 	struct list_head rate_list;
 	struct list_head sb_list;
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 76f459da6e05..d218f57ad8cf 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -92,7 +92,8 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_
 				 DEVLINK_PORT_FN_STATE_ACTIVE),
 };
 
-static LIST_HEAD(devlink_list);
+static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
+#define DEVLINK_REGISTERED XA_MARK_1
 
 /* devlink_mutex
  *
@@ -123,6 +124,7 @@ static struct devlink *devlink_get_from_attrs(struct net *net,
 					      struct nlattr **attrs)
 {
 	struct devlink *devlink;
+	unsigned long index;
 	bool found = false;
 	char *busname;
 	char *devname;
@@ -135,7 +137,7 @@ static struct devlink *devlink_get_from_attrs(struct net *net,
 
 	lockdep_assert_held(&devlink_mutex);
 
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
 		    strcmp(dev_name(devlink->dev), devname) == 0 &&
 		    net_eq(devlink_net(devlink), net)) {
@@ -1087,11 +1089,12 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg,
 	struct devlink_rate *devlink_rate;
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err = 0;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -1189,11 +1192,12 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
 {
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -1251,11 +1255,12 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 	struct devlink *devlink;
 	struct devlink_port *devlink_port;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -1916,11 +1921,12 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 	struct devlink *devlink;
 	struct devlink_sb *devlink_sb;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -2067,11 +2073,12 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
 	struct devlink *devlink;
 	struct devlink_sb *devlink_sb;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err = 0;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -2287,11 +2294,12 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
 	struct devlink *devlink;
 	struct devlink_sb *devlink_sb;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err = 0;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -2535,11 +2543,12 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
 	struct devlink *devlink;
 	struct devlink_sb *devlink_sb;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err = 0;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -4611,11 +4620,12 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
 	struct devlink_param_item *param_item;
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err = 0;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -4886,11 +4896,12 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
 	struct devlink_port *devlink_port;
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err = 0;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -5462,11 +5473,12 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
 {
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err = 0;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -5995,11 +6007,12 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
 {
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err = 0;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -7176,11 +7189,12 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
 	struct devlink_port *port;
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -7210,7 +7224,7 @@ retry_rep:
 		devlink_put(devlink);
 	}
 
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -7771,11 +7785,12 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg,
 	struct devlink_trap_item *trap_item;
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -7997,11 +8012,12 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg,
 	u32 portid = NETLINK_CB(cb->skb).portid;
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -8310,11 +8326,12 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg,
 	u32 portid = NETLINK_CB(cb->skb).portid;
 	struct devlink *devlink;
 	int start = cb->args[0];
+	unsigned long index;
 	int idx = 0;
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
@@ -8899,6 +8916,8 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 				 struct device *dev)
 {
 	struct devlink *devlink;
+	static u32 last_id;
+	int ret;
 
 	WARN_ON(!ops || !dev);
 	if (!devlink_reload_actions_valid(ops))
@@ -8908,6 +8927,13 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 	if (!devlink)
 		return NULL;
 
+	ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
+			      &last_id, GFP_KERNEL);
+	if (ret < 0) {
+		kfree(devlink);
+		return NULL;
+	}
+
 	devlink->dev = dev;
 	devlink->ops = ops;
 	xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
@@ -8940,7 +8966,7 @@ EXPORT_SYMBOL_GPL(devlink_alloc_ns);
 int devlink_register(struct devlink *devlink)
 {
 	mutex_lock(&devlink_mutex);
-	list_add_tail(&devlink->list, &devlink_list);
+	xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
 	devlink_notify(devlink, DEVLINK_CMD_NEW);
 	mutex_unlock(&devlink_mutex);
 	return 0;
@@ -8961,7 +8987,7 @@ void devlink_unregister(struct devlink *devlink)
 	WARN_ON(devlink_reload_supported(devlink->ops) &&
 		devlink->reload_enabled);
 	devlink_notify(devlink, DEVLINK_CMD_DEL);
-	list_del(&devlink->list);
+	xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
 	mutex_unlock(&devlink_mutex);
 }
 EXPORT_SYMBOL_GPL(devlink_unregister);
@@ -9023,6 +9049,7 @@ void devlink_free(struct devlink *devlink)
 	WARN_ON(!list_empty(&devlink->port_list));
 
 	xa_destroy(&devlink->snapshot_ids);
+	xa_erase(&devlinks, devlink->index);
 
 	kfree(devlink);
 }
@@ -11497,13 +11524,14 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
 {
 	struct devlink *devlink;
 	u32 actions_performed;
+	unsigned long index;
 	int err;
 
 	/* In case network namespace is getting destroyed, reload
 	 * all devlink instances from this namespace into init_net.
 	 */
 	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
 		if (!devlink_try_get(devlink))
 			continue;
 
-- 
cgit v1.2.3


From c7fa1d9b1fb179375e889ff076a1566ecc997bfc Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Fri, 13 Aug 2021 18:00:00 +0300
Subject: net: bridge: mcast: dump ipv4 querier state

Add support for dumping global IPv4 querier state, we dump the state
only if our own querier is enabled or there has been another external
querier which has won the election. For the bridge global state we use
a new attribute IFLA_BR_MCAST_QUERIER_STATE and embed the state inside.
The structure is:
 [IFLA_BR_MCAST_QUERIER_STATE]
  `[BRIDGE_QUERIER_IP_ADDRESS] - ip address of the querier
  `[BRIDGE_QUERIER_IP_PORT]    - bridge port ifindex where the querier was
                                 seen (set only if external querier)
  `[BRIDGE_QUERIER_IP_OTHER_TIMER]   -  other querier timeout

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 10 ++++++
 include/uapi/linux/if_link.h   |  1 +
 net/bridge/br_multicast.c      | 73 ++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_netlink.c        |  5 ++-
 net/bridge/br_private.h        |  4 +++
 5 files changed, 92 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 620d86e825b8..e0fff67fcd88 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -770,4 +770,14 @@ struct br_boolopt_multi {
 	__u32 optval;
 	__u32 optmask;
 };
+
+enum {
+	BRIDGE_QUERIER_UNSPEC,
+	BRIDGE_QUERIER_IP_ADDRESS,
+	BRIDGE_QUERIER_IP_PORT,
+	BRIDGE_QUERIER_IP_OTHER_TIMER,
+	BRIDGE_QUERIER_PAD,
+	__BRIDGE_QUERIER_MAX
+};
+#define BRIDGE_QUERIER_MAX (__BRIDGE_QUERIER_MAX - 1)
 #endif /* _UAPI_LINUX_IF_BRIDGE_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 5310003523ce..8aad65b69054 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -479,6 +479,7 @@ enum {
 	IFLA_BR_MCAST_MLD_VERSION,
 	IFLA_BR_VLAN_STATS_PER_PORT,
 	IFLA_BR_MULTI_BOOLOPT,
+	IFLA_BR_MCAST_QUERIER_STATE,
 	__IFLA_BR_MAX,
 };
 
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 3705b7ace62d..4513bc13b6d3 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2905,6 +2905,79 @@ update:
 	return true;
 }
 
+static struct net_bridge_port *
+__br_multicast_get_querier_port(struct net_bridge *br,
+				const struct bridge_mcast_querier *querier)
+{
+	int port_ifidx = READ_ONCE(querier->port_ifidx);
+	struct net_bridge_port *p;
+	struct net_device *dev;
+
+	if (port_ifidx == 0)
+		return NULL;
+
+	dev = dev_get_by_index_rcu(dev_net(br->dev), port_ifidx);
+	if (!dev)
+		return NULL;
+	p = br_port_get_rtnl_rcu(dev);
+	if (!p || p->br != br)
+		return NULL;
+
+	return p;
+}
+
+size_t br_multicast_querier_state_size(void)
+{
+	return nla_total_size(sizeof(0)) +      /* nest attribute */
+	       nla_total_size(sizeof(__be32)) + /* BRIDGE_QUERIER_IP_ADDRESS */
+	       nla_total_size(sizeof(int)) +    /* BRIDGE_QUERIER_IP_PORT */
+	       nla_total_size_64bit(sizeof(u64)); /* BRIDGE_QUERIER_IP_OTHER_TIMER */
+}
+
+/* protected by rtnl or rcu */
+int br_multicast_dump_querier_state(struct sk_buff *skb,
+				    const struct net_bridge_mcast *brmctx,
+				    int nest_attr)
+{
+	struct bridge_mcast_querier querier = {};
+	struct net_bridge_port *p;
+	struct nlattr *nest;
+
+	if (!brmctx->multicast_querier &&
+	    !timer_pending(&brmctx->ip4_other_query.timer))
+		return 0;
+
+	nest = nla_nest_start(skb, nest_attr);
+	if (!nest)
+		return -EMSGSIZE;
+
+	rcu_read_lock();
+	br_multicast_read_querier(&brmctx->ip4_querier, &querier);
+	if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS,
+			    querier.addr.src.ip4)) {
+		rcu_read_unlock();
+		goto out_err;
+	}
+
+	p = __br_multicast_get_querier_port(brmctx->br, &querier);
+	if (timer_pending(&brmctx->ip4_other_query.timer) &&
+	    (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IP_OTHER_TIMER,
+			       br_timer_value(&brmctx->ip4_other_query.timer),
+			       BRIDGE_QUERIER_PAD) ||
+	     (p && nla_put_u32(skb, BRIDGE_QUERIER_IP_PORT, p->dev->ifindex)))) {
+		rcu_read_unlock();
+		goto out_err;
+	}
+	rcu_read_unlock();
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+out_err:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static void
 br_multicast_update_query_timer(struct net_bridge_mcast *brmctx,
 				struct bridge_mcast_other_query *query,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 8ae026fa2ad7..2f184ad8ae29 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1501,6 +1501,7 @@ static size_t br_get_size(const struct net_device *brdev)
 	       nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
 	       nla_total_size(sizeof(u8)) +	/* IFLA_BR_MCAST_IGMP_VERSION */
 	       nla_total_size(sizeof(u8)) +	/* IFLA_BR_MCAST_MLD_VERSION */
+	       br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_NF_CALL_IPTABLES */
@@ -1587,7 +1588,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	    nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT,
 			br->multicast_ctx.multicast_startup_query_count) ||
 	    nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION,
-		       br->multicast_ctx.multicast_igmp_version))
+		       br->multicast_ctx.multicast_igmp_version) ||
+	    br_multicast_dump_querier_state(skb, &br->multicast_ctx,
+					    IFLA_BR_MCAST_QUERIER_STATE))
 		return -EMSGSIZE;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 70e2d9a7f105..9b1bf98a2c5a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -948,6 +948,10 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
 		  struct netlink_ext_ack *extack);
 int br_rports_fill_info(struct sk_buff *skb,
 			const struct net_bridge_mcast *brmctx);
+int br_multicast_dump_querier_state(struct sk_buff *skb,
+				    const struct net_bridge_mcast *brmctx,
+				    int nest_attr);
+size_t br_multicast_querier_state_size(void);
 
 static inline bool br_group_is_l2(const struct br_ip *group)
 {
-- 
cgit v1.2.3


From 85b4108211742c5dd4f9f56c1d0704b4e0d4c98e Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Fri, 13 Aug 2021 18:00:01 +0300
Subject: net: bridge: mcast: dump ipv6 querier state

Add support for dumping global IPv6 querier state, we dump the state
only if our own querier is enabled or there has been another external
querier which has won the election. For the bridge global state we use
a new attribute IFLA_BR_MCAST_QUERIER_STATE and embed the state inside.
The structure is:
  [IFLA_BR_MCAST_QUERIER_STATE]
   `[BRIDGE_QUERIER_IPV6_ADDRESS] - ip address of the querier
   `[BRIDGE_QUERIER_IPV6_PORT]    - bridge port ifindex where the querier
                                    was seen (set only if external querier)
   `[BRIDGE_QUERIER_IPV6_OTHER_TIMER]   -  other querier timeout

IPv4 and IPv6 attributes are embedded at the same level of
IFLA_BR_MCAST_QUERIER_STATE. If we didn't dump anything we cancel the nest
and return.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  3 +++
 net/bridge/br_multicast.c      | 36 ++++++++++++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index e0fff67fcd88..eceaad200bf6 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -777,6 +777,9 @@ enum {
 	BRIDGE_QUERIER_IP_PORT,
 	BRIDGE_QUERIER_IP_OTHER_TIMER,
 	BRIDGE_QUERIER_PAD,
+	BRIDGE_QUERIER_IPV6_ADDRESS,
+	BRIDGE_QUERIER_IPV6_PORT,
+	BRIDGE_QUERIER_IPV6_OTHER_TIMER,
 	__BRIDGE_QUERIER_MAX
 };
 #define BRIDGE_QUERIER_MAX (__BRIDGE_QUERIER_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 4513bc13b6d3..0e5d6ba03457 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2943,15 +2943,15 @@ int br_multicast_dump_querier_state(struct sk_buff *skb,
 	struct net_bridge_port *p;
 	struct nlattr *nest;
 
-	if (!brmctx->multicast_querier &&
-	    !timer_pending(&brmctx->ip4_other_query.timer))
-		return 0;
-
 	nest = nla_nest_start(skb, nest_attr);
 	if (!nest)
 		return -EMSGSIZE;
 
 	rcu_read_lock();
+	if (!brmctx->multicast_querier &&
+	    !timer_pending(&brmctx->ip4_other_query.timer))
+		goto out_v6;
+
 	br_multicast_read_querier(&brmctx->ip4_querier, &querier);
 	if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS,
 			    querier.addr.src.ip4)) {
@@ -2968,8 +2968,36 @@ int br_multicast_dump_querier_state(struct sk_buff *skb,
 		rcu_read_unlock();
 		goto out_err;
 	}
+
+out_v6:
+#if IS_ENABLED(CONFIG_IPV6)
+	if (!brmctx->multicast_querier &&
+	    !timer_pending(&brmctx->ip6_other_query.timer))
+		goto out;
+
+	br_multicast_read_querier(&brmctx->ip6_querier, &querier);
+	if (nla_put_in6_addr(skb, BRIDGE_QUERIER_IPV6_ADDRESS,
+			     &querier.addr.src.ip6)) {
+		rcu_read_unlock();
+		goto out_err;
+	}
+
+	p = __br_multicast_get_querier_port(brmctx->br, &querier);
+	if (timer_pending(&brmctx->ip6_other_query.timer) &&
+	    (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IPV6_OTHER_TIMER,
+			       br_timer_value(&brmctx->ip6_other_query.timer),
+			       BRIDGE_QUERIER_PAD) ||
+	     (p && nla_put_u32(skb, BRIDGE_QUERIER_IPV6_PORT,
+			       p->dev->ifindex)))) {
+		rcu_read_unlock();
+		goto out_err;
+	}
+out:
+#endif
 	rcu_read_unlock();
 	nla_nest_end(skb, nest);
+	if (!nla_len(nest))
+		nla_nest_cancel(skb, nest);
 
 	return 0;
 
-- 
cgit v1.2.3


From ddc649d158c560c6685be1701900a6e456ecceac Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Fri, 13 Aug 2021 18:00:02 +0300
Subject: net: bridge: vlan: dump mcast ctx querier state

Use the new mcast querier state dump infrastructure and export vlans'
mcast context querier state embedded in attribute
BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 1 +
 net/bridge/br_vlan_options.c   | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index eceaad200bf6..f71a81fdbbc6 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -563,6 +563,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
 	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index b4fd5fa441b7..49dec53a4a74 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -299,7 +299,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 		       v_opts->br_mcast_ctx.multicast_querier) ||
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
-		       v_opts->br_mcast_ctx.multicast_router))
+		       v_opts->br_mcast_ctx.multicast_router) ||
+	    br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx,
+					    BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE))
 		goto out_err;
 
 	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
@@ -379,6 +381,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER */
+		+ br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
-- 
cgit v1.2.3


From 6c9b40844751ea30c72f7a2f92f4d704bc6b2927 Mon Sep 17 00:00:00 2001
From: Cai Huoqing <caihuoqing@baidu.com>
Date: Fri, 13 Aug 2021 20:08:02 +0800
Subject: net: Remove net/ipx.h and uapi/linux/ipx.h header files

commit <47595e32869f> ("<MAINTAINERS: Mark some staging directories>")
indicated the ipx network layer as obsolete in Jan 2018,
updated in the MAINTAINERS file

now, after being exposed for 3 years to refactoring, so to
delete uapi/linux/ipx.h and net/ipx.h header files for good.
additionally, there is no module that depends on ipx.h except
a broken staging driver(r8188eu)

Signed-off-by: Cai Huoqing <caihuoqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipx.h        | 171 -----------------------------------------------
 include/uapi/linux/ipx.h |  87 ------------------------
 2 files changed, 258 deletions(-)
 delete mode 100644 include/net/ipx.h
 delete mode 100644 include/uapi/linux/ipx.h

(limited to 'include')

diff --git a/include/net/ipx.h b/include/net/ipx.h
deleted file mode 100644
index 9d1342807b59..000000000000
--- a/include/net/ipx.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_INET_IPX_H_
-#define _NET_INET_IPX_H_
-/*
- *	The following information is in its entirety obtained from:
- *
- *	Novell 'IPX Router Specification' Version 1.10 
- *		Part No. 107-000029-001
- *
- *	Which is available from ftp.novell.com
- */
-
-#include <linux/netdevice.h>
-#include <net/datalink.h>
-#include <linux/ipx.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/refcount.h>
-
-struct ipx_address {
-	__be32  net;
-	__u8    node[IPX_NODE_LEN]; 
-	__be16  sock;
-};
-
-#define ipx_broadcast_node	"\377\377\377\377\377\377"
-#define ipx_this_node           "\0\0\0\0\0\0"
-
-#define IPX_MAX_PPROP_HOPS 8
-
-struct ipxhdr {
-	__be16			ipx_checksum __packed;
-#define IPX_NO_CHECKSUM	cpu_to_be16(0xFFFF)
-	__be16			ipx_pktsize __packed;
-	__u8			ipx_tctrl;
-	__u8			ipx_type;
-#define IPX_TYPE_UNKNOWN	0x00
-#define IPX_TYPE_RIP		0x01	/* may also be 0 */
-#define IPX_TYPE_SAP		0x04	/* may also be 0 */
-#define IPX_TYPE_SPX		0x05	/* SPX protocol */
-#define IPX_TYPE_NCP		0x11	/* $lots for docs on this (SPIT) */
-#define IPX_TYPE_PPROP		0x14	/* complicated flood fill brdcast */
-	struct ipx_address	ipx_dest __packed;
-	struct ipx_address	ipx_source __packed;
-};
-
-/* From af_ipx.c */
-extern int sysctl_ipx_pprop_broadcasting;
-
-struct ipx_interface {
-	/* IPX address */
-	__be32			if_netnum;
-	unsigned char		if_node[IPX_NODE_LEN];
-	refcount_t		refcnt;
-
-	/* physical device info */
-	struct net_device	*if_dev;
-	struct datalink_proto	*if_dlink;
-	__be16			if_dlink_type;
-
-	/* socket support */
-	unsigned short		if_sknum;
-	struct hlist_head	if_sklist;
-	spinlock_t		if_sklist_lock;
-
-	/* administrative overhead */
-	int			if_ipx_offset;
-	unsigned char		if_internal;
-	unsigned char		if_primary;
-	
-	struct list_head	node; /* node in ipx_interfaces list */
-};
-
-struct ipx_route {
-	__be32			ir_net;
-	struct ipx_interface	*ir_intrfc;
-	unsigned char		ir_routed;
-	unsigned char		ir_router_node[IPX_NODE_LEN];
-	struct list_head	node; /* node in ipx_routes list */
-	refcount_t		refcnt;
-};
-
-struct ipx_cb {
-	u8	ipx_tctrl;
-	__be32	ipx_dest_net;
-	__be32	ipx_source_net;
-	struct {
-		__be32 netnum;
-		int index;
-	} last_hop;
-};
-
-#include <net/sock.h>
-
-struct ipx_sock {
-	/* struct sock has to be the first member of ipx_sock */
-	struct sock		sk;
-	struct ipx_address	dest_addr;
-	struct ipx_interface	*intrfc;
-	__be16			port;
-#ifdef CONFIG_IPX_INTERN
-	unsigned char		node[IPX_NODE_LEN];
-#endif
-	unsigned short		type;
-	/*
-	 * To handle special ncp connection-handling sockets for mars_nwe,
- 	 * the connection number must be stored in the socket.
-	 */
-	unsigned short		ipx_ncp_conn;
-};
-
-static inline struct ipx_sock *ipx_sk(struct sock *sk)
-{
-	return (struct ipx_sock *)sk;
-}
-
-#define IPX_SKB_CB(__skb) ((struct ipx_cb *)&((__skb)->cb[0]))
-
-#define IPX_MIN_EPHEMERAL_SOCKET	0x4000
-#define IPX_MAX_EPHEMERAL_SOCKET	0x7fff
-
-extern struct list_head ipx_routes;
-extern rwlock_t ipx_routes_lock;
-
-extern struct list_head ipx_interfaces;
-struct ipx_interface *ipx_interfaces_head(void);
-extern spinlock_t ipx_interfaces_lock;
-
-extern struct ipx_interface *ipx_primary_net;
-
-int ipx_proc_init(void);
-void ipx_proc_exit(void);
-
-const char *ipx_frame_name(__be16);
-const char *ipx_device_name(struct ipx_interface *intrfc);
-
-static __inline__ void ipxitf_hold(struct ipx_interface *intrfc)
-{
-	refcount_inc(&intrfc->refcnt);
-}
-
-void ipxitf_down(struct ipx_interface *intrfc);
-struct ipx_interface *ipxitf_find_using_net(__be32 net);
-int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node);
-__be16 ipx_cksum(struct ipxhdr *packet, int length);
-int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc,
-		     unsigned char *node);
-void ipxrtr_del_routes(struct ipx_interface *intrfc);
-int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx,
-			struct msghdr *msg, size_t len, int noblock);
-int ipxrtr_route_skb(struct sk_buff *skb);
-struct ipx_route *ipxrtr_lookup(__be32 net);
-int ipxrtr_ioctl(unsigned int cmd, void __user *arg);
-
-static __inline__ void ipxitf_put(struct ipx_interface *intrfc)
-{
-	if (refcount_dec_and_test(&intrfc->refcnt))
-		ipxitf_down(intrfc);
-}
-
-static __inline__ void ipxrtr_hold(struct ipx_route *rt)
-{
-	        refcount_inc(&rt->refcnt);
-}
-
-static __inline__ void ipxrtr_put(struct ipx_route *rt)
-{
-	        if (refcount_dec_and_test(&rt->refcnt))
-			                kfree(rt);
-}
-#endif /* _NET_INET_IPX_H_ */
diff --git a/include/uapi/linux/ipx.h b/include/uapi/linux/ipx.h
deleted file mode 100644
index 3168137adae8..000000000000
--- a/include/uapi/linux/ipx.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _IPX_H_
-#define _IPX_H_
-#include <linux/libc-compat.h>	/* for compatibility with glibc netipx/ipx.h */
-#include <linux/types.h>
-#include <linux/sockios.h>
-#include <linux/socket.h>
-#define IPX_NODE_LEN	6
-#define IPX_MTU		576
-
-#if __UAPI_DEF_SOCKADDR_IPX
-struct sockaddr_ipx {
-	__kernel_sa_family_t sipx_family;
-	__be16		sipx_port;
-	__be32		sipx_network;
-	unsigned char 	sipx_node[IPX_NODE_LEN];
-	__u8		sipx_type;
-	unsigned char	sipx_zero;	/* 16 byte fill */
-};
-#endif /* __UAPI_DEF_SOCKADDR_IPX */
-
-/*
- * So we can fit the extra info for SIOCSIFADDR into the address nicely
- */
-#define sipx_special	sipx_port
-#define sipx_action	sipx_zero
-#define IPX_DLTITF	0
-#define IPX_CRTITF	1
-
-#if __UAPI_DEF_IPX_ROUTE_DEFINITION
-struct ipx_route_definition {
-	__be32        ipx_network;
-	__be32        ipx_router_network;
-	unsigned char ipx_router_node[IPX_NODE_LEN];
-};
-#endif /* __UAPI_DEF_IPX_ROUTE_DEFINITION */
-
-#if __UAPI_DEF_IPX_INTERFACE_DEFINITION
-struct ipx_interface_definition {
-	__be32        ipx_network;
-	unsigned char ipx_device[16];
-	unsigned char ipx_dlink_type;
-#define IPX_FRAME_NONE		0
-#define IPX_FRAME_SNAP		1
-#define IPX_FRAME_8022		2
-#define IPX_FRAME_ETHERII	3
-#define IPX_FRAME_8023		4
-#define IPX_FRAME_TR_8022       5 /* obsolete */
-	unsigned char ipx_special;
-#define IPX_SPECIAL_NONE	0
-#define IPX_PRIMARY		1
-#define IPX_INTERNAL		2
-	unsigned char ipx_node[IPX_NODE_LEN];
-};
-#endif /* __UAPI_DEF_IPX_INTERFACE_DEFINITION */
-
-#if __UAPI_DEF_IPX_CONFIG_DATA
-struct ipx_config_data {
-	unsigned char	ipxcfg_auto_select_primary;
-	unsigned char	ipxcfg_auto_create_interfaces;
-};
-#endif /* __UAPI_DEF_IPX_CONFIG_DATA */
-
-/*
- * OLD Route Definition for backward compatibility.
- */
-
-#if __UAPI_DEF_IPX_ROUTE_DEF
-struct ipx_route_def {
-	__be32		ipx_network;
-	__be32		ipx_router_network;
-#define IPX_ROUTE_NO_ROUTER	0
-	unsigned char	ipx_router_node[IPX_NODE_LEN];
-	unsigned char	ipx_device[16];
-	unsigned short	ipx_flags;
-#define IPX_RT_SNAP		8
-#define IPX_RT_8022		4
-#define IPX_RT_BLUEBOOK		2
-#define IPX_RT_ROUTED		1
-};
-#endif /* __UAPI_DEF_IPX_ROUTE_DEF */
-
-#define SIOCAIPXITFCRT		(SIOCPROTOPRIVATE)
-#define SIOCAIPXPRISLT		(SIOCPROTOPRIVATE + 1)
-#define SIOCIPXCFGDATA		(SIOCPROTOPRIVATE + 2)
-#define SIOCIPXNCPCONN		(SIOCPROTOPRIVATE + 3)
-#endif /* _IPX_H_ */
-- 
cgit v1.2.3


From 2c860a43dd77f969bb959336a2f743d7103a8f63 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 14 Aug 2021 10:57:15 +0900
Subject: bpf: af_unix: Implement BPF iterator for UNIX domain socket.

This patch implements the BPF iterator for the UNIX domain socket.

Currently, the batch optimisation introduced for the TCP iterator in the
commit 04c7820b776f ("bpf: tcp: Bpf iter batching and lock_sock") is not
used for the UNIX domain socket.  It will require replacing the big lock
for the hash table with small locks for each hash list not to block other
processes.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210814015718.42704-2-kuniyu@amazon.co.jp
---
 include/linux/btf_ids.h |  3 +-
 net/unix/af_unix.c      | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 57890b357f85..bed4b9964581 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -172,7 +172,8 @@ extern struct btf_id_set name;
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock)		\
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock)			\
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock)			\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock)
 
 enum {
 #define BTF_SOCK_TYPE(name, str) name,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1c2224f05b51..bad8f19174e3 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -113,6 +113,7 @@
 #include <linux/security.h>
 #include <linux/freezer.h>
 #include <linux/file.h>
+#include <linux/btf_ids.h>
 
 #include "scm.h"
 
@@ -3143,6 +3144,64 @@ static const struct seq_operations unix_seq_ops = {
 	.stop   = unix_seq_stop,
 	.show   = unix_seq_show,
 };
+
+#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_iter__unix {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct unix_sock *, unix_sk);
+	uid_t uid __aligned(8);
+};
+
+static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+			      struct unix_sock *unix_sk, uid_t uid)
+{
+	struct bpf_iter__unix ctx;
+
+	meta->seq_num--;  /* skip SEQ_START_TOKEN */
+	ctx.meta = meta;
+	ctx.unix_sk = unix_sk;
+	ctx.uid = uid;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	struct sock *sk = v;
+	uid_t uid;
+
+	if (v == SEQ_START_TOKEN)
+		return 0;
+
+	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	return unix_prog_seq_show(prog, &meta, v, uid);
+}
+
+static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)unix_prog_seq_show(prog, &meta, v, 0);
+	}
+
+	unix_seq_stop(seq, v);
+}
+
+static const struct seq_operations bpf_iter_unix_seq_ops = {
+	.start	= unix_seq_start,
+	.next	= unix_seq_next,
+	.stop	= bpf_iter_unix_seq_stop,
+	.show	= bpf_iter_unix_seq_show,
+};
+#endif
 #endif
 
 static const struct net_proto_family unix_family_ops = {
@@ -3183,6 +3242,35 @@ static struct pernet_operations unix_net_ops = {
 	.exit = unix_net_exit,
 };
 
+#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
+		     struct unix_sock *unix_sk, uid_t uid)
+
+static const struct bpf_iter_seq_info unix_seq_info = {
+	.seq_ops		= &bpf_iter_unix_seq_ops,
+	.init_seq_private	= bpf_iter_init_seq_net,
+	.fini_seq_private	= bpf_iter_fini_seq_net,
+	.seq_priv_size		= sizeof(struct seq_net_private),
+};
+
+static struct bpf_iter_reg unix_reg_info = {
+	.target			= "unix",
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__unix, unix_sk),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+	.seq_info		= &unix_seq_info,
+};
+
+static void __init bpf_iter_register(void)
+{
+	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
+	if (bpf_iter_reg_target(&unix_reg_info))
+		pr_warn("Warning: could not register bpf iterator unix\n");
+}
+#endif
+
 static int __init af_unix_init(void)
 {
 	int rc = -1;
@@ -3198,6 +3286,11 @@ static int __init af_unix_init(void)
 	sock_register(&unix_family_ops);
 	register_pernet_subsys(&unix_net_ops);
 	unix_bpf_build_proto();
+
+#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	bpf_iter_register();
+#endif
+
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From 46efe4efb9d1fd05e46574819884a362e6a4d7b2 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 15 Aug 2021 04:47:47 +0300
Subject: net: dsa: felix: stop calling ocelot_port_{enable,disable}

ocelot_port_enable touches ANA_PORT_PORT_CFG, which has the following
fields:

- LOCKED_PORTMOVE_CPU, LEARNDROP, LEARNCPU, LEARNAUTO, RECV_ENA, all of
  which are written with their hardware default values, also runtime
  invariants. So it makes no sense to write these during every .ndo_open.

- PORTID_VAL: this field has an out-of-reset value of zero for all ports
  and must be initialized by software. Additionally, the
  ocelot_setup_logical_port_ids() code path sets up different logical
  port IDs for the ports in a hardware LAG, and we absolutely don't want
  .ndo_open to interfere there and reset those values.

So in fact the write from ocelot_port_enable can better be moved to
ocelot_init_port, and the .ndo_open hook deleted.

ocelot_port_disable touches DEV_MAC_ENA_CFG and QSYS_SWITCH_PORT_MODE_PORT_ENA,
in an attempt to undo what ocelot_adjust_link did. But since .ndo_stop
does not get called each time the link falls (i.e. this isn't a
substitute for .phylink_mac_link_down), felix already does better at
this by writing those registers already in felix_phylink_mac_link_down.

So keep ocelot_port_disable (for now, until ocelot is converted to
phylink too), and just delete the felix call to it, which is not
necessary.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c         | 19 -------------------
 drivers/net/ethernet/mscc/ocelot.c     | 22 +++++++++-------------
 drivers/net/ethernet/mscc/ocelot_net.c |  4 ----
 include/soc/mscc/ocelot.h              |  2 --
 4 files changed, 9 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 583a22d901b3..0050bb5b10aa 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -796,23 +796,6 @@ static int felix_vlan_del(struct dsa_switch *ds, int port,
 	return ocelot_vlan_del(ocelot, port, vlan->vid);
 }
 
-static int felix_port_enable(struct dsa_switch *ds, int port,
-			     struct phy_device *phy)
-{
-	struct ocelot *ocelot = ds->priv;
-
-	ocelot_port_enable(ocelot, port, phy);
-
-	return 0;
-}
-
-static void felix_port_disable(struct dsa_switch *ds, int port)
-{
-	struct ocelot *ocelot = ds->priv;
-
-	return ocelot_port_disable(ocelot, port);
-}
-
 static void felix_phylink_validate(struct dsa_switch *ds, int port,
 				   unsigned long *supported,
 				   struct phylink_link_state *state)
@@ -1615,8 +1598,6 @@ const struct dsa_switch_ops felix_switch_ops = {
 	.phylink_mac_config		= felix_phylink_mac_config,
 	.phylink_mac_link_down		= felix_phylink_mac_link_down,
 	.phylink_mac_link_up		= felix_phylink_mac_link_up,
-	.port_enable			= felix_port_enable,
-	.port_disable			= felix_port_disable,
 	.port_fdb_dump			= felix_fdb_dump,
 	.port_fdb_add			= felix_fdb_add,
 	.port_fdb_del			= felix_fdb_del,
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index adfb9781799e..a948c807349d 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -514,19 +514,6 @@ void ocelot_adjust_link(struct ocelot *ocelot, int port,
 }
 EXPORT_SYMBOL(ocelot_adjust_link);
 
-void ocelot_port_enable(struct ocelot *ocelot, int port,
-			struct phy_device *phy)
-{
-	/* Enable receiving frames on the port, and activate auto-learning of
-	 * MAC addresses.
-	 */
-	ocelot_write_gix(ocelot, ANA_PORT_PORT_CFG_LEARNAUTO |
-			 ANA_PORT_PORT_CFG_RECV_ENA |
-			 ANA_PORT_PORT_CFG_PORTID_VAL(port),
-			 ANA_PORT_PORT_CFG, port);
-}
-EXPORT_SYMBOL(ocelot_port_enable);
-
 void ocelot_port_disable(struct ocelot *ocelot, int port)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
@@ -1956,6 +1943,15 @@ void ocelot_init_port(struct ocelot *ocelot, int port)
 	/* Disable source address learning for standalone mode */
 	ocelot_port_set_learning(ocelot, port, false);
 
+	/* Set the port's initial logical port ID value, enable receiving
+	 * frames on it, and configure the MAC address learning type to
+	 * automatic.
+	 */
+	ocelot_write_gix(ocelot, ANA_PORT_PORT_CFG_LEARNAUTO |
+			 ANA_PORT_PORT_CFG_RECV_ENA |
+			 ANA_PORT_PORT_CFG_PORTID_VAL(port),
+			 ANA_PORT_PORT_CFG, port);
+
 	/* Enable vcap lookups */
 	ocelot_vcap_enable(ocelot, port);
 }
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index de900ea70fd4..6a196cd6a61a 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -449,8 +449,6 @@ static int ocelot_port_open(struct net_device *dev)
 {
 	struct ocelot_port_private *priv = netdev_priv(dev);
 	struct ocelot_port *ocelot_port = &priv->port;
-	struct ocelot *ocelot = ocelot_port->ocelot;
-	int port = priv->chip_port;
 	int err;
 
 	if (priv->serdes) {
@@ -474,8 +472,6 @@ static int ocelot_port_open(struct net_device *dev)
 	phy_attached_info(priv->phy);
 	phy_start(priv->phy);
 
-	ocelot_port_enable(ocelot, port, priv->phy);
-
 	return 0;
 }
 
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 2f5ce4d4fdbf..2c2dcb954f23 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -798,8 +798,6 @@ void ocelot_init_port(struct ocelot *ocelot, int port);
 void ocelot_deinit_port(struct ocelot *ocelot, int port);
 
 /* DSA callbacks */
-void ocelot_port_enable(struct ocelot *ocelot, int port,
-			struct phy_device *phy);
 void ocelot_port_disable(struct ocelot *ocelot, int port);
 void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data);
 void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data);
-- 
cgit v1.2.3


From e6e12df625f23e8011a804250fc22a0c3c9e2463 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 15 Aug 2021 04:47:48 +0300
Subject: net: mscc: ocelot: convert to phylink

The felix DSA driver, which is a wrapper over the same hardware class as
ocelot, is integrated with phylink, but ocelot is using the plain PHY
library. It makes sense to bring together the two implementations, which
is what this patch achieves.

This is a large patch and hard to break up, but it does the following:

The existing ocelot_adjust_link writes some registers, and
felix_phylink_mac_link_up writes some registers, some of them are
common, but both functions write to some registers to which the other
doesn't.

The main reasons for this are:
- Felix switches so far have used an NXP PCS so they had no need to
  write the PCS1G registers that ocelot_adjust_link writes
- Felix switches have the MAC fixed at 1G, so some of the MAC speed
  changes actually break the link and must be avoided.

The naming conventions for the functions introduced in this patch are:
- vsc7514_phylink_{mac_config,validate} are specific to the Ocelot
  instantiations and placed in ocelot_net.c which is built only for the
  ocelot switchdev driver.
- ocelot_phylink_mac_link_{up,down} are shared between the ocelot
  switchdev driver and the felix DSA driver (they are put in the common
  lib).

One by one, the registers written by ocelot_adjust_link are:

DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this
                   register since its out-of-reset value was fine and
                   did not need changing. The write is moved to the
                   common ocelot_phylink_mac_link_up and on felix it is
                   guarded by a quirk bit that makes the written value
                   identical with the out-of-reset one
DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config
PCS1G_MODE_CFG - same as above
PCS1G_SD_CFG - same as above
PCS1G_CFG - same as above
PCS1G_ANEG_CFG - same as above
PCS1G_LB_CFG - same as above
DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable
                  touched this. felix_phylink_mac_link_{up,down} also
                  do. We go with what felix does and put it in
                  ocelot_phylink_mac_link_up.
DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both
                write this, but to different values. Move to the common
                ocelot_phylink_mac_link_up and make sure via the quirk
                that the old values are preserved for both.
ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up
                  did not. Runtime invariant, speed does not matter since
                  PFC is disabled via the RX_PFC_ENA bits which are cleared.
                  Move to vsc7514_phylink_mac_config.
QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and
                                 felix_phylink_mac_link_{up,down} wrote
                                 this. Ocelot also wrote this register
                                 from ocelot_port_disable. Keep what
                                 felix did, move in ocelot_phylink_mac_link_{up,down}
                                 and delete ocelot_port_disable.
ANA_POL_FLOWC - same as above
SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas
                 ocelot always enabled RX and TX flow control, felix
                 listened to phylink (for the most part, at least - see
                 the 2500base-X comment).

The registers which only felix_phylink_mac_link_up wrote are:

SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control
                          worked on ocelot. Not it should, since the
                          code is shared with felix where it does.
ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink
                    should be the one touching them, deleted.

Other changes:

- The old phylib registration code was in mscc_ocelot_init_ports. It is
  hard to work with 2 levels of indentation already in, and with hard to
  follow teardown logic. The new phylink registration code was moved
  inside ocelot_probe_port(), right between alloc_etherdev() and
  register_netdev(). It could not be done before (=> outside of)
  ocelot_probe_port() because ocelot_probe_port() allocates the struct
  ocelot_port which we then use to assign ocelot_port->phy_mode to. It
  is more preferable to me to have all PHY handling logic inside the
  same function.
- On the same topic: struct ocelot_port_private :: serdes is only used
  in ocelot_port_open to set the SERDES protocol to Ethernet. This is
  logically a runtime invariant and can be done just once, when the port
  registers with phylink. We therefore don't even need to keep the
  serdes reference inside struct ocelot_port_private, or to use the devm
  variant of of_phy_get().
- Phylink needs a valid phy-mode for phylink_create() to succeed, and
  the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts
  don't define one for the internal PHY ports. So we patch
  PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL.
- There was a strategically placed:

	switch (priv->phy_mode) {
	case PHY_INTERFACE_MODE_NA:
	        continue;

  which made the code skip the serdes initialization for the internal
  PHY ports. Frankly that is not all that obvious, so now we explicitly
  initialize the serdes under an "if" condition and not rely on code
  jumps, so everything is clearer.
- There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII
  ports. Since that is in fact the default value for the register field
  DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear
  the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out
  of reset, which does match the comment. I don't even want to know why
  this code is placed there, but if there is indeed an issue that all
  ports that share a QSGMII lane must all be up, then this logic is
  already buggy, since mscc_ocelot_init_ports iterates using
  for_each_available_child_of_node, so nobody prevents the user from
  putting a 'status = "disabled";' for some QSGMII ports which would
  break the driver's assumption.
  In any case, in the eventuality that I'm right, we would have yet
  another issue if ocelot_phylink_mac_link_down would reset those ports
  and that would be forbidden, so since the ocelot_adjust_link logic did
  not do that (maybe for a reason), add another quirk to preserve the
  old logic.

The ocelot driver teardown goes through all ports in one fell swoop.
When initialization of one port fails, the ocelot->ports[port] pointer
for that is reset to NULL, and teardown is done only for non-NULL ports,
so there is no reason to do partial teardowns, let the central
mscc_ocelot_release_ports() do its job.

Tested bind, unbind, rebind, link up, link down, speed change on mock-up
hardware (modified the driver to probe on Felix VSC9959). Also
regression tested the felix DSA driver. Could not test the Ocelot
specific bits (PCS1G, SERDES, device tree bindings).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c             |  90 +----------
 drivers/net/dsa/ocelot/felix.h             |   1 +
 drivers/net/ethernet/mscc/Kconfig          |   2 +-
 drivers/net/ethernet/mscc/ocelot.c         | 151 ++++++++++-------
 drivers/net/ethernet/mscc/ocelot.h         |  11 +-
 drivers/net/ethernet/mscc/ocelot_net.c     | 250 ++++++++++++++++++++++++-----
 drivers/net/ethernet/mscc/ocelot_vsc7514.c |  59 +------
 include/soc/mscc/ocelot.h                  |  19 ++-
 8 files changed, 329 insertions(+), 254 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 0050bb5b10aa..cbe23b20f3fa 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -824,25 +824,9 @@ static void felix_phylink_mac_link_down(struct dsa_switch *ds, int port,
 					phy_interface_t interface)
 {
 	struct ocelot *ocelot = ds->priv;
-	struct ocelot_port *ocelot_port = ocelot->ports[port];
-	int err;
-
-	ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA,
-			 DEV_MAC_ENA_CFG);
-
-	ocelot_fields_write(ocelot, port, QSYS_SWITCH_PORT_MODE_PORT_ENA, 0);
 
-	err = ocelot_port_flush(ocelot, port);
-	if (err)
-		dev_err(ocelot->dev, "failed to flush port %d: %d\n",
-			port, err);
-
-	/* Put the port in reset. */
-	ocelot_port_writel(ocelot_port,
-			   DEV_CLOCK_CFG_MAC_TX_RST |
-			   DEV_CLOCK_CFG_MAC_RX_RST |
-			   DEV_CLOCK_CFG_LINK_SPEED(OCELOT_SPEED_1000),
-			   DEV_CLOCK_CFG);
+	ocelot_phylink_mac_link_down(ocelot, port, link_an_mode, interface,
+				     FELIX_MAC_QUIRKS);
 }
 
 static void felix_phylink_mac_link_up(struct dsa_switch *ds, int port,
@@ -853,75 +837,11 @@ static void felix_phylink_mac_link_up(struct dsa_switch *ds, int port,
 				      bool tx_pause, bool rx_pause)
 {
 	struct ocelot *ocelot = ds->priv;
-	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	struct felix *felix = ocelot_to_felix(ocelot);
-	u32 mac_fc_cfg;
-
-	/* Take port out of reset by clearing the MAC_TX_RST, MAC_RX_RST and
-	 * PORT_RST bits in DEV_CLOCK_CFG. Note that the way this system is
-	 * integrated is that the MAC speed is fixed and it's the PCS who is
-	 * performing the rate adaptation, so we have to write "1000Mbps" into
-	 * the LINK_SPEED field of DEV_CLOCK_CFG (which is also its default
-	 * value).
-	 */
-	ocelot_port_writel(ocelot_port,
-			   DEV_CLOCK_CFG_LINK_SPEED(OCELOT_SPEED_1000),
-			   DEV_CLOCK_CFG);
 
-	switch (speed) {
-	case SPEED_10:
-		mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(3);
-		break;
-	case SPEED_100:
-		mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(2);
-		break;
-	case SPEED_1000:
-	case SPEED_2500:
-		mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(1);
-		break;
-	default:
-		dev_err(ocelot->dev, "Unsupported speed on port %d: %d\n",
-			port, speed);
-		return;
-	}
-
-	/* handle Rx pause in all cases, with 2500base-X this is used for rate
-	 * adaptation.
-	 */
-	mac_fc_cfg |= SYS_MAC_FC_CFG_RX_FC_ENA;
-
-	if (tx_pause)
-		mac_fc_cfg |= SYS_MAC_FC_CFG_TX_FC_ENA |
-			      SYS_MAC_FC_CFG_PAUSE_VAL_CFG(0xffff) |
-			      SYS_MAC_FC_CFG_FC_LATENCY_CFG(0x7) |
-			      SYS_MAC_FC_CFG_ZERO_PAUSE_ENA;
-
-	/* Flow control. Link speed is only used here to evaluate the time
-	 * specification in incoming pause frames.
-	 */
-	ocelot_write_rix(ocelot, mac_fc_cfg, SYS_MAC_FC_CFG, port);
-
-	ocelot_write_rix(ocelot, 0, ANA_POL_FLOWC, port);
-
-	ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, tx_pause);
-
-	/* Undo the effects of felix_phylink_mac_link_down:
-	 * enable MAC module
-	 */
-	ocelot_port_writel(ocelot_port, DEV_MAC_ENA_CFG_RX_ENA |
-			   DEV_MAC_ENA_CFG_TX_ENA, DEV_MAC_ENA_CFG);
-
-	/* Enable receiving frames on the port, and activate auto-learning of
-	 * MAC addresses.
-	 */
-	ocelot_write_gix(ocelot, ANA_PORT_PORT_CFG_LEARNAUTO |
-			 ANA_PORT_PORT_CFG_RECV_ENA |
-			 ANA_PORT_PORT_CFG_PORTID_VAL(port),
-			 ANA_PORT_PORT_CFG, port);
-
-	/* Core: Enable port for frame transfer */
-	ocelot_fields_write(ocelot, port,
-			    QSYS_SWITCH_PORT_MODE_PORT_ENA, 1);
+	ocelot_phylink_mac_link_up(ocelot, port, phydev, link_an_mode,
+				   interface, speed, duplex, tx_pause, rx_pause,
+				   FELIX_MAC_QUIRKS);
 
 	if (felix->info->port_sched_speed_set)
 		felix->info->port_sched_speed_set(ocelot, port, speed);
diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h
index 9da3c6a94c6e..5854bab43327 100644
--- a/drivers/net/dsa/ocelot/felix.h
+++ b/drivers/net/dsa/ocelot/felix.h
@@ -5,6 +5,7 @@
 #define _MSCC_FELIX_H
 
 #define ocelot_to_felix(o)		container_of((o), struct felix, ocelot)
+#define FELIX_MAC_QUIRKS		OCELOT_QUIRK_PCS_PERFORMS_RATE_ADAPTATION
 
 /* Platform-specific information */
 struct felix_info {
diff --git a/drivers/net/ethernet/mscc/Kconfig b/drivers/net/ethernet/mscc/Kconfig
index b1d68e197258..b6a73d151dec 100644
--- a/drivers/net/ethernet/mscc/Kconfig
+++ b/drivers/net/ethernet/mscc/Kconfig
@@ -16,7 +16,7 @@ config MSCC_OCELOT_SWITCH_LIB
 	select NET_DEVLINK
 	select REGMAP_MMIO
 	select PACKING
-	select PHYLIB
+	select PHYLINK
 	tristate
 	help
 	  This is a hardware support library for Ocelot network switches. It is
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index a948c807349d..5209650fd25f 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -377,7 +377,7 @@ static u32 ocelot_read_eq_avail(struct ocelot *ocelot, int port)
 	return ocelot_read_rix(ocelot, QSYS_SW_STATUS, port);
 }
 
-int ocelot_port_flush(struct ocelot *ocelot, int port)
+static int ocelot_port_flush(struct ocelot *ocelot, int port)
 {
 	unsigned int pause_ena;
 	int err, val;
@@ -429,63 +429,118 @@ int ocelot_port_flush(struct ocelot *ocelot, int port)
 
 	return err;
 }
-EXPORT_SYMBOL(ocelot_port_flush);
 
-void ocelot_adjust_link(struct ocelot *ocelot, int port,
-			struct phy_device *phydev)
+void ocelot_phylink_mac_link_down(struct ocelot *ocelot, int port,
+				  unsigned int link_an_mode,
+				  phy_interface_t interface,
+				  unsigned long quirks)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
-	int speed, mode = 0;
+	int err;
+
+	ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA,
+			 DEV_MAC_ENA_CFG);
+
+	ocelot_fields_write(ocelot, port, QSYS_SWITCH_PORT_MODE_PORT_ENA, 0);
+
+	err = ocelot_port_flush(ocelot, port);
+	if (err)
+		dev_err(ocelot->dev, "failed to flush port %d: %d\n",
+			port, err);
+
+	/* Put the port in reset. */
+	if (interface != PHY_INTERFACE_MODE_QSGMII ||
+	    !(quirks & OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP))
+		ocelot_port_rmwl(ocelot_port,
+				 DEV_CLOCK_CFG_MAC_TX_RST |
+				 DEV_CLOCK_CFG_MAC_TX_RST,
+				 DEV_CLOCK_CFG_MAC_TX_RST |
+				 DEV_CLOCK_CFG_MAC_TX_RST,
+				 DEV_CLOCK_CFG);
+}
+EXPORT_SYMBOL_GPL(ocelot_phylink_mac_link_down);
+
+void ocelot_phylink_mac_link_up(struct ocelot *ocelot, int port,
+				struct phy_device *phydev,
+				unsigned int link_an_mode,
+				phy_interface_t interface,
+				int speed, int duplex,
+				bool tx_pause, bool rx_pause,
+				unsigned long quirks)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	int mac_speed, mode = 0;
+	u32 mac_fc_cfg;
+
+	/* The MAC might be integrated in systems where the MAC speed is fixed
+	 * and it's the PCS who is performing the rate adaptation, so we have
+	 * to write "1000Mbps" into the LINK_SPEED field of DEV_CLOCK_CFG
+	 * (which is also its default value).
+	 */
+	if ((quirks & OCELOT_QUIRK_PCS_PERFORMS_RATE_ADAPTATION) ||
+	    speed == SPEED_1000) {
+		mac_speed = OCELOT_SPEED_1000;
+		mode = DEV_MAC_MODE_CFG_GIGA_MODE_ENA;
+	} else if (speed == SPEED_2500) {
+		mac_speed = OCELOT_SPEED_2500;
+		mode = DEV_MAC_MODE_CFG_GIGA_MODE_ENA;
+	} else if (speed == SPEED_100) {
+		mac_speed = OCELOT_SPEED_100;
+	} else {
+		mac_speed = OCELOT_SPEED_10;
+	}
+
+	if (duplex == DUPLEX_FULL)
+		mode |= DEV_MAC_MODE_CFG_FDX_ENA;
+
+	ocelot_port_writel(ocelot_port, mode, DEV_MAC_MODE_CFG);
+
+	/* Take port out of reset by clearing the MAC_TX_RST, MAC_RX_RST and
+	 * PORT_RST bits in DEV_CLOCK_CFG.
+	 */
+	ocelot_port_writel(ocelot_port, DEV_CLOCK_CFG_LINK_SPEED(mac_speed),
+			   DEV_CLOCK_CFG);
 
-	switch (phydev->speed) {
+	switch (speed) {
 	case SPEED_10:
-		speed = OCELOT_SPEED_10;
+		mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(OCELOT_SPEED_10);
 		break;
 	case SPEED_100:
-		speed = OCELOT_SPEED_100;
+		mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(OCELOT_SPEED_100);
 		break;
 	case SPEED_1000:
-		speed = OCELOT_SPEED_1000;
-		mode = DEV_MAC_MODE_CFG_GIGA_MODE_ENA;
-		break;
 	case SPEED_2500:
-		speed = OCELOT_SPEED_2500;
-		mode = DEV_MAC_MODE_CFG_GIGA_MODE_ENA;
+		mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(OCELOT_SPEED_1000);
 		break;
 	default:
-		dev_err(ocelot->dev, "Unsupported PHY speed on port %d: %d\n",
-			port, phydev->speed);
+		dev_err(ocelot->dev, "Unsupported speed on port %d: %d\n",
+			port, speed);
 		return;
 	}
 
-	phy_print_status(phydev);
-
-	if (!phydev->link)
-		return;
-
-	/* Only full duplex supported for now */
-	ocelot_port_writel(ocelot_port, DEV_MAC_MODE_CFG_FDX_ENA |
-			   mode, DEV_MAC_MODE_CFG);
-
-	/* Disable HDX fast control */
-	ocelot_port_writel(ocelot_port, DEV_PORT_MISC_HDX_FAST_DIS,
-			   DEV_PORT_MISC);
+	/* Handle RX pause in all cases, with 2500base-X this is used for rate
+	 * adaptation.
+	 */
+	mac_fc_cfg |= SYS_MAC_FC_CFG_RX_FC_ENA;
 
-	/* SGMII only for now */
-	ocelot_port_writel(ocelot_port, PCS1G_MODE_CFG_SGMII_MODE_ENA,
-			   PCS1G_MODE_CFG);
-	ocelot_port_writel(ocelot_port, PCS1G_SD_CFG_SD_SEL, PCS1G_SD_CFG);
+	if (tx_pause)
+		mac_fc_cfg |= SYS_MAC_FC_CFG_TX_FC_ENA |
+			      SYS_MAC_FC_CFG_PAUSE_VAL_CFG(0xffff) |
+			      SYS_MAC_FC_CFG_FC_LATENCY_CFG(0x7) |
+			      SYS_MAC_FC_CFG_ZERO_PAUSE_ENA;
 
-	/* Enable PCS */
-	ocelot_port_writel(ocelot_port, PCS1G_CFG_PCS_ENA, PCS1G_CFG);
+	/* Flow control. Link speed is only used here to evaluate the time
+	 * specification in incoming pause frames.
+	 */
+	ocelot_write_rix(ocelot, mac_fc_cfg, SYS_MAC_FC_CFG, port);
 
-	/* No aneg on SGMII */
-	ocelot_port_writel(ocelot_port, 0, PCS1G_ANEG_CFG);
+	ocelot_write_rix(ocelot, 0, ANA_POL_FLOWC, port);
 
-	/* No loopback */
-	ocelot_port_writel(ocelot_port, 0, PCS1G_LB_CFG);
+	ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, tx_pause);
 
-	/* Enable MAC module */
+	/* Undo the effects of ocelot_phylink_mac_link_down:
+	 * enable MAC module
+	 */
 	ocelot_port_writel(ocelot_port, DEV_MAC_ENA_CFG_RX_ENA |
 			   DEV_MAC_ENA_CFG_TX_ENA, DEV_MAC_ENA_CFG);
 
@@ -502,26 +557,8 @@ void ocelot_adjust_link(struct ocelot *ocelot, int port,
 	/* Core: Enable port for frame transfer */
 	ocelot_fields_write(ocelot, port,
 			    QSYS_SWITCH_PORT_MODE_PORT_ENA, 1);
-
-	/* Flow control */
-	ocelot_write_rix(ocelot, SYS_MAC_FC_CFG_PAUSE_VAL_CFG(0xffff) |
-			 SYS_MAC_FC_CFG_RX_FC_ENA | SYS_MAC_FC_CFG_TX_FC_ENA |
-			 SYS_MAC_FC_CFG_ZERO_PAUSE_ENA |
-			 SYS_MAC_FC_CFG_FC_LATENCY_CFG(0x7) |
-			 SYS_MAC_FC_CFG_FC_LINK_SPEED(speed),
-			 SYS_MAC_FC_CFG, port);
-	ocelot_write_rix(ocelot, 0, ANA_POL_FLOWC, port);
-}
-EXPORT_SYMBOL(ocelot_adjust_link);
-
-void ocelot_port_disable(struct ocelot *ocelot, int port)
-{
-	struct ocelot_port *ocelot_port = ocelot->ports[port];
-
-	ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG);
-	ocelot_fields_write(ocelot, port, QSYS_SWITCH_PORT_MODE_PORT_ENA, 0);
 }
-EXPORT_SYMBOL(ocelot_port_disable);
+EXPORT_SYMBOL_GPL(ocelot_phylink_mac_link_up);
 
 static void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port,
 					 struct sk_buff *clone)
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index db6b1a4c3926..1952d6a1b98a 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -12,8 +12,7 @@
 #include <linux/etherdevice.h>
 #include <linux/if_vlan.h>
 #include <linux/net_tstamp.h>
-#include <linux/phy.h>
-#include <linux/phy/phy.h>
+#include <linux/phylink.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 
@@ -42,11 +41,9 @@ struct ocelot_port_tc {
 struct ocelot_port_private {
 	struct ocelot_port port;
 	struct net_device *dev;
-	struct phy_device *phy;
+	struct phylink *phylink;
+	struct phylink_config phylink_config;
 	u8 chip_port;
-
-	struct phy *serdes;
-
 	struct ocelot_port_tc tc;
 };
 
@@ -107,7 +104,7 @@ u32 ocelot_port_readl(struct ocelot_port *port, u32 reg);
 void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg);
 
 int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target,
-		      struct phy_device *phy);
+		      struct device_node *portnp);
 void ocelot_release_port(struct ocelot_port *ocelot_port);
 int ocelot_devlink_init(struct ocelot *ocelot);
 void ocelot_devlink_teardown(struct ocelot *ocelot);
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 6a196cd6a61a..5e8965be968a 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -9,10 +9,14 @@
  */
 
 #include <linux/if_bridge.h>
+#include <linux/of_net.h>
+#include <linux/phy/phy.h>
 #include <net/pkt_cls.h>
 #include "ocelot.h"
 #include "ocelot_vcap.h"
 
+#define OCELOT_MAC_QUIRKS	OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP
+
 static struct ocelot *devlink_port_to_ocelot(struct devlink_port *dlp)
 {
 	return devlink_priv(dlp->devlink);
@@ -381,15 +385,6 @@ static int ocelot_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	return 0;
 }
 
-static void ocelot_port_adjust_link(struct net_device *dev)
-{
-	struct ocelot_port_private *priv = netdev_priv(dev);
-	struct ocelot *ocelot = priv->port.ocelot;
-	int port = priv->chip_port;
-
-	ocelot_adjust_link(ocelot, port, dev->phydev);
-}
-
 static int ocelot_vlan_vid_prepare(struct net_device *dev, u16 vid, bool pvid,
 				   bool untagged)
 {
@@ -448,29 +443,8 @@ static int ocelot_vlan_vid_del(struct net_device *dev, u16 vid)
 static int ocelot_port_open(struct net_device *dev)
 {
 	struct ocelot_port_private *priv = netdev_priv(dev);
-	struct ocelot_port *ocelot_port = &priv->port;
-	int err;
-
-	if (priv->serdes) {
-		err = phy_set_mode_ext(priv->serdes, PHY_MODE_ETHERNET,
-				       ocelot_port->phy_mode);
-		if (err) {
-			netdev_err(dev, "Could not set mode of SerDes\n");
-			return err;
-		}
-	}
-
-	err = phy_connect_direct(dev, priv->phy, &ocelot_port_adjust_link,
-				 ocelot_port->phy_mode);
-	if (err) {
-		netdev_err(dev, "Could not attach to PHY\n");
-		return err;
-	}
-
-	dev->phydev = priv->phy;
 
-	phy_attached_info(priv->phy);
-	phy_start(priv->phy);
+	phylink_start(priv->phylink);
 
 	return 0;
 }
@@ -478,14 +452,8 @@ static int ocelot_port_open(struct net_device *dev)
 static int ocelot_port_stop(struct net_device *dev)
 {
 	struct ocelot_port_private *priv = netdev_priv(dev);
-	struct ocelot *ocelot = priv->port.ocelot;
-	int port = priv->chip_port;
-
-	phy_disconnect(priv->phy);
 
-	dev->phydev = NULL;
-
-	ocelot_port_disable(ocelot, port);
+	phylink_stop(priv->phylink);
 
 	return 0;
 }
@@ -1524,8 +1492,188 @@ struct notifier_block ocelot_switchdev_blocking_nb __read_mostly = {
 	.notifier_call = ocelot_switchdev_blocking_event,
 };
 
+static void vsc7514_phylink_validate(struct phylink_config *config,
+				     unsigned long *supported,
+				     struct phylink_link_state *state)
+{
+	struct net_device *ndev = to_net_dev(config->dev);
+	struct ocelot_port_private *priv = netdev_priv(ndev);
+	struct ocelot_port *ocelot_port = &priv->port;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = {};
+
+	if (state->interface != PHY_INTERFACE_MODE_NA &&
+	    state->interface != ocelot_port->phy_mode) {
+		bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		return;
+	}
+
+	phylink_set_port_modes(mask);
+
+	phylink_set(mask, Pause);
+	phylink_set(mask, Autoneg);
+	phylink_set(mask, Asym_Pause);
+	phylink_set(mask, 10baseT_Half);
+	phylink_set(mask, 10baseT_Full);
+	phylink_set(mask, 100baseT_Half);
+	phylink_set(mask, 100baseT_Full);
+	phylink_set(mask, 1000baseT_Half);
+	phylink_set(mask, 1000baseT_Full);
+	phylink_set(mask, 1000baseX_Full);
+	phylink_set(mask, 2500baseT_Full);
+	phylink_set(mask, 2500baseX_Full);
+
+	bitmap_and(supported, supported, mask, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_and(state->advertising, state->advertising, mask,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static void vsc7514_phylink_mac_config(struct phylink_config *config,
+				       unsigned int link_an_mode,
+				       const struct phylink_link_state *state)
+{
+	struct net_device *ndev = to_net_dev(config->dev);
+	struct ocelot_port_private *priv = netdev_priv(ndev);
+	struct ocelot_port *ocelot_port = &priv->port;
+
+	/* Disable HDX fast control */
+	ocelot_port_writel(ocelot_port, DEV_PORT_MISC_HDX_FAST_DIS,
+			   DEV_PORT_MISC);
+
+	/* SGMII only for now */
+	ocelot_port_writel(ocelot_port, PCS1G_MODE_CFG_SGMII_MODE_ENA,
+			   PCS1G_MODE_CFG);
+	ocelot_port_writel(ocelot_port, PCS1G_SD_CFG_SD_SEL, PCS1G_SD_CFG);
+
+	/* Enable PCS */
+	ocelot_port_writel(ocelot_port, PCS1G_CFG_PCS_ENA, PCS1G_CFG);
+
+	/* No aneg on SGMII */
+	ocelot_port_writel(ocelot_port, 0, PCS1G_ANEG_CFG);
+
+	/* No loopback */
+	ocelot_port_writel(ocelot_port, 0, PCS1G_LB_CFG);
+}
+
+static void vsc7514_phylink_mac_link_down(struct phylink_config *config,
+					  unsigned int link_an_mode,
+					  phy_interface_t interface)
+{
+	struct net_device *ndev = to_net_dev(config->dev);
+	struct ocelot_port_private *priv = netdev_priv(ndev);
+	struct ocelot *ocelot = priv->port.ocelot;
+	int port = priv->chip_port;
+
+	ocelot_phylink_mac_link_down(ocelot, port, link_an_mode, interface,
+				     OCELOT_MAC_QUIRKS);
+}
+
+static void vsc7514_phylink_mac_link_up(struct phylink_config *config,
+					struct phy_device *phydev,
+					unsigned int link_an_mode,
+					phy_interface_t interface,
+					int speed, int duplex,
+					bool tx_pause, bool rx_pause)
+{
+	struct net_device *ndev = to_net_dev(config->dev);
+	struct ocelot_port_private *priv = netdev_priv(ndev);
+	struct ocelot *ocelot = priv->port.ocelot;
+	int port = priv->chip_port;
+
+	ocelot_phylink_mac_link_up(ocelot, port, phydev, link_an_mode,
+				   interface, speed, duplex,
+				   tx_pause, rx_pause, OCELOT_MAC_QUIRKS);
+}
+
+static const struct phylink_mac_ops ocelot_phylink_ops = {
+	.validate		= vsc7514_phylink_validate,
+	.mac_config		= vsc7514_phylink_mac_config,
+	.mac_link_down		= vsc7514_phylink_mac_link_down,
+	.mac_link_up		= vsc7514_phylink_mac_link_up,
+};
+
+static int ocelot_port_phylink_create(struct ocelot *ocelot, int port,
+				      struct device_node *portnp)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct ocelot_port_private *priv;
+	struct device *dev = ocelot->dev;
+	phy_interface_t phy_mode;
+	struct phylink *phylink;
+	int err;
+
+	of_get_phy_mode(portnp, &phy_mode);
+	/* DT bindings of internal PHY ports are broken and don't
+	 * specify a phy-mode
+	 */
+	if (phy_mode == PHY_INTERFACE_MODE_NA)
+		phy_mode = PHY_INTERFACE_MODE_INTERNAL;
+
+	if (phy_mode != PHY_INTERFACE_MODE_SGMII &&
+	    phy_mode != PHY_INTERFACE_MODE_QSGMII &&
+	    phy_mode != PHY_INTERFACE_MODE_INTERNAL) {
+		dev_err(dev, "unsupported phy mode %s for port %d\n",
+			phy_modes(phy_mode), port);
+		return -EINVAL;
+	}
+
+	/* Ensure clock signals and speed are set on all QSGMII links */
+	if (phy_mode == PHY_INTERFACE_MODE_QSGMII)
+		ocelot_port_rmwl(ocelot_port, 0,
+				 DEV_CLOCK_CFG_MAC_TX_RST |
+				 DEV_CLOCK_CFG_MAC_TX_RST,
+				 DEV_CLOCK_CFG);
+
+	ocelot_port->phy_mode = phy_mode;
+
+	if (phy_mode != PHY_INTERFACE_MODE_INTERNAL) {
+		struct phy *serdes = of_phy_get(portnp, NULL);
+
+		if (IS_ERR(serdes)) {
+			err = PTR_ERR(serdes);
+			dev_err_probe(dev, err,
+				      "missing SerDes phys for port %d\n",
+				      port);
+			return err;
+		}
+
+		err = phy_set_mode_ext(serdes, PHY_MODE_ETHERNET, phy_mode);
+		of_phy_put(serdes);
+		if (err) {
+			dev_err(dev, "Could not SerDes mode on port %d: %pe\n",
+				port, ERR_PTR(err));
+			return err;
+		}
+	}
+
+	priv = container_of(ocelot_port, struct ocelot_port_private, port);
+
+	priv->phylink_config.dev = &priv->dev->dev;
+	priv->phylink_config.type = PHYLINK_NETDEV;
+
+	phylink = phylink_create(&priv->phylink_config,
+				 of_fwnode_handle(portnp),
+				 phy_mode, &ocelot_phylink_ops);
+	if (IS_ERR(phylink)) {
+		err = PTR_ERR(phylink);
+		dev_err(dev, "Could not create phylink (%pe)\n", phylink);
+		return err;
+	}
+
+	priv->phylink = phylink;
+
+	err = phylink_of_phy_connect(phylink, portnp, 0);
+	if (err) {
+		dev_err(dev, "Could not connect to PHY: %pe\n", ERR_PTR(err));
+		phylink_destroy(phylink);
+		priv->phylink = NULL;
+		return err;
+	}
+
+	return 0;
+}
+
 int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target,
-		      struct phy_device *phy)
+		      struct device_node *portnp)
 {
 	struct ocelot_port_private *priv;
 	struct ocelot_port *ocelot_port;
@@ -1538,7 +1686,6 @@ int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target,
 	SET_NETDEV_DEV(dev, ocelot->dev);
 	priv = netdev_priv(dev);
 	priv->dev = dev;
-	priv->phy = phy;
 	priv->chip_port = port;
 	ocelot_port = &priv->port;
 	ocelot_port->ocelot = ocelot;
@@ -1559,15 +1706,23 @@ int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target,
 
 	ocelot_init_port(ocelot, port);
 
+	err = ocelot_port_phylink_create(ocelot, port, portnp);
+	if (err)
+		goto out;
+
 	err = register_netdev(dev);
 	if (err) {
 		dev_err(ocelot->dev, "register_netdev failed\n");
-		free_netdev(dev);
-		ocelot->ports[port] = NULL;
-		return err;
+		goto out;
 	}
 
 	return 0;
+
+out:
+	ocelot->ports[port] = NULL;
+	free_netdev(dev);
+
+	return err;
 }
 
 void ocelot_release_port(struct ocelot_port *ocelot_port)
@@ -1577,5 +1732,14 @@ void ocelot_release_port(struct ocelot_port *ocelot_port)
 						port);
 
 	unregister_netdev(priv->dev);
+
+	if (priv->phylink) {
+		rtnl_lock();
+		phylink_disconnect_phy(priv->phylink);
+		rtnl_unlock();
+
+		phylink_destroy(priv->phylink);
+	}
+
 	free_netdev(priv->dev);
 }
diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
index aa41c9cde643..18aed504f45d 100644
--- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/of_net.h>
 #include <linux/netdevice.h>
+#include <linux/phylink.h>
 #include <linux/of_mdio.h>
 #include <linux/of_platform.h>
 #include <linux/mfd/syscon.h>
@@ -945,13 +946,9 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev,
 	for_each_available_child_of_node(ports, portnp) {
 		struct ocelot_port_private *priv;
 		struct ocelot_port *ocelot_port;
-		struct device_node *phy_node;
 		struct devlink_port *dlp;
-		phy_interface_t phy_mode;
-		struct phy_device *phy;
 		struct regmap *target;
 		struct resource *res;
-		struct phy *serdes;
 		char res_name[8];
 
 		if (of_property_read_u32(portnp, "reg", &reg))
@@ -975,15 +972,6 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev,
 			goto out_teardown;
 		}
 
-		phy_node = of_parse_phandle(portnp, "phy-handle", 0);
-		if (!phy_node)
-			continue;
-
-		phy = of_phy_find_device(phy_node);
-		of_node_put(phy_node);
-		if (!phy)
-			continue;
-
 		err = ocelot_port_devlink_init(ocelot, port,
 					       DEVLINK_PORT_FLAVOUR_PHYSICAL);
 		if (err) {
@@ -992,7 +980,7 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev,
 		}
 		devlink_ports_registered |= BIT(port);
 
-		err = ocelot_probe_port(ocelot, port, target, phy);
+		err = ocelot_probe_port(ocelot, port, target, portnp);
 		if (err) {
 			of_node_put(portnp);
 			goto out_teardown;
@@ -1003,49 +991,6 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev,
 				    port);
 		dlp = &ocelot->devlink_ports[port];
 		devlink_port_type_eth_set(dlp, priv->dev);
-
-		of_get_phy_mode(portnp, &phy_mode);
-
-		ocelot_port->phy_mode = phy_mode;
-
-		switch (ocelot_port->phy_mode) {
-		case PHY_INTERFACE_MODE_NA:
-			continue;
-		case PHY_INTERFACE_MODE_SGMII:
-			break;
-		case PHY_INTERFACE_MODE_QSGMII:
-			/* Ensure clock signals and speed is set on all
-			 * QSGMII links
-			 */
-			ocelot_port_writel(ocelot_port,
-					   DEV_CLOCK_CFG_LINK_SPEED
-					   (OCELOT_SPEED_1000),
-					   DEV_CLOCK_CFG);
-			break;
-		default:
-			dev_err(ocelot->dev,
-				"invalid phy mode for port%d, (Q)SGMII only\n",
-				port);
-			of_node_put(portnp);
-			err = -EINVAL;
-			goto out_teardown;
-		}
-
-		serdes = devm_of_phy_get(ocelot->dev, portnp, NULL);
-		if (IS_ERR(serdes)) {
-			err = PTR_ERR(serdes);
-			if (err == -EPROBE_DEFER)
-				dev_dbg(ocelot->dev, "deferring probe\n");
-			else
-				dev_err(ocelot->dev,
-					"missing SerDes phys for port%d\n",
-					port);
-
-			of_node_put(portnp);
-			goto out_teardown;
-		}
-
-		priv->serdes = serdes;
 	}
 
 	/* Initialize unused devlink ports at the end */
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 2c2dcb954f23..fb5681f7e61b 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -589,6 +589,9 @@ enum ocelot_sb_pool {
 	OCELOT_SB_POOL_NUM,
 };
 
+#define OCELOT_QUIRK_PCS_PERFORMS_RATE_ADAPTATION	BIT(0)
+#define OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP		BIT(1)
+
 struct ocelot_port {
 	struct ocelot			*ocelot;
 
@@ -798,16 +801,12 @@ void ocelot_init_port(struct ocelot *ocelot, int port);
 void ocelot_deinit_port(struct ocelot *ocelot, int port);
 
 /* DSA callbacks */
-void ocelot_port_disable(struct ocelot *ocelot, int port);
 void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data);
 void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data);
 int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset);
 int ocelot_get_ts_info(struct ocelot *ocelot, int port,
 		       struct ethtool_ts_info *info);
 void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs);
-int ocelot_port_flush(struct ocelot *ocelot, int port);
-void ocelot_adjust_link(struct ocelot *ocelot, int port,
-			struct phy_device *phydev);
 int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled);
 void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state);
 void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot);
@@ -892,6 +891,18 @@ int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port,
 				   enum devlink_sb_pool_type pool_type,
 				   u32 *p_cur, u32 *p_max);
 
+void ocelot_phylink_mac_link_down(struct ocelot *ocelot, int port,
+				  unsigned int link_an_mode,
+				  phy_interface_t interface,
+				  unsigned long quirks);
+void ocelot_phylink_mac_link_up(struct ocelot *ocelot, int port,
+				struct phy_device *phydev,
+				unsigned int link_an_mode,
+				phy_interface_t interface,
+				int speed, int duplex,
+				bool tx_pause, bool rx_pause,
+				unsigned long quirks);
+
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
 int ocelot_mrp_add(struct ocelot *ocelot, int port,
 		   const struct switchdev_obj_mrp *mrp);
-- 
cgit v1.2.3


From 7087c4f69487f017722df7d299ef9b7709996b79 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 11 Aug 2021 16:20:16 -0700
Subject: Bluetooth: Store advertising handle so it can be re-enabled

This stores the advertising handle/instance into hci_conn so it is
accessible when re-enabling the advertising once disconnected.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_event.c        | 45 ++++++++++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index a7d06d7da602..a7360c8c72f8 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -629,6 +629,7 @@ struct hci_conn {
 	__u8		init_addr_type;
 	bdaddr_t	resp_addr;
 	__u8		resp_addr_type;
+	__u8		adv_instance;
 	__u16		handle;
 	__u16		state;
 	__u8		mode;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index a8a199cfef60..1ee89d9b2ed4 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2348,19 +2348,20 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
 
 	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
 	if (conn) {
-		u8 type = conn->type;
-
 		mgmt_disconnect_failed(hdev, &conn->dst, conn->type,
 				       conn->dst_type, status);
 
+		if (conn->type == LE_LINK) {
+			hdev->cur_adv_instance = conn->adv_instance;
+			hci_req_reenable_advertising(hdev);
+		}
+
 		/* If the disconnection failed for any reason, the upper layer
 		 * does not retry to disconnect in current implementation.
 		 * Hence, we need to do some basic cleanup here and re-enable
 		 * advertising if necessary.
 		 */
 		hci_conn_del(conn);
-		if (type == LE_LINK)
-			hci_req_reenable_advertising(hdev);
 	}
 
 	hci_dev_unlock(hdev);
@@ -2886,7 +2887,6 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	struct hci_conn_params *params;
 	struct hci_conn *conn;
 	bool mgmt_connected;
-	u8 type;
 
 	BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
 
@@ -2941,10 +2941,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 		}
 	}
 
-	type = conn->type;
-
 	hci_disconn_cfm(conn, ev->reason);
-	hci_conn_del(conn);
 
 	/* The suspend notifier is waiting for all devices to disconnect so
 	 * clear the bit from pending tasks and inform the wait queue.
@@ -2964,8 +2961,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	 * or until a connection is created or until the Advertising
 	 * is timed out due to Directed Advertising."
 	 */
-	if (type == LE_LINK)
+	if (conn->type == LE_LINK) {
+		hdev->cur_adv_instance = conn->adv_instance;
 		hci_req_reenable_advertising(hdev);
+	}
+
+	hci_conn_del(conn);
 
 unlock:
 	hci_dev_unlock(hdev);
@@ -5323,6 +5324,13 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 	conn->handle = handle;
 	conn->state = BT_CONFIG;
 
+	/* Store current advertising instance as connection advertising instance
+	 * when sotfware rotation is in use so it can be re-enabled when
+	 * disconnected.
+	 */
+	if (!ext_adv_capable(hdev))
+		conn->adv_instance = hdev->cur_adv_instance;
+
 	conn->le_conn_interval = interval;
 	conn->le_conn_latency = latency;
 	conn->le_supv_timeout = supervision_timeout;
@@ -5406,13 +5414,13 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_evt_le_ext_adv_set_term *ev = (void *) skb->data;
 	struct hci_conn *conn;
+	struct adv_info *adv;
 
 	BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
 
-	if (ev->status) {
-		struct adv_info *adv;
+	adv = hci_find_adv_instance(hdev, ev->handle);
 
-		adv = hci_find_adv_instance(hdev, ev->handle);
+	if (ev->status) {
 		if (!adv)
 			return;
 
@@ -5423,9 +5431,15 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb)
 		return;
 	}
 
+	if (adv)
+		adv->enabled = false;
+
 	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle));
 	if (conn) {
-		struct adv_info *adv_instance;
+		/* Store handle in the connection so the correct advertising
+		 * instance can be re-enabled when disconnected.
+		 */
+		conn->adv_instance = ev->handle;
 
 		if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM ||
 		    bacmp(&conn->resp_addr, BDADDR_ANY))
@@ -5436,9 +5450,8 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb)
 			return;
 		}
 
-		adv_instance = hci_find_adv_instance(hdev, ev->handle);
-		if (adv_instance)
-			bacpy(&conn->resp_addr, &adv_instance->random_addr);
+		if (adv)
+			bacpy(&conn->resp_addr, &adv->random_addr);
 	}
 }
 
-- 
cgit v1.2.3


From 5b4ecc3d4c4aab8d002fe6358885c10e7b57e432 Mon Sep 17 00:00:00 2001
From: Guangbin Huang <huangguangbin2@huawei.com>
Date: Mon, 16 Aug 2021 10:15:27 +0800
Subject: ethtool: add two link extended substates of bad signal integrity

ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST means the input
external clock signal for SerDes is too weak or lost.

ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS means the received signal for
SerDes is too weak because analog loss of signal.

Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 67aa7134b301..b6db6590baf0 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -639,6 +639,8 @@ enum ethtool_link_ext_substate_link_logical_mismatch {
 enum ethtool_link_ext_substate_bad_signal_integrity {
 	ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1,
 	ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE,
+	ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST,
+	ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS,
 };
 
 /* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */
-- 
cgit v1.2.3


From fb7dd8bca0139fd73d3f4a6cd257b11731317ded Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 15 Aug 2021 00:05:54 -0700
Subject: bpf: Refactor BPF_PROG_RUN into a function

Turn BPF_PROG_RUN into a proper always inlined function. No functional and
performance changes are intended, but it makes it much easier to understand
what's going on with how BPF programs are actually get executed. It's more
obvious what types and callbacks are expected. Also extra () around input
parameters can be dropped, as well as `__` variable prefixes intended to avoid
naming collisions, which makes the code simpler to read and write.

This refactoring also highlighted one extra issue. BPF_PROG_RUN is both
a macro and an enum value (BPF_PROG_RUN == BPF_PROG_TEST_RUN). Turning
BPF_PROG_RUN into a function causes naming conflict compilation error. So
rename BPF_PROG_RUN into lower-case bpf_prog_run(), similar to
bpf_prog_run_xdp(), bpf_prog_run_pin_on_cpu(), etc. All existing callers of
BPF_PROG_RUN, the macro, are switched to bpf_prog_run() explicitly.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210815070609.987780-2-andrii@kernel.org
---
 Documentation/networking/filter.rst      |  4 +--
 drivers/media/rc/bpf-lirc.c              |  2 +-
 drivers/net/ppp/ppp_generic.c            |  8 ++---
 drivers/net/team/team_mode_loadbalance.c |  2 +-
 include/linux/bpf.h                      |  2 +-
 include/linux/filter.h                   | 61 +++++++++++++++++++-------------
 kernel/bpf/bpf_iter.c                    |  2 +-
 kernel/bpf/cgroup.c                      | 16 ++++-----
 kernel/bpf/core.c                        |  2 +-
 kernel/bpf/trampoline.c                  |  2 +-
 kernel/bpf/verifier.c                    |  2 +-
 kernel/events/core.c                     |  2 +-
 kernel/trace/bpf_trace.c                 |  4 +--
 lib/test_bpf.c                           |  2 +-
 net/bpf/test_run.c                       |  6 ++--
 net/core/filter.c                        |  4 +--
 net/core/ptp_classifier.c                |  2 +-
 net/netfilter/xt_bpf.c                   |  2 +-
 net/sched/act_bpf.c                      |  4 +--
 net/sched/cls_bpf.c                      |  4 +--
 20 files changed, 73 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst
index 5f13905b12e0..ce2b8e8bb9ab 100644
--- a/Documentation/networking/filter.rst
+++ b/Documentation/networking/filter.rst
@@ -638,8 +638,8 @@ extension, PTP dissector/classifier, and much more. They are all internally
 converted by the kernel into the new instruction set representation and run
 in the eBPF interpreter. For in-kernel handlers, this all works transparently
 by using bpf_prog_create() for setting up the filter, resp.
-bpf_prog_destroy() for destroying it. The macro
-BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed
+bpf_prog_destroy() for destroying it. The function
+bpf_prog_run(filter, ctx) transparently invokes eBPF interpreter or JITed
 code to run the filter. 'filter' is a pointer to struct bpf_prog that we
 got from bpf_prog_create(), and 'ctx' the given context (e.g.
 skb pointer). All constraints and restrictions from bpf_check_classic() apply
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index afae0afe3f81..bb5a9dc78f1b 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -217,7 +217,7 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)
 	raw->bpf_sample = sample;
 
 	if (raw->progs)
-		BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN);
+		BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, bpf_prog_run);
 }
 
 /*
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index e9e81573f21e..fb52cd175b45 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1744,7 +1744,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
 		   a four-byte PPP header on each packet */
 		*(u8 *)skb_push(skb, 2) = 1;
 		if (ppp->pass_filter &&
-		    BPF_PROG_RUN(ppp->pass_filter, skb) == 0) {
+		    bpf_prog_run(ppp->pass_filter, skb) == 0) {
 			if (ppp->debug & 1)
 				netdev_printk(KERN_DEBUG, ppp->dev,
 					      "PPP: outbound frame "
@@ -1754,7 +1754,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
 		}
 		/* if this packet passes the active filter, record the time */
 		if (!(ppp->active_filter &&
-		      BPF_PROG_RUN(ppp->active_filter, skb) == 0))
+		      bpf_prog_run(ppp->active_filter, skb) == 0))
 			ppp->last_xmit = jiffies;
 		skb_pull(skb, 2);
 #else
@@ -2468,7 +2468,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 
 			*(u8 *)skb_push(skb, 2) = 0;
 			if (ppp->pass_filter &&
-			    BPF_PROG_RUN(ppp->pass_filter, skb) == 0) {
+			    bpf_prog_run(ppp->pass_filter, skb) == 0) {
 				if (ppp->debug & 1)
 					netdev_printk(KERN_DEBUG, ppp->dev,
 						      "PPP: inbound frame "
@@ -2477,7 +2477,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 				return;
 			}
 			if (!(ppp->active_filter &&
-			      BPF_PROG_RUN(ppp->active_filter, skb) == 0))
+			      bpf_prog_run(ppp->active_filter, skb) == 0))
 				ppp->last_recv = jiffies;
 			__skb_pull(skb, 2);
 		} else
diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index 32aef8ac4a14..b095a4b4957b 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -197,7 +197,7 @@ static unsigned int lb_get_skb_hash(struct lb_priv *lb_priv,
 	fp = rcu_dereference_bh(lb_priv->fp);
 	if (unlikely(!fp))
 		return 0;
-	lhash = BPF_PROG_RUN(fp, skb);
+	lhash = bpf_prog_run(fp, skb);
 	c = (char *) &lhash;
 	return c[0] ^ c[1] ^ c[2] ^ c[3];
 }
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c8cc09013210..968fea98087a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1103,7 +1103,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 /* an array of programs to be executed under rcu_lock.
  *
  * Typical usage:
- * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
+ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run);
  *
  * the structure returned by bpf_prog_array_alloc() should be populated
  * with program pointers and the last pointer must be NULL.
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1797e8506929..954373db20e7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -600,25 +600,38 @@ struct sk_filter {
 
 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
-#define __BPF_PROG_RUN(prog, ctx, dfunc)	({			\
-	u32 __ret;							\
-	cant_migrate();							\
-	if (static_branch_unlikely(&bpf_stats_enabled_key)) {		\
-		struct bpf_prog_stats *__stats;				\
-		u64 __start = sched_clock();				\
-		__ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
-		__stats = this_cpu_ptr(prog->stats);			\
-		u64_stats_update_begin(&__stats->syncp);		\
-		__stats->cnt++;						\
-		__stats->nsecs += sched_clock() - __start;		\
-		u64_stats_update_end(&__stats->syncp);			\
-	} else {							\
-		__ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
-	}								\
-	__ret; })
-
-#define BPF_PROG_RUN(prog, ctx)						\
-	__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func)
+typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
+					  const struct bpf_insn *insnsi,
+					  unsigned int (*bpf_func)(const void *,
+								   const struct bpf_insn *));
+
+static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
+					  const void *ctx,
+					  bpf_dispatcher_fn dfunc)
+{
+	u32 ret;
+
+	cant_migrate();
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
+		struct bpf_prog_stats *stats;
+		u64 start = sched_clock();
+
+		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
+		stats = this_cpu_ptr(prog->stats);
+		u64_stats_update_begin(&stats->syncp);
+		stats->cnt++;
+		stats->nsecs += sched_clock() - start;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
+	}
+	return ret;
+}
+
+static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
+{
+	return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
+}
 
 /*
  * Use in preemptible and therefore migratable context to make sure that
@@ -637,7 +650,7 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
 	u32 ret;
 
 	migrate_disable();
-	ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func);
+	ret = bpf_prog_run(prog, ctx);
 	migrate_enable();
 	return ret;
 }
@@ -742,7 +755,7 @@ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
 		memset(cb_data, 0, sizeof(cb_saved));
 	}
 
-	res = BPF_PROG_RUN(prog, skb);
+	res = bpf_prog_run(prog, skb);
 
 	if (unlikely(prog->cb_access))
 		memcpy(cb_data, cb_saved, sizeof(cb_saved));
@@ -787,7 +800,7 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	 * under local_bh_disable(), which provides the needed RCU protection
 	 * for accessing map entries.
 	 */
-	u32 act = __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
+	u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
 
 	if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) {
 		if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev))
@@ -1440,7 +1453,7 @@ static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
 		};
 		u32 act;
 
-		act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
+		act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
 		if (act == SK_PASS) {
 			selected_sk = ctx.selected_sk;
 			no_reuseport = ctx.no_reuseport;
@@ -1478,7 +1491,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 		};
 		u32 act;
 
-		act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
+		act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
 		if (act == SK_PASS) {
 			selected_sk = ctx.selected_sk;
 			no_reuseport = ctx.no_reuseport;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 2e9d47bb40ff..b2ee45064e06 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -686,7 +686,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
 
 	rcu_read_lock();
 	migrate_disable();
-	ret = BPF_PROG_RUN(prog, ctx);
+	ret = bpf_prog_run(prog, ctx);
 	migrate_enable();
 	rcu_read_unlock();
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9f6070369caa..16dc467adfa0 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1043,7 +1043,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	int ret;
 
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, bpf_prog_run);
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1091,7 +1091,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx,
-				       BPF_PROG_RUN, flags);
+				       bpf_prog_run, flags);
 
 	return ret == 1 ? 0 : -EPERM;
 }
@@ -1121,7 +1121,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	int ret;
 
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
-				 BPF_PROG_RUN);
+				 bpf_prog_run);
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
@@ -1140,7 +1140,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
 	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
-				   BPF_PROG_RUN);
+				   bpf_prog_run);
 	rcu_read_unlock();
 
 	return !allow;
@@ -1271,7 +1271,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, bpf_prog_run);
 	rcu_read_unlock();
 
 	kfree(ctx.cur_val);
@@ -1386,7 +1386,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 
 	lock_sock(sk);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
-				 &ctx, BPF_PROG_RUN);
+				 &ctx, bpf_prog_run);
 	release_sock(sk);
 
 	if (!ret) {
@@ -1496,7 +1496,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 
 	lock_sock(sk);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
-				 &ctx, BPF_PROG_RUN);
+				 &ctx, bpf_prog_run);
 	release_sock(sk);
 
 	if (!ret) {
@@ -1557,7 +1557,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 	 */
 
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
-				 &ctx, BPF_PROG_RUN);
+				 &ctx, bpf_prog_run);
 	if (!ret)
 		return -EPERM;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 82af6279992d..5ee2ec27c3d4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1879,7 +1879,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
  *	@err: pointer to error variable
  *
  * Try to JIT eBPF program, if JIT is not available, use interpreter.
- * The BPF program will be executed via BPF_PROG_RUN() macro.
+ * The BPF program will be executed via bpf_prog_run() function.
  *
  * Return: the &fp argument along with &err set to 0 for success or
  * a negative errno code on failure
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index b2535acfe9db..fe1e857324e6 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -548,7 +548,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
 	u64_stats_update_end(&stats->syncp);
 }
 
-/* The logic is similar to BPF_PROG_RUN, but with an explicit
+/* The logic is similar to bpf_prog_run(), but with an explicit
  * rcu_read_lock() and migrate_disable() which are required
  * for the trampoline. The macro is split into
  * call __bpf_prog_enter
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5ea2238a6656..f5a0077c9981 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12383,7 +12383,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		subprog_end = env->subprog_info[i + 1].start;
 
 		len = subprog_end - subprog_start;
-		/* BPF_PROG_RUN doesn't call subprogs directly,
+		/* bpf_prog_run() doesn't call subprogs directly,
 		 * hence main prog stats include the runtime of subprogs.
 		 * subprogs don't have IDs and not reachable via prog_get_next_id
 		 * func[i]->stats will never be accessed and stays NULL
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1cb1f9b8392e..7d20743b48e1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9913,7 +9913,7 @@ static void bpf_overflow_handler(struct perf_event *event,
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
 		goto out;
 	rcu_read_lock();
-	ret = BPF_PROG_RUN(event->prog, &ctx);
+	ret = bpf_prog_run(event->prog, &ctx);
 	rcu_read_unlock();
 out:
 	__this_cpu_dec(bpf_prog_active);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0da94e1d6af9..05a5a556671d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	 * out of events when it was updated in between this and the
 	 * rcu_dereference() which is accepted risk.
 	 */
-	ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
+	ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, bpf_prog_run);
 
  out:
 	__this_cpu_dec(bpf_prog_active);
@@ -1816,7 +1816,7 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
 {
 	cant_sleep();
 	rcu_read_lock();
-	(void) BPF_PROG_RUN(prog, args);
+	(void) bpf_prog_run(prog, args);
 	rcu_read_unlock();
 }
 
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 77fe6fde56c5..830a18ecffc8 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -8616,7 +8616,7 @@ static int __run_one(const struct bpf_prog *fp, const void *data,
 	start = ktime_get_ns();
 
 	for (i = 0; i < runs; i++)
-		ret = BPF_PROG_RUN(fp, data);
+		ret = bpf_prog_run(fp, data);
 
 	finish = ktime_get_ns();
 	migrate_enable();
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 4b855af267b1..2eb0e55ef54d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -116,7 +116,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 		if (xdp)
 			*retval = bpf_prog_run_xdp(prog, ctx);
 		else
-			*retval = BPF_PROG_RUN(prog, ctx);
+			*retval = bpf_prog_run(prog, ctx);
 	} while (bpf_test_timer_continue(&t, repeat, &ret, time));
 	bpf_reset_run_ctx(old_ctx);
 	bpf_test_timer_leave(&t);
@@ -327,7 +327,7 @@ __bpf_prog_test_run_raw_tp(void *data)
 	struct bpf_raw_tp_test_run_info *info = data;
 
 	rcu_read_lock();
-	info->retval = BPF_PROG_RUN(info->prog, info->ctx);
+	info->retval = bpf_prog_run(info->prog, info->ctx);
 	rcu_read_unlock();
 }
 
@@ -989,7 +989,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
 	bpf_test_timer_enter(&t);
 	do {
 		ctx.selected_sk = NULL;
-		retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN);
+		retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run);
 	} while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
 	bpf_test_timer_leave(&t);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 3aca07c44fad..5cf38e8886f1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -114,7 +114,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
  * Run the eBPF program and then cut skb->data to correct size returned by
  * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
  * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
+ * wrapper to bpf_prog_run. It returns 0 if the packet should
  * be accepted or -EPERM if the packet should be tossed.
  *
  */
@@ -10115,7 +10115,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 	enum sk_action action;
 
 	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
-	action = BPF_PROG_RUN(prog, &reuse_kern);
+	action = bpf_prog_run(prog, &reuse_kern);
 
 	if (action == SK_PASS)
 		return reuse_kern.selected_sk;
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index e33fde06d528..dd4cf01d1e0a 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -103,7 +103,7 @@ static struct bpf_prog *ptp_insns __read_mostly;
 
 unsigned int ptp_classify_raw(const struct sk_buff *skb)
 {
-	return BPF_PROG_RUN(ptp_insns, skb);
+	return bpf_prog_run(ptp_insns, skb);
 }
 EXPORT_SYMBOL_GPL(ptp_classify_raw);
 
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 13cf3f9b5938..849ac552a154 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_bpf_info *info = par->matchinfo;
 
-	return BPF_PROG_RUN(info->filter, skb);
+	return bpf_prog_run(info->filter, skb);
 }
 
 static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 040807aa15b9..5c36013339e1 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -47,11 +47,11 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
 		bpf_compute_data_pointers(skb);
-		filter_res = BPF_PROG_RUN(filter, skb);
+		filter_res = bpf_prog_run(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
 		bpf_compute_data_pointers(skb);
-		filter_res = BPF_PROG_RUN(filter, skb);
+		filter_res = bpf_prog_run(filter, skb);
 	}
 	if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
 		skb_orphan(skb);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 3b472bafdc9d..df19a847829e 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -96,11 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
 			bpf_compute_data_pointers(skb);
-			filter_res = BPF_PROG_RUN(prog->filter, skb);
+			filter_res = bpf_prog_run(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
 			bpf_compute_data_pointers(skb);
-			filter_res = BPF_PROG_RUN(prog->filter, skb);
+			filter_res = bpf_prog_run(prog->filter, skb);
 		}
 
 		if (prog->exts_integrated) {
-- 
cgit v1.2.3


From 7d08c2c9117113fee118487425ed55efa50cbfa9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 15 Aug 2021 00:05:55 -0700
Subject: bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions

Similar to BPF_PROG_RUN, turn BPF_PROG_RUN_ARRAY macros into proper functions
with all the same readability and maintainability benefits. Making them into
functions required shuffling around bpf_set_run_ctx/bpf_reset_run_ctx
functions. Also, explicitly specifying the type of the BPF prog run callback
required adjusting __bpf_prog_run_save_cb() to accept const void *, casted
internally to const struct sk_buff.

Further, split out a cgroup-specific BPF_PROG_RUN_ARRAY_CG and
BPF_PROG_RUN_ARRAY_CG_FLAGS from the more generic BPF_PROG_RUN_ARRAY due to
the differences in bpf_run_ctx used for those two different use cases.

I think BPF_PROG_RUN_ARRAY_CG would benefit from further refactoring to accept
struct cgroup and enum bpf_attach_type instead of bpf_prog_array, fetching
cgrp->bpf.effective[type] and RCU-dereferencing it internally. But that
required including include/linux/cgroup-defs.h, which I wasn't sure is ok with
everyone.

The remaining generic BPF_PROG_RUN_ARRAY function will be extended to
pass-through user-provided context value in the next patch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210815070609.987780-3-andrii@kernel.org
---
 include/linux/bpf.h      | 179 +++++++++++++++++++++++++++--------------------
 include/linux/filter.h   |   5 +-
 kernel/bpf/cgroup.c      |  32 ++++-----
 kernel/trace/bpf_trace.c |   2 +-
 4 files changed, 124 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 968fea98087a..344e0d4d8ef6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1146,67 +1146,116 @@ struct bpf_run_ctx {};
 
 struct bpf_cg_run_ctx {
 	struct bpf_run_ctx run_ctx;
-	struct bpf_prog_array_item *prog_item;
+	const struct bpf_prog_array_item *prog_item;
 };
 
+static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
+{
+	struct bpf_run_ctx *old_ctx = NULL;
+
+#ifdef CONFIG_BPF_SYSCALL
+	old_ctx = current->bpf_ctx;
+	current->bpf_ctx = new_ctx;
+#endif
+	return old_ctx;
+}
+
+static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
+{
+#ifdef CONFIG_BPF_SYSCALL
+	current->bpf_ctx = old_ctx;
+#endif
+}
+
 /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */
 #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE			(1 << 0)
 /* BPF program asks to set CN on the packet. */
 #define BPF_RET_SET_CN						(1 << 0)
 
-#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags)		\
-	({								\
-		struct bpf_prog_array_item *_item;			\
-		struct bpf_prog *_prog;					\
-		struct bpf_prog_array *_array;				\
-		struct bpf_run_ctx *old_run_ctx;			\
-		struct bpf_cg_run_ctx run_ctx;				\
-		u32 _ret = 1;						\
-		u32 func_ret;						\
-		migrate_disable();					\
-		rcu_read_lock();					\
-		_array = rcu_dereference(array);			\
-		_item = &_array->items[0];				\
-		old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);	\
-		while ((_prog = READ_ONCE(_item->prog))) {		\
-			run_ctx.prog_item = _item;			\
-			func_ret = func(_prog, ctx);			\
-			_ret &= (func_ret & 1);				\
-			*(ret_flags) |= (func_ret >> 1);		\
-			_item++;					\
-		}							\
-		bpf_reset_run_ctx(old_run_ctx);				\
-		rcu_read_unlock();					\
-		migrate_enable();					\
-		_ret;							\
-	 })
-
-#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage)	\
-	({						\
-		struct bpf_prog_array_item *_item;	\
-		struct bpf_prog *_prog;			\
-		struct bpf_prog_array *_array;		\
-		struct bpf_run_ctx *old_run_ctx;	\
-		struct bpf_cg_run_ctx run_ctx;		\
-		u32 _ret = 1;				\
-		migrate_disable();			\
-		rcu_read_lock();			\
-		_array = rcu_dereference(array);	\
-		if (unlikely(check_non_null && !_array))\
-			goto _out;			\
-		_item = &_array->items[0];		\
-		old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\
-		while ((_prog = READ_ONCE(_item->prog))) {	\
-			run_ctx.prog_item = _item;	\
-			_ret &= func(_prog, ctx);	\
-			_item++;			\
-		}					\
-		bpf_reset_run_ctx(old_run_ctx);		\
-_out:							\
-		rcu_read_unlock();			\
-		migrate_enable();			\
-		_ret;					\
-	 })
+typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx);
+
+static __always_inline u32
+BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu,
+			    const void *ctx, bpf_prog_run_fn run_prog,
+			    u32 *ret_flags)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_cg_run_ctx run_ctx;
+	u32 ret = 1;
+	u32 func_ret;
+
+	migrate_disable();
+	rcu_read_lock();
+	array = rcu_dereference(array_rcu);
+	item = &array->items[0];
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	while ((prog = READ_ONCE(item->prog))) {
+		run_ctx.prog_item = item;
+		func_ret = run_prog(prog, ctx);
+		ret &= (func_ret & 1);
+		*(ret_flags) |= (func_ret >> 1);
+		item++;
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+	rcu_read_unlock();
+	migrate_enable();
+	return ret;
+}
+
+static __always_inline u32
+BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu,
+		      const void *ctx, bpf_prog_run_fn run_prog)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_cg_run_ctx run_ctx;
+	u32 ret = 1;
+
+	migrate_disable();
+	rcu_read_lock();
+	array = rcu_dereference(array_rcu);
+	item = &array->items[0];
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	while ((prog = READ_ONCE(item->prog))) {
+		run_ctx.prog_item = item;
+		ret &= run_prog(prog, ctx);
+		item++;
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+	rcu_read_unlock();
+	migrate_enable();
+	return ret;
+}
+
+static __always_inline u32
+BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
+		   const void *ctx, bpf_prog_run_fn run_prog)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	u32 ret = 1;
+
+	migrate_disable();
+	rcu_read_lock();
+	array = rcu_dereference(array_rcu);
+	if (unlikely(!array))
+		goto out;
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		ret &= run_prog(prog, ctx);
+		item++;
+	}
+out:
+	rcu_read_unlock();
+	migrate_enable();
+	return ret;
+}
 
 /* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
  * so BPF programs can request cwr for TCP packets.
@@ -1235,7 +1284,7 @@ _out:							\
 		u32 _flags = 0;				\
 		bool _cn;				\
 		u32 _ret;				\
-		_ret = BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, &_flags); \
+		_ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \
 		_cn = _flags & BPF_RET_SET_CN;		\
 		if (_ret)				\
 			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
@@ -1244,12 +1293,6 @@ _out:							\
 		_ret;					\
 	})
 
-#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
-	__BPF_PROG_RUN_ARRAY(array, ctx, func, false, true)
-
-#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)	\
-	__BPF_PROG_RUN_ARRAY(array, ctx, func, true, false)
-
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 extern struct mutex bpf_stats_enabled_mutex;
@@ -1284,20 +1327,6 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
-static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
-{
-	struct bpf_run_ctx *old_ctx;
-
-	old_ctx = current->bpf_ctx;
-	current->bpf_ctx = new_ctx;
-	return old_ctx;
-}
-
-static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
-{
-	current->bpf_ctx = old_ctx;
-}
-
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 954373db20e7..7d248941ecea 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -723,7 +723,7 @@ static inline void bpf_restore_data_end(
 	cb->data_end = saved_data_end;
 }
 
-static inline u8 *bpf_skb_cb(struct sk_buff *skb)
+static inline u8 *bpf_skb_cb(const struct sk_buff *skb)
 {
 	/* eBPF programs may read/write skb->cb[] area to transfer meta
 	 * data between tail calls. Since this also needs to work with
@@ -744,8 +744,9 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 
 /* Must be invoked with migration disabled */
 static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
-					 struct sk_buff *skb)
+					 const void *ctx)
 {
+	const struct sk_buff *skb = ctx;
 	u8 *cb_data = bpf_skb_cb(skb);
 	u8 cb_saved[BPF_SKB_CB_LEN];
 	u32 res;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 16dc467adfa0..a1dedba4c174 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1012,8 +1012,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 		ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
 			cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
 	} else {
-		ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
-					  __bpf_prog_run_save_cb);
+		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb,
+					    __bpf_prog_run_save_cb);
 		ret = (ret == 1 ? 0 : -EPERM);
 	}
 	bpf_restore_data_end(skb, saved_data_end);
@@ -1043,7 +1043,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	int ret;
 
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run);
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1090,8 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx,
-				       bpf_prog_run, flags);
+	ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx,
+				          bpf_prog_run, flags);
 
 	return ret == 1 ? 0 : -EPERM;
 }
@@ -1120,8 +1120,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	int ret;
 
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
-				 bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops,
+				    bpf_prog_run);
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
@@ -1139,8 +1139,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
-				   bpf_prog_run);
+	allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx,
+				      bpf_prog_run);
 	rcu_read_unlock();
 
 	return !allow;
@@ -1271,7 +1271,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run);
 	rcu_read_unlock();
 
 	kfree(ctx.cur_val);
@@ -1385,8 +1385,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
-				 &ctx, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+				    &ctx, bpf_prog_run);
 	release_sock(sk);
 
 	if (!ret) {
@@ -1495,8 +1495,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
-				 &ctx, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+				    &ctx, bpf_prog_run);
 	release_sock(sk);
 
 	if (!ret) {
@@ -1556,8 +1556,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 	 * be called if that data shouldn't be "exported".
 	 */
 
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
-				 &ctx, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+				    &ctx, bpf_prog_run);
 	if (!ret)
 		return -EPERM;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 05a5a556671d..91867b14b222 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	 * out of events when it was updated in between this and the
 	 * rcu_dereference() which is accepted risk.
 	 */
-	ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
 
  out:
 	__this_cpu_dec(bpf_prog_active);
-- 
cgit v1.2.3


From b89fbfbb854c9afc3047e8273cc3a694650b802e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 15 Aug 2021 00:05:57 -0700
Subject: bpf: Implement minimal BPF perf link

Introduce a new type of BPF link - BPF perf link. This brings perf_event-based
BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into
the common BPF link infrastructure, allowing to list all active perf_event
based attachments, auto-detaching BPF program from perf_event when link's FD
is closed, get generic BPF link fdinfo/get_info functionality.

BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags
are currently supported.

Force-detaching and atomic BPF program updates are not yet implemented, but
with perf_event-based BPF links we now have common framework for this without
the need to extend ioctl()-based perf_event interface.

One interesting consideration is a new value for bpf_attach_type, which
BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from
bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of
bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or
BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different
program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based
mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to
define a single BPF_PERF_EVENT attach type for all of them and adjust
link_create()'s logic for checking correspondence between attach type and
program type.

The alternative would be to define three new attach types (e.g., BPF_KPROBE,
BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill
and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by
libbpf. I chose to not do this to avoid unnecessary proliferation of
bpf_attach_type enum values and not have to deal with naming conflicts.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20210815070609.987780-5-andrii@kernel.org
---
 include/linux/bpf_types.h      |   3 ++
 include/linux/trace_events.h   |   3 ++
 include/uapi/linux/bpf.h       |   2 +
 kernel/bpf/syscall.c           | 105 ++++++++++++++++++++++++++++++++++++++---
 kernel/events/core.c           |  10 ++--
 tools/include/uapi/linux/bpf.h |   2 +
 6 files changed, 112 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index ae3ac3a2018c..9c81724e4b98 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -136,3 +136,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
 BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
 #endif
+#ifdef CONFIG_PERF_EVENTS
+BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
+#endif
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index ad413b382a3c..8ac92560d3a3 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -803,6 +803,9 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+void perf_event_free_bpf_prog(struct perf_event *event);
+
 void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
 void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
 void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2db6925e04f4..94fe8329b28f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -993,6 +993,7 @@ enum bpf_attach_type {
 	BPF_SK_SKB_VERDICT,
 	BPF_SK_REUSEPORT_SELECT,
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
+	BPF_PERF_EVENT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1006,6 +1007,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_ITER = 4,
 	BPF_LINK_TYPE_NETNS = 5,
 	BPF_LINK_TYPE_XDP = 6,
+	BPF_LINK_TYPE_PERF_EVENT = 7,
 
 	MAX_BPF_LINK_TYPE,
 };
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9a2068e39d23..80c03bedd6e6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2906,6 +2906,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = {
 	.fill_link_info = bpf_raw_tp_link_fill_link_info,
 };
 
+#ifdef CONFIG_PERF_EVENTS
+struct bpf_perf_link {
+	struct bpf_link link;
+	struct file *perf_file;
+};
+
+static void bpf_perf_link_release(struct bpf_link *link)
+{
+	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+	struct perf_event *event = perf_link->perf_file->private_data;
+
+	perf_event_free_bpf_prog(event);
+	fput(perf_link->perf_file);
+}
+
+static void bpf_perf_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+
+	kfree(perf_link);
+}
+
+static const struct bpf_link_ops bpf_perf_link_lops = {
+	.release = bpf_perf_link_release,
+	.dealloc = bpf_perf_link_dealloc,
+};
+
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct bpf_perf_link *link;
+	struct perf_event *event;
+	struct file *perf_file;
+	int err;
+
+	if (attr->link_create.flags)
+		return -EINVAL;
+
+	perf_file = perf_event_get(attr->link_create.target_fd);
+	if (IS_ERR(perf_file))
+		return PTR_ERR(perf_file);
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto out_put_file;
+	}
+	bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+	link->perf_file = perf_file;
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		goto out_put_file;
+	}
+
+	event = perf_file->private_data;
+	err = perf_event_set_bpf_prog(event, prog);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		goto out_put_file;
+	}
+	/* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
+	bpf_prog_inc(prog);
+
+	return bpf_link_settle(&link_primer);
+
+out_put_file:
+	fput(perf_file);
+	return err;
+}
+#endif /* CONFIG_PERF_EVENTS */
+
 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
 
 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
@@ -4147,15 +4220,26 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	if (ret)
 		goto out;
 
-	if (prog->type == BPF_PROG_TYPE_EXT) {
+	switch (prog->type) {
+	case BPF_PROG_TYPE_EXT:
 		ret = tracing_bpf_link_attach(attr, uattr, prog);
 		goto out;
-	}
-
-	ptype = attach_type_to_prog_type(attr->link_create.attach_type);
-	if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
-		ret = -EINVAL;
-		goto out;
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_KPROBE:
+	case BPF_PROG_TYPE_TRACEPOINT:
+		if (attr->link_create.attach_type != BPF_PERF_EVENT) {
+			ret = -EINVAL;
+			goto out;
+		}
+		ptype = prog->type;
+		break;
+	default:
+		ptype = attach_type_to_prog_type(attr->link_create.attach_type);
+		if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
 	}
 
 	switch (ptype) {
@@ -4179,6 +4263,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
+#endif
+#ifdef CONFIG_PERF_EVENTS
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_TRACEPOINT:
+	case BPF_PROG_TYPE_KPROBE:
+		ret = bpf_perf_link_attach(attr, prog);
+		break;
 #endif
 	default:
 		ret = -EINVAL;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2f07718bd41c..9fd65667bcb2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4697,7 +4697,6 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -5574,7 +5573,6 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
 static int perf_copy_attr(struct perf_event_attr __user *uattr,
 			  struct perf_event_attr *attr);
 
@@ -10013,7 +10011,7 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
 	return false;
 }
 
-static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 
@@ -10047,7 +10045,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *pr
 	return perf_event_attach_bpf_prog(event, prog);
 }
 
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
 {
 	if (!perf_event_is_tracing(event)) {
 		perf_event_free_bpf_handler(event);
@@ -10066,12 +10064,12 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
-static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
 {
 	return -ENOENT;
 }
 
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
 {
 }
 #endif /* CONFIG_EVENT_TRACING */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2db6925e04f4..94fe8329b28f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -993,6 +993,7 @@ enum bpf_attach_type {
 	BPF_SK_SKB_VERDICT,
 	BPF_SK_REUSEPORT_SELECT,
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
+	BPF_PERF_EVENT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1006,6 +1007,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_ITER = 4,
 	BPF_LINK_TYPE_NETNS = 5,
 	BPF_LINK_TYPE_XDP = 6,
+	BPF_LINK_TYPE_PERF_EVENT = 7,
 
 	MAX_BPF_LINK_TYPE,
 };
-- 
cgit v1.2.3


From 82e6b1eee6a8875ef4eacfd60711cce6965c6b04 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 15 Aug 2021 00:05:58 -0700
Subject: bpf: Allow to specify user-provided bpf_cookie for BPF perf links

Add ability for users to specify custom u64 value (bpf_cookie) when creating
BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event,
tracepoints).

This is useful for cases when the same BPF program is used for attaching and
processing invocation of different tracepoints/kprobes/uprobes in a generic
fashion, but such that each invocation is distinguished from each other (e.g.,
BPF program can look up additional information associated with a specific
kernel function without having to rely on function IP lookups). This enables
new use cases to be implemented simply and efficiently that previously were
possible only through code generation (and thus multiple instances of almost
identical BPF program) or compilation at runtime (BCC-style) on target hosts
(even more expensive resource-wise). For uprobes it is not even possible in
some cases to know function IP before hand (e.g., when attaching to shared
library without PID filtering, in which case base load address is not known
for a library).

This is done by storing u64 bpf_cookie in struct bpf_prog_array_item,
corresponding to each attached and run BPF program. Given cgroup BPF programs
already use two 8-byte pointers for their needs and cgroup BPF programs don't
have (yet?) support for bpf_cookie, reuse that space through union of
cgroup_storage and new bpf_cookie field.

Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx.
This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF
program execution code, which luckily is now also split from
BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper
giving access to this user-provided cookie value from inside a BPF program.
Generic perf_event BPF programs will access this value from perf_event itself
through passed in BPF program context.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org
---
 drivers/media/rc/bpf-lirc.c    |  4 ++--
 include/linux/bpf.h            | 16 +++++++++++++++-
 include/linux/perf_event.h     |  1 +
 include/linux/trace_events.h   |  6 +++---
 include/uapi/linux/bpf.h       |  7 +++++++
 kernel/bpf/core.c              | 29 ++++++++++++++++++-----------
 kernel/bpf/syscall.c           |  2 +-
 kernel/events/core.c           | 21 ++++++++++++++-------
 kernel/trace/bpf_trace.c       |  8 +++++---
 tools/include/uapi/linux/bpf.h |  7 +++++++
 10 files changed, 73 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index bb5a9dc78f1b..3eff08d7b8e5 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -160,7 +160,7 @@ static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog)
 		goto unlock;
 	}
 
-	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	ret = bpf_prog_array_copy(old_array, NULL, prog, 0, &new_array);
 	if (ret < 0)
 		goto unlock;
 
@@ -193,7 +193,7 @@ static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog)
 	}
 
 	old_array = lirc_rcu_dereference(raw->progs);
-	ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array);
+	ret = bpf_prog_array_copy(old_array, prog, NULL, 0, &new_array);
 	/*
 	 * Do not use bpf_prog_array_delete_safe() as we would end up
 	 * with a dummy entry in the array, and the we would free the
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 344e0d4d8ef6..83c3cc5e90df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1114,7 +1114,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
  */
 struct bpf_prog_array_item {
 	struct bpf_prog *prog;
-	struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+	union {
+		struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+		u64 bpf_cookie;
+	};
 };
 
 struct bpf_prog_array {
@@ -1140,6 +1143,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
+			u64 bpf_cookie,
 			struct bpf_prog_array **new_array);
 
 struct bpf_run_ctx {};
@@ -1149,6 +1153,11 @@ struct bpf_cg_run_ctx {
 	const struct bpf_prog_array_item *prog_item;
 };
 
+struct bpf_trace_run_ctx {
+	struct bpf_run_ctx run_ctx;
+	u64 bpf_cookie;
+};
+
 static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
 {
 	struct bpf_run_ctx *old_ctx = NULL;
@@ -1239,6 +1248,8 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
 	const struct bpf_prog_array_item *item;
 	const struct bpf_prog *prog;
 	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_trace_run_ctx run_ctx;
 	u32 ret = 1;
 
 	migrate_disable();
@@ -1246,11 +1257,14 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
 	array = rcu_dereference(array_rcu);
 	if (unlikely(!array))
 		goto out;
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	item = &array->items[0];
 	while ((prog = READ_ONCE(item->prog))) {
+		run_ctx.bpf_cookie = item->bpf_cookie;
 		ret &= run_prog(prog, ctx);
 		item++;
 	}
+	bpf_reset_run_ctx(old_run_ctx);
 out:
 	rcu_read_unlock();
 	migrate_enable();
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2d510ad750ed..fe156a8170aa 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -762,6 +762,7 @@ struct perf_event {
 #ifdef CONFIG_BPF_SYSCALL
 	perf_overflow_handler_t		orig_overflow_handler;
 	struct bpf_prog			*prog;
+	u64				bpf_cookie;
 #endif
 
 #ifdef CONFIG_EVENT_TRACING
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8ac92560d3a3..8e0631a4b046 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -675,7 +675,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 
 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
-int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
@@ -692,7 +692,7 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c
 }
 
 static inline int
-perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie)
 {
 	return -EOPNOTSUPP;
 }
@@ -803,7 +803,7 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_free_bpf_prog(struct perf_event *event);
 
 void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 94fe8329b28f..63ee482d50e1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1448,6 +1448,13 @@ union bpf_attr {
 				__aligned_u64	iter_info;	/* extra bpf_iter_link_info */
 				__u32		iter_info_len;	/* iter_info length */
 			};
+			struct {
+				/* black box user-provided value passed through
+				 * to BPF program at the execution time and
+				 * accessible through bpf_get_attach_cookie() BPF helper
+				 */
+				__u64		bpf_cookie;
+			} perf_event;
 		};
 	} link_create;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5ee2ec27c3d4..91f24c7b38a1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2119,13 +2119,13 @@ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
 int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
+			u64 bpf_cookie,
 			struct bpf_prog_array **new_array)
 {
 	int new_prog_cnt, carry_prog_cnt = 0;
-	struct bpf_prog_array_item *existing;
+	struct bpf_prog_array_item *existing, *new;
 	struct bpf_prog_array *array;
 	bool found_exclude = false;
-	int new_prog_idx = 0;
 
 	/* Figure out how many existing progs we need to carry over to
 	 * the new array.
@@ -2162,20 +2162,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
 	if (!array)
 		return -ENOMEM;
+	new = array->items;
 
 	/* Fill in the new prog array */
 	if (carry_prog_cnt) {
 		existing = old_array->items;
-		for (; existing->prog; existing++)
-			if (existing->prog != exclude_prog &&
-			    existing->prog != &dummy_bpf_prog.prog) {
-				array->items[new_prog_idx++].prog =
-					existing->prog;
-			}
+		for (; existing->prog; existing++) {
+			if (existing->prog == exclude_prog ||
+			    existing->prog == &dummy_bpf_prog.prog)
+				continue;
+
+			new->prog = existing->prog;
+			new->bpf_cookie = existing->bpf_cookie;
+			new++;
+		}
 	}
-	if (include_prog)
-		array->items[new_prog_idx++].prog = include_prog;
-	array->items[new_prog_idx].prog = NULL;
+	if (include_prog) {
+		new->prog = include_prog;
+		new->bpf_cookie = bpf_cookie;
+		new++;
+	}
+	new->prog = NULL;
 	*new_array = array;
 	return 0;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 80c03bedd6e6..7420e1334ab2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2963,7 +2963,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
 	}
 
 	event = perf_file->private_data;
-	err = perf_event_set_bpf_prog(event, prog);
+	err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		goto out_put_file;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9fd65667bcb2..2d1e63dd97f2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5643,7 +5643,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 
-		err = perf_event_set_bpf_prog(event, prog);
+		err = perf_event_set_bpf_prog(event, prog, 0);
 		if (err) {
 			bpf_prog_put(prog);
 			return err;
@@ -9936,7 +9936,9 @@ out:
 	event->orig_overflow_handler(event, data, regs);
 }
 
-static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+				      struct bpf_prog *prog,
+				      u64 bpf_cookie)
 {
 	if (event->overflow_handler_context)
 		/* hw breakpoint or kernel counter */
@@ -9966,6 +9968,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog
 	}
 
 	event->prog = prog;
+	event->bpf_cookie = bpf_cookie;
 	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
 	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
 	return 0;
@@ -9983,7 +9986,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
 	bpf_prog_put(prog);
 }
 #else
-static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+				      struct bpf_prog *prog,
+				      u64 bpf_cookie)
 {
 	return -EOPNOTSUPP;
 }
@@ -10011,12 +10016,13 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
 	return false;
 }
 
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+			    u64 bpf_cookie)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 
 	if (!perf_event_is_tracing(event))
-		return perf_event_set_bpf_handler(event, prog);
+		return perf_event_set_bpf_handler(event, prog, bpf_cookie);
 
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
@@ -10042,7 +10048,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
 			return -EACCES;
 	}
 
-	return perf_event_attach_bpf_prog(event, prog);
+	return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
 }
 
 void perf_event_free_bpf_prog(struct perf_event *event)
@@ -10064,7 +10070,8 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+			    u64 bpf_cookie)
 {
 	return -ENOENT;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 91867b14b222..57879d28f824 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1675,7 +1675,8 @@ static DEFINE_MUTEX(bpf_event_mutex);
 #define BPF_TRACE_MAX_PROGS 64
 
 int perf_event_attach_bpf_prog(struct perf_event *event,
-			       struct bpf_prog *prog)
+			       struct bpf_prog *prog,
+			       u64 bpf_cookie)
 {
 	struct bpf_prog_array *old_array;
 	struct bpf_prog_array *new_array;
@@ -1702,12 +1703,13 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 		goto unlock;
 	}
 
-	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
 	if (ret < 0)
 		goto unlock;
 
 	/* set the new array to event->tp_event and set event->prog */
 	event->prog = prog;
+	event->bpf_cookie = bpf_cookie;
 	rcu_assign_pointer(event->tp_event->prog_array, new_array);
 	bpf_prog_array_free(old_array);
 
@@ -1728,7 +1730,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 		goto unlock;
 
 	old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
-	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
 	if (ret == -ENOENT)
 		goto unlock;
 	if (ret < 0) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 94fe8329b28f..63ee482d50e1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1448,6 +1448,13 @@ union bpf_attr {
 				__aligned_u64	iter_info;	/* extra bpf_iter_link_info */
 				__u32		iter_info_len;	/* iter_info length */
 			};
+			struct {
+				/* black box user-provided value passed through
+				 * to BPF program at the execution time and
+				 * accessible through bpf_get_attach_cookie() BPF helper
+				 */
+				__u64		bpf_cookie;
+			} perf_event;
 		};
 	} link_create;
 
-- 
cgit v1.2.3


From 7adfc6c9b315e174cf8743b21b7b691c8766791b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 15 Aug 2021 00:05:59 -0700
Subject: bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie
 value

Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs
to get access to a user-provided bpf_cookie value, specified during BPF
program attachment (BPF link creation) time.

Naming is hard, though. With the concept being named "BPF cookie", I've
considered calling the helper:
  - bpf_get_cookie() -- seems too unspecific and easily mistaken with socket
    cookie;
  - bpf_get_bpf_cookie() -- too much tautology;
  - bpf_get_link_cookie() -- would be ok, but while we create a BPF link to
    attach BPF program to BPF hook, it's still an "attachment" and the
    bpf_cookie is associated with BPF program attachment to a hook, not a BPF
    link itself. Technically, we could support bpf_cookie with old-style
    cgroup programs.So I ultimately rejected it in favor of
    bpf_get_attach_cookie().

Currently all perf_event-backed BPF program types support
bpf_get_attach_cookie() helper. Follow-up patches will add support for
fentry/fexit programs as well.

While at it, mark bpf_tracing_func_proto() as static to make it obvious that
it's only used from within the kernel/trace/bpf_trace.c.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org
---
 include/linux/bpf.h            |  3 ---
 include/uapi/linux/bpf.h       | 16 ++++++++++++++++
 kernel/trace/bpf_trace.c       | 35 ++++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++
 4 files changed, 66 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 83c3cc5e90df..f4c16f19f83e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2102,9 +2102,6 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 
-const struct bpf_func_proto *bpf_tracing_func_proto(
-	enum bpf_func_id func_id, const struct bpf_prog *prog);
-
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63ee482d50e1..c4f7892edb2b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4856,6 +4856,21 @@ union bpf_attr {
  * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
+ *
+ * u64 bpf_get_attach_cookie(void *ctx)
+ * 	Description
+ * 		Get bpf_cookie value provided (optionally) during the program
+ * 		attachment. It might be different for each individual
+ * 		attachment, even if BPF program itself is the same.
+ * 		Expects BPF program context *ctx* as a first argument.
+ *
+ * 		Supported for the following program types:
+ *			- kprobe/uprobe;
+ *			- tracepoint;
+ *			- perf_event.
+ * 	Return
+ *		Value specified by user at BPF link creation/attachment time
+ *		or 0, if it was not specified.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5032,6 +5047,7 @@ union bpf_attr {
 	FN(timer_start),		\
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
+	FN(get_attach_cookie),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 57879d28f824..cbc73c08c4a4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -975,7 +975,34 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
-const struct bpf_func_proto *
+BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
+{
+	struct bpf_trace_run_ctx *run_ctx;
+
+	run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+	return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
+	.func		= bpf_get_attach_cookie_trace,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
+{
+	return ctx->event->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
+	.func		= bpf_get_attach_cookie_pe,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
+static const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
@@ -1109,6 +1136,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 	case BPF_FUNC_get_func_ip:
 		return &bpf_get_func_ip_proto_kprobe;
+	case BPF_FUNC_get_attach_cookie:
+		return &bpf_get_attach_cookie_proto_trace;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
@@ -1219,6 +1248,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_stackid_proto_tp;
 	case BPF_FUNC_get_stack:
 		return &bpf_get_stack_proto_tp;
+	case BPF_FUNC_get_attach_cookie:
+		return &bpf_get_attach_cookie_proto_trace;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
@@ -1326,6 +1357,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_prog_read_value_proto;
 	case BPF_FUNC_read_branch_records:
 		return &bpf_read_branch_records_proto;
+	case BPF_FUNC_get_attach_cookie:
+		return &bpf_get_attach_cookie_proto_pe;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 63ee482d50e1..c4f7892edb2b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4856,6 +4856,21 @@ union bpf_attr {
  * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
+ *
+ * u64 bpf_get_attach_cookie(void *ctx)
+ * 	Description
+ * 		Get bpf_cookie value provided (optionally) during the program
+ * 		attachment. It might be different for each individual
+ * 		attachment, even if BPF program itself is the same.
+ * 		Expects BPF program context *ctx* as a first argument.
+ *
+ * 		Supported for the following program types:
+ *			- kprobe/uprobe;
+ *			- tracepoint;
+ *			- perf_event.
+ * 	Return
+ *		Value specified by user at BPF link creation/attachment time
+ *		or 0, if it was not specified.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5032,6 +5047,7 @@ union bpf_attr {
 	FN(timer_start),		\
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
+	FN(get_attach_cookie),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 94531cfcbe79c3598acf96806627b2137ca32eb9 Mon Sep 17 00:00:00 2001
From: Jiang Wang <jiang.wang@bytedance.com>
Date: Mon, 16 Aug 2021 19:03:21 +0000
Subject: af_unix: Add unix_stream_proto for sockmap

Previously, sockmap for AF_UNIX protocol only supports
dgram type. This patch add unix stream type support, which
is similar to unix_dgram_proto. To support sockmap, dgram
and stream cannot share the same unix_proto anymore, because
they have different implementations, such as unhash for stream
type (which will remove closed or disconnected sockets from the map),
so rename unix_proto to unix_dgram_proto and add a new
unix_stream_proto.

Also implement stream related sockmap functions.
And add dgram key words to those dgram specific functions.

Signed-off-by: Jiang Wang <jiang.wang@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210816190327.2739291-3-jiang.wang@bytedance.com
---
 include/net/af_unix.h |  8 +++--
 net/core/sock_map.c   |  1 +
 net/unix/af_unix.c    | 83 ++++++++++++++++++++++++++++++++++++++-------
 net/unix/unix_bpf.c   | 93 +++++++++++++++++++++++++++++++++++++++------------
 4 files changed, 148 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 4757d7f53f13..7d142e8a0550 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -87,6 +87,8 @@ long unix_outq_len(struct sock *sk);
 
 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
 			 int flags);
+int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
+			  int flags);
 #ifdef CONFIG_SYSCTL
 int unix_sysctl_register(struct net *net);
 void unix_sysctl_unregister(struct net *net);
@@ -96,9 +98,11 @@ static inline void unix_sysctl_unregister(struct net *net) {}
 #endif
 
 #ifdef CONFIG_BPF_SYSCALL
-extern struct proto unix_proto;
+extern struct proto unix_dgram_proto;
+extern struct proto unix_stream_proto;
 
-int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
+int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
+int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void __init unix_bpf_build_proto(void);
 #else
 static inline void __init unix_bpf_build_proto(void)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index ae5fa4338d9c..e252b8ec2b85 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1494,6 +1494,7 @@ void sock_map_unhash(struct sock *sk)
 	rcu_read_unlock();
 	saved_unhash(sk);
 }
+EXPORT_SYMBOL_GPL(sock_map_unhash);
 
 void sock_map_close(struct sock *sk, long timeout)
 {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4455b62317d4..443c49081636 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -798,17 +798,35 @@ static void unix_close(struct sock *sk, long timeout)
 	 */
 }
 
-struct proto unix_proto = {
-	.name			= "UNIX",
+static void unix_unhash(struct sock *sk)
+{
+	/* Nothing to do here, unix socket does not need a ->unhash().
+	 * This is merely for sockmap.
+	 */
+}
+
+struct proto unix_dgram_proto = {
+	.name			= "UNIX-DGRAM",
+	.owner			= THIS_MODULE,
+	.obj_size		= sizeof(struct unix_sock),
+	.close			= unix_close,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
+#endif
+};
+
+struct proto unix_stream_proto = {
+	.name			= "UNIX-STREAM",
 	.owner			= THIS_MODULE,
 	.obj_size		= sizeof(struct unix_sock),
 	.close			= unix_close,
+	.unhash			= unix_unhash,
 #ifdef CONFIG_BPF_SYSCALL
-	.psock_update_sk_prot	= unix_bpf_update_proto,
+	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
 #endif
 };
 
-static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
+static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 {
 	struct sock *sk = NULL;
 	struct unix_sock *u;
@@ -817,7 +835,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
-	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
+	if (type == SOCK_STREAM)
+		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
+	else /*dgram and  seqpacket */
+		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
+
 	if (!sk)
 		goto out;
 
@@ -879,7 +901,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
+	return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM;
 }
 
 static int unix_release(struct socket *sock)
@@ -1293,7 +1315,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	err = -ENOMEM;
 
 	/* create new sock for complete connection */
-	newsk = unix_create1(sock_net(sk), NULL, 0);
+	newsk = unix_create1(sock_net(sk), NULL, 0, sock->type);
 	if (newsk == NULL)
 		goto out;
 
@@ -2323,8 +2345,10 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si
 	struct sock *sk = sock->sk;
 
 #ifdef CONFIG_BPF_SYSCALL
-	if (sk->sk_prot != &unix_proto)
-		return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+	const struct proto *prot = READ_ONCE(sk->sk_prot);
+
+	if (prot != &unix_dgram_proto)
+		return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
 					    flags & ~MSG_DONTWAIT, NULL);
 #endif
 	return __unix_dgram_recvmsg(sk, msg, size, flags);
@@ -2728,6 +2752,20 @@ static int unix_stream_read_actor(struct sk_buff *skb,
 	return ret ?: chunk;
 }
 
+int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
+			  size_t size, int flags)
+{
+	struct unix_stream_read_state state = {
+		.recv_actor = unix_stream_read_actor,
+		.socket = sk->sk_socket,
+		.msg = msg,
+		.size = size,
+		.flags = flags
+	};
+
+	return unix_stream_read_generic(&state, true);
+}
+
 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 			       size_t size, int flags)
 {
@@ -2739,6 +2777,14 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 		.flags = flags
 	};
 
+#ifdef CONFIG_BPF_SYSCALL
+	struct sock *sk = sock->sk;
+	const struct proto *prot = READ_ONCE(sk->sk_prot);
+
+	if (prot != &unix_stream_proto)
+		return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+					    flags & ~MSG_DONTWAIT, NULL);
+#endif
 	return unix_stream_read_generic(&state, true);
 }
 
@@ -2799,7 +2845,9 @@ static int unix_shutdown(struct socket *sock, int mode)
 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
 
 		int peer_mode = 0;
+		const struct proto *prot = READ_ONCE(other->sk_prot);
 
+		prot->unhash(other);
 		if (mode&RCV_SHUTDOWN)
 			peer_mode |= SEND_SHUTDOWN;
 		if (mode&SEND_SHUTDOWN)
@@ -2808,10 +2856,12 @@ static int unix_shutdown(struct socket *sock, int mode)
 		other->sk_shutdown |= peer_mode;
 		unix_state_unlock(other);
 		other->sk_state_change(other);
-		if (peer_mode == SHUTDOWN_MASK)
+		if (peer_mode == SHUTDOWN_MASK) {
 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
-		else if (peer_mode & RCV_SHUTDOWN)
+			other->sk_state = TCP_CLOSE;
+		} else if (peer_mode & RCV_SHUTDOWN) {
 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
+		}
 	}
 	if (other)
 		sock_put(other);
@@ -3289,7 +3339,13 @@ static int __init af_unix_init(void)
 
 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
 
-	rc = proto_register(&unix_proto, 1);
+	rc = proto_register(&unix_dgram_proto, 1);
+	if (rc != 0) {
+		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
+		goto out;
+	}
+
+	rc = proto_register(&unix_stream_proto, 1);
 	if (rc != 0) {
 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
 		goto out;
@@ -3310,7 +3366,8 @@ out:
 static void __exit af_unix_exit(void)
 {
 	sock_unregister(PF_UNIX);
-	proto_unregister(&unix_proto);
+	proto_unregister(&unix_dgram_proto);
+	proto_unregister(&unix_stream_proto);
 	unregister_pernet_subsys(&unix_net_ops);
 }
 
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index 20f53575b5c9..b927e2baae50 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -38,9 +38,18 @@ static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock,
 	return ret;
 }
 
-static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
-				  size_t len, int nonblock, int flags,
-				  int *addr_len)
+static int __unix_recvmsg(struct sock *sk, struct msghdr *msg,
+			  size_t len, int flags)
+{
+	if (sk->sk_type == SOCK_DGRAM)
+		return __unix_dgram_recvmsg(sk, msg, len, flags);
+	else
+		return __unix_stream_recvmsg(sk, msg, len, flags);
+}
+
+static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
+			    size_t len, int nonblock, int flags,
+			    int *addr_len)
 {
 	struct unix_sock *u = unix_sk(sk);
 	struct sk_psock *psock;
@@ -48,14 +57,14 @@ static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
 
 	psock = sk_psock_get(sk);
 	if (unlikely(!psock))
-		return __unix_dgram_recvmsg(sk, msg, len, flags);
+		return __unix_recvmsg(sk, msg, len, flags);
 
 	mutex_lock(&u->iolock);
 	if (!skb_queue_empty(&sk->sk_receive_queue) &&
 	    sk_psock_queue_empty(psock)) {
 		mutex_unlock(&u->iolock);
 		sk_psock_put(sk, psock);
-		return __unix_dgram_recvmsg(sk, msg, len, flags);
+		return __unix_recvmsg(sk, msg, len, flags);
 	}
 
 msg_bytes_ready:
@@ -71,7 +80,7 @@ msg_bytes_ready:
 				goto msg_bytes_ready;
 			mutex_unlock(&u->iolock);
 			sk_psock_put(sk, psock);
-			return __unix_dgram_recvmsg(sk, msg, len, flags);
+			return __unix_recvmsg(sk, msg, len, flags);
 		}
 		copied = -EAGAIN;
 	}
@@ -80,30 +89,55 @@ msg_bytes_ready:
 	return copied;
 }
 
-static struct proto *unix_prot_saved __read_mostly;
-static DEFINE_SPINLOCK(unix_prot_lock);
-static struct proto unix_bpf_prot;
+static struct proto *unix_dgram_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(unix_dgram_prot_lock);
+static struct proto unix_dgram_bpf_prot;
+
+static struct proto *unix_stream_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(unix_stream_prot_lock);
+static struct proto unix_stream_bpf_prot;
 
-static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
 {
 	*prot        = *base;
 	prot->close  = sock_map_close;
-	prot->recvmsg = unix_dgram_bpf_recvmsg;
+	prot->recvmsg = unix_bpf_recvmsg;
+}
+
+static void unix_stream_bpf_rebuild_protos(struct proto *prot,
+					   const struct proto *base)
+{
+	*prot        = *base;
+	prot->close  = sock_map_close;
+	prot->recvmsg = unix_bpf_recvmsg;
+	prot->unhash  = sock_map_unhash;
+}
+
+static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops)
+{
+	if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) {
+		spin_lock_bh(&unix_dgram_prot_lock);
+		if (likely(ops != unix_dgram_prot_saved)) {
+			unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops);
+			smp_store_release(&unix_dgram_prot_saved, ops);
+		}
+		spin_unlock_bh(&unix_dgram_prot_lock);
+	}
 }
 
-static void unix_bpf_check_needs_rebuild(struct proto *ops)
+static void unix_stream_bpf_check_needs_rebuild(struct proto *ops)
 {
-	if (unlikely(ops != smp_load_acquire(&unix_prot_saved))) {
-		spin_lock_bh(&unix_prot_lock);
-		if (likely(ops != unix_prot_saved)) {
-			unix_bpf_rebuild_protos(&unix_bpf_prot, ops);
-			smp_store_release(&unix_prot_saved, ops);
+	if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) {
+		spin_lock_bh(&unix_stream_prot_lock);
+		if (likely(ops != unix_stream_prot_saved)) {
+			unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops);
+			smp_store_release(&unix_stream_prot_saved, ops);
 		}
-		spin_unlock_bh(&unix_prot_lock);
+		spin_unlock_bh(&unix_stream_prot_lock);
 	}
 }
 
-int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 {
 	if (sk->sk_type != SOCK_DGRAM)
 		return -EOPNOTSUPP;
@@ -114,12 +148,27 @@ int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 		return 0;
 	}
 
-	unix_bpf_check_needs_rebuild(psock->sk_proto);
-	WRITE_ONCE(sk->sk_prot, &unix_bpf_prot);
+	unix_dgram_bpf_check_needs_rebuild(psock->sk_proto);
+	WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot);
+	return 0;
+}
+
+int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+	if (restore) {
+		sk->sk_write_space = psock->saved_write_space;
+		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+		return 0;
+	}
+
+	unix_stream_bpf_check_needs_rebuild(psock->sk_proto);
+	WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot);
 	return 0;
 }
 
 void __init unix_bpf_build_proto(void)
 {
-	unix_bpf_rebuild_protos(&unix_bpf_prot, &unix_proto);
+	unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto);
+	unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto);
+
 }
-- 
cgit v1.2.3


From 0d2ab3aea50bb02ff0c9c3d53c7b2b4b21cdd59d Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Fri, 2 Jul 2021 19:44:07 +0200
Subject: nl80211: add support for BSS coloring

This patch adds support for BSS color collisions to the wireless subsystem.
Add the required functionality to nl80211 that will notify about color
collisions, triggering the color change and notifying when it is completed.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: John Crispin <john@phrozen.org>
Link: https://lore.kernel.org/r/500b3582aec8fe2c42ef46f3117b148cb7cbceb5.1625247619.git.lorenzo@kernel.org
[remove unnecessary NULL initialisation]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  92 +++++++++++++++++++++++++
 include/uapi/linux/nl80211.h |  43 ++++++++++++
 net/wireless/nl80211.c       | 157 +++++++++++++++++++++++++++++++++++++++++++
 net/wireless/rdev-ops.h      |  13 ++++
 net/wireless/trace.h         |  46 +++++++++++++
 5 files changed, 351 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 161cdf7df1a0..62dd8422e0dc 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1252,6 +1252,27 @@ struct cfg80211_csa_settings {
 	u8 count;
 };
 
+/**
+ * struct cfg80211_color_change_settings - color change settings
+ *
+ * Used for bss color change
+ *
+ * @beacon_color_change: beacon data while performing the color countdown
+ * @counter_offsets_beacon: offsets of the counters within the beacon (tail)
+ * @counter_offsets_presp: offsets of the counters within the probe response
+ * @beacon_next: beacon data to be used after the color change
+ * @count: number of beacons until the color change
+ * @color: the color used after the change
+ */
+struct cfg80211_color_change_settings {
+	struct cfg80211_beacon_data beacon_color_change;
+	u16 counter_offset_beacon;
+	u16 counter_offset_presp;
+	struct cfg80211_beacon_data beacon_next;
+	u8 count;
+	u8 color;
+};
+
 /**
  * struct iface_combination_params - input parameters for interface combinations
  *
@@ -3995,6 +4016,8 @@ struct mgmt_frame_regs {
  *	given TIDs. This callback may sleep.
  *
  * @set_sar_specs: Update the SAR (TX power) settings.
+ *
+ * @color_change: Initiate a color change.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -4322,6 +4345,9 @@ struct cfg80211_ops {
 				    const u8 *peer, u8 tids);
 	int	(*set_sar_specs)(struct wiphy *wiphy,
 				 struct cfg80211_sar_specs *sar);
+	int	(*color_change)(struct wiphy *wiphy,
+				struct net_device *dev,
+				struct cfg80211_color_change_settings *params);
 };
 
 /*
@@ -8218,4 +8244,70 @@ void cfg80211_update_owe_info_event(struct net_device *netdev,
  */
 void cfg80211_bss_flush(struct wiphy *wiphy);
 
+/**
+ * cfg80211_bss_color_notify - notify about bss color event
+ * @dev: network device
+ * @gfp: allocation flags
+ * @cmd: the actual event we want to notify
+ * @count: the number of TBTTs until the color change happens
+ * @color_bitmap: representations of the colors that the local BSS is aware of
+ */
+int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp,
+			      enum nl80211_commands cmd, u8 count,
+			      u64 color_bitmap);
+
+/**
+ * cfg80211_obss_color_collision_notify - notify about bss color collision
+ * @dev: network device
+ * @color_bitmap: representations of the colors that the local BSS is aware of
+ */
+static inline int cfg80211_obss_color_collision_notify(struct net_device *dev,
+						       u64 color_bitmap)
+{
+	return cfg80211_bss_color_notify(dev, GFP_KERNEL,
+					 NL80211_CMD_OBSS_COLOR_COLLISION,
+					 0, color_bitmap);
+}
+
+/**
+ * cfg80211_color_change_started_notify - notify color change start
+ * @dev: the device on which the color is switched
+ * @count: the number of TBTTs until the color change happens
+ *
+ * Inform the userspace about the color change that has started.
+ */
+static inline int cfg80211_color_change_started_notify(struct net_device *dev,
+						       u8 count)
+{
+	return cfg80211_bss_color_notify(dev, GFP_KERNEL,
+					 NL80211_CMD_COLOR_CHANGE_STARTED,
+					 count, 0);
+}
+
+/**
+ * cfg80211_color_change_aborted_notify - notify color change abort
+ * @dev: the device on which the color is switched
+ *
+ * Inform the userspace about the color change that has aborted.
+ */
+static inline int cfg80211_color_change_aborted_notify(struct net_device *dev)
+{
+	return cfg80211_bss_color_notify(dev, GFP_KERNEL,
+					 NL80211_CMD_COLOR_CHANGE_ABORTED,
+					 0, 0);
+}
+
+/**
+ * cfg80211_color_change_notify - notify color change completion
+ * @dev: the device on which the color was switched
+ *
+ * Inform the userspace about the color change that has completed.
+ */
+static inline int cfg80211_color_change_notify(struct net_device *dev)
+{
+	return cfg80211_bss_color_notify(dev, GFP_KERNEL,
+					 NL80211_CMD_COLOR_CHANGE_COMPLETED,
+					 0, 0);
+}
+
 #endif /* __NET_CFG80211_H */
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index db474994fa73..c2efea98e060 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1185,6 +1185,21 @@
  *	passed using %NL80211_ATTR_SAR_SPEC. %NL80211_ATTR_WIPHY is used to
  *	specify the wiphy index to be applied to.
  *
+ * @NL80211_CMD_OBSS_COLOR_COLLISION: This notification is sent out whenever
+ *	mac80211/drv detects a bss color collision.
+ *
+ * @NL80211_CMD_COLOR_CHANGE_REQUEST: This command is used to indicate that
+ *	userspace wants to change the BSS color.
+ *
+ * @NL80211_CMD_COLOR_CHANGE_STARTED: Notify userland, that a color change has
+ *	started
+ *
+ * @NL80211_CMD_COLOR_CHANGE_ABORTED: Notify userland, that the color change has
+ *	been aborted
+ *
+ * @NL80211_CMD_COLOR_CHANGE_COMPLETED: Notify userland that the color change
+ *	has completed
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1417,6 +1432,14 @@ enum nl80211_commands {
 
 	NL80211_CMD_SET_SAR_SPECS,
 
+	NL80211_CMD_OBSS_COLOR_COLLISION,
+
+	NL80211_CMD_COLOR_CHANGE_REQUEST,
+
+	NL80211_CMD_COLOR_CHANGE_STARTED,
+	NL80211_CMD_COLOR_CHANGE_ABORTED,
+	NL80211_CMD_COLOR_CHANGE_COMPLETED,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2560,6 +2583,16 @@ enum nl80211_commands {
  *	disassoc events to indicate that an immediate reconnect to the AP
  *	is desired.
  *
+ * @NL80211_ATTR_OBSS_COLOR_BITMAP: bitmap of the u64 BSS colors for the
+ *	%NL80211_CMD_OBSS_COLOR_COLLISION event.
+ *
+ * @NL80211_ATTR_COLOR_CHANGE_COUNT: u8 attribute specifying the number of TBTT's
+ *	until the color switch event.
+ * @NL80211_ATTR_COLOR_CHANGE_COLOR: u8 attribute specifying the color that we are
+ *	switching to
+ * @NL80211_ATTR_COLOR_CHANGE_ELEMS: Nested set of attributes containing the IE
+ *	information for the time while performing a color switch.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3057,6 +3090,12 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_DISABLE_HE,
 
+	NL80211_ATTR_OBSS_COLOR_BITMAP,
+
+	NL80211_ATTR_COLOR_CHANGE_COUNT,
+	NL80211_ATTR_COLOR_CHANGE_COLOR,
+	NL80211_ATTR_COLOR_CHANGE_ELEMS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -5953,6 +5992,9 @@ enum nl80211_feature_flags {
  *      frame protection for all management frames exchanged during the
  *      negotiation and range measurement procedure.
  *
+ * @NL80211_EXT_FEATURE_BSS_COLOR: The driver supports BSS color collision
+ *	detection and change announcemnts.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6017,6 +6059,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_SECURE_LTF,
 	NL80211_EXT_FEATURE_SECURE_RTT,
 	NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE,
+	NL80211_EXT_FEATURE_BSS_COLOR,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index dceed5b5b226..bf7cd4752547 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -759,6 +759,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_RECONNECT_REQUESTED] = { .type = NLA_REJECT },
 	[NL80211_ATTR_SAR_SPEC] = NLA_POLICY_NESTED(sar_policy),
 	[NL80211_ATTR_DISABLE_HE] = { .type = NLA_FLAG },
+	[NL80211_ATTR_OBSS_COLOR_BITMAP] = { .type = NLA_U64 },
+	[NL80211_ATTR_COLOR_CHANGE_COUNT] = { .type = NLA_U8 },
+	[NL80211_ATTR_COLOR_CHANGE_COLOR] = { .type = NLA_U8 },
+	[NL80211_ATTR_COLOR_CHANGE_ELEMS] = NLA_POLICY_NESTED(nl80211_policy),
 };
 
 /* policy for the key attributes */
@@ -14800,6 +14804,106 @@ bad_tid_conf:
 	return ret;
 }
 
+static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct cfg80211_color_change_settings params = {};
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct nlattr **tb;
+	u16 offset;
+	int err;
+
+	if (!rdev->ops->color_change)
+		return -EOPNOTSUPP;
+
+	if (!wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_BSS_COLOR))
+		return -EOPNOTSUPP;
+
+	if (wdev->iftype != NL80211_IFTYPE_AP)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT] ||
+	    !info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR] ||
+	    !info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS])
+		return -EINVAL;
+
+	params.count = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT]);
+	params.color = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR]);
+
+	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_next);
+	if (err)
+		return err;
+
+	tb = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*tb), GFP_KERNEL);
+	if (!tb)
+		return -ENOMEM;
+
+	err = nla_parse_nested(tb, NL80211_ATTR_MAX,
+			       info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS],
+			       nl80211_policy, info->extack);
+	if (err)
+		goto out;
+
+	err = nl80211_parse_beacon(rdev, tb, &params.beacon_color_change);
+	if (err)
+		goto out;
+
+	if (!tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) != sizeof(u16)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]);
+	if (offset >= params.beacon_color_change.tail_len) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (params.beacon_color_change.tail[offset] != params.count) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	params.counter_offset_beacon = offset;
+
+	if (tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) {
+		if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) !=
+		    sizeof(u16)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]);
+		if (offset >= params.beacon_color_change.probe_resp_len) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (params.beacon_color_change.probe_resp[offset] !=
+		    params.count) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		params.counter_offset_presp = offset;
+	}
+
+	wdev_lock(wdev);
+	err = rdev_color_change(rdev, dev, &params);
+	wdev_unlock(wdev);
+
+out:
+	kfree(tb);
+	return err;
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -15795,6 +15899,14 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_WIPHY |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_COLOR_CHANGE_REQUEST,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = nl80211_color_change,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
@@ -17424,6 +17536,51 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev,
 }
 EXPORT_SYMBOL(cfg80211_ch_switch_started_notify);
 
+int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp,
+			      enum nl80211_commands cmd, u8 count,
+			      u64 color_bitmap)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+	struct sk_buff *msg;
+	void *hdr;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	trace_cfg80211_bss_color_notify(dev, cmd, count, color_bitmap);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+	if (!hdr)
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
+		goto nla_put_failure;
+
+	if (cmd == NL80211_CMD_COLOR_CHANGE_STARTED &&
+	    nla_put_u32(msg, NL80211_ATTR_COLOR_CHANGE_COUNT, count))
+		goto nla_put_failure;
+
+	if (cmd == NL80211_CMD_OBSS_COLOR_COLLISION &&
+	    nla_put_u64_64bit(msg, NL80211_ATTR_OBSS_COLOR_BITMAP,
+			      color_bitmap, NL80211_ATTR_PAD))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
+				       msg, 0, NL80211_MCGRP_MLME, gfp);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(cfg80211_bss_color_notify);
+
 void
 nl80211_radar_notify(struct cfg80211_registered_device *rdev,
 		     const struct cfg80211_chan_def *chandef,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index b1d37f582dc6..ce6bf218a1a3 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1368,4 +1368,17 @@ static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int rdev_color_change(struct cfg80211_registered_device *rdev,
+				    struct net_device *dev,
+				    struct cfg80211_color_change_settings *params)
+{
+	int ret;
+
+	trace_rdev_color_change(&rdev->wiphy, dev, params);
+	ret = rdev->ops->color_change(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+
+	return ret;
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 440bce5f0274..911bb3b8969e 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3597,6 +3597,52 @@ TRACE_EVENT(rdev_set_sar_specs,
 		  WIPHY_PR_ARG, __entry->type, __entry->num)
 );
 
+TRACE_EVENT(rdev_color_change,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct cfg80211_color_change_settings *params),
+	TP_ARGS(wiphy, netdev, params),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		__field(u8, count)
+		__field(u16, bcn_ofs)
+		__field(u16, pres_ofs)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		__entry->count = params->count;
+		__entry->bcn_ofs = params->counter_offset_beacon;
+		__entry->pres_ofs = params->counter_offset_presp;
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
+		  ", count: %u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG,
+		  __entry->count)
+);
+
+TRACE_EVENT(cfg80211_bss_color_notify,
+	TP_PROTO(struct net_device *netdev,
+		 enum nl80211_commands cmd,
+		 u8 count, u64 color_bitmap),
+	TP_ARGS(netdev, cmd, count, color_bitmap),
+	TP_STRUCT__entry(
+		NETDEV_ENTRY
+		__field(enum nl80211_bss_scan_width, cmd)
+		__field(u8, count)
+		__field(u64, color_bitmap)
+	),
+	TP_fast_assign(
+		NETDEV_ASSIGN;
+		__entry->cmd = cmd;
+		__entry->count = count;
+		__entry->color_bitmap = color_bitmap;
+	),
+	TP_printk(NETDEV_PR_FMT ", cmd: %x, count: %u, bitmap: %llx",
+		  NETDEV_PR_ARG, __entry->cmd, __entry->count,
+		  __entry->color_bitmap)
+);
+
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From 5f9404abdf2ac31c8f4768c39714bfcaca389e3a Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Fri, 2 Jul 2021 19:44:08 +0200
Subject: mac80211: add support for BSS color change

The color change announcement is very similar to how CSA works where
we have an IE that includes a counter. When the counter hits 0, the new
color is applied via an updated beacon.

This patch makes the CSA counter functionality reusable, rather than
implementing it again. This also allows for future reuse incase support
for other counter IEs gets added.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: John Crispin <john@phrozen.org>
Link: https://lore.kernel.org/r/057c1e67b82bee561ea44ce6a45a8462d3da6995.1625247619.git.lorenzo@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  29 ++++++
 net/mac80211/cfg.c         | 234 ++++++++++++++++++++++++++++++++++++++++++---
 net/mac80211/ieee80211_i.h |  11 +++
 net/mac80211/iface.c       |   2 +
 net/mac80211/tx.c          |  29 +++---
 5 files changed, 283 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d8a1d09a2141..a23e6734d26b 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1711,6 +1711,10 @@ enum ieee80211_offload_flags {
  *	protected by fq->lock.
  * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see
  *	&enum ieee80211_offload_flags.
+ * @color_change_active: marks whether a color change is ongoing. Internally it is
+ *	write-protected by sdata_lock and local->mtx so holding either is fine
+ *	for read access.
+ * @color_change_color: the bss color that will be used after the change.
  */
 struct ieee80211_vif {
 	enum nl80211_iftype type;
@@ -1739,6 +1743,9 @@ struct ieee80211_vif {
 
 	bool txqs_stopped[IEEE80211_NUM_ACS];
 
+	bool color_change_active;
+	u8 color_change_color;
+
 	/* must be last */
 	u8 drv_priv[] __aligned(sizeof(void *));
 };
@@ -5007,6 +5014,16 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif);
  */
 bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif);
 
+/**
+ * ieee80211_color_change_finish - notify mac80211 about color change
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ *
+ * After a color change announcement was scheduled and the counter in this
+ * announcement hits 1, this function must be called by the driver to
+ * notify mac80211 that the color can be changed
+ */
+void ieee80211_color_change_finish(struct ieee80211_vif *vif);
+
 /**
  * ieee80211_proberesp_get - retrieve a Probe Response template
  * @hw: pointer obtained from ieee80211_alloc_hw().
@@ -6771,6 +6788,18 @@ struct sk_buff *
 ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw,
 					  struct ieee80211_vif *vif);
 
+/**
+ * ieeee80211_obss_color_collision_notify - notify userland about a BSS color
+ * collision.
+ *
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is
+ *	aware of.
+ */
+void
+ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
+				       u64 color_bitmap);
+
 /**
  * ieee80211_is_tx_data - check if frame is a data frame
  *
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 4e6f11e63df3..d69b31c20fe2 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -828,9 +828,11 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
 	return ret;
 }
 
-static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
-				    const u8 *resp, size_t resp_len,
-				    const struct ieee80211_csa_settings *csa)
+static int
+ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
+			 const u8 *resp, size_t resp_len,
+			 const struct ieee80211_csa_settings *csa,
+			 const struct ieee80211_color_change_settings *cca)
 {
 	struct probe_resp *new, *old;
 
@@ -850,6 +852,8 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
 		memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp,
 		       csa->n_counter_offsets_presp *
 		       sizeof(new->cntdwn_counter_offsets[0]));
+	else if (cca)
+		new->cntdwn_counter_offsets[0] = cca->counter_offset_presp;
 
 	rcu_assign_pointer(sdata->u.ap.probe_resp, new);
 	if (old)
@@ -955,7 +959,8 @@ static int ieee80211_set_ftm_responder_params(
 
 static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
 				   struct cfg80211_beacon_data *params,
-				   const struct ieee80211_csa_settings *csa)
+				   const struct ieee80211_csa_settings *csa,
+				   const struct ieee80211_color_change_settings *cca)
 {
 	struct beacon_data *new, *old;
 	int new_head_len, new_tail_len;
@@ -1004,6 +1009,9 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
 		memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon,
 		       csa->n_counter_offsets_beacon *
 		       sizeof(new->cntdwn_counter_offsets[0]));
+	} else if (cca) {
+		new->cntdwn_current_counter = cca->count;
+		new->cntdwn_counter_offsets[0] = cca->counter_offset_beacon;
 	}
 
 	/* copy in head */
@@ -1020,7 +1028,7 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
 			memcpy(new->tail, old->tail, new_tail_len);
 
 	err = ieee80211_set_probe_resp(sdata, params->probe_resp,
-				       params->probe_resp_len, csa);
+				       params->probe_resp_len, csa, cca);
 	if (err < 0) {
 		kfree(new);
 		return err;
@@ -1175,7 +1183,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL))
 		sdata->vif.bss_conf.beacon_tx_rate = params->beacon_rate;
 
-	err = ieee80211_assign_beacon(sdata, &params->beacon, NULL);
+	err = ieee80211_assign_beacon(sdata, &params->beacon, NULL, NULL);
 	if (err < 0)
 		goto error;
 	changed |= err;
@@ -1230,17 +1238,17 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	sdata_assert_lock(sdata);
 
-	/* don't allow changing the beacon while CSA is in place - offset
+	/* don't allow changing the beacon while a countdown is in place - offset
 	 * of channel switch counter may change
 	 */
-	if (sdata->vif.csa_active)
+	if (sdata->vif.csa_active || sdata->vif.color_change_active)
 		return -EBUSY;
 
 	old = sdata_dereference(sdata->u.ap.beacon, sdata);
 	if (!old)
 		return -ENOENT;
 
-	err = ieee80211_assign_beacon(sdata, params, NULL);
+	err = ieee80211_assign_beacon(sdata, params, NULL, NULL);
 	if (err < 0)
 		return err;
 	ieee80211_bss_info_change_notify(sdata, err);
@@ -3156,7 +3164,7 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata,
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP:
 		err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon,
-					      NULL);
+					      NULL, NULL);
 		kfree(sdata->u.ap.next_beacon);
 		sdata->u.ap.next_beacon = NULL;
 
@@ -3322,7 +3330,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
 		csa.n_counter_offsets_presp = params->n_counter_offsets_presp;
 		csa.count = params->count;
 
-		err = ieee80211_assign_beacon(sdata, &params->beacon_csa, &csa);
+		err = ieee80211_assign_beacon(sdata, &params->beacon_csa, &csa, NULL);
 		if (err < 0) {
 			kfree(sdata->u.ap.next_beacon);
 			return err;
@@ -3411,6 +3419,15 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
+static void ieee80211_color_change_abort(struct ieee80211_sub_if_data  *sdata)
+{
+	sdata->vif.color_change_active = false;
+	kfree(sdata->u.ap.next_beacon);
+	sdata->u.ap.next_beacon = NULL;
+
+	cfg80211_color_change_aborted_notify(sdata->dev);
+}
+
 static int
 __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 			   struct cfg80211_csa_settings *params)
@@ -3479,6 +3496,10 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 		goto out;
 	}
 
+	/* if there is a color change in progress, abort it */
+	if (sdata->vif.color_change_active)
+		ieee80211_color_change_abort(sdata);
+
 	err = ieee80211_set_csa_beacon(sdata, params, &changed);
 	if (err) {
 		ieee80211_vif_unreserve_chanctx(sdata);
@@ -4130,6 +4151,196 @@ static int ieee80211_set_sar_specs(struct wiphy *wiphy,
 	return local->ops->set_sar_specs(&local->hw, sar);
 }
 
+static int
+ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata,
+					u32 *changed)
+{
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP: {
+		int ret;
+
+		ret = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon,
+					      NULL, NULL);
+		kfree(sdata->u.ap.next_beacon);
+		sdata->u.ap.next_beacon = NULL;
+
+		if (ret < 0)
+			return ret;
+
+		*changed |= ret;
+		break;
+	}
+	default:
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata,
+				  struct cfg80211_color_change_settings *params,
+				  u32 *changed)
+{
+	struct ieee80211_color_change_settings color_change = {};
+	int err;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP:
+		sdata->u.ap.next_beacon =
+			cfg80211_beacon_dup(&params->beacon_next);
+		if (!sdata->u.ap.next_beacon)
+			return -ENOMEM;
+
+		if (params->count <= 1)
+			break;
+
+		color_change.counter_offset_beacon =
+			params->counter_offset_beacon;
+		color_change.counter_offset_presp =
+			params->counter_offset_presp;
+		color_change.count = params->count;
+
+		err = ieee80211_assign_beacon(sdata, &params->beacon_color_change,
+					      NULL, &color_change);
+		if (err < 0) {
+			kfree(sdata->u.ap.next_beacon);
+			return err;
+		}
+		*changed |= err;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void
+ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata,
+					 u8 color, int enable, u32 changed)
+{
+	sdata->vif.bss_conf.he_bss_color.color = color;
+	sdata->vif.bss_conf.he_bss_color.enabled = enable;
+	changed |= BSS_CHANGED_HE_BSS_COLOR;
+
+	ieee80211_bss_info_change_notify(sdata, changed);
+}
+
+static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	u32 changed = 0;
+	int err;
+
+	sdata_assert_lock(sdata);
+	lockdep_assert_held(&local->mtx);
+
+	sdata->vif.color_change_active = false;
+
+	err = ieee80211_set_after_color_change_beacon(sdata, &changed);
+	if (err) {
+		cfg80211_color_change_aborted_notify(sdata->dev);
+		return err;
+	}
+
+	ieee80211_color_change_bss_config_notify(sdata,
+						 sdata->vif.color_change_color,
+						 1, changed);
+	cfg80211_color_change_notify(sdata->dev);
+
+	return 0;
+}
+
+void ieee80211_color_change_finalize_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data,
+			     color_change_finalize_work);
+	struct ieee80211_local *local = sdata->local;
+
+	sdata_lock(sdata);
+	mutex_lock(&local->mtx);
+
+	/* AP might have been stopped while waiting for the lock. */
+	if (!sdata->vif.color_change_active)
+		goto unlock;
+
+	if (!ieee80211_sdata_running(sdata))
+		goto unlock;
+
+	ieee80211_color_change_finalize(sdata);
+
+unlock:
+	mutex_unlock(&local->mtx);
+	sdata_unlock(sdata);
+}
+
+void ieee80211_color_change_finish(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+	ieee80211_queue_work(&sdata->local->hw,
+			     &sdata->color_change_finalize_work);
+}
+EXPORT_SYMBOL_GPL(ieee80211_color_change_finish);
+
+void
+ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
+				       u64 color_bitmap)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+	if (sdata->vif.color_change_active || sdata->vif.csa_active)
+		return;
+
+	cfg80211_obss_color_collision_notify(sdata->dev, color_bitmap);
+}
+EXPORT_SYMBOL_GPL(ieeee80211_obss_color_collision_notify);
+
+static int
+ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev,
+		       struct cfg80211_color_change_settings *params)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	u32 changed = 0;
+	int err;
+
+	sdata_assert_lock(sdata);
+
+	mutex_lock(&local->mtx);
+
+	/* don't allow another color change if one is already active or if csa
+	 * is active
+	 */
+	if (sdata->vif.color_change_active || sdata->vif.csa_active) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = ieee80211_set_color_change_beacon(sdata, params, &changed);
+	if (err)
+		goto out;
+
+	sdata->vif.color_change_active = true;
+	sdata->vif.color_change_color = params->color;
+
+	cfg80211_color_change_started_notify(sdata->dev, params->count);
+
+	if (changed)
+		ieee80211_color_change_bss_config_notify(sdata, 0, 0, changed);
+	else
+		/* if the beacon didn't change, we can finalize immediately */
+		ieee80211_color_change_finalize(sdata);
+
+out:
+	mutex_unlock(&local->mtx);
+
+	return err;
+}
+
 const struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -4233,4 +4444,5 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.set_tid_config = ieee80211_set_tid_config,
 	.reset_tid_config = ieee80211_reset_tid_config,
 	.set_sar_specs = ieee80211_set_sar_specs,
+	.color_change = ieee80211_color_change,
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c57224d771b9..e8945c20688a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -245,6 +245,12 @@ struct ieee80211_csa_settings {
 	u8 count;
 };
 
+struct ieee80211_color_change_settings {
+	u16 counter_offset_beacon;
+	u16 counter_offset_presp;
+	u8 count;
+};
+
 struct beacon_data {
 	u8 *head, *tail;
 	int head_len, tail_len;
@@ -924,6 +930,8 @@ struct ieee80211_sub_if_data {
 	bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */
 	struct cfg80211_chan_def csa_chandef;
 
+	struct work_struct color_change_finalize_work;
+
 	struct list_head assigned_chanctx_list; /* protected by chanctx_mtx */
 	struct list_head reserved_chanctx_list; /* protected by chanctx_mtx */
 
@@ -1888,6 +1896,9 @@ void ieee80211_csa_finalize_work(struct work_struct *work);
 int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 			     struct cfg80211_csa_settings *params);
 
+/* color change handling */
+void ieee80211_color_change_finalize_work(struct work_struct *work);
+
 /* interface handling */
 #define MAC80211_SUPPORTED_FEATURES_TX	(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
 					 NETIF_F_HW_CSUM | NETIF_F_SG | \
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index cd96cd337aa8..769f8f585c06 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -462,6 +462,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 	sdata_unlock(sdata);
 
 	cancel_work_sync(&sdata->csa_finalize_work);
+	cancel_work_sync(&sdata->color_change_finalize_work);
 
 	cancel_delayed_work_sync(&sdata->dfs_cac_timer_work);
 
@@ -1531,6 +1532,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	INIT_WORK(&sdata->work, ieee80211_iface_work);
 	INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work);
 	INIT_WORK(&sdata->csa_finalize_work, ieee80211_csa_finalize_work);
+	INIT_WORK(&sdata->color_change_finalize_work, ieee80211_color_change_finalize_work);
 	INIT_LIST_HEAD(&sdata->assigned_chanctx_list);
 	INIT_LIST_HEAD(&sdata->reserved_chanctx_list);
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 8509778ff31f..1f7571777fb1 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4782,11 +4782,11 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
 					struct beacon_data *beacon)
 {
+	u8 *beacon_data, count, max_count = 1;
 	struct probe_resp *resp;
-	u8 *beacon_data;
 	size_t beacon_data_len;
+	u16 *bcn_offsets;
 	int i;
-	u8 count = beacon->cntdwn_current_counter;
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP:
@@ -4806,21 +4806,27 @@ static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
 	}
 
 	rcu_read_lock();
-	for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; ++i) {
-		resp = rcu_dereference(sdata->u.ap.probe_resp);
+	resp = rcu_dereference(sdata->u.ap.probe_resp);
+
+	bcn_offsets = beacon->cntdwn_counter_offsets;
+	count = beacon->cntdwn_current_counter;
+	if (sdata->vif.csa_active)
+		max_count = IEEE80211_MAX_CNTDWN_COUNTERS_NUM;
 
-		if (beacon->cntdwn_counter_offsets[i]) {
-			if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[i] >=
-					 beacon_data_len)) {
+	for (i = 0; i < max_count; ++i) {
+		if (bcn_offsets[i]) {
+			if (WARN_ON_ONCE(bcn_offsets[i] >= beacon_data_len)) {
 				rcu_read_unlock();
 				return;
 			}
-
-			beacon_data[beacon->cntdwn_counter_offsets[i]] = count;
+			beacon_data[bcn_offsets[i]] = count;
 		}
 
-		if (sdata->vif.type == NL80211_IFTYPE_AP && resp)
-			resp->data[resp->cntdwn_counter_offsets[i]] = count;
+		if (sdata->vif.type == NL80211_IFTYPE_AP && resp) {
+			u16 *resp_offsets = resp->cntdwn_counter_offsets;
+
+			resp->data[resp_offsets[i]] = count;
+		}
 	}
 	rcu_read_unlock();
 }
@@ -5030,6 +5036,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 			if (offs) {
 				offs->tim_offset = beacon->head_len;
 				offs->tim_length = skb->len - beacon->head_len;
+				offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0];
 
 				/* for AP the csa offsets are from tail */
 				csa_off_base = skb->len;
-- 
cgit v1.2.3


From 2843ff6f36db7074e17bf5d637a14da08c54aed8 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@xiaomi.com>
Date: Tue, 17 Aug 2021 15:07:23 -0700
Subject: mptcp: remote addresses fullmesh

This patch added and managed a new per endpoint flag, named
MPTCP_PM_ADDR_FLAG_FULLMESH.

In mptcp_pm_create_subflow_or_signal_addr(), if such flag is set, instead
of:
        remote_address((struct sock_common *)sk, &remote);
fill a temporary allocated array of all known remote address. After
releaseing the pm lock loop on such array and create a subflow for each
remote address from the given local.

Note that the we could still use an array even for non 'fullmesh'
endpoint: with a single entry corresponding to the primary MPC subflow
remote address.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Geliang Tang <geliangtang@xiaomi.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mptcp.h |  1 +
 net/mptcp/pm_netlink.c     | 59 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 56 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 7b05f7102321..f66038b9551f 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -73,6 +73,7 @@ enum {
 #define MPTCP_PM_ADDR_FLAG_SIGNAL			(1 << 0)
 #define MPTCP_PM_ADDR_FLAG_SUBFLOW			(1 << 1)
 #define MPTCP_PM_ADDR_FLAG_BACKUP			(1 << 2)
+#define MPTCP_PM_ADDR_FLAG_FULLMESH			(1 << 3)
 
 enum {
 	MPTCP_PM_CMD_UNSPEC,
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 64a39f30659f..bf5b9b475bb7 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -410,6 +410,55 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
 	}
 }
 
+static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr,
+				  struct mptcp_addr_info *addr)
+{
+	int i;
+
+	for (i = 0; i < nr; i++) {
+		if (addresses_equal(&addrs[i], addr, addr->port))
+			return true;
+	}
+
+	return false;
+}
+
+/* Fill all the remote addresses into the array addrs[],
+ * and return the array size.
+ */
+static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh,
+					      struct mptcp_addr_info *addrs)
+{
+	struct sock *sk = (struct sock *)msk, *ssk;
+	struct mptcp_subflow_context *subflow;
+	struct mptcp_addr_info remote = { 0 };
+	unsigned int subflows_max;
+	int i = 0;
+
+	subflows_max = mptcp_pm_get_subflows_max(msk);
+
+	/* Non-fullmesh endpoint, fill in the single entry
+	 * corresponding to the primary MPC subflow remote address
+	 */
+	if (!fullmesh) {
+		remote_address((struct sock_common *)sk, &remote);
+		msk->pm.subflows++;
+		addrs[i++] = remote;
+	} else {
+		mptcp_for_each_subflow(msk, subflow) {
+			ssk = mptcp_subflow_tcp_sock(subflow);
+			remote_address((struct sock_common *)ssk, &remote);
+			if (!lookup_address_in_vec(addrs, i, &remote) &&
+			    msk->pm.subflows < subflows_max) {
+				msk->pm.subflows++;
+				addrs[i++] = remote;
+			}
+		}
+	}
+
+	return i;
+}
+
 static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
 {
 	struct sock *sk = (struct sock *)msk;
@@ -455,14 +504,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
 	    !READ_ONCE(msk->pm.remote_deny_join_id0)) {
 		local = select_local_address(pernet, msk);
 		if (local) {
-			struct mptcp_addr_info remote = { 0 };
+			bool fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
+			struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
+			int i, nr;
 
 			msk->pm.local_addr_used++;
-			msk->pm.subflows++;
 			check_work_pending(msk);
-			remote_address((struct sock_common *)sk, &remote);
+			nr = fill_remote_addresses_vec(msk, fullmesh, addrs);
 			spin_unlock_bh(&msk->pm.lock);
-			__mptcp_subflow_connect(sk, &local->addr, &remote);
+			for (i = 0; i < nr; i++)
+				__mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
 			spin_lock_bh(&msk->pm.lock);
 			return;
 		}
-- 
cgit v1.2.3


From 994d2cbb08ca05e3c1af954ec63a3ae32a862ac5 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 17 Aug 2021 17:58:47 +0300
Subject: net: dsa: tag_sja1105: be dsa_loop-safe

Add support for tag_sja1105 running on non-sja1105 DSA ports, by making
sure that every time we dereference dp->priv, we check the switch's
dsa_switch_ops (otherwise we access a struct sja1105_port structure that
is in fact something else).

This adds an unconditional build-time dependency between sja1105 being
built as module => tag_sja1105 must also be built as module. This was
there only for PTP before.

Some sane defaults must also take place when not running on sja1105
hardware. These are:

- sja1105_xmit_tpid: the sja1105 driver uses different VLAN protocols
  depending on VLAN awareness and switch revision (when an encapsulated
  VLAN must be sent). Default to 0x8100.

- sja1105_rcv_meta_state_machine: this aggregates PTP frames with their
  metadata timestamp frames. When running on non-sja1105 hardware, don't
  do that and accept all frames unmodified.

- sja1105_defer_xmit: calls sja1105_port_deferred_xmit in sja1105_main.c
  which writes a management route over SPI. When not running on sja1105
  hardware, bypass the SPI write and send the frame as-is.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c |  5 ++---
 include/linux/dsa/sja1105.h            | 18 ++++++++++++++++
 net/dsa/Kconfig                        |  2 +-
 net/dsa/tag_sja1105.c                  | 39 +++++++++++++++++++++++-----------
 4 files changed, 48 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index fe894dc18335..05ba65042b5f 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -28,8 +28,6 @@
 #define SJA1105_UNKNOWN_MULTICAST	0x010000000000ull
 #define SJA1105_DEFAULT_VLAN		(VLAN_N_VID - 1)
 
-static const struct dsa_switch_ops sja1105_switch_ops;
-
 static void sja1105_hw_reset(struct gpio_desc *gpio, unsigned int pulse_len,
 			     unsigned int startup_delay)
 {
@@ -3100,7 +3098,7 @@ static void sja1105_teardown(struct dsa_switch *ds)
 	sja1105_static_config_free(&priv->static_config);
 }
 
-static const struct dsa_switch_ops sja1105_switch_ops = {
+const struct dsa_switch_ops sja1105_switch_ops = {
 	.get_tag_protocol	= sja1105_get_tag_protocol,
 	.setup			= sja1105_setup,
 	.teardown		= sja1105_teardown,
@@ -3149,6 +3147,7 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.port_bridge_tx_fwd_offload = dsa_tag_8021q_bridge_tx_fwd_offload,
 	.port_bridge_tx_fwd_unoffload = dsa_tag_8021q_bridge_tx_fwd_unoffload,
 };
+EXPORT_SYMBOL_GPL(sja1105_switch_ops);
 
 static const struct of_device_id sja1105_dt_ids[];
 
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 0eadc7ac44ec..6b0dc9ff92d1 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -88,4 +88,22 @@ static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port,
 
 #endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */
 
+#if IS_ENABLED(CONFIG_NET_DSA_SJA1105)
+
+extern const struct dsa_switch_ops sja1105_switch_ops;
+
+static inline bool dsa_port_is_sja1105(struct dsa_port *dp)
+{
+	return dp->ds->ops == &sja1105_switch_ops;
+}
+
+#else
+
+static inline bool dsa_port_is_sja1105(struct dsa_port *dp)
+{
+	return false;
+}
+
+#endif
+
 #endif /* _NET_DSA_SJA1105_H */
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 970906eb5b2c..548285539752 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -138,7 +138,7 @@ config NET_DSA_TAG_LAN9303
 
 config NET_DSA_TAG_SJA1105
 	tristate "Tag driver for NXP SJA1105 switches"
-	depends on (NET_DSA_SJA1105 && NET_DSA_SJA1105_PTP) || !NET_DSA_SJA1105 || !NET_DSA_SJA1105_PTP
+	depends on NET_DSA_SJA1105 || !NET_DSA_SJA1105
 	select PACKING
 	help
 	  Say Y or M if you want to enable support for tagging frames with the
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 1406bc41d345..5b80a9049e2c 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -116,9 +116,14 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb)
 }
 
 /* Calls sja1105_port_deferred_xmit in sja1105_main.c */
-static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp,
+static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp,
 					  struct sk_buff *skb)
 {
+	struct sja1105_port *sp = dp->priv;
+
+	if (!dsa_port_is_sja1105(dp))
+		return skb;
+
 	/* Increase refcount so the kfree_skb in dsa_slave_xmit
 	 * won't really free the packet.
 	 */
@@ -128,8 +133,13 @@ static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp,
 	return NULL;
 }
 
-static u16 sja1105_xmit_tpid(struct sja1105_port *sp)
+static u16 sja1105_xmit_tpid(struct dsa_port *dp)
 {
+	struct sja1105_port *sp = dp->priv;
+
+	if (unlikely(!dsa_port_is_sja1105(dp)))
+		return ETH_P_8021Q;
+
 	return sp->xmit_tpid;
 }
 
@@ -155,7 +165,7 @@ static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
 	 */
 	tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(dp->bridge_num);
 
-	return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), tx_vid);
+	return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), tx_vid);
 }
 
 static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
@@ -174,9 +184,9 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 	 * is the .port_deferred_xmit driver callback.
 	 */
 	if (unlikely(sja1105_is_link_local(skb)))
-		return sja1105_defer_xmit(dp->priv, skb);
+		return sja1105_defer_xmit(dp, skb);
 
-	return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv),
+	return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp),
 			     ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
 }
 
@@ -200,7 +210,7 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
 	 * tag_8021q TX VLANs.
 	 */
 	if (likely(!sja1105_is_link_local(skb)))
-		return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv),
+		return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp),
 				     ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
 
 	skb_push(skb, SJA1110_HEADER_LEN);
@@ -265,16 +275,16 @@ static struct sk_buff
 				bool is_link_local,
 				bool is_meta)
 {
-	struct sja1105_port *sp;
-	struct dsa_port *dp;
-
-	dp = dsa_slave_to_port(skb->dev);
-	sp = dp->priv;
-
 	/* Step 1: A timestampable frame was received.
 	 * Buffer it until we get its meta frame.
 	 */
 	if (is_link_local) {
+		struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+		struct sja1105_port *sp = dp->priv;
+
+		if (unlikely(!dsa_port_is_sja1105(dp)))
+			return skb;
+
 		if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state))
 			/* Do normal processing. */
 			return skb;
@@ -307,8 +317,13 @@ static struct sk_buff
 	 * frame, which serves no further purpose).
 	 */
 	} else if (is_meta) {
+		struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+		struct sja1105_port *sp = dp->priv;
 		struct sk_buff *stampable_skb;
 
+		if (unlikely(!dsa_port_is_sja1105(dp)))
+			return skb;
+
 		/* Drop the meta frame if we're not in the right state
 		 * to process it.
 		 */
-- 
cgit v1.2.3


From 4b1327be9fe57443295ae86fe0fcf24a18469e9f Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 17 Aug 2021 12:40:03 -0700
Subject: net-memcg: pass in gfp_t mask to mem_cgroup_charge_skmem()

Add gfp_t mask as an input parameter to mem_cgroup_charge_skmem(),
to give more control to the networking stack and enable it to change
memcg charging behavior. In the future, the networking stack may decide
to avoid oom-kills when fallbacks are more appropriate.

One behavior change in mem_cgroup_charge_skmem() by this patch is to
avoid force charging by default and let the caller decide when and if
force charging is needed through the presence or absence of
__GFP_NOFAIL.

Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/memcontrol.h      |  3 ++-
 include/net/sock.h              |  5 +++++
 mm/memcontrol.c                 | 24 +++++++++++-------------
 net/core/sock.c                 | 16 ++++++++++++----
 net/ipv4/inet_connection_sock.c |  3 ++-
 net/ipv4/tcp_output.c           |  3 ++-
 6 files changed, 34 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bfe5c486f4ad..f0ee30881ca9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1581,7 +1581,8 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 struct sock;
-bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
+			     gfp_t gfp_mask);
 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
 #ifdef CONFIG_MEMCG
 extern struct static_key_false memcg_sockets_enabled_key;
diff --git a/include/net/sock.h b/include/net/sock.h
index 6e761451c927..95b25777b53e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2400,6 +2400,11 @@ static inline gfp_t gfp_any(void)
 	return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
 }
 
+static inline gfp_t gfp_memcg_charge(void)
+{
+	return in_softirq() ? GFP_NOWAIT : GFP_KERNEL;
+}
+
 static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
 {
 	return noblock ? 0 : sk->sk_rcvtimeo;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8ef06f9e0db1..be585ceaba98 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7048,14 +7048,14 @@ void mem_cgroup_sk_free(struct sock *sk)
  * mem_cgroup_charge_skmem - charge socket memory
  * @memcg: memcg to charge
  * @nr_pages: number of pages to charge
+ * @gfp_mask: reclaim mode
  *
  * Charges @nr_pages to @memcg. Returns %true if the charge fit within
- * @memcg's configured limit, %false if the charge had to be forced.
+ * @memcg's configured limit, %false if it doesn't.
  */
-bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
+			     gfp_t gfp_mask)
 {
-	gfp_t gfp_mask = GFP_KERNEL;
-
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
 		struct page_counter *fail;
 
@@ -7063,21 +7063,19 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 			memcg->tcpmem_pressure = 0;
 			return true;
 		}
-		page_counter_charge(&memcg->tcpmem, nr_pages);
 		memcg->tcpmem_pressure = 1;
+		if (gfp_mask & __GFP_NOFAIL) {
+			page_counter_charge(&memcg->tcpmem, nr_pages);
+			return true;
+		}
 		return false;
 	}
 
-	/* Don't block in the packet receive path */
-	if (in_softirq())
-		gfp_mask = GFP_NOWAIT;
-
-	mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
-
-	if (try_charge(memcg, gfp_mask, nr_pages) == 0)
+	if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
+		mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
 		return true;
+	}
 
-	try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
 	return false;
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index aada649e07e8..950f1e70dbf5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2728,10 +2728,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 {
 	struct proto *prot = sk->sk_prot;
 	long allocated = sk_memory_allocated_add(sk, amt);
+	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
 	bool charged = true;
 
-	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
-	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
+	if (memcg_charge &&
+	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+						gfp_memcg_charge())))
 		goto suppress_allocation;
 
 	/* Under limit. */
@@ -2785,8 +2787,14 @@ suppress_allocation:
 		/* Fail only if socket is _under_ its sndbuf.
 		 * In this case we cannot block, so that we have to fail.
 		 */
-		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
+			/* Force charge with __GFP_NOFAIL */
+			if (memcg_charge && !charged) {
+				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+					gfp_memcg_charge() | __GFP_NOFAIL);
+			}
 			return 1;
+		}
 	}
 
 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
@@ -2794,7 +2802,7 @@ suppress_allocation:
 
 	sk_memory_allocated_sub(sk, amt);
 
-	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+	if (memcg_charge && charged)
 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
 
 	return 0;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 754013fa393b..f25d02ad4a8a 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -534,7 +534,8 @@ out:
 				   atomic_read(&newsk->sk_rmem_alloc));
 		mem_cgroup_sk_alloc(newsk);
 		if (newsk->sk_memcg && amt)
-			mem_cgroup_charge_skmem(newsk->sk_memcg, amt);
+			mem_cgroup_charge_skmem(newsk->sk_memcg, amt,
+						GFP_KERNEL | __GFP_NOFAIL);
 
 		release_sock(newsk);
 	}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 29553fce8502..6d72f3ea48c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3373,7 +3373,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
 	sk_memory_allocated_add(sk, amt);
 
 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
-		mem_cgroup_charge_skmem(sk->sk_memcg, amt);
+		mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+					gfp_memcg_charge() | __GFP_NOFAIL);
 }
 
 /* Send a FIN. The caller locks the socket for us.
-- 
cgit v1.2.3


From 2274af1d60fee3fe35f341fc5d4dbf99ab78fb2f Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Wed, 18 Aug 2021 18:07:09 +0300
Subject: net: mii: make mii_ethtool_gset() return void

mii_ethtool_gset() does not return any errors. Since there are no users
of this function that rely on its return value, it can be
made void.

Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mii.c   | 6 +-----
 include/linux/mii.h | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/mii.c b/drivers/net/mii.c
index 779c3a96dba7..22680f47385d 100644
--- a/drivers/net/mii.c
+++ b/drivers/net/mii.c
@@ -49,10 +49,8 @@ static u32 mii_get_an(struct mii_if_info *mii, u16 addr)
  *
  * The @ecmd parameter is expected to have been cleared before calling
  * mii_ethtool_gset().
- *
- * Returns 0 for success, negative on error.
  */
-int mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd)
+void mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd)
 {
 	struct net_device *dev = mii->dev;
 	u16 bmcr, bmsr, ctrl1000 = 0, stat1000 = 0;
@@ -131,8 +129,6 @@ int mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd)
 	mii->full_duplex = ecmd->duplex;
 
 	/* ignore maxtxpkt, maxrxpkt for now */
-
-	return 0;
 }
 
 /**
diff --git a/include/linux/mii.h b/include/linux/mii.h
index 219b93cad1dd..12ea29e04293 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -32,7 +32,7 @@ struct mii_if_info {
 
 extern int mii_link_ok (struct mii_if_info *mii);
 extern int mii_nway_restart (struct mii_if_info *mii);
-extern int mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd);
+extern void mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd);
 extern void mii_ethtool_get_link_ksettings(
 	struct mii_if_info *mii, struct ethtool_link_ksettings *cmd);
 extern int mii_ethtool_sset(struct mii_if_info *mii, struct ethtool_cmd *ecmd);
-- 
cgit v1.2.3


From 74fc4f828769cca1c3be89ea92cb88feaa27ef52 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Tue, 17 Aug 2021 20:05:18 +0300
Subject: net: Fix offloading indirect devices dependency on qdisc order
 creation

Currently, when creating an ingress qdisc on an indirect device before
the driver registered for callbacks, the driver will not have a chance
to register its filter configuration callbacks.

To fix that, modify the code such that it keeps track of all the ingress
qdiscs that call flow_indr_dev_setup_offload(). When a driver calls
flow_indr_dev_register(),  go through the list of tracked ingress qdiscs
and call the driver callback entry point so as to give it a chance to
register its callback.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h            |  1 +
 net/core/flow_offload.c               | 89 ++++++++++++++++++++++++++++++++++-
 net/netfilter/nf_flow_table_offload.c |  1 +
 net/netfilter/nf_tables_offload.c     |  1 +
 net/sched/cls_api.c                   |  1 +
 5 files changed, 92 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index f3c2841566a0..5aa27acdb0b3 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -453,6 +453,7 @@ struct flow_block_offload {
 	struct list_head *driver_block_list;
 	struct netlink_ext_ack *extack;
 	struct Qdisc *sch;
+	struct list_head *cb_list_head;
 };
 
 enum tc_setup_type;
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 1da83997e86a..6beaea13564a 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -321,6 +321,7 @@ EXPORT_SYMBOL(flow_block_cb_setup_simple);
 static DEFINE_MUTEX(flow_indr_block_lock);
 static LIST_HEAD(flow_block_indr_list);
 static LIST_HEAD(flow_block_indr_dev_list);
+static LIST_HEAD(flow_indir_dev_list);
 
 struct flow_indr_dev {
 	struct list_head		list;
@@ -345,6 +346,33 @@ static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb,
 	return indr_dev;
 }
 
+struct flow_indir_dev_info {
+	void *data;
+	struct net_device *dev;
+	struct Qdisc *sch;
+	enum tc_setup_type type;
+	void (*cleanup)(struct flow_block_cb *block_cb);
+	struct list_head list;
+	enum flow_block_command command;
+	enum flow_block_binder_type binder_type;
+	struct list_head *cb_list;
+};
+
+static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
+{
+	struct flow_block_offload bo;
+	struct flow_indir_dev_info *cur;
+
+	list_for_each_entry(cur, &flow_indir_dev_list, list) {
+		memset(&bo, 0, sizeof(bo));
+		bo.command = cur->command;
+		bo.binder_type = cur->binder_type;
+		INIT_LIST_HEAD(&bo.cb_list);
+		cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup);
+		list_splice(&bo.cb_list, cur->cb_list);
+	}
+}
+
 int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
 {
 	struct flow_indr_dev *indr_dev;
@@ -366,6 +394,7 @@ int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
 	}
 
 	list_add(&indr_dev->list, &flow_block_indr_dev_list);
+	existing_qdiscs_register(cb, cb_priv);
 	mutex_unlock(&flow_indr_block_lock);
 
 	return 0;
@@ -462,7 +491,59 @@ out:
 }
 EXPORT_SYMBOL(flow_indr_block_cb_alloc);
 
-int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
+static struct flow_indir_dev_info *find_indir_dev(void *data)
+{
+	struct flow_indir_dev_info *cur;
+
+	list_for_each_entry(cur, &flow_indir_dev_list, list) {
+		if (cur->data == data)
+			return cur;
+	}
+	return NULL;
+}
+
+static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch,
+			 enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb),
+			 struct flow_block_offload *bo)
+{
+	struct flow_indir_dev_info *info;
+
+	info = find_indir_dev(data);
+	if (info)
+		return -EEXIST;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->data = data;
+	info->dev = dev;
+	info->sch = sch;
+	info->type = type;
+	info->cleanup = cleanup;
+	info->command = bo->command;
+	info->binder_type = bo->binder_type;
+	info->cb_list = bo->cb_list_head;
+
+	list_add(&info->list, &flow_indir_dev_list);
+	return 0;
+}
+
+static int indir_dev_remove(void *data)
+{
+	struct flow_indir_dev_info *info;
+
+	info = find_indir_dev(data);
+	if (!info)
+		return -ENOENT;
+
+	list_del(&info->list);
+
+	kfree(info);
+	return 0;
+}
+
+int flow_indr_dev_setup_offload(struct net_device *dev,	struct Qdisc *sch,
 				enum tc_setup_type type, void *data,
 				struct flow_block_offload *bo,
 				void (*cleanup)(struct flow_block_cb *block_cb))
@@ -470,6 +551,12 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
 	struct flow_indr_dev *this;
 
 	mutex_lock(&flow_indr_block_lock);
+
+	if (bo->command == FLOW_BLOCK_BIND)
+		indir_dev_add(data, dev, sch, type, cleanup, bo);
+	else if (bo->command == FLOW_BLOCK_UNBIND)
+		indir_dev_remove(data);
+
 	list_for_each_entry(this, &flow_block_indr_dev_list, list)
 		this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup);
 
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 2bfd9f1b8f11..d6bf1b2cd541 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -1096,6 +1096,7 @@ static void nf_flow_table_block_offload_init(struct flow_block_offload *bo,
 	bo->command	= cmd;
 	bo->binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	bo->extack	= extack;
+	bo->cb_list_head = &flowtable->flow_block.cb_list;
 	INIT_LIST_HEAD(&bo->cb_list);
 }
 
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index b58d73a96523..9656c1646222 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -353,6 +353,7 @@ static void nft_flow_block_offload_init(struct flow_block_offload *bo,
 	bo->command	= cmd;
 	bo->binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	bo->extack	= extack;
+	bo->cb_list_head = &basechain->flow_block.cb_list;
 	INIT_LIST_HEAD(&bo->cb_list);
 }
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 4a7043a4e5d6..2ef8f5a6205a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -634,6 +634,7 @@ static void tcf_block_offload_init(struct flow_block_offload *bo,
 	bo->block_shared = shared;
 	bo->extack = extack;
 	bo->sch = sch;
+	bo->cb_list_head = &flow_block->cb_list;
 	INIT_LIST_HEAD(&bo->cb_list);
 }
 
-- 
cgit v1.2.3


From 6e86a1543c378f2e8837ad88f361b7bf606c80f7 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Wed, 18 Aug 2021 09:12:32 +0200
Subject: can: dev: provide optional GPIO based termination support

For CAN buses to work, a termination resistor has to be present at both
ends of the bus. This resistor is usually 120 Ohms, other values may be
required for special bus topologies.

This patch adds support for a generic GPIO based CAN termination. The
resistor value has to be specified via device tree, and it can only be
attached to or detached from the bus. By default the termination is not
active.

Link: https://lore.kernel.org/r/20210818071232.20585-4-o.rempel@pengutronix.de
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/can/dev.h   |  8 ++++++
 2 files changed, 74 insertions(+)

(limited to 'include')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 311d8564d611..e3d840b81357 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -15,6 +15,7 @@
 #include <linux/can/dev.h>
 #include <linux/can/skb.h>
 #include <linux/can/led.h>
+#include <linux/gpio/consumer.h>
 #include <linux/of.h>
 
 #define MOD_DESC "CAN device driver interface"
@@ -400,10 +401,69 @@ void close_candev(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(close_candev);
 
+static int can_set_termination(struct net_device *ndev, u16 term)
+{
+	struct can_priv *priv = netdev_priv(ndev);
+	int set;
+
+	if (term == priv->termination_gpio_ohms[CAN_TERMINATION_GPIO_ENABLED])
+		set = 1;
+	else
+		set = 0;
+
+	gpiod_set_value(priv->termination_gpio, set);
+
+	return 0;
+}
+
+static int can_get_termination(struct net_device *ndev)
+{
+	struct can_priv *priv = netdev_priv(ndev);
+	struct device *dev = ndev->dev.parent;
+	struct gpio_desc *gpio;
+	u32 term;
+	int ret;
+
+	/* Disabling termination by default is the safe choice: Else if many
+	 * bus participants enable it, no communication is possible at all.
+	 */
+	gpio = devm_gpiod_get_optional(dev, "termination", GPIOD_OUT_LOW);
+	if (IS_ERR(gpio))
+		return dev_err_probe(dev, PTR_ERR(gpio),
+				     "Cannot get termination-gpios\n");
+
+	if (!gpio)
+		return 0;
+
+	ret = device_property_read_u32(dev, "termination-ohms", &term);
+	if (ret) {
+		netdev_err(ndev, "Cannot get termination-ohms: %pe\n",
+			   ERR_PTR(ret));
+		return ret;
+	}
+
+	if (term > U16_MAX) {
+		netdev_err(ndev, "Invalid termination-ohms value (%u > %u)\n",
+			   term, U16_MAX);
+		return -EINVAL;
+	}
+
+	priv->termination_const_cnt = ARRAY_SIZE(priv->termination_gpio_ohms);
+	priv->termination_const = priv->termination_gpio_ohms;
+	priv->termination_gpio = gpio;
+	priv->termination_gpio_ohms[CAN_TERMINATION_GPIO_DISABLED] =
+		CAN_TERMINATION_DISABLED;
+	priv->termination_gpio_ohms[CAN_TERMINATION_GPIO_ENABLED] = term;
+	priv->do_set_termination = can_set_termination;
+
+	return 0;
+}
+
 /* Register the CAN network device */
 int register_candev(struct net_device *dev)
 {
 	struct can_priv *priv = netdev_priv(dev);
+	int err;
 
 	/* Ensure termination_const, termination_const_cnt and
 	 * do_set_termination consistency. All must be either set or
@@ -419,6 +479,12 @@ int register_candev(struct net_device *dev)
 	if (!priv->data_bitrate_const != !priv->data_bitrate_const_cnt)
 		return -EINVAL;
 
+	if (!priv->termination_const) {
+		err = can_get_termination(dev);
+		if (err)
+			return err;
+	}
+
 	dev->rtnl_link_ops = &can_link_ops;
 	netif_carrier_off(dev);
 
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 27b275e463da..2413253e54c7 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -32,6 +32,12 @@ enum can_mode {
 	CAN_MODE_SLEEP
 };
 
+enum can_termination_gpio {
+	CAN_TERMINATION_GPIO_DISABLED = 0,
+	CAN_TERMINATION_GPIO_ENABLED,
+	CAN_TERMINATION_GPIO_MAX,
+};
+
 /*
  * CAN common private data
  */
@@ -55,6 +61,8 @@ struct can_priv {
 	unsigned int termination_const_cnt;
 	const u16 *termination_const;
 	u16 termination;
+	struct gpio_desc *termination_gpio;
+	u16 termination_gpio_ohms[CAN_TERMINATION_GPIO_MAX];
 
 	enum can_state state;
 
-- 
cgit v1.2.3


From 44779a4b85abd1d1dab9e5b90bd5e6adcfc8143a Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 18 Aug 2021 16:52:16 -0700
Subject: bpf: Use kvmalloc for map keys in syscalls

Same as previous patch but for the keys. memdup_bpfptr is renamed
to kvmemdup_bpfptr (and converted to kvmalloc).

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210818235216.1159202-2-sdf@google.com
---
 include/linux/bpfptr.h | 12 ++++++++++--
 kernel/bpf/syscall.c   | 34 +++++++++++++++++-----------------
 2 files changed, 27 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 5cdeab497cb3..546e27fc6d46 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -62,9 +62,17 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
 	return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size);
 }
 
-static inline void *memdup_bpfptr(bpfptr_t src, size_t len)
+static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
 {
-	return memdup_sockptr((sockptr_t) src, len);
+	void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN);
+
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+	if (copy_from_bpfptr(p, src, len)) {
+		kvfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+	return p;
 }
 
 static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 075f650d297a..4e50c0bfdb7d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1013,7 +1013,7 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 static void *__bpf_copy_key(void __user *ukey, u64 key_size)
 {
 	if (key_size)
-		return memdup_user(ukey, key_size);
+		return vmemdup_user(ukey, key_size);
 
 	if (ukey)
 		return ERR_PTR(-EINVAL);
@@ -1024,7 +1024,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
 static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
 {
 	if (key_size)
-		return memdup_bpfptr(ukey, key_size);
+		return kvmemdup_bpfptr(ukey, key_size);
 
 	if (!bpfptr_is_null(ukey))
 		return ERR_PTR(-EINVAL);
@@ -1093,7 +1093,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 free_value:
 	kvfree(value);
 free_key:
-	kfree(key);
+	kvfree(key);
 err_put:
 	fdput(f);
 	return err;
@@ -1153,7 +1153,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 free_value:
 	kvfree(value);
 free_key:
-	kfree(key);
+	kvfree(key);
 err_put:
 	fdput(f);
 	return err;
@@ -1205,7 +1205,7 @@ static int map_delete_elem(union bpf_attr *attr)
 	bpf_enable_instrumentation();
 	maybe_wait_bpf_programs(map);
 out:
-	kfree(key);
+	kvfree(key);
 err_put:
 	fdput(f);
 	return err;
@@ -1247,7 +1247,7 @@ static int map_get_next_key(union bpf_attr *attr)
 	}
 
 	err = -ENOMEM;
-	next_key = kmalloc(map->key_size, GFP_USER);
+	next_key = kvmalloc(map->key_size, GFP_USER);
 	if (!next_key)
 		goto free_key;
 
@@ -1270,9 +1270,9 @@ out:
 	err = 0;
 
 free_next_key:
-	kfree(next_key);
+	kvfree(next_key);
 free_key:
-	kfree(key);
+	kvfree(key);
 err_put:
 	fdput(f);
 	return err;
@@ -1299,7 +1299,7 @@ int generic_map_delete_batch(struct bpf_map *map,
 	if (!max_count)
 		return 0;
 
-	key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
 	if (!key)
 		return -ENOMEM;
 
@@ -1326,7 +1326,7 @@ int generic_map_delete_batch(struct bpf_map *map,
 	if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
 		err = -EFAULT;
 
-	kfree(key);
+	kvfree(key);
 	return err;
 }
 
@@ -1357,13 +1357,13 @@ int generic_map_update_batch(struct bpf_map *map,
 	if (!max_count)
 		return 0;
 
-	key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
 	if (!key)
 		return -ENOMEM;
 
 	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
 	if (!value) {
-		kfree(key);
+		kvfree(key);
 		return -ENOMEM;
 	}
 
@@ -1385,7 +1385,7 @@ int generic_map_update_batch(struct bpf_map *map,
 		err = -EFAULT;
 
 	kvfree(value);
-	kfree(key);
+	kvfree(key);
 	return err;
 }
 
@@ -1419,13 +1419,13 @@ int generic_map_lookup_batch(struct bpf_map *map,
 	if (put_user(0, &uattr->batch.count))
 		return -EFAULT;
 
-	buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+	buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
 	if (!buf_prevkey)
 		return -ENOMEM;
 
 	buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
 	if (!buf) {
-		kfree(buf_prevkey);
+		kvfree(buf_prevkey);
 		return -ENOMEM;
 	}
 
@@ -1485,7 +1485,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
 		err = -EFAULT;
 
 free_buf:
-	kfree(buf_prevkey);
+	kvfree(buf_prevkey);
 	kvfree(buf);
 	return err;
 }
@@ -1575,7 +1575,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 free_value:
 	kvfree(value);
 free_key:
-	kfree(key);
+	kvfree(key);
 err_put:
 	fdput(f);
 	return err;
-- 
cgit v1.2.3


From 1ae258f8b343a0c4316c5545bfaf21010e4f0c73 Mon Sep 17 00:00:00 2001
From: Dmytro Linkin <dlinkin@nvidia.com>
Date: Mon, 31 May 2021 17:08:14 +0300
Subject: net/mlx5: E-switch, Introduce rate limiting groups API

Extend eswitch API with rate limiting groups:

- Define new struct mlx5_esw_rate_group that is used to hold all
  internal group data.

- Implement functions that allow creation, destruction and cleanup of
  groups.

- Assign all vports to internal unlimited zero group by default.

This commit lays the groundwork for group rate limiting by implementing
devlink_ops->rate_node_{new|del}() callbacks to support creating and
deleting groups through devlink rate node objects. APIs that allows
setting rates and adding/removing members are implemented in following
patches.

Co-developed-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com>
Reviewed-by: Huy Nguyen <huyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 141 +++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h |   4 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h |   1 +
 include/linux/mlx5/mlx5_ifc.h                     |   3 +-
 5 files changed, 145 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index f4cd2573d4ea..ef87d0bf983b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -295,6 +295,8 @@ static const struct devlink_ops mlx5_devlink_ops = {
 	.port_function_hw_addr_set = mlx5_devlink_port_function_hw_addr_set,
 	.rate_leaf_tx_share_set = mlx5_esw_devlink_rate_leaf_tx_share_set,
 	.rate_leaf_tx_max_set = mlx5_esw_devlink_rate_leaf_tx_max_set,
+	.rate_node_new = mlx5_esw_devlink_rate_node_new,
+	.rate_node_del = mlx5_esw_devlink_rate_node_del,
 #endif
 #ifdef CONFIG_MLX5_SF_MANAGER
 	.port_new = mlx5_devlink_sf_port_new,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index fcdcddf4a710..c9081d39fa8a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -11,6 +11,13 @@
 #define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \
 	min_t(u32, max_t(u32, DIV_ROUND_UP(rate, divider), MLX5_MIN_BW_SHARE), limit)
 
+struct mlx5_esw_rate_group {
+	u32 tsar_ix;
+	u32 max_rate;
+	u32 min_rate;
+	u32 bw_share;
+};
+
 static int esw_qos_vport_config(struct mlx5_eswitch *esw,
 				struct mlx5_vport *vport,
 				u32 max_rate, u32 bw_share,
@@ -159,6 +166,54 @@ int mlx5_esw_qos_set_vport_max_rate(struct mlx5_eswitch *esw,
 	return err;
 }
 
+static struct mlx5_esw_rate_group *
+esw_qos_create_rate_group(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
+{
+	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
+	struct mlx5_esw_rate_group *group;
+	int err;
+
+	if (!MLX5_CAP_QOS(esw->dev, log_esw_max_sched_depth))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	MLX5_SET(scheduling_context, tsar_ctx, parent_element_id,
+		 esw->qos.root_tsar_ix);
+	err = mlx5_create_scheduling_element_cmd(esw->dev,
+						 SCHEDULING_HIERARCHY_E_SWITCH,
+						 tsar_ctx,
+						 &group->tsar_ix);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "E-Switch create TSAR for group failed");
+		goto err_sched_elem;
+	}
+
+	return group;
+
+err_sched_elem:
+	kfree(group);
+	return ERR_PTR(err);
+}
+
+static int esw_qos_destroy_rate_group(struct mlx5_eswitch *esw,
+				      struct mlx5_esw_rate_group *group,
+				      struct netlink_ext_ack *extack)
+{
+	int err;
+
+	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
+						  SCHEDULING_HIERARCHY_E_SWITCH,
+						  group->tsar_ix);
+	if (err)
+		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR_ID failed");
+
+	kfree(group);
+	return err;
+}
+
 static bool esw_qos_element_type_supported(struct mlx5_core_dev *dev, int type)
 {
 	switch (type) {
@@ -191,8 +246,9 @@ void mlx5_esw_qos_create(struct mlx5_eswitch *esw)
 	if (!esw_qos_element_type_supported(dev, SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR))
 		return;
 
+	mutex_lock(&esw->state_lock);
 	if (esw->qos.enabled)
-		return;
+		goto unlock;
 
 	MLX5_SET(scheduling_context, tsar_ctx, element_type,
 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
@@ -205,27 +261,54 @@ void mlx5_esw_qos_create(struct mlx5_eswitch *esw)
 						 tsar_ctx,
 						 &esw->qos.root_tsar_ix);
 	if (err) {
-		esw_warn(dev, "E-Switch create TSAR failed (%d)\n", err);
-		return;
+		esw_warn(dev, "E-Switch create root TSAR failed (%d)\n", err);
+		goto unlock;
 	}
 
+	if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) {
+		esw->qos.group0 = esw_qos_create_rate_group(esw, NULL);
+		if (IS_ERR(esw->qos.group0)) {
+			esw_warn(dev, "E-Switch create rate group 0 failed (%ld)\n",
+				 PTR_ERR(esw->qos.group0));
+			goto err_group0;
+		}
+	}
 	esw->qos.enabled = true;
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return;
+
+err_group0:
+	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
+						  SCHEDULING_HIERARCHY_E_SWITCH,
+						  esw->qos.root_tsar_ix);
+	if (err)
+		esw_warn(esw->dev, "E-Switch destroy root TSAR failed (%d)\n", err);
+	mutex_unlock(&esw->state_lock);
 }
 
 void mlx5_esw_qos_destroy(struct mlx5_eswitch *esw)
 {
+	struct devlink *devlink = priv_to_devlink(esw->dev);
 	int err;
 
+	devlink_rate_nodes_destroy(devlink);
+	mutex_lock(&esw->state_lock);
 	if (!esw->qos.enabled)
-		return;
+		goto unlock;
+
+	if (esw->qos.group0)
+		esw_qos_destroy_rate_group(esw, esw->qos.group0, NULL);
 
 	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
 						  SCHEDULING_HIERARCHY_E_SWITCH,
 						  esw->qos.root_tsar_ix);
 	if (err)
-		esw_warn(esw->dev, "E-Switch destroy TSAR failed (%d)\n", err);
+		esw_warn(esw->dev, "E-Switch destroy root TSAR failed (%d)\n", err);
 
 	esw->qos.enabled = false;
+unlock:
+	mutex_unlock(&esw->state_lock);
 }
 
 int mlx5_esw_qos_vport_enable(struct mlx5_eswitch *esw, struct mlx5_vport *vport,
@@ -386,3 +469,51 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *
 	mutex_unlock(&esw->state_lock);
 	return err;
 }
+
+int mlx5_esw_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv,
+				   struct netlink_ext_ack *extack)
+{
+	struct mlx5_esw_rate_group *group;
+	struct mlx5_eswitch *esw;
+	int err = 0;
+
+	esw = mlx5_devlink_eswitch_get(rate_node->devlink);
+	if (IS_ERR(esw))
+		return PTR_ERR(esw);
+
+	mutex_lock(&esw->state_lock);
+	if (esw->mode != MLX5_ESWITCH_OFFLOADS) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Rate node creation supported only in switchdev mode");
+		err = -EOPNOTSUPP;
+		goto unlock;
+	}
+
+	group = esw_qos_create_rate_group(esw, extack);
+	if (IS_ERR(group)) {
+		err = PTR_ERR(group);
+		goto unlock;
+	}
+
+	*priv = group;
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
+
+int mlx5_esw_devlink_rate_node_del(struct devlink_rate *rate_node, void *priv,
+				   struct netlink_ext_ack *extack)
+{
+	struct mlx5_esw_rate_group *group = priv;
+	struct mlx5_eswitch *esw;
+	int err;
+
+	esw = mlx5_devlink_eswitch_get(rate_node->devlink);
+	if (IS_ERR(esw))
+		return PTR_ERR(esw);
+
+	mutex_lock(&esw->state_lock);
+	err = esw_qos_destroy_rate_group(esw, group, extack);
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
index 507c7e017834..ab9fd8621cca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
@@ -24,6 +24,10 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void
 					    u64 tx_share, struct netlink_ext_ack *extack);
 int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *priv,
 					  u64 tx_max, struct netlink_ext_ack *extack);
+int mlx5_esw_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv,
+				   struct netlink_ext_ack *extack);
+int mlx5_esw_devlink_rate_node_del(struct devlink_rate *rate_node, void *priv,
+				   struct netlink_ext_ack *extack);
 #endif
 
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index ebeccee38a57..3580901ae548 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -306,6 +306,7 @@ struct mlx5_eswitch {
 	struct {
 		bool            enabled;
 		u32             root_tsar_ix;
+		struct mlx5_esw_rate_group *group0;
 	} qos;
 
 	struct mlx5_esw_bridge_offloads *br_offloads;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fce3cbae0b99..f3638d09ba77 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -865,7 +865,8 @@ struct mlx5_ifc_qos_cap_bits {
 	u8         nic_bw_share[0x1];
 	u8         nic_rate_limit[0x1];
 	u8         packet_pacing_uid[0x1];
-	u8         reserved_at_c[0x14];
+	u8         log_esw_max_sched_depth[0x4];
+	u8         reserved_at_10[0x10];
 
 	u8         reserved_at_20[0xb];
 	u8         log_max_qos_nic_queue_group[0x5];
-- 
cgit v1.2.3


From 01af940e9be623c5492ddb8a5d7f02586fea1c5b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 19 Aug 2021 20:40:06 +0300
Subject: net: mscc: ocelot: transmit the "native VLAN" error via extack

We need to reject some more configurations in future patches, convert
the existing one to netlink extack.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c         |  8 +++++---
 drivers/net/ethernet/mscc/ocelot.c     |  7 +++----
 drivers/net/ethernet/mscc/ocelot_net.c | 30 ++++++++++++++++--------------
 include/soc/mscc/ocelot.h              |  2 +-
 4 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index cbe23b20f3fa..72d746f11fb9 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -742,7 +742,8 @@ static int felix_lag_change(struct dsa_switch *ds, int port)
 }
 
 static int felix_vlan_prepare(struct dsa_switch *ds, int port,
-			      const struct switchdev_obj_port_vlan *vlan)
+			      const struct switchdev_obj_port_vlan *vlan,
+			      struct netlink_ext_ack *extack)
 {
 	struct ocelot *ocelot = ds->priv;
 	u16 flags = vlan->flags;
@@ -760,7 +761,8 @@ static int felix_vlan_prepare(struct dsa_switch *ds, int port,
 
 	return ocelot_vlan_prepare(ocelot, port, vlan->vid,
 				   flags & BRIDGE_VLAN_INFO_PVID,
-				   flags & BRIDGE_VLAN_INFO_UNTAGGED);
+				   flags & BRIDGE_VLAN_INFO_UNTAGGED,
+				   extack);
 }
 
 static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled,
@@ -779,7 +781,7 @@ static int felix_vlan_add(struct dsa_switch *ds, int port,
 	u16 flags = vlan->flags;
 	int err;
 
-	err = felix_vlan_prepare(ds, port, vlan);
+	err = felix_vlan_prepare(ds, port, vlan, extack);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 8ec194178aa2..ccb8a9863890 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -259,16 +259,15 @@ int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
 EXPORT_SYMBOL(ocelot_port_vlan_filtering);
 
 int ocelot_vlan_prepare(struct ocelot *ocelot, int port, u16 vid, bool pvid,
-			bool untagged)
+			bool untagged, struct netlink_ext_ack *extack)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 
 	/* Deny changing the native VLAN, but always permit deleting it */
 	if (untagged && ocelot_port->native_vlan.vid != vid &&
 	    ocelot_port->native_vlan.valid) {
-		dev_err(ocelot->dev,
-			"Port already has a native VLAN: %d\n",
-			ocelot_port->native_vlan.vid);
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Port already has a native VLAN");
 		return -EBUSY;
 	}
 
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 9044737936d2..690f39e9351b 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -386,17 +386,6 @@ static int ocelot_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	return 0;
 }
 
-static int ocelot_vlan_vid_prepare(struct net_device *dev, u16 vid, bool pvid,
-				   bool untagged)
-{
-	struct ocelot_port_private *priv = netdev_priv(dev);
-	struct ocelot_port *ocelot_port = &priv->port;
-	struct ocelot *ocelot = ocelot_port->ocelot;
-	int port = priv->chip_port;
-
-	return ocelot_vlan_prepare(ocelot, port, vid, pvid, untagged);
-}
-
 static int ocelot_vlan_vid_add(struct net_device *dev, u16 vid, bool pvid,
 			       bool untagged)
 {
@@ -944,14 +933,26 @@ static int ocelot_port_attr_set(struct net_device *dev, const void *ctx,
 	return err;
 }
 
+static int ocelot_vlan_vid_prepare(struct net_device *dev, u16 vid, bool pvid,
+				   bool untagged, struct netlink_ext_ack *extack)
+{
+	struct ocelot_port_private *priv = netdev_priv(dev);
+	struct ocelot_port *ocelot_port = &priv->port;
+	struct ocelot *ocelot = ocelot_port->ocelot;
+	int port = priv->chip_port;
+
+	return ocelot_vlan_prepare(ocelot, port, vid, pvid, untagged, extack);
+}
+
 static int ocelot_port_obj_add_vlan(struct net_device *dev,
-				    const struct switchdev_obj_port_vlan *vlan)
+				    const struct switchdev_obj_port_vlan *vlan,
+				    struct netlink_ext_ack *extack)
 {
 	bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
 	bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID;
 	int ret;
 
-	ret = ocelot_vlan_vid_prepare(dev, vlan->vid, pvid, untagged);
+	ret = ocelot_vlan_vid_prepare(dev, vlan->vid, pvid, untagged, extack);
 	if (ret)
 		return ret;
 
@@ -1039,7 +1040,8 @@ static int ocelot_port_obj_add(struct net_device *dev, const void *ctx,
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
 		ret = ocelot_port_obj_add_vlan(dev,
-					       SWITCHDEV_OBJ_PORT_VLAN(obj));
+					       SWITCHDEV_OBJ_PORT_VLAN(obj),
+					       extack);
 		break;
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		ret = ocelot_port_obj_add_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj));
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index fb5681f7e61b..ac072303dadf 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -825,7 +825,7 @@ int ocelot_fdb_add(struct ocelot *ocelot, int port,
 int ocelot_fdb_del(struct ocelot *ocelot, int port,
 		   const unsigned char *addr, u16 vid);
 int ocelot_vlan_prepare(struct ocelot *ocelot, int port, u16 vid, bool pvid,
-			bool untagged);
+			bool untagged, struct netlink_ext_ack *extack);
 int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid,
 		    bool untagged);
 int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid);
-- 
cgit v1.2.3


From 3b95d1b29386dd4328e471fda2eee74b7fd4a4b3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 19 Aug 2021 20:40:07 +0300
Subject: net: mscc: ocelot: transmit the VLAN filtering restrictions via
 extack

We need to transmit more restrictions in future patches, convert this
one to netlink extack.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c         | 2 +-
 drivers/net/ethernet/mscc/ocelot.c     | 6 +++---
 drivers/net/ethernet/mscc/ocelot_net.c | 8 +++++---
 include/soc/mscc/ocelot.h              | 3 ++-
 4 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 72d746f11fb9..3656e67af789 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -770,7 +770,7 @@ static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled,
 {
 	struct ocelot *ocelot = ds->priv;
 
-	return ocelot_port_vlan_filtering(ocelot, port, enabled);
+	return ocelot_port_vlan_filtering(ocelot, port, enabled, extack);
 }
 
 static int felix_vlan_add(struct dsa_switch *ds, int port,
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index ccb8a9863890..e848e0379b5a 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -223,7 +223,7 @@ static void ocelot_port_set_pvid(struct ocelot *ocelot, int port,
 }
 
 int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
-			       bool vlan_aware)
+			       bool vlan_aware, struct netlink_ext_ack *extack)
 {
 	struct ocelot_vcap_block *block = &ocelot->block[VCAP_IS1];
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
@@ -233,8 +233,8 @@ int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
 	list_for_each_entry(filter, &block->rules, list) {
 		if (filter->ingress_port_mask & BIT(port) &&
 		    filter->action.vid_replace_ena) {
-			dev_err(ocelot->dev,
-				"Cannot change VLAN state with vlan modify rules active\n");
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Cannot change VLAN state with vlan modify rules active");
 			return -EBUSY;
 		}
 	}
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 690f39e9351b..c0c465a4a981 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -913,7 +913,8 @@ static int ocelot_port_attr_set(struct net_device *dev, const void *ctx,
 		ocelot_port_attr_ageing_set(ocelot, port, attr->u.ageing_time);
 		break;
 	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
-		ocelot_port_vlan_filtering(ocelot, port, attr->u.vlan_filtering);
+		ocelot_port_vlan_filtering(ocelot, port, attr->u.vlan_filtering,
+					   extack);
 		break;
 	case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED:
 		ocelot_port_attr_mc_set(ocelot, port, !attr->u.mc_disabled);
@@ -1133,14 +1134,15 @@ static int ocelot_switchdev_sync(struct ocelot *ocelot, int port,
 	ocelot_port_attr_ageing_set(ocelot, port, ageing_time);
 
 	return ocelot_port_vlan_filtering(ocelot, port,
-					  br_vlan_enabled(bridge_dev));
+					  br_vlan_enabled(bridge_dev),
+					  extack);
 }
 
 static int ocelot_switchdev_unsync(struct ocelot *ocelot, int port)
 {
 	int err;
 
-	err = ocelot_port_vlan_filtering(ocelot, port, false);
+	err = ocelot_port_vlan_filtering(ocelot, port, false, NULL);
 	if (err)
 		return err;
 
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index ac072303dadf..06706a9fd5b1 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -807,7 +807,8 @@ int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset);
 int ocelot_get_ts_info(struct ocelot *ocelot, int port,
 		       struct ethtool_ts_info *info);
 void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs);
-int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled);
+int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled,
+			       struct netlink_ext_ack *extack);
 void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state);
 void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot);
 int ocelot_port_pre_bridge_flags(struct ocelot *ocelot, int port,
-- 
cgit v1.2.3


From 2796d846d74a18cc6563e96eff8bf28c5e06f912 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Fri, 20 Aug 2021 15:42:55 +0300
Subject: net: bridge: vlan: convert mcast router global option to per-vlan
 entry

The per-vlan router option controls the port/vlan and host vlan entries'
mcast router config. The global option controlled only the host vlan
config, but that is unnecessary and incosistent as it's not really a
global vlan option, but rather bridge option to control host router
config, so convert BRIDGE_VLANDB_GOPTS_MCAST_ROUTER to
BRIDGE_VLANDB_ENTRY_MCAST_ROUTER which can be used to control both host
vlan and port vlan mcast router config.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  2 +-
 net/bridge/br_multicast.c      | 13 +++++++++++
 net/bridge/br_private.h        | 15 +++++++++++++
 net/bridge/br_vlan.c           |  1 +
 net/bridge/br_vlan_options.c   | 51 +++++++++++++++++++++++++++---------------
 5 files changed, 63 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index f71a81fdbbc6..2711c3522010 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -506,6 +506,7 @@ enum {
 	BRIDGE_VLANDB_ENTRY_STATE,
 	BRIDGE_VLANDB_ENTRY_TUNNEL_INFO,
 	BRIDGE_VLANDB_ENTRY_STATS,
+	BRIDGE_VLANDB_ENTRY_MCAST_ROUTER,
 	__BRIDGE_VLANDB_ENTRY_MAX,
 };
 #define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1)
@@ -561,7 +562,6 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
-	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
 	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE,
 	__BRIDGE_VLANDB_GOPTS_MAX
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index be9d1376e249..2c437d4bf632 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4310,6 +4310,19 @@ unlock:
 	return err;
 }
 
+int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router)
+{
+	int err;
+
+	if (br_vlan_is_master(v))
+		err = br_multicast_set_router(&v->br_mcast_ctx, mcast_router);
+	else
+		err = br_multicast_set_port_router(&v->port_mcast_ctx,
+						   mcast_router);
+
+	return err;
+}
+
 static void br_multicast_start_querier(struct net_bridge_mcast *brmctx,
 				       struct bridge_mcast_own_query *query)
 {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index fcc0fcf44a95..b4cef3a97f12 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -676,6 +676,20 @@ static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur,
 	return true;
 }
 
+static inline u8 br_vlan_multicast_router(const struct net_bridge_vlan *v)
+{
+	u8 mcast_router = MDB_RTR_TYPE_DISABLED;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (!br_vlan_is_master(v))
+		mcast_router = v->port_mcast_ctx.multicast_router;
+	else
+		mcast_router = v->br_mcast_ctx.multicast_router;
+#endif
+
+	return mcast_router;
+}
+
 static inline int br_afspec_cmd_to_rtm(int cmd)
 {
 	switch (cmd) {
@@ -881,6 +895,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb,
 int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val);
 int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx,
 				 unsigned long val);
+int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router);
 int br_multicast_toggle(struct net_bridge *br, unsigned long val,
 			struct netlink_ext_ack *extack);
 int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val);
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index e25e288e7a85..19f65ab91a02 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -2136,6 +2136,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] =
 	[BRIDGE_VLANDB_ENTRY_RANGE]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_ENTRY_STATE]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
+	[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]	= { .type = NLA_U8 },
 };
 
 static int br_vlan_rtm_process_one(struct net_device *dev,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index a3b8a086284b..8ffd4ed2563c 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -40,22 +40,38 @@ static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr,
 bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
 			   const struct net_bridge_vlan *range_end)
 {
+	u8 range_mc_rtr = br_vlan_multicast_router(range_end);
+	u8 curr_mc_rtr = br_vlan_multicast_router(v_curr);
+
 	return v_curr->state == range_end->state &&
-	       __vlan_tun_can_enter_range(v_curr, range_end);
+	       __vlan_tun_can_enter_range(v_curr, range_end) &&
+	       curr_mc_rtr == range_mc_rtr;
 }
 
 bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v)
 {
-	return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE,
-			   br_vlan_get_state(v)) &&
-	       __vlan_tun_put(skb, v);
+	if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) ||
+	    !__vlan_tun_put(skb, v))
+		return false;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER,
+		       br_vlan_multicast_router(v)))
+		return false;
+#endif
+
+	return true;
 }
 
 size_t br_vlan_opts_nl_size(void)
 {
 	return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */
 	       + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */
-	       + nla_total_size(sizeof(u32)); /* BRIDGE_VLANDB_TINFO_ID */
+	       + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	       + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */
+#endif
+	       + 0;
 }
 
 static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
@@ -181,6 +197,18 @@ static int br_vlan_process_one_opts(const struct net_bridge *br,
 			return err;
 	}
 
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]) {
+		u8 val;
+
+		val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]);
+		err = br_multicast_set_vlan_router(v, val);
+		if (err)
+			return err;
+		*changed = true;
+	}
+#endif
+
 	return 0;
 }
 
@@ -298,8 +326,6 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			v_opts->br_mcast_ctx.multicast_startup_query_count) ||
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 		       v_opts->br_mcast_ctx.multicast_querier) ||
-	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
-		       v_opts->br_mcast_ctx.multicast_router) ||
 	    br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx,
 					    BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE))
 		goto out_err;
@@ -380,7 +406,6 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
-		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER */
 		+ br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */
 		+ nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
 		+ br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
@@ -522,15 +547,6 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 			return err;
 		*changed = true;
 	}
-	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]) {
-		u8 val;
-
-		val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]);
-		err = br_multicast_set_router(&v->br_mcast_ctx, val);
-		if (err)
-			return err;
-		*changed = true;
-	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -554,7 +570,6 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]	= { .type = NLA_U8 },
-	[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
-- 
cgit v1.2.3


From d2587c57ffd8dcad04171dfd203dcc4ff98e4782 Mon Sep 17 00:00:00 2001
From: Angus Ainslie <angus@akkea.ca>
Date: Thu, 12 Aug 2021 09:52:17 -0700
Subject: brcmfmac: add 43752 SDIO ids and initialization

Add HW and SDIO ids for use with the SparkLan AP6275S
Add the firmware mapping structures for the BRCM43752 chipset.
The 43752 needs some things setup similar to the 43012 chipset.
The WATERMARK shows better performance when initialized to the 4373 value.

Signed-off-by: Angus Ainslie <angus@akkea.ca>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/20210812165218.2508258-2-angus@akkea.ca
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c     | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c       | 3 +++
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c       | 8 ++++++--
 drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h | 1 +
 include/linux/mmc/sdio_ids.h                                  | 1 +
 5 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index 720c7fec22ac..ac02244a6fdf 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -991,6 +991,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4359),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012),
+	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_89359),
 	{ /* end: all zeroes */ }
 };
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
index 4faab0170ffa..1ee49f9e325d 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
@@ -729,6 +729,8 @@ static u32 brcmf_chip_tcm_rambase(struct brcmf_chip_priv *ci)
 	case BRCM_CC_4364_CHIP_ID:
 	case CY_CC_4373_CHIP_ID:
 		return 0x160000;
+	case CY_CC_43752_CHIP_ID:
+		return 0x170000;
 	default:
 		brcmf_err("unknown chip: %s\n", ci->pub.name);
 		break;
@@ -1421,6 +1423,7 @@ bool brcmf_chip_sr_capable(struct brcmf_chip *pub)
 		reg = chip->ops->read32(chip->ctx, addr);
 		return (reg & CC_SR_CTL0_ENABLE_MASK) != 0;
 	case BRCM_CC_4359_CHIP_ID:
+	case CY_CC_43752_CHIP_ID:
 	case CY_CC_43012_CHIP_ID:
 		addr = CORE_CC_REG(pmu->base, retention_ctl);
 		reg = chip->ops->read32(chip->ctx, addr);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index baf36d79ed43..8effeb7a7269 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -625,6 +625,7 @@ BRCMF_FW_CLM_DEF(4356, "brcmfmac4356-sdio");
 BRCMF_FW_DEF(4359, "brcmfmac4359-sdio");
 BRCMF_FW_CLM_DEF(4373, "brcmfmac4373-sdio");
 BRCMF_FW_CLM_DEF(43012, "brcmfmac43012-sdio");
+BRCMF_FW_CLM_DEF(43752, "brcmfmac43752-sdio");
 
 /* firmware config files */
 MODULE_FIRMWARE(BRCMF_FW_DEFAULT_PATH "brcmfmac*-sdio.*.txt");
@@ -655,7 +656,8 @@ static const struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
 	BRCMF_FW_ENTRY(BRCM_CC_4356_CHIP_ID, 0xFFFFFFFF, 4356),
 	BRCMF_FW_ENTRY(BRCM_CC_4359_CHIP_ID, 0xFFFFFFFF, 4359),
 	BRCMF_FW_ENTRY(CY_CC_4373_CHIP_ID, 0xFFFFFFFF, 4373),
-	BRCMF_FW_ENTRY(CY_CC_43012_CHIP_ID, 0xFFFFFFFF, 43012)
+	BRCMF_FW_ENTRY(CY_CC_43012_CHIP_ID, 0xFFFFFFFF, 43012),
+	BRCMF_FW_ENTRY(CY_CC_43752_CHIP_ID, 0xFFFFFFFF, 43752)
 };
 
 #define TXCTL_CREDITS	2
@@ -3421,7 +3423,8 @@ err:
 
 static bool brcmf_sdio_aos_no_decode(struct brcmf_sdio *bus)
 {
-	if (bus->ci->chip == CY_CC_43012_CHIP_ID)
+	if (bus->ci->chip == CY_CC_43012_CHIP_ID ||
+	    bus->ci->chip == CY_CC_43752_CHIP_ID)
 		return true;
 	else
 		return false;
@@ -4266,6 +4269,7 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err,
 
 		switch (sdiod->func1->device) {
 		case SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373:
+		case SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752:
 			brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes\n",
 				  CY_4373_F2_WATERMARK);
 			brcmf_sdiod_writeb(sdiod, SBSDIO_WATERMARK,
diff --git a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
index 00309b272a0e..9d81320164ce 100644
--- a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
+++ b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
@@ -52,6 +52,7 @@
 #define BRCM_CC_4371_CHIP_ID		0x4371
 #define CY_CC_4373_CHIP_ID		0x4373
 #define CY_CC_43012_CHIP_ID		43012
+#define CY_CC_43752_CHIP_ID		43752
 
 /* USB Device IDs */
 #define BRCM_USB_43143_DEVICE_ID	0xbd1e
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 12036619346c..a85c9f0bd470 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -75,6 +75,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_43364		0xa9a4
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
 #define SDIO_DEVICE_ID_BROADCOM_43455		0xa9bf
+#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752	0xaae8
 
 #define SDIO_VENDOR_ID_MARVELL			0x02df
 #define SDIO_DEVICE_ID_MARVELL_LIBERTAS		0x9103
-- 
cgit v1.2.3


From f5e165e72b29d908214e554ef57f67790ba95934 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 19 Aug 2021 20:55:00 +0300
Subject: net: dsa: track unique bridge numbers across all DSA switch trees

Right now, cross-tree bridging setups work somewhat by mistake.

In the case of cross-tree bridging with sja1105, all switch instances
need to agree upon a common VLAN ID for forwarding a packet that belongs
to a certain bridging domain.

With TX forwarding offload, the VLAN ID is the bridge VLAN for
VLAN-aware bridging, and the tag_8021q TX forwarding offload VID
(a VLAN which has non-zero VBID bits) for VLAN-unaware bridging.

The VBID for VLAN-unaware bridging is derived from the dp->bridge_num
value calculated by DSA independently for each switch tree.

If ports from one tree join one bridge, and ports from another tree join
another bridge, DSA will assign them the same bridge_num, even though
the bridges are different. If cross-tree bridging is supported, this
is an issue.

Modify DSA to calculate the bridge_num globally across all switch trees.
This has the implication for a driver that the dp->bridge_num value that
DSA will assign to its ports might not be contiguous, if there are
boards with multiple DSA drivers instantiated. Additionally, all
bridge_num values eat up towards each switch's
ds->num_fwd_offloading_bridges maximum, which is potentially unfortunate,
and can be seen as a limitation introduced by this patch. However, that
is the lesser evil for now.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  8 +++-----
 net/dsa/dsa2.c     | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/dsa/dsa_priv.h |  2 ++
 net/dsa/port.c     | 39 +++++----------------------------------
 4 files changed, 58 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0c2cba45fa79..c7ea0f61056f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -155,9 +155,6 @@ struct dsa_switch_tree {
 
 	/* Track the largest switch index within a tree */
 	unsigned int last_switch;
-
-	/* Track the bridges with forwarding offload enabled */
-	unsigned long fwd_offloading_bridges;
 };
 
 #define dsa_lags_foreach_id(_id, _dst)				\
@@ -411,8 +408,9 @@ struct dsa_switch {
 	unsigned int		num_lag_ids;
 
 	/* Drivers that support bridge forwarding offload should set this to
-	 * the maximum number of bridges spanning the same switch tree that can
-	 * be offloaded.
+	 * the maximum number of bridges spanning the same switch tree (or all
+	 * trees, in the case of cross-tree bridging support) that can be
+	 * offloaded.
 	 */
 	unsigned int		num_fwd_offloading_bridges;
 
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index dcd67801eca4..1b2b25d7bd02 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -21,6 +21,9 @@
 static DEFINE_MUTEX(dsa2_mutex);
 LIST_HEAD(dsa_tree_list);
 
+/* Track the bridges with forwarding offload enabled */
+static unsigned long dsa_fwd_offloading_bridges;
+
 /**
  * dsa_tree_notify - Execute code for all switches in a DSA switch tree.
  * @dst: collection of struct dsa_switch devices to notify.
@@ -126,6 +129,51 @@ void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag)
 	}
 }
 
+static int dsa_bridge_num_find(const struct net_device *bridge_dev)
+{
+	struct dsa_switch_tree *dst;
+	struct dsa_port *dp;
+
+	/* When preparing the offload for a port, it will have a valid
+	 * dp->bridge_dev pointer but a not yet valid dp->bridge_num.
+	 * However there might be other ports having the same dp->bridge_dev
+	 * and a valid dp->bridge_num, so just ignore this port.
+	 */
+	list_for_each_entry(dst, &dsa_tree_list, list)
+		list_for_each_entry(dp, &dst->ports, list)
+			if (dp->bridge_dev == bridge_dev &&
+			    dp->bridge_num != -1)
+				return dp->bridge_num;
+
+	return -1;
+}
+
+int dsa_bridge_num_get(const struct net_device *bridge_dev, int max)
+{
+	int bridge_num = dsa_bridge_num_find(bridge_dev);
+
+	if (bridge_num < 0) {
+		/* First port that offloads TX forwarding for this bridge */
+		bridge_num = find_first_zero_bit(&dsa_fwd_offloading_bridges,
+						 DSA_MAX_NUM_OFFLOADING_BRIDGES);
+		if (bridge_num >= max)
+			return -1;
+
+		set_bit(bridge_num, &dsa_fwd_offloading_bridges);
+	}
+
+	return bridge_num;
+}
+
+void dsa_bridge_num_put(const struct net_device *bridge_dev, int bridge_num)
+{
+	/* Check if the bridge is still in use, otherwise it is time
+	 * to clean it up so we can reuse this bridge_num later.
+	 */
+	if (!dsa_bridge_num_find(bridge_dev))
+		clear_bit(bridge_num, &dsa_fwd_offloading_bridges);
+}
+
 struct dsa_switch *dsa_switch_find(int tree_index, int sw_index)
 {
 	struct dsa_switch_tree *dst;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index b7a269e0513f..88aaf43b2da4 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -543,6 +543,8 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
 			      struct net_device *master,
 			      const struct dsa_device_ops *tag_ops,
 			      const struct dsa_device_ops *old_tag_ops);
+int dsa_bridge_num_get(const struct net_device *bridge_dev, int max);
+void dsa_bridge_num_put(const struct net_device *bridge_dev, int bridge_num);
 
 /* tag_8021q.c */
 int dsa_tag_8021q_bridge_join(struct dsa_switch *ds,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 979042a64d1a..4fbe81ffb1ce 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -270,27 +270,9 @@ static void dsa_port_switchdev_unsync_attrs(struct dsa_port *dp)
 	 */
 }
 
-static int dsa_tree_find_bridge_num(struct dsa_switch_tree *dst,
-				    struct net_device *bridge_dev)
-{
-	struct dsa_port *dp;
-
-	/* When preparing the offload for a port, it will have a valid
-	 * dp->bridge_dev pointer but a not yet valid dp->bridge_num.
-	 * However there might be other ports having the same dp->bridge_dev
-	 * and a valid dp->bridge_num, so just ignore this port.
-	 */
-	list_for_each_entry(dp, &dst->ports, list)
-		if (dp->bridge_dev == bridge_dev && dp->bridge_num != -1)
-			return dp->bridge_num;
-
-	return -1;
-}
-
 static void dsa_port_bridge_tx_fwd_unoffload(struct dsa_port *dp,
 					     struct net_device *bridge_dev)
 {
-	struct dsa_switch_tree *dst = dp->ds->dst;
 	int bridge_num = dp->bridge_num;
 	struct dsa_switch *ds = dp->ds;
 
@@ -300,11 +282,7 @@ static void dsa_port_bridge_tx_fwd_unoffload(struct dsa_port *dp,
 
 	dp->bridge_num = -1;
 
-	/* Check if the bridge is still in use, otherwise it is time
-	 * to clean it up so we can reuse this bridge_num later.
-	 */
-	if (!dsa_tree_find_bridge_num(dst, bridge_dev))
-		clear_bit(bridge_num, &dst->fwd_offloading_bridges);
+	dsa_bridge_num_put(bridge_dev, bridge_num);
 
 	/* Notify the chips only once the offload has been deactivated, so
 	 * that they can update their configuration accordingly.
@@ -316,23 +294,16 @@ static void dsa_port_bridge_tx_fwd_unoffload(struct dsa_port *dp,
 static bool dsa_port_bridge_tx_fwd_offload(struct dsa_port *dp,
 					   struct net_device *bridge_dev)
 {
-	struct dsa_switch_tree *dst = dp->ds->dst;
 	struct dsa_switch *ds = dp->ds;
 	int bridge_num, err;
 
 	if (!ds->ops->port_bridge_tx_fwd_offload)
 		return false;
 
-	bridge_num = dsa_tree_find_bridge_num(dst, bridge_dev);
-	if (bridge_num < 0) {
-		/* First port that offloads TX forwarding for this bridge */
-		bridge_num = find_first_zero_bit(&dst->fwd_offloading_bridges,
-						 DSA_MAX_NUM_OFFLOADING_BRIDGES);
-		if (bridge_num >= ds->num_fwd_offloading_bridges)
-			return false;
-
-		set_bit(bridge_num, &dst->fwd_offloading_bridges);
-	}
+	bridge_num = dsa_bridge_num_get(bridge_dev,
+					ds->num_fwd_offloading_bridges);
+	if (bridge_num < 0)
+		return false;
 
 	dp->bridge_num = bridge_num;
 
-- 
cgit v1.2.3


From 6fc88c354f3af83ffa2c285b86e76c759755693f Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 19 Aug 2021 02:24:20 -0700
Subject: bpf: Migrate cgroup_bpf to internal cgroup_bpf_attach_type enum

Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf
attach types and a function to map bpf_attach_type values to the new
enum. Inspired by netns_bpf_attach_type.

Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever
possible.  Functionality is unchanged as attach_type_to_prog_type
switches in bpf/syscall.c were preventing non-cgroup programs from
making use of the invalid cgroup_bpf array slots.

As a result struct cgroup_bpf uses 504 fewer bytes relative to when its
arrays were sized using MAX_BPF_ATTACH_TYPE.

bpf_cgroup_storage is notably not migrated as struct
bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type
member which is not meant to be opaque. Similarly, bpf_cgroup_link
continues to report its bpf_attach_type member to userspace via fdinfo
and bpf_link_info.

To ease disambiguation, bpf_attach_type variables are renamed from
'type' to 'atype' when changed to cgroup_bpf_attach_type.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com
---
 include/linux/bpf-cgroup.h     | 182 ++++++++++++++++++++++++++++-------------
 include/uapi/linux/bpf.h       |   2 +-
 kernel/bpf/cgroup.c            | 156 +++++++++++++++++++++--------------
 net/ipv4/af_inet.c             |   6 +-
 net/ipv4/udp.c                 |   2 +-
 net/ipv6/af_inet6.c            |   6 +-
 net/ipv6/udp.c                 |   2 +-
 tools/include/uapi/linux/bpf.h |   2 +-
 8 files changed, 226 insertions(+), 132 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a74cd1c3bd87..2746fd804216 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -23,9 +23,73 @@ struct ctl_table_header;
 struct task_struct;
 
 #ifdef CONFIG_CGROUP_BPF
+enum cgroup_bpf_attach_type {
+	CGROUP_BPF_ATTACH_TYPE_INVALID = -1,
+	CGROUP_INET_INGRESS = 0,
+	CGROUP_INET_EGRESS,
+	CGROUP_INET_SOCK_CREATE,
+	CGROUP_SOCK_OPS,
+	CGROUP_DEVICE,
+	CGROUP_INET4_BIND,
+	CGROUP_INET6_BIND,
+	CGROUP_INET4_CONNECT,
+	CGROUP_INET6_CONNECT,
+	CGROUP_INET4_POST_BIND,
+	CGROUP_INET6_POST_BIND,
+	CGROUP_UDP4_SENDMSG,
+	CGROUP_UDP6_SENDMSG,
+	CGROUP_SYSCTL,
+	CGROUP_UDP4_RECVMSG,
+	CGROUP_UDP6_RECVMSG,
+	CGROUP_GETSOCKOPT,
+	CGROUP_SETSOCKOPT,
+	CGROUP_INET4_GETPEERNAME,
+	CGROUP_INET6_GETPEERNAME,
+	CGROUP_INET4_GETSOCKNAME,
+	CGROUP_INET6_GETSOCKNAME,
+	CGROUP_INET_SOCK_RELEASE,
+	MAX_CGROUP_BPF_ATTACH_TYPE
+};
+
+#define CGROUP_ATYPE(type) \
+	case BPF_##type: return type
+
+static inline enum cgroup_bpf_attach_type
+to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
+{
+	switch (attach_type) {
+	CGROUP_ATYPE(CGROUP_INET_INGRESS);
+	CGROUP_ATYPE(CGROUP_INET_EGRESS);
+	CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE);
+	CGROUP_ATYPE(CGROUP_SOCK_OPS);
+	CGROUP_ATYPE(CGROUP_DEVICE);
+	CGROUP_ATYPE(CGROUP_INET4_BIND);
+	CGROUP_ATYPE(CGROUP_INET6_BIND);
+	CGROUP_ATYPE(CGROUP_INET4_CONNECT);
+	CGROUP_ATYPE(CGROUP_INET6_CONNECT);
+	CGROUP_ATYPE(CGROUP_INET4_POST_BIND);
+	CGROUP_ATYPE(CGROUP_INET6_POST_BIND);
+	CGROUP_ATYPE(CGROUP_UDP4_SENDMSG);
+	CGROUP_ATYPE(CGROUP_UDP6_SENDMSG);
+	CGROUP_ATYPE(CGROUP_SYSCTL);
+	CGROUP_ATYPE(CGROUP_UDP4_RECVMSG);
+	CGROUP_ATYPE(CGROUP_UDP6_RECVMSG);
+	CGROUP_ATYPE(CGROUP_GETSOCKOPT);
+	CGROUP_ATYPE(CGROUP_SETSOCKOPT);
+	CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME);
+	CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME);
+	CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
+	CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME);
+	CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE);
+	default:
+		return CGROUP_BPF_ATTACH_TYPE_INVALID;
+	}
+}
+
+#undef CGROUP_ATYPE
 
-extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
-#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
+extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE];
+#define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype])
 
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -67,15 +131,15 @@ struct bpf_prog_array;
 
 struct cgroup_bpf {
 	/* array of effective progs in this cgroup */
-	struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+	struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
 
 	/* attached progs to this cgroup and attach flags
 	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
 	 * have either zero or one element
 	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct list_head progs[MAX_BPF_ATTACH_TYPE];
-	u32 flags[MAX_BPF_ATTACH_TYPE];
+	struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
+	u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
 
 	/* list of cgroup shared storages */
 	struct list_head storages;
@@ -115,28 +179,28 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
-				enum bpf_attach_type type);
+				enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
-			       enum bpf_attach_type type);
+			       enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type,
+				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags);
 
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
-				     enum bpf_attach_type type);
+				     enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
-				      short access, enum bpf_attach_type type);
+				      short access, enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
 				   char **buf, size_t *pcount, loff_t *ppos,
-				   enum bpf_attach_type type);
+				   enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
 				       int *optname, char __user *optval,
@@ -179,9 +243,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
 	int __ret = 0;							      \
-	if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS))		      \
+	if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))		      \
 		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
-						    BPF_CGROUP_INET_INGRESS); \
+						    CGROUP_INET_INGRESS); \
 									      \
 	__ret;								      \
 })
@@ -189,54 +253,54 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb)			       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
+	if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
 		typeof(sk) __sk = sk_to_full_sk(sk);			       \
 		if (sk_fullsock(__sk))					       \
 			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
-						      BPF_CGROUP_INET_EGRESS); \
+						      CGROUP_INET_EGRESS); \
 	}								       \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SK_PROG(sk, type)				       \
+#define BPF_CGROUP_RUN_SK_PROG(sk, atype)				       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(type)) {					       \
-		__ret = __cgroup_bpf_run_filter_sk(sk, type);		       \
+	if (cgroup_bpf_enabled(atype)) {					       \
+		__ret = __cgroup_bpf_run_filter_sk(sk, atype);		       \
 	}								       \
 	__ret;								       \
 })
 
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
-	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE)
+	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE)
 
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk)			       \
-	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE)
+	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE)
 
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk)				       \
-	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND)
+	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND)
 
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)				       \
-	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND)
+	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
 
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)				       \
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype)				       \
 ({									       \
 	u32 __unused_flags;						       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(type))					       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+	if (cgroup_bpf_enabled(atype))					       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
 							  NULL,		       \
 							  &__unused_flags);    \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)		       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx)		       \
 ({									       \
 	u32 __unused_flags;						       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(type))	{				       \
+	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
 							  t_ctx,	       \
 							  &__unused_flags);    \
 		release_sock(sk);					       \
@@ -249,13 +313,13 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
  * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check
  * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE).
  */
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags)	       \
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags)	       \
 ({									       \
 	u32 __flags = 0;						       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(type))	{				       \
+	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
 							  NULL, &__flags);     \
 		release_sock(sk);					       \
 		if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)	       \
@@ -265,33 +329,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 })
 
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk)				       \
-	((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) ||		       \
-	  cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) &&		       \
+	((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) ||		       \
+	  cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) &&		       \
 	 (sk)->sk_prot->pre_connect)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL)
 
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
@@ -311,33 +375,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk)			\
 ({									\
 	int __ret = 0;							\
-	if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS))			\
+	if (cgroup_bpf_enabled(CGROUP_SOCK_OPS))			\
 		__ret = __cgroup_bpf_run_filter_sock_ops(sk,		\
 							 sock_ops,	\
-							 BPF_CGROUP_SOCK_OPS); \
+							 CGROUP_SOCK_OPS); \
 	__ret;								\
 })
 
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) {       \
+	if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) {       \
 		typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk);	       \
 		if (__sk && sk_fullsock(__sk))				       \
 			__ret = __cgroup_bpf_run_filter_sock_ops(__sk,	       \
 								 sock_ops,     \
-							 BPF_CGROUP_SOCK_OPS); \
+							 CGROUP_SOCK_OPS); \
 	}								       \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)	      \
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access)	      \
 ({									      \
 	int __ret = 0;							      \
-	if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE))			      \
-		__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
+	if (cgroup_bpf_enabled(CGROUP_DEVICE))			      \
+		__ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \
 							  access,	      \
-							  BPF_CGROUP_DEVICE); \
+							  CGROUP_DEVICE); \
 									      \
 	__ret;								      \
 })
@@ -346,10 +410,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos)  \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL))			       \
+	if (cgroup_bpf_enabled(CGROUP_SYSCTL))			       \
 		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
 						       buf, count, pos,        \
-						       BPF_CGROUP_SYSCTL);     \
+						       CGROUP_SYSCTL);     \
 	__ret;								       \
 })
 
@@ -357,7 +421,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 				       kernel_optval)			       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT))			       \
+	if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT))			       \
 		__ret = __cgroup_bpf_run_filter_setsockopt(sock, level,	       \
 							   optname, optval,    \
 							   optlen,	       \
@@ -368,7 +432,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen)			       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
+	if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))			       \
 		get_user(__ret, optlen);				       \
 	__ret;								       \
 })
@@ -377,7 +441,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 				       max_optlen, retval)		       \
 ({									       \
 	int __ret = retval;						       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
+	if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))			       \
 		if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \
 		    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
 					tcp_bpf_bypass_getsockopt,	       \
@@ -392,7 +456,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 					    optlen, retval)		       \
 ({									       \
 	int __ret = retval;						       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
+	if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))			       \
 		__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \
 			sock, level, optname, optval, optlen, retval);	       \
 	__ret;								       \
@@ -451,14 +515,14 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 	return 0;
 }
 
-#define cgroup_bpf_enabled(type) (0)
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
+#define cgroup_bpf_enabled(atype) (0)
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
@@ -470,7 +534,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c4f7892edb2b..191f0b286ee3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -84,7 +84,7 @@ struct bpf_lpm_trie_key {
 
 struct bpf_cgroup_storage_key {
 	__u64	cgroup_inode_id;	/* cgroup inode id */
-	__u32	attach_type;		/* program attach type */
+	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
 union bpf_iter_link_info {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8e9d99e2ade4..03145d45e3d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -19,7 +19,7 @@
 
 #include "../cgroup/cgroup-internal.h"
 
-DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE);
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 
 void cgroup_bpf_offline(struct cgroup *cgrp)
@@ -113,12 +113,12 @@ static void cgroup_bpf_release(struct work_struct *work)
 	struct list_head *storages = &cgrp->bpf.storages;
 	struct bpf_cgroup_storage *storage, *stmp;
 
-	unsigned int type;
+	unsigned int atype;
 
 	mutex_lock(&cgroup_mutex);
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
-		struct list_head *progs = &cgrp->bpf.progs[type];
+	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
+		struct list_head *progs = &cgrp->bpf.progs[atype];
 		struct bpf_prog_list *pl, *pltmp;
 
 		list_for_each_entry_safe(pl, pltmp, progs, node) {
@@ -128,10 +128,10 @@ static void cgroup_bpf_release(struct work_struct *work)
 			if (pl->link)
 				bpf_cgroup_link_auto_detach(pl->link);
 			kfree(pl);
-			static_branch_dec(&cgroup_bpf_enabled_key[type]);
+			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 		}
 		old_array = rcu_dereference_protected(
-				cgrp->bpf.effective[type],
+				cgrp->bpf.effective[atype],
 				lockdep_is_held(&cgroup_mutex));
 		bpf_prog_array_free(old_array);
 	}
@@ -196,7 +196,7 @@ static u32 prog_list_length(struct list_head *head)
  * if parent has overridable or multi-prog, allow attaching
  */
 static bool hierarchy_allows_attach(struct cgroup *cgrp,
-				    enum bpf_attach_type type)
+				    enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *p;
 
@@ -204,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
 	if (!p)
 		return true;
 	do {
-		u32 flags = p->bpf.flags[type];
+		u32 flags = p->bpf.flags[atype];
 		u32 cnt;
 
 		if (flags & BPF_F_ALLOW_MULTI)
 			return true;
-		cnt = prog_list_length(&p->bpf.progs[type]);
+		cnt = prog_list_length(&p->bpf.progs[atype]);
 		WARN_ON_ONCE(cnt > 1);
 		if (cnt == 1)
 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -225,7 +225,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
  * to programs in this cgroup
  */
 static int compute_effective_progs(struct cgroup *cgrp,
-				   enum bpf_attach_type type,
+				   enum cgroup_bpf_attach_type atype,
 				   struct bpf_prog_array **array)
 {
 	struct bpf_prog_array_item *item;
@@ -236,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp,
 
 	/* count number of effective programs by walking parents */
 	do {
-		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
-			cnt += prog_list_length(&p->bpf.progs[type]);
+		if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+			cnt += prog_list_length(&p->bpf.progs[atype]);
 		p = cgroup_parent(p);
 	} while (p);
 
@@ -249,10 +249,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
 	cnt = 0;
 	p = cgrp;
 	do {
-		if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 			continue;
 
-		list_for_each_entry(pl, &p->bpf.progs[type], node) {
+		list_for_each_entry(pl, &p->bpf.progs[atype], node) {
 			if (!prog_list_prog(pl))
 				continue;
 
@@ -269,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
 }
 
 static void activate_effective_progs(struct cgroup *cgrp,
-				     enum bpf_attach_type type,
+				     enum cgroup_bpf_attach_type atype,
 				     struct bpf_prog_array *old_array)
 {
-	old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
+	old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
 					lockdep_is_held(&cgroup_mutex));
 	/* free prog array after grace period, since __cgroup_bpf_run_*()
 	 * might be still walking the array
@@ -328,7 +328,7 @@ cleanup:
 }
 
 static int update_effective_progs(struct cgroup *cgrp,
-				  enum bpf_attach_type type)
+				  enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup_subsys_state *css;
 	int err;
@@ -340,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp,
 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
 			continue;
 
-		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
 		if (err)
 			goto cleanup;
 	}
@@ -357,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp,
 			continue;
 		}
 
-		activate_effective_progs(desc, type, desc->bpf.inactive);
+		activate_effective_progs(desc, atype, desc->bpf.inactive);
 		desc->bpf.inactive = NULL;
 	}
 
@@ -436,11 +436,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 			enum bpf_attach_type type, u32 flags)
 {
 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
-	struct list_head *progs = &cgrp->bpf.progs[type];
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_list *pl;
+	struct list_head *progs;
 	int err;
 
 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -454,10 +455,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
 		return -EINVAL;
 
-	if (!hierarchy_allows_attach(cgrp, type))
+	atype = to_cgroup_bpf_attach_type(type);
+	if (atype < 0)
+		return -EINVAL;
+
+	progs = &cgrp->bpf.progs[atype];
+
+	if (!hierarchy_allows_attach(cgrp, atype))
 		return -EPERM;
 
-	if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
+	if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
 		/* Disallow attaching non-overridable on top
 		 * of existing overridable in this cgroup.
 		 * Disallow attaching multi-prog if overridable or none
@@ -490,16 +497,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	pl->prog = prog;
 	pl->link = link;
 	bpf_cgroup_storages_assign(pl->storage, storage);
-	cgrp->bpf.flags[type] = saved_flags;
+	cgrp->bpf.flags[atype] = saved_flags;
 
-	err = update_effective_progs(cgrp, type);
+	err = update_effective_progs(cgrp, atype);
 	if (err)
 		goto cleanup;
 
 	if (old_prog)
 		bpf_prog_put(old_prog);
 	else
-		static_branch_inc(&cgroup_bpf_enabled_key[type]);
+		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
 	bpf_cgroup_storages_link(new_storage, cgrp, type);
 	return 0;
 
@@ -520,7 +527,7 @@ cleanup:
  * all descendant cgroups. This function is guaranteed to succeed.
  */
 static void replace_effective_prog(struct cgroup *cgrp,
-				   enum bpf_attach_type type,
+				   enum cgroup_bpf_attach_type atype,
 				   struct bpf_cgroup_link *link)
 {
 	struct bpf_prog_array_item *item;
@@ -539,10 +546,10 @@ static void replace_effective_prog(struct cgroup *cgrp,
 
 		/* find position of link in effective progs array */
 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
-			if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 				continue;
 
-			head = &cg->bpf.progs[type];
+			head = &cg->bpf.progs[atype];
 			list_for_each_entry(pl, head, node) {
 				if (!prog_list_prog(pl))
 					continue;
@@ -554,7 +561,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
 found:
 		BUG_ON(!cg);
 		progs = rcu_dereference_protected(
-				desc->bpf.effective[type],
+				desc->bpf.effective[atype],
 				lockdep_is_held(&cgroup_mutex));
 		item = &progs->items[pos];
 		WRITE_ONCE(item->prog, link->link.prog);
@@ -574,11 +581,18 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 				struct bpf_cgroup_link *link,
 				struct bpf_prog *new_prog)
 {
-	struct list_head *progs = &cgrp->bpf.progs[link->type];
+	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog *old_prog;
 	struct bpf_prog_list *pl;
+	struct list_head *progs;
 	bool found = false;
 
+	atype = to_cgroup_bpf_attach_type(link->type);
+	if (atype < 0)
+		return -EINVAL;
+
+	progs = &cgrp->bpf.progs[atype];
+
 	if (link->link.prog->type != new_prog->type)
 		return -EINVAL;
 
@@ -592,7 +606,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 		return -ENOENT;
 
 	old_prog = xchg(&link->link.prog, new_prog);
-	replace_effective_prog(cgrp, link->type, link);
+	replace_effective_prog(cgrp, atype, link);
 	bpf_prog_put(old_prog);
 	return 0;
 }
@@ -667,12 +681,20 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			struct bpf_cgroup_link *link, enum bpf_attach_type type)
 {
-	struct list_head *progs = &cgrp->bpf.progs[type];
-	u32 flags = cgrp->bpf.flags[type];
-	struct bpf_prog_list *pl;
+	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog *old_prog;
+	struct bpf_prog_list *pl;
+	struct list_head *progs;
+	u32 flags;
 	int err;
 
+	atype = to_cgroup_bpf_attach_type(type);
+	if (atype < 0)
+		return -EINVAL;
+
+	progs = &cgrp->bpf.progs[atype];
+	flags = cgrp->bpf.flags[atype];
+
 	if (prog && link)
 		/* only one of prog or link can be specified */
 		return -EINVAL;
@@ -686,7 +708,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	pl->prog = NULL;
 	pl->link = NULL;
 
-	err = update_effective_progs(cgrp, type);
+	err = update_effective_progs(cgrp, atype);
 	if (err)
 		goto cleanup;
 
@@ -695,10 +717,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	kfree(pl);
 	if (list_empty(progs))
 		/* last program was detached, reset flags to zero */
-		cgrp->bpf.flags[type] = 0;
+		cgrp->bpf.flags[atype] = 0;
 	if (old_prog)
 		bpf_prog_put(old_prog);
-	static_branch_dec(&cgroup_bpf_enabled_key[type]);
+	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 	return 0;
 
 cleanup:
@@ -714,13 +736,21 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 {
 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 	enum bpf_attach_type type = attr->query.attach_type;
-	struct list_head *progs = &cgrp->bpf.progs[type];
-	u32 flags = cgrp->bpf.flags[type];
+	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_array *effective;
+	struct list_head *progs;
 	struct bpf_prog *prog;
 	int cnt, ret = 0, i;
+	u32 flags;
+
+	atype = to_cgroup_bpf_attach_type(type);
+	if (atype < 0)
+		return -EINVAL;
+
+	progs = &cgrp->bpf.progs[atype];
+	flags = cgrp->bpf.flags[atype];
 
-	effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
 					      lockdep_is_held(&cgroup_mutex));
 
 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
@@ -925,14 +955,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	link->cgroup = cgrp;
 	link->type = attr->link_create.attach_type;
 
-	err  = bpf_link_prime(&link->link, &link_primer);
+	err = bpf_link_prime(&link->link, &link_primer);
 	if (err) {
 		kfree(link);
 		goto out_put_cgroup;
 	}
 
-	err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
-				BPF_F_ALLOW_MULTI);
+	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
+				link->type, BPF_F_ALLOW_MULTI);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		goto out_put_cgroup;
@@ -986,7 +1016,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
  */
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
-				enum bpf_attach_type type)
+				enum cgroup_bpf_attach_type atype)
 {
 	unsigned int offset = skb->data - skb_network_header(skb);
 	struct sock *save_sk;
@@ -1008,11 +1038,11 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	/* compute pointers for the bpf prog */
 	bpf_compute_and_save_data_end(skb, &saved_data_end);
 
-	if (type == BPF_CGROUP_INET_EGRESS) {
+	if (atype == CGROUP_INET_EGRESS) {
 		ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
-			cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+			cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
 	} else {
-		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb,
+		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
 					    __bpf_prog_run_save_cb);
 		ret = (ret == 1 ? 0 : -EPERM);
 	}
@@ -1038,12 +1068,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
  * and if it returned != 1 during execution. In all other cases, 0 is returned.
  */
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
-			       enum bpf_attach_type type)
+			       enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	int ret;
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1065,7 +1095,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type,
+				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags)
 {
@@ -1090,7 +1120,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx,
+	ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
 				          bpf_prog_run, flags);
 
 	return ret == 1 ? 0 : -EPERM;
@@ -1115,19 +1145,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
  */
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
-				     enum bpf_attach_type type)
+				     enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	int ret;
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops,
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
 				    bpf_prog_run);
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
-				      short access, enum bpf_attach_type type)
+				      short access, enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *cgrp;
 	struct bpf_cgroup_dev_ctx ctx = {
@@ -1139,7 +1169,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx,
+	allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
 				      bpf_prog_run);
 	rcu_read_unlock();
 
@@ -1231,7 +1261,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
 				   char **buf, size_t *pcount, loff_t *ppos,
-				   enum bpf_attach_type type)
+				   enum cgroup_bpf_attach_type atype)
 {
 	struct bpf_sysctl_kern ctx = {
 		.head = head,
@@ -1271,7 +1301,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
 	rcu_read_unlock();
 
 	kfree(ctx.cur_val);
@@ -1289,7 +1319,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 #ifdef CONFIG_NET
 static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
-					     enum bpf_attach_type attach_type)
+					     enum cgroup_bpf_attach_type attach_type)
 {
 	struct bpf_prog_array *prog_array;
 	bool empty;
@@ -1364,7 +1394,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
 	 */
-	if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
 		return 0;
 
 	/* Allocate a bit more than the initial user buffer for
@@ -1385,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
 				    &ctx, bpf_prog_run);
 	release_sock(sk);
 
@@ -1460,7 +1490,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
 	 */
-	if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
 		return retval;
 
 	ctx.optlen = max_optlen;
@@ -1495,7 +1525,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
 				    &ctx, bpf_prog_run);
 	release_sock(sk);
 
@@ -1556,7 +1586,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 	 * be called if that data shouldn't be "exported".
 	 */
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
 				    &ctx, bpf_prog_run);
 	if (!ret)
 		return -EPERM;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0e4d758c2585..1d816a5fd3eb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -452,7 +452,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	 * changes context in a wrong way it will be caught.
 	 */
 	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
-						 BPF_CGROUP_INET4_BIND, &flags);
+						 CGROUP_INET4_BIND, &flags);
 	if (err)
 		return err;
 
@@ -781,7 +781,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin_port = inet->inet_dport;
 		sin->sin_addr.s_addr = inet->inet_daddr;
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    BPF_CGROUP_INET4_GETPEERNAME,
+					    CGROUP_INET4_GETPEERNAME,
 					    NULL);
 	} else {
 		__be32 addr = inet->inet_rcv_saddr;
@@ -790,7 +790,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin_port = inet->inet_sport;
 		sin->sin_addr.s_addr = addr;
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    BPF_CGROUP_INET4_GETSOCKNAME,
+					    CGROUP_INET4_GETSOCKNAME,
 					    NULL);
 	}
 	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1a742b710e54..8851c9463b4b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1143,7 +1143,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		rcu_read_unlock();
 	}
 
-	if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) {
+	if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
 					    (struct sockaddr *)usin, &ipc.addr);
 		if (err)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d92c90d97763..b5878bb8e419 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -455,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	 * changes context in a wrong way it will be caught.
 	 */
 	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
-						 BPF_CGROUP_INET6_BIND, &flags);
+						 CGROUP_INET6_BIND, &flags);
 	if (err)
 		return err;
 
@@ -532,7 +532,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		if (np->sndflow)
 			sin->sin6_flowinfo = np->flow_label;
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    BPF_CGROUP_INET6_GETPEERNAME,
+					    CGROUP_INET6_GETPEERNAME,
 					    NULL);
 	} else {
 		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -541,7 +541,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 			sin->sin6_addr = sk->sk_v6_rcv_saddr;
 		sin->sin6_port = inet->inet_sport;
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    BPF_CGROUP_INET6_GETSOCKNAME,
+					    CGROUP_INET6_GETSOCKNAME,
 					    NULL);
 	}
 	sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c5e15e94bb00..ea53847b5b7e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1475,7 +1475,7 @@ do_udp_sendmsg:
 		fl6.saddr = np->saddr;
 	fl6.fl6_sport = inet->inet_sport;
 
-	if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) {
+	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
 					   (struct sockaddr *)sin6, &fl6.saddr);
 		if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c4f7892edb2b..191f0b286ee3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -84,7 +84,7 @@ struct bpf_lpm_trie_key {
 
 struct bpf_cgroup_storage_key {
 	__u64	cgroup_inode_id;	/* cgroup inode id */
-	__u32	attach_type;		/* program attach type */
+	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
 union bpf_iter_link_info {
-- 
cgit v1.2.3


From dab2ea6c680f87add6d2f7007ce46b6b9e3857f7 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 23 Aug 2021 20:02:38 +0200
Subject: ieee80211: add TWT element definitions

Introduce TWT definitions and TWT Information element structure
in ieee80211.h

Tested-by: Peter Chiu <chui-hao.chiu@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/71d8b581fe4b5abc5b92f8d77ac2de3e2f7591b6.1629741512.git.lorenzo@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index a6730072d13a..2e8953d80d4b 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1088,6 +1088,48 @@ struct ieee80211_ext {
 	} u;
 } __packed __aligned(2);
 
+#define IEEE80211_TWT_CONTROL_NDP			BIT(0)
+#define IEEE80211_TWT_CONTROL_RESP_MODE			BIT(1)
+#define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST	BIT(3)
+#define IEEE80211_TWT_CONTROL_RX_DISABLED		BIT(4)
+#define IEEE80211_TWT_CONTROL_WAKE_DUR_UNIT		BIT(5)
+
+#define IEEE80211_TWT_REQTYPE_REQUEST			BIT(0)
+#define IEEE80211_TWT_REQTYPE_SETUP_CMD			GENMASK(3, 1)
+#define IEEE80211_TWT_REQTYPE_TRIGGER			BIT(4)
+#define IEEE80211_TWT_REQTYPE_IMPLICIT			BIT(5)
+#define IEEE80211_TWT_REQTYPE_FLOWTYPE			BIT(6)
+#define IEEE80211_TWT_REQTYPE_FLOWID			GENMASK(9, 7)
+#define IEEE80211_TWT_REQTYPE_WAKE_INT_EXP		GENMASK(14, 10)
+#define IEEE80211_TWT_REQTYPE_PROTECTION		BIT(15)
+
+enum ieee80211_twt_setup_cmd {
+	TWT_SETUP_CMD_REQUEST,
+	TWT_SETUP_CMD_SUGGEST,
+	TWT_SETUP_CMD_DEMAND,
+	TWT_SETUP_CMD_GROUPING,
+	TWT_SETUP_CMD_ACCEPT,
+	TWT_SETUP_CMD_ALTERNATE,
+	TWT_SETUP_CMD_DICTATE,
+	TWT_SETUP_CMD_REJECT,
+};
+
+struct ieee80211_twt_params {
+	__le16 req_type;
+	__le64 twt;
+	u8 min_twt_dur;
+	__le16 mantissa;
+	u8 channel;
+} __packed;
+
+struct ieee80211_twt_setup {
+	u8 dialog_token;
+	u8 element_id;
+	u8 length;
+	u8 control;
+	u8 params[];
+} __packed;
+
 struct ieee80211_mgmt {
 	__le16 frame_control;
 	__le16 duration;
@@ -1252,6 +1294,10 @@ struct ieee80211_mgmt {
 					__le16 toa_error;
 					u8 variable[0];
 				} __packed ftm;
+				struct {
+					u8 action_code;
+					u8 variable[];
+				} __packed s1g;
 			} u;
 		} __packed action;
 	} u;
@@ -2881,6 +2927,7 @@ enum ieee80211_eid {
 	WLAN_EID_AID_RESPONSE = 211,
 	WLAN_EID_S1G_BCN_COMPAT = 213,
 	WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214,
+	WLAN_EID_S1G_TWT = 216,
 	WLAN_EID_S1G_CAPABILITIES = 217,
 	WLAN_EID_VENDOR_SPECIFIC = 221,
 	WLAN_EID_QOS_PARAMETER = 222,
@@ -2950,6 +2997,7 @@ enum ieee80211_category {
 	WLAN_CATEGORY_FST = 18,
 	WLAN_CATEGORY_UNPROT_DMG = 20,
 	WLAN_CATEGORY_VHT = 21,
+	WLAN_CATEGORY_S1G = 22,
 	WLAN_CATEGORY_VENDOR_SPECIFIC_PROTECTED = 126,
 	WLAN_CATEGORY_VENDOR_SPECIFIC = 127,
 };
@@ -3023,6 +3071,20 @@ enum ieee80211_key_len {
 	WLAN_KEY_LEN_BIP_GMAC_256 = 32,
 };
 
+enum ieee80211_s1g_actioncode {
+	WLAN_S1G_AID_SWITCH_REQUEST,
+	WLAN_S1G_AID_SWITCH_RESPONSE,
+	WLAN_S1G_SYNC_CONTROL,
+	WLAN_S1G_STA_INFO_ANNOUNCE,
+	WLAN_S1G_EDCA_PARAM_SET,
+	WLAN_S1G_EL_OPERATION,
+	WLAN_S1G_TWT_SETUP,
+	WLAN_S1G_TWT_TEARDOWN,
+	WLAN_S1G_SECT_GROUP_ID_LIST,
+	WLAN_S1G_SECT_ID_FEEDBACK,
+	WLAN_S1G_TWT_INFORMATION = 11,
+};
+
 #define IEEE80211_WEP_IV_LEN		4
 #define IEEE80211_WEP_ICV_LEN		4
 #define IEEE80211_CCMP_HDR_LEN		8
-- 
cgit v1.2.3


From f5a4c24e689f54e66201f04d343bdd2e8a1d7923 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 23 Aug 2021 20:02:39 +0200
Subject: mac80211: introduce individual TWT support in AP mode

Introduce TWT action frames parsing support to mac80211.
Currently just individual TWT agreement are support in AP mode.
Whenever the AP receives a TWT action frame from an associated client,
after performing sanity checks, it will notify the underlay driver with
requested parameters in order to check if they are supported and if there
is enough room for a new agreement. The driver is expected to set the
agreement result and report it to mac80211.

Drivers supporting this have two new callbacks:
 - add_twt_setup (mandatory)
 - twt_teardown_request (optional)

mac80211 will send an action frame reply according to the result
reported by the driver.

Tested-by: Peter Chiu <chui-hao.chiu@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/257512f2e22ba42b9f2624942a128dd8f141de4b.1629741512.git.lorenzo@kernel.org
[use le16p_replace_bits(), minor cleanups, use (void *) casts,
 fix to use ieee80211_get_he_iftype_cap() correctly]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  12 +++
 net/mac80211/driver-ops.h  |  36 +++++++++
 net/mac80211/ieee80211_i.h |   6 ++
 net/mac80211/iface.c       |  41 +++++++++++
 net/mac80211/rx.c          |  73 ++++++++++++++++++
 net/mac80211/s1g.c         | 180 +++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/status.c      |  17 ++++-
 net/mac80211/trace.h       |  67 +++++++++++++++++
 8 files changed, 430 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a23e6734d26b..af0fc13cea34 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3926,6 +3926,13 @@ struct ieee80211_prep_tx_info {
  * @set_sar_specs: Update the SAR (TX power) settings.
  * @sta_set_decap_offload: Called to notify the driver when a station is allowed
  *	to use rx decapsulation offload
+ * @add_twt_setup: Update hw with TWT agreement parameters received from the peer.
+ *	This callback allows the hw to check if requested parameters
+ *	are supported and if there is enough room for a new agreement.
+ *	The hw is expected to set agreement result in the req_type field of
+ *	twt structure.
+ * @twt_teardown_request: Update the hw with TWT teardown request received
+ *	from the peer.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -4249,6 +4256,11 @@ struct ieee80211_ops {
 	void (*sta_set_decap_offload)(struct ieee80211_hw *hw,
 				      struct ieee80211_vif *vif,
 				      struct ieee80211_sta *sta, bool enabled);
+	void (*add_twt_setup)(struct ieee80211_hw *hw,
+			      struct ieee80211_sta *sta,
+			      struct ieee80211_twt_setup *twt);
+	void (*twt_teardown_request)(struct ieee80211_hw *hw,
+				     struct ieee80211_sta *sta, u8 flowid);
 };
 
 /**
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index bcb7cc06db3d..cd3731cbf6c6 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1447,4 +1447,40 @@ static inline void drv_sta_set_decap_offload(struct ieee80211_local *local,
 	trace_drv_return_void(local);
 }
 
+static inline void drv_add_twt_setup(struct ieee80211_local *local,
+				     struct ieee80211_sub_if_data *sdata,
+				     struct ieee80211_sta *sta,
+				     struct ieee80211_twt_setup *twt)
+{
+	struct ieee80211_twt_params *twt_agrt;
+
+	might_sleep();
+
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	twt_agrt = (void *)twt->params;
+
+	trace_drv_add_twt_setup(local, sta, twt, twt_agrt);
+	local->ops->add_twt_setup(&local->hw, sta, twt);
+	trace_drv_return_void(local);
+}
+
+static inline void drv_twt_teardown_request(struct ieee80211_local *local,
+					    struct ieee80211_sub_if_data *sdata,
+					    struct ieee80211_sta *sta,
+					    u8 flowid)
+{
+	might_sleep();
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	if (!local->ops->twt_teardown_request)
+		return;
+
+	trace_drv_twt_teardown_request(local, sta, flowid);
+	local->ops->twt_teardown_request(&local->hw, sta, flowid);
+	trace_drv_return_void(local);
+}
+
 #endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index e8945c20688a..4d830a9c7b18 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -946,6 +946,7 @@ struct ieee80211_sub_if_data {
 
 	struct work_struct work;
 	struct sk_buff_head skb_queue;
+	struct sk_buff_head status_queue;
 
 	u8 needed_rx_chains;
 	enum ieee80211_smps_mode smps_mode;
@@ -2080,6 +2081,11 @@ ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif,
 
 /* S1G */
 void ieee80211_s1g_sta_rate_init(struct sta_info *sta);
+bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb);
+void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata,
+				 struct sk_buff *skb);
+void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata,
+				     struct sk_buff *skb);
 
 /* Spectrum management */
 void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 769f8f585c06..62c95597704b 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -552,6 +552,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 		 */
 		ieee80211_free_keys(sdata, true);
 		skb_queue_purge(&sdata->skb_queue);
+		skb_queue_purge(&sdata->status_queue);
 	}
 
 	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
@@ -984,6 +985,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 	}
 
 	skb_queue_head_init(&sdata->skb_queue);
+	skb_queue_head_init(&sdata->status_queue);
 	INIT_WORK(&sdata->work, ieee80211_iface_work);
 
 	return 0;
@@ -1382,6 +1384,16 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local,
 			WARN_ON(1);
 			break;
 		}
+	} else if (ieee80211_is_action(mgmt->frame_control) &&
+		   mgmt->u.action.category == WLAN_CATEGORY_S1G) {
+		switch (mgmt->u.action.u.s1g.action_code) {
+		case WLAN_S1G_TWT_TEARDOWN:
+		case WLAN_S1G_TWT_SETUP:
+			ieee80211_s1g_rx_twt_action(sdata, skb);
+			break;
+		default:
+			break;
+		}
 	} else if (ieee80211_is_ext(mgmt->frame_control)) {
 		if (sdata->vif.type == NL80211_IFTYPE_STATION)
 			ieee80211_sta_rx_queued_ext(sdata, skb);
@@ -1437,6 +1449,24 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local,
 	}
 }
 
+static void ieee80211_iface_process_status(struct ieee80211_sub_if_data *sdata,
+					   struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (void *)skb->data;
+
+	if (ieee80211_is_action(mgmt->frame_control) &&
+	    mgmt->u.action.category == WLAN_CATEGORY_S1G) {
+		switch (mgmt->u.action.u.s1g.action_code) {
+		case WLAN_S1G_TWT_TEARDOWN:
+		case WLAN_S1G_TWT_SETUP:
+			ieee80211_s1g_status_twt_action(sdata, skb);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
 static void ieee80211_iface_work(struct work_struct *work)
 {
 	struct ieee80211_sub_if_data *sdata =
@@ -1466,6 +1496,16 @@ static void ieee80211_iface_work(struct work_struct *work)
 		kcov_remote_stop();
 	}
 
+	/* process status queue */
+	while ((skb = skb_dequeue(&sdata->status_queue))) {
+		kcov_remote_start_common(skb_get_kcov_handle(skb));
+
+		ieee80211_iface_process_status(sdata, skb);
+		kfree_skb(skb);
+
+		kcov_remote_stop();
+	}
+
 	/* then other type-dependent work */
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_STATION:
@@ -1529,6 +1569,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	}
 
 	skb_queue_head_init(&sdata->skb_queue);
+	skb_queue_head_init(&sdata->status_queue);
 	INIT_WORK(&sdata->work, ieee80211_iface_work);
 	INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work);
 	INIT_WORK(&sdata->csa_finalize_work, ieee80211_csa_finalize_work);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 33c56eab07fc..99ed68f7dc36 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3212,6 +3212,68 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
+static bool
+ieee80211_process_rx_twt_action(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)rx->skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	const struct ieee80211_sta_he_cap *hecap;
+	struct ieee80211_supported_band *sband;
+
+	/* TWT actions are only supported in AP for the moment */
+	if (sdata->vif.type != NL80211_IFTYPE_AP)
+		return false;
+
+	if (!rx->local->ops->add_twt_setup)
+		return false;
+
+	sband = rx->local->hw.wiphy->bands[status->band];
+	hecap = ieee80211_get_he_iftype_cap(sband,
+					    ieee80211_vif_type_p2p(&sdata->vif));
+	if (!hecap)
+		return false;
+
+	if (!(hecap->he_cap_elem.mac_cap_info[0] &
+	      IEEE80211_HE_MAC_CAP0_TWT_RES))
+		return false;
+
+	if (!rx->sta)
+		return false;
+
+	switch (mgmt->u.action.u.s1g.action_code) {
+	case WLAN_S1G_TWT_SETUP: {
+		struct ieee80211_twt_setup *twt;
+
+		if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE +
+				   1 + /* action code */
+				   sizeof(struct ieee80211_twt_setup) +
+				   2 /* TWT req_type agrt */)
+			break;
+
+		twt = (void *)mgmt->u.action.u.s1g.variable;
+		if (twt->element_id != WLAN_EID_S1G_TWT)
+			break;
+
+		if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE +
+				   4 + /* action code + token + tlv */
+				   twt->length)
+			break;
+
+		return true; /* queue the frame */
+	}
+	case WLAN_S1G_TWT_TEARDOWN:
+		if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + 2)
+			break;
+
+		return true; /* queue the frame */
+	default:
+		break;
+	}
+
+	return false;
+}
+
 static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 {
@@ -3491,6 +3553,17 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 		    !mesh_path_sel_is_hwmp(sdata))
 			break;
 		goto queue;
+	case WLAN_CATEGORY_S1G:
+		switch (mgmt->u.action.u.s1g.action_code) {
+		case WLAN_S1G_TWT_SETUP:
+		case WLAN_S1G_TWT_TEARDOWN:
+			if (ieee80211_process_rx_twt_action(rx))
+				goto queue;
+			break;
+		default:
+			break;
+		}
+		break;
 	}
 
 	return RX_CONTINUE;
diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c
index c33f332b049a..7e35ab5b6166 100644
--- a/net/mac80211/s1g.c
+++ b/net/mac80211/s1g.c
@@ -6,6 +6,7 @@
 #include <linux/ieee80211.h>
 #include <net/mac80211.h>
 #include "ieee80211_i.h"
+#include "driver-ops.h"
 
 void ieee80211_s1g_sta_rate_init(struct sta_info *sta)
 {
@@ -14,3 +15,182 @@ void ieee80211_s1g_sta_rate_init(struct sta_info *sta)
 	sta->rx_stats.last_rate =
 			STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_S1G);
 }
+
+bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data;
+
+	if (likely(!ieee80211_is_action(mgmt->frame_control)))
+		return false;
+
+	if (likely(mgmt->u.action.category != WLAN_CATEGORY_S1G))
+		return false;
+
+	return mgmt->u.action.u.s1g.action_code == WLAN_S1G_TWT_SETUP;
+}
+
+static void
+ieee80211_s1g_send_twt_setup(struct ieee80211_sub_if_data *sdata, const u8 *da,
+			     const u8 *bssid, struct ieee80211_twt_setup *twt)
+{
+	int len = IEEE80211_MIN_ACTION_SIZE + 4 + twt->length;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_mgmt *mgmt;
+	struct sk_buff *skb;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + len);
+	if (!skb)
+		return;
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = skb_put_zero(skb, len);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, bssid, ETH_ALEN);
+
+	mgmt->u.action.category = WLAN_CATEGORY_S1G;
+	mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_SETUP;
+	memcpy(mgmt->u.action.u.s1g.variable, twt, 3 + twt->length);
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
+					IEEE80211_TX_INTFL_MLME_CONN_TX |
+					IEEE80211_TX_CTL_REQ_TX_STATUS;
+	ieee80211_tx_skb(sdata, skb);
+}
+
+static void
+ieee80211_s1g_send_twt_teardown(struct ieee80211_sub_if_data *sdata,
+				const u8 *da, const u8 *bssid, u8 flowid)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_mgmt *mgmt;
+	struct sk_buff *skb;
+	u8 *id;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+			    IEEE80211_MIN_ACTION_SIZE + 2);
+	if (!skb)
+		return;
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE + 2);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, bssid, ETH_ALEN);
+
+	mgmt->u.action.category = WLAN_CATEGORY_S1G;
+	mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_TEARDOWN;
+	id = (u8 *)mgmt->u.action.u.s1g.variable;
+	*id = flowid;
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
+					IEEE80211_TX_CTL_REQ_TX_STATUS;
+	ieee80211_tx_skb(sdata, skb);
+}
+
+static void
+ieee80211_s1g_rx_twt_setup(struct ieee80211_sub_if_data *sdata,
+			   struct sta_info *sta, struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (void *)skb->data;
+	struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable;
+	struct ieee80211_twt_params *twt_agrt = (void *)twt->params;
+
+	twt_agrt->req_type &= cpu_to_le16(~IEEE80211_TWT_REQTYPE_REQUEST);
+
+	/* broadcast TWT not supported yet */
+	if (twt->control & IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST) {
+		le16p_replace_bits(&twt_agrt->req_type,
+				   TWT_SETUP_CMD_REJECT,
+				   IEEE80211_TWT_REQTYPE_SETUP_CMD);
+		goto out;
+	}
+
+	drv_add_twt_setup(sdata->local, sdata, &sta->sta, twt);
+out:
+	ieee80211_s1g_send_twt_setup(sdata, mgmt->sa, sdata->vif.addr, twt);
+}
+
+static void
+ieee80211_s1g_rx_twt_teardown(struct ieee80211_sub_if_data *sdata,
+			      struct sta_info *sta, struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data;
+
+	drv_twt_teardown_request(sdata->local, sdata, &sta->sta,
+				 mgmt->u.action.u.s1g.variable[0]);
+}
+
+static void
+ieee80211_s1g_tx_twt_setup_fail(struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta, struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data;
+	struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable;
+	struct ieee80211_twt_params *twt_agrt = (void *)twt->params;
+	u8 flowid = le16_get_bits(twt_agrt->req_type,
+				  IEEE80211_TWT_REQTYPE_FLOWID);
+
+	drv_twt_teardown_request(sdata->local, sdata, &sta->sta, flowid);
+
+	ieee80211_s1g_send_twt_teardown(sdata, mgmt->sa, sdata->vif.addr,
+					flowid);
+}
+
+void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata,
+				 struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data;
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	mutex_lock(&local->sta_mtx);
+
+	sta = sta_info_get_bss(sdata, mgmt->sa);
+	if (!sta)
+		goto out;
+
+	switch (mgmt->u.action.u.s1g.action_code) {
+	case WLAN_S1G_TWT_SETUP:
+		ieee80211_s1g_rx_twt_setup(sdata, sta, skb);
+		break;
+	case WLAN_S1G_TWT_TEARDOWN:
+		ieee80211_s1g_rx_twt_teardown(sdata, sta, skb);
+		break;
+	default:
+		break;
+	}
+
+out:
+	mutex_unlock(&local->sta_mtx);
+}
+
+void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata,
+				     struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data;
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	mutex_lock(&local->sta_mtx);
+
+	sta = sta_info_get_bss(sdata, mgmt->da);
+	if (!sta)
+		goto out;
+
+	switch (mgmt->u.action.u.s1g.action_code) {
+	case WLAN_S1G_TWT_SETUP:
+		/* process failed twt setup frames */
+		ieee80211_s1g_tx_twt_setup_fail(sdata, sta, skb);
+		break;
+	default:
+		break;
+	}
+
+out:
+	mutex_unlock(&local->sta_mtx);
+}
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 1f295e5721ef..f6f63a0b1b72 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -705,13 +705,26 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 			/* Check to see if packet is a TDLS teardown packet */
 			if (ieee80211_is_data(hdr->frame_control) &&
 			    (ieee80211_get_tdls_action(skb, hdr_size) ==
-			     WLAN_TDLS_TEARDOWN))
+			     WLAN_TDLS_TEARDOWN)) {
 				ieee80211_tdls_td_tx_handle(local, sdata, skb,
 							    info->flags);
-			else
+			} else if (ieee80211_s1g_is_twt_setup(skb)) {
+				if (!acked) {
+					struct sk_buff *qskb;
+
+					qskb = skb_clone(skb, GFP_ATOMIC);
+					if (qskb) {
+						skb_queue_tail(&sdata->status_queue,
+							       qskb);
+						ieee80211_queue_work(&local->hw,
+								     &sdata->work);
+					}
+				}
+			} else {
 				ieee80211_mgd_conn_tx_status(sdata,
 							     hdr->frame_control,
 							     acked);
+			}
 		}
 
 		rcu_read_unlock();
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index f6ef15366938..9e8381bef7ed 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2825,6 +2825,73 @@ DEFINE_EVENT(sta_flag_evt, drv_sta_set_decap_offload,
 	TP_ARGS(local, sdata, sta, enabled)
 );
 
+TRACE_EVENT(drv_add_twt_setup,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sta *sta,
+		 struct ieee80211_twt_setup *twt,
+		 struct ieee80211_twt_params *twt_agrt),
+
+	TP_ARGS(local, sta, twt, twt_agrt),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		STA_ENTRY
+		__field(u8, dialog_token)
+		__field(u8, control)
+		__field(__le16, req_type)
+		__field(__le64, twt)
+		__field(u8, duration)
+		__field(__le16, mantissa)
+		__field(u8, channel)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		STA_ASSIGN;
+		__entry->dialog_token = twt->dialog_token;
+		__entry->control = twt->control;
+		__entry->req_type = twt_agrt->req_type;
+		__entry->twt = twt_agrt->twt;
+		__entry->duration = twt_agrt->min_twt_dur;
+		__entry->mantissa = twt_agrt->mantissa;
+		__entry->channel = twt_agrt->channel;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT STA_PR_FMT
+		" token:%d control:0x%02x req_type:0x%04x"
+		" twt:%llu duration:%d mantissa:%d channel:%d",
+		LOCAL_PR_ARG, STA_PR_ARG, __entry->dialog_token,
+		__entry->control, le16_to_cpu(__entry->req_type),
+		le64_to_cpu(__entry->twt), __entry->duration,
+		le16_to_cpu(__entry->mantissa), __entry->channel
+	)
+);
+
+TRACE_EVENT(drv_twt_teardown_request,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sta *sta, u8 flowid),
+
+	TP_ARGS(local, sta, flowid),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		STA_ENTRY
+		__field(u8, flowid)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		STA_ASSIGN;
+		__entry->flowid = flowid;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT STA_PR_FMT " flowid:%d",
+		LOCAL_PR_ARG, STA_PR_ARG, __entry->flowid
+	)
+);
+
 #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From 58adf9dcb15b99f047e80e10c85fb51ed3c88215 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 24 Aug 2021 00:22:58 +0300
Subject: net: dsa: let drivers state that they need VLAN filtering while
 standalone

As explained in commit e358bef7c392 ("net: dsa: Give drivers the chance
to veto certain upper devices"), the hellcreek driver uses some tricks
to comply with the network stack expectations: it enforces port
separation in standalone mode using VLANs. For untagged traffic,
bridging between ports is prevented by using different PVIDs, and for
VLAN-tagged traffic, it never accepts 8021q uppers with the same VID on
two ports, so packets with one VLAN cannot leak from one port to another.

That is almost fine*, and has worked because hellcreek relied on an
implicit behavior of the DSA core that was changed by the previous
patch: the standalone ports declare the 'rx-vlan-filter' feature as 'on
[fixed]'. Since most of the DSA drivers are actually VLAN-unaware in
standalone mode, that feature was actually incorrectly reflecting the
hardware/driver state, so there was a desire to fix it. This leaves the
hellcreek driver in a situation where it has to explicitly request this
behavior from the DSA framework.

We configure the ports as follows:

- Standalone: 'rx-vlan-filter' is on. An 8021q upper on top of a
  standalone hellcreek port will go through dsa_slave_vlan_rx_add_vid
  and will add a VLAN to the hardware tables, giving the driver the
  opportunity to refuse it through .port_prechangeupper.

- Bridged with vlan_filtering=0: 'rx-vlan-filter' is off. An 8021q upper
  on top of a bridged hellcreek port will not go through
  dsa_slave_vlan_rx_add_vid, because there will not be any attempt to
  offload this VLAN. The driver already disables VLAN awareness, so that
  upper should receive the traffic it needs.

- Bridged with vlan_filtering=1: 'rx-vlan-filter' is on. An 8021q upper
  on top of a bridged hellcreek port will call dsa_slave_vlan_rx_add_vid,
  and can again be vetoed through .port_prechangeupper.

*It is not actually completely fine, because if I follow through
correctly, we can have the following situation:

ip link add br0 type bridge vlan_filtering 0
ip link set lan0 master br0 # lan0 now becomes VLAN-unaware
ip link set lan0 nomaster # lan0 fails to become VLAN-aware again, therefore breaking isolation

This patch fixes that corner case by extending the DSA core logic, based
on this requested attribute, to change the VLAN awareness state of the
switch (port) when it leaves the bridge.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Kurt Kanzenbach <kurt@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/hirschmann/hellcreek.c |  1 +
 include/net/dsa.h                      |  3 +++
 net/dsa/slave.c                        | 12 ++++++++----
 net/dsa/switch.c                       | 21 ++++++++++++++++-----
 4 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c
index 5c54ae1be62c..3faff95fd49f 100644
--- a/drivers/net/dsa/hirschmann/hellcreek.c
+++ b/drivers/net/dsa/hirschmann/hellcreek.c
@@ -1345,6 +1345,7 @@ static int hellcreek_setup(struct dsa_switch *ds)
 	 * filtering setups are not supported.
 	 */
 	ds->vlan_filtering_is_global = true;
+	ds->needs_standalone_vlan_filtering = true;
 
 	/* Intercept _all_ PTP multicast traffic */
 	ret = hellcreek_setup_fdb(hellcreek);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index c7ea0f61056f..f9a17145255a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -363,6 +363,9 @@ struct dsa_switch {
 	 */
 	bool			vlan_filtering_is_global;
 
+	/* Keep VLAN filtering enabled on ports not offloading any upper. */
+	bool			needs_standalone_vlan_filtering;
+
 	/* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges
 	 * that have vlan_filtering=0. All drivers should ideally set this (and
 	 * then the option would get removed), but it is unknown whether this
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index f71d31d3aab4..662ff531d4e2 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1435,11 +1435,12 @@ static int dsa_slave_clear_vlan(struct net_device *vdev, int vid, void *arg)
  * To summarize, a DSA switch port offloads:
  *
  * - If standalone (this includes software bridge, software LAG):
- *     - if ds->vlan_filtering_is_global = true AND there are bridges spanning
- *       this switch chip which have vlan_filtering=1:
+ *     - if ds->needs_standalone_vlan_filtering = true, OR if
+ *       (ds->vlan_filtering_is_global = true AND there are bridges spanning
+ *       this switch chip which have vlan_filtering=1)
  *         - the 8021q upper VLANs
- *     - else (VLAN filtering is not global, or it is, but no port is under a
- *       VLAN-aware bridge):
+ *     - else (standalone VLAN filtering is not needed, VLAN filtering is not
+ *       global, or it is, but no port is under a VLAN-aware bridge):
  *         - no VLAN (any 8021q upper is a software VLAN)
  *
  * - If under a vlan_filtering=0 bridge which it offload:
@@ -1871,6 +1872,7 @@ void dsa_slave_setup_tagger(struct net_device *slave)
 	struct dsa_slave_priv *p = netdev_priv(slave);
 	const struct dsa_port *cpu_dp = dp->cpu_dp;
 	struct net_device *master = cpu_dp->master;
+	const struct dsa_switch *ds = dp->ds;
 
 	slave->needed_headroom = cpu_dp->tag_ops->needed_headroom;
 	slave->needed_tailroom = cpu_dp->tag_ops->needed_tailroom;
@@ -1888,6 +1890,8 @@ void dsa_slave_setup_tagger(struct net_device *slave)
 	slave->features |= NETIF_F_LLTX;
 	if (slave->needed_tailroom)
 		slave->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST);
+	if (ds->needs_standalone_vlan_filtering)
+		slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 }
 
 static struct lock_class_key dsa_slave_netdev_xmit_lock_key;
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index dd042fd7f800..1c797ec8e2c2 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -116,9 +116,10 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds,
 static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 				   struct dsa_notifier_bridge_info *info)
 {
-	bool unset_vlan_filtering = br_vlan_enabled(info->br);
 	struct dsa_switch_tree *dst = ds->dst;
 	struct netlink_ext_ack extack = {0};
+	bool change_vlan_filtering = false;
+	bool vlan_filtering;
 	int err, port;
 
 	if (dst->index == info->tree_index && ds->index == info->sw_index &&
@@ -131,6 +132,15 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 						info->sw_index, info->port,
 						info->br);
 
+	if (ds->needs_standalone_vlan_filtering && !br_vlan_enabled(info->br)) {
+		change_vlan_filtering = true;
+		vlan_filtering = true;
+	} else if (!ds->needs_standalone_vlan_filtering &&
+		   br_vlan_enabled(info->br)) {
+		change_vlan_filtering = true;
+		vlan_filtering = false;
+	}
+
 	/* If the bridge was vlan_filtering, the bridge core doesn't trigger an
 	 * event for changing vlan_filtering setting upon slave ports leaving
 	 * it. That is a good thing, because that lets us handle it and also
@@ -139,21 +149,22 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 	 * vlan_filtering callback is only when the last port leaves the last
 	 * VLAN-aware bridge.
 	 */
-	if (unset_vlan_filtering && ds->vlan_filtering_is_global) {
+	if (change_vlan_filtering && ds->vlan_filtering_is_global) {
 		for (port = 0; port < ds->num_ports; port++) {
 			struct net_device *bridge_dev;
 
 			bridge_dev = dsa_to_port(ds, port)->bridge_dev;
 
 			if (bridge_dev && br_vlan_enabled(bridge_dev)) {
-				unset_vlan_filtering = false;
+				change_vlan_filtering = false;
 				break;
 			}
 		}
 	}
-	if (unset_vlan_filtering) {
+
+	if (change_vlan_filtering) {
 		err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port),
-					      false, &extack);
+					      vlan_filtering, &extack);
 		if (extack._msg)
 			dev_err(ds->dev, "port %d: %s\n", info->port,
 				extack._msg);
-- 
cgit v1.2.3


From 95d1d2490c278ea316a4350f4c24818275fb989c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 23 Aug 2021 11:01:35 -0700
Subject: netdevice: move xdp_rxq within netdev_rx_queue

Both struct netdev_rx_queue and struct xdp_rxq_info are cacheline
aligned. This causes extra padding before and after the xdp_rxq
member. Move the member upfront, so that it's naturally aligned.

Before:
	/* size: 256, cachelines: 4, members: 6 */
	/* sum members: 160, holes: 1, sum holes: 40 */
	/* padding: 56 */
	/* paddings: 1, sum paddings: 36 */
	/* forced alignments: 1, forced holes: 1, sum forced holes: 40 */

After:
	/* size: 192, cachelines: 3, members: 6 */
	/* padding: 32 */
	/* paddings: 1, sum paddings: 36 */
	/* forced alignments: 1 */

Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/r/20210823180135.1153608-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2f03cd9e371a..b88ad5aef7fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -722,13 +722,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 
 /* This structure contains an instance of an RX queue. */
 struct netdev_rx_queue {
+	struct xdp_rxq_info		xdp_rxq;
 #ifdef CONFIG_RPS
 	struct rps_map __rcu		*rps_map;
 	struct rps_dev_flow_table __rcu	*rps_flow_table;
 #endif
 	struct kobject			kobj;
 	struct net_device		*dev;
-	struct xdp_rxq_info		xdp_rxq;
 #ifdef CONFIG_XDP_SOCKETS
 	struct xsk_buff_pool            *pool;
 #endif
-- 
cgit v1.2.3


From 029ee6b14356b94120bedb852dcdaefc0a282cf1 Mon Sep 17 00:00:00 2001
From: Yufeng Mo <moyufeng@huawei.com>
Date: Fri, 20 Aug 2021 15:35:17 +0800
Subject: ethtool: add two coalesce attributes for CQE mode

Currently, there are many drivers who support CQE mode configuration,
some configure it as a fixed when initialized, some provide an
interface to change it by ethtool private flags. In order to make it
more generic, add two new 'ETHTOOL_A_COALESCE_USE_CQE_TX' and
'ETHTOOL_A_COALESCE_USE_CQE_RX' coalesce attributes, then these
parameters can be accessed by ethtool netlink coalesce uAPI.

Also add an new structure kernel_ethtool_coalesce, then the
new parameter can be added into this struct.

Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst | 15 +++++++++++++++
 include/linux/ethtool.h                      | 11 ++++++++++-
 include/uapi/linux/ethtool_netlink.h         |  2 ++
 net/ethtool/coalesce.c                       | 19 +++++++++++++++++--
 net/ethtool/netlink.h                        |  2 +-
 5 files changed, 45 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index c690bb37430d..d9b55b7a1a4d 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -947,12 +947,25 @@ Kernel response contents:
   ``ETHTOOL_A_COALESCE_TX_USECS_HIGH``         u32     delay (us), high Tx
   ``ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH``    u32     max packets, high Tx
   ``ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL``  u32     rate sampling interval
+  ``ETHTOOL_A_COALESCE_USE_CQE_TX``            bool    timer reset mode, Tx
+  ``ETHTOOL_A_COALESCE_USE_CQE_RX``            bool    timer reset mode, Rx
   ===========================================  ======  =======================
 
 Attributes are only included in reply if their value is not zero or the
 corresponding bit in ``ethtool_ops::supported_coalesce_params`` is set (i.e.
 they are declared as supported by driver).
 
+Timer reset mode (``ETHTOOL_A_COALESCE_USE_CQE_TX`` and
+``ETHTOOL_A_COALESCE_USE_CQE_RX``) controls the interaction between packet
+arrival and the various time based delay parameters. By default timers are
+expected to limit the max delay between any packet arrival/departure and a
+corresponding interrupt. In this mode timer should be started by packet
+arrival (sometimes delivery of previous interrupt) and reset when interrupt
+is delivered.
+Setting the appropriate attribute to 1 will enable ``CQE`` mode, where
+each packet event resets the timer. In this mode timer is used to force
+the interrupt if queue goes idle, while busy queues depend on the packet
+limit to trigger interrupts.
 
 COALESCE_SET
 ============
@@ -985,6 +998,8 @@ Request contents:
   ``ETHTOOL_A_COALESCE_TX_USECS_HIGH``         u32     delay (us), high Tx
   ``ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH``    u32     max packets, high Tx
   ``ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL``  u32     rate sampling interval
+  ``ETHTOOL_A_COALESCE_USE_CQE_TX``            bool    timer reset mode, Tx
+  ``ETHTOOL_A_COALESCE_USE_CQE_RX``            bool    timer reset mode, Rx
   ===========================================  ======  =======================
 
 Request is rejected if it attributes declared as unsupported by driver (i.e.
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 4711b96dae0c..a9d77a6a3e00 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -172,6 +172,11 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
+struct kernel_ethtool_coalesce {
+	u8 use_cqe_mode_tx;
+	u8 use_cqe_mode_rx;
+};
+
 /**
  * ethtool_intersect_link_masks - Given two link masks, AND them together
  * @dst: first mask and where result is stored
@@ -211,7 +216,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 #define ETHTOOL_COALESCE_TX_USECS_HIGH		BIT(19)
 #define ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH	BIT(20)
 #define ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL	BIT(21)
-#define ETHTOOL_COALESCE_ALL_PARAMS		GENMASK(21, 0)
+#define ETHTOOL_COALESCE_USE_CQE_RX		BIT(22)
+#define ETHTOOL_COALESCE_USE_CQE_TX		BIT(23)
+#define ETHTOOL_COALESCE_ALL_PARAMS		GENMASK(23, 0)
 
 #define ETHTOOL_COALESCE_USECS						\
 	(ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS)
@@ -237,6 +244,8 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 	 ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_RX_USECS_HIGH | \
 	 ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \
 	 ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL)
+#define ETHTOOL_COALESCE_USE_CQE					\
+	(ETHTOOL_COALESCE_USE_CQE_RX | ETHTOOL_COALESCE_USE_CQE_TX)
 
 #define ETHTOOL_STAT_NOT_SET	(~0ULL)
 
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index b3b93710eff7..5545f1ca9237 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -377,6 +377,8 @@ enum {
 	ETHTOOL_A_COALESCE_TX_USECS_HIGH,		/* u32 */
 	ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH,		/* u32 */
 	ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL,	/* u32 */
+	ETHTOOL_A_COALESCE_USE_CQE_MODE_TX,		/* u8 */
+	ETHTOOL_A_COALESCE_USE_CQE_MODE_RX,		/* u8 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_COALESCE_CNT,
diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
index 1d6bc132aa4d..e6bc53634e68 100644
--- a/net/ethtool/coalesce.c
+++ b/net/ethtool/coalesce.c
@@ -10,6 +10,7 @@ struct coalesce_req_info {
 struct coalesce_reply_data {
 	struct ethnl_reply_data		base;
 	struct ethtool_coalesce		coalesce;
+	struct kernel_ethtool_coalesce	kernel_coalesce;
 	u32				supported_params;
 };
 
@@ -100,7 +101,9 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base,
 	       nla_total_size(sizeof(u32)) +	/* _RX_MAX_FRAMES_HIGH */
 	       nla_total_size(sizeof(u32)) +	/* _TX_USECS_HIGH */
 	       nla_total_size(sizeof(u32)) +	/* _TX_MAX_FRAMES_HIGH */
-	       nla_total_size(sizeof(u32));	/* _RATE_SAMPLE_INTERVAL */
+	       nla_total_size(sizeof(u32)) +	/* _RATE_SAMPLE_INTERVAL */
+	       nla_total_size(sizeof(u8)) +	/* _USE_CQE_MODE_TX */
+	       nla_total_size(sizeof(u8));	/* _USE_CQE_MODE_RX */
 }
 
 static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val,
@@ -124,6 +127,7 @@ static int coalesce_fill_reply(struct sk_buff *skb,
 			       const struct ethnl_reply_data *reply_base)
 {
 	const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+	const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce;
 	const struct ethtool_coalesce *coal = &data->coalesce;
 	u32 supported = data->supported_params;
 
@@ -170,7 +174,11 @@ static int coalesce_fill_reply(struct sk_buff *skb,
 	    coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH,
 			     coal->tx_max_coalesced_frames_high, supported) ||
 	    coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL,
-			     coal->rate_sample_interval, supported))
+			     coal->rate_sample_interval, supported) ||
+	    coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX,
+			      kcoal->use_cqe_mode_tx, supported) ||
+	    coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX,
+			      kcoal->use_cqe_mode_rx, supported))
 		return -EMSGSIZE;
 
 	return 0;
@@ -215,10 +223,13 @@ const struct nla_policy ethnl_coalesce_set_policy[] = {
 	[ETHTOOL_A_COALESCE_TX_USECS_HIGH]	= { .type = NLA_U32 },
 	[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH]	= { .type = NLA_U32 },
 	[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 },
+	[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX]	= NLA_POLICY_MAX(NLA_U8, 1),
+	[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX]	= NLA_POLICY_MAX(NLA_U8, 1),
 };
 
 int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
 {
+	struct kernel_ethtool_coalesce kernel_coalesce = {};
 	struct ethtool_coalesce coalesce = {};
 	struct ethnl_req_info req_info = {};
 	struct nlattr **tb = info->attrs;
@@ -303,6 +314,10 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
 			 tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod);
 	ethnl_update_u32(&coalesce.rate_sample_interval,
 			 tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod);
+	ethnl_update_u8(&kernel_coalesce.use_cqe_mode_tx,
+			tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod);
+	ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx,
+			tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod);
 	ret = 0;
 	if (!mod)
 		goto out_ops;
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 077aac3929a8..e8987e28036f 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -359,7 +359,7 @@ extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1];
 extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
 extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
 extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
-extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL + 1];
+extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1];
 extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1];
 extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1];
 extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1];
-- 
cgit v1.2.3


From f3ccfda1931977b80267ba54070a1aeafa18f6ca Mon Sep 17 00:00:00 2001
From: Yufeng Mo <moyufeng@huawei.com>
Date: Fri, 20 Aug 2021 15:35:18 +0800
Subject: ethtool: extend coalesce setting uAPI with CQE mode

In order to support more coalesce parameters through netlink,
add two new parameter kernel_coal and extack for .set_coalesce
and .get_coalesce, then some extra info can return to user with
the netlink API.

Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c           |  8 ++++++--
 drivers/net/ethernet/amazon/ena/ena_ethtool.c          |  8 ++++++--
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c           |  8 ++++++--
 drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c    |  8 ++++++--
 drivers/net/ethernet/broadcom/bcmsysport.c             |  8 ++++++--
 drivers/net/ethernet/broadcom/bnx2.c                   | 12 ++++++++----
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c    |  8 ++++++--
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c      |  8 ++++++--
 drivers/net/ethernet/broadcom/genet/bcmgenet.c         |  8 ++++++--
 drivers/net/ethernet/broadcom/tg3.c                    | 10 ++++++++--
 drivers/net/ethernet/brocade/bna/bnad_ethtool.c        | 12 ++++++++----
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c     |  8 ++++++--
 drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c    |  4 +++-
 drivers/net/ethernet/chelsio/cxgb/cxgb2.c              |  8 ++++++--
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c        |  8 ++++++--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c     |  8 ++++++--
 drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c    |  8 ++++++--
 drivers/net/ethernet/cisco/enic/enic_ethtool.c         |  8 ++++++--
 drivers/net/ethernet/cortina/gemini.c                  |  8 ++++++--
 drivers/net/ethernet/emulex/benet/be_ethtool.c         |  8 ++++++--
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c     |  8 ++++++--
 drivers/net/ethernet/freescale/enetc/enetc_ethtool.c   |  8 ++++++--
 drivers/net/ethernet/freescale/fec_main.c              | 14 +++++++++-----
 drivers/net/ethernet/freescale/gianfar_ethtool.c       |  8 ++++++--
 drivers/net/ethernet/hisilicon/hip04_eth.c             |  8 ++++++--
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c       | 12 ++++++++++--
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c     |  8 ++++++--
 drivers/net/ethernet/huawei/hinic/hinic_ethtool.c      |  8 ++++++--
 drivers/net/ethernet/intel/e1000/e1000_ethtool.c       |  8 ++++++--
 drivers/net/ethernet/intel/e1000e/ethtool.c            |  8 ++++++--
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c       |  8 ++++++--
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c         | 12 ++++++++++--
 drivers/net/ethernet/intel/iavf/iavf_ethtool.c         | 12 ++++++++++--
 drivers/net/ethernet/intel/ice/ice_ethtool.c           | 12 ++++++++----
 drivers/net/ethernet/intel/igb/igb_ethtool.c           |  8 ++++++--
 drivers/net/ethernet/intel/igbvf/ethtool.c             |  8 ++++++--
 drivers/net/ethernet/intel/igc/igc_ethtool.c           |  8 ++++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c       |  8 ++++++--
 drivers/net/ethernet/intel/ixgbevf/ethtool.c           |  8 ++++++--
 drivers/net/ethernet/jme.c                             | 12 ++++++++----
 drivers/net/ethernet/marvell/mv643xx_eth.c             | 12 ++++++++----
 drivers/net/ethernet/marvell/mvneta.c                  | 14 ++++++++++----
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c        | 14 ++++++++++----
 .../net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c  |  8 ++++++--
 drivers/net/ethernet/marvell/skge.c                    |  8 ++++++--
 drivers/net/ethernet/marvell/sky2.c                    |  8 ++++++--
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c        |  8 ++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  8 ++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c       |  8 ++++++--
 .../net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c    |  8 ++++++--
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c       | 12 ++++++++----
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  8 ++++++--
 drivers/net/ethernet/ni/nixge.c                        | 14 ++++++++++----
 drivers/net/ethernet/pensando/ionic/ionic_ethtool.c    |  8 ++++++--
 .../net/ethernet/qlogic/netxen/netxen_nic_ethtool.c    |  8 ++++++--
 drivers/net/ethernet/qlogic/qede/qede.h                |  4 +++-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c        |  8 ++++++--
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c    |  8 ++++++--
 drivers/net/ethernet/realtek/r8169_main.c              | 10 ++++++++--
 drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c     |  8 ++++++--
 drivers/net/ethernet/sfc/ethtool.c                     |  8 ++++++--
 drivers/net/ethernet/sfc/falcon/ethtool.c              |  8 ++++++--
 drivers/net/ethernet/socionext/netsec.c                | 10 +++++++---
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  8 ++++++--
 drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c     | 14 ++++++++++----
 drivers/net/ethernet/tehuti/tehuti.c                   | 12 ++++++++----
 drivers/net/ethernet/ti/cpsw.c                         |  2 +-
 drivers/net/ethernet/ti/cpsw_ethtool.c                 |  8 ++++++--
 drivers/net/ethernet/ti/cpsw_new.c                     |  2 +-
 drivers/net/ethernet/ti/cpsw_priv.h                    |  8 ++++++--
 drivers/net/ethernet/ti/davinci_emac.c                 | 14 +++++++++++---
 drivers/net/ethernet/via/via-velocity.c                |  8 ++++++--
 drivers/net/ethernet/xilinx/ll_temac_main.c            | 14 ++++++++++----
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c      | 18 ++++++++++++++----
 drivers/net/netdevsim/ethtool.c                        |  8 ++++++--
 drivers/net/tun.c                                      |  8 ++++++--
 drivers/net/usb/r8152.c                                |  8 ++++++--
 drivers/net/virtio_net.c                               |  8 ++++++--
 drivers/net/vmxnet3/vmxnet3_ethtool.c                  | 12 ++++++++----
 drivers/net/wireless/ath/wil6210/ethtool.c             | 14 ++++++++++----
 drivers/s390/net/qeth_ethtool.c                        |  4 +++-
 drivers/staging/qlge/qlge_ethtool.c                    | 10 ++++++++--
 include/linux/ethtool.h                                | 11 +++++++++--
 net/ethtool/coalesce.c                                 | 10 +++++++---
 net/ethtool/ioctl.c                                    | 15 ++++++++++++---
 85 files changed, 576 insertions(+), 202 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 823f6831e7ea..a09ca21f7dff 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -72,7 +72,9 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
 }
 
 static int ipoib_get_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 
@@ -83,7 +85,9 @@ static int ipoib_get_coalesce(struct net_device *dev,
 }
 
 static int ipoib_set_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 	int ret;
diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
index 27dae632efcb..13e745cf3781 100644
--- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c
+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
@@ -357,7 +357,9 @@ static int ena_get_link_ksettings(struct net_device *netdev,
 }
 
 static int ena_get_coalesce(struct net_device *net_dev,
-			    struct ethtool_coalesce *coalesce)
+			    struct ethtool_coalesce *coalesce,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct ena_adapter *adapter = netdev_priv(net_dev);
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
@@ -402,7 +404,9 @@ static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *
 }
 
 static int ena_set_coalesce(struct net_device *net_dev,
-			    struct ethtool_coalesce *coalesce)
+			    struct ethtool_coalesce *coalesce,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct ena_adapter *adapter = netdev_priv(net_dev);
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index 61f39a0e04f9..bafc51c34e0b 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -428,7 +428,9 @@ static void xgbe_set_msglevel(struct net_device *netdev, u32 msglevel)
 }
 
 static int xgbe_get_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *ec)
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 
@@ -443,7 +445,9 @@ static int xgbe_get_coalesce(struct net_device *netdev,
 }
 
 static int xgbe_set_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *ec)
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	struct xgbe_hw_if *hw_if = &pdata->hw_if;
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
index de2a9348bc3f..a9ef0544e30f 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
@@ -547,7 +547,9 @@ static int aq_ethtool_set_rxnfc(struct net_device *ndev,
 }
 
 static int aq_ethtool_get_coalesce(struct net_device *ndev,
-				   struct ethtool_coalesce *coal)
+				   struct ethtool_coalesce *coal,
+				   struct kernel_ethtool_coalesce *kernel_coal,
+				   struct netlink_ext_ack *extack)
 {
 	struct aq_nic_s *aq_nic = netdev_priv(ndev);
 	struct aq_nic_cfg_s *cfg;
@@ -571,7 +573,9 @@ static int aq_ethtool_get_coalesce(struct net_device *ndev,
 }
 
 static int aq_ethtool_set_coalesce(struct net_device *ndev,
-				   struct ethtool_coalesce *coal)
+				   struct ethtool_coalesce *coal,
+				   struct kernel_ethtool_coalesce *kernel_coal,
+				   struct netlink_ext_ack *extack)
 {
 	struct aq_nic_s *aq_nic = netdev_priv(ndev);
 	struct aq_nic_cfg_s *cfg;
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index d9f0f0df8f7b..7fa1b695400d 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -607,7 +607,9 @@ static void bcm_sysport_set_tx_coalesce(struct bcm_sysport_tx_ring *ring,
 }
 
 static int bcm_sysport_get_coalesce(struct net_device *dev,
-				    struct ethtool_coalesce *ec)
+				    struct ethtool_coalesce *ec,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
 	u32 reg;
@@ -627,7 +629,9 @@ static int bcm_sysport_get_coalesce(struct net_device *dev,
 }
 
 static int bcm_sysport_set_coalesce(struct net_device *dev,
-				    struct ethtool_coalesce *ec)
+				    struct ethtool_coalesce *ec,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
 	struct dim_cq_moder moder;
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index dc3ee576360b..a705e2615307 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -7242,8 +7242,10 @@ bnx2_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
 	return rc;
 }
 
-static int
-bnx2_get_coalesce(struct net_device *dev, struct ethtool_coalesce *coal)
+static int bnx2_get_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct bnx2 *bp = netdev_priv(dev);
 
@@ -7264,8 +7266,10 @@ bnx2_get_coalesce(struct net_device *dev, struct ethtool_coalesce *coal)
 	return 0;
 }
 
-static int
-bnx2_set_coalesce(struct net_device *dev, struct ethtool_coalesce *coal)
+static int bnx2_set_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct bnx2 *bp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 32245bbe88a8..472a3a478038 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -1878,7 +1878,9 @@ static int bnx2x_set_eeprom(struct net_device *dev,
 }
 
 static int bnx2x_get_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
@@ -1891,7 +1893,9 @@ static int bnx2x_get_coalesce(struct net_device *dev,
 }
 
 static int bnx2x_set_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 364460ef8db7..9f8c72d95228 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -49,7 +49,9 @@ static void bnxt_set_msglevel(struct net_device *dev, u32 value)
 }
 
 static int bnxt_get_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *coal)
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = netdev_priv(dev);
 	struct bnxt_coal *hw_coal;
@@ -79,7 +81,9 @@ static int bnxt_get_coalesce(struct net_device *dev,
 }
 
 static int bnxt_set_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *coal)
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = netdev_priv(dev);
 	bool update_stats = false;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 8507198992df..23c7595d2a1d 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -828,7 +828,9 @@ static void bcmgenet_set_msglevel(struct net_device *dev, u32 level)
 }
 
 static int bcmgenet_get_coalesce(struct net_device *dev,
-				 struct ethtool_coalesce *ec)
+				 struct ethtool_coalesce *ec,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
 	struct bcmgenet_rx_ring *ring;
@@ -890,7 +892,9 @@ static void bcmgenet_set_ring_rx_coalesce(struct bcmgenet_rx_ring *ring,
 }
 
 static int bcmgenet_set_coalesce(struct net_device *dev,
-				 struct ethtool_coalesce *ec)
+				 struct ethtool_coalesce *ec,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
 	unsigned int i;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index eefb070a68d2..8a238e349e02 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14037,7 +14037,10 @@ static int tg3_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return -EOPNOTSUPP;
 }
 
-static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+static int tg3_get_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct tg3 *tp = netdev_priv(dev);
 
@@ -14045,7 +14048,10 @@ static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
 	return 0;
 }
 
-static int tg3_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+static int tg3_set_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct tg3 *tp = netdev_priv(dev);
 	u32 max_rxcoal_tick_int = 0, max_txcoal_tick_int = 0;
diff --git a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c
index 265c2fa6bbe0..391b85f25141 100644
--- a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c
+++ b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c
@@ -307,8 +307,10 @@ bnad_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wolinfo)
 	wolinfo->wolopts = 0;
 }
 
-static int
-bnad_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *coalesce)
+static int bnad_get_coalesce(struct net_device *netdev,
+			     struct ethtool_coalesce *coalesce,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct bnad *bnad = netdev_priv(netdev);
 	unsigned long flags;
@@ -328,8 +330,10 @@ bnad_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *coalesce)
 	return 0;
 }
 
-static int
-bnad_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *coalesce)
+static int bnad_set_coalesce(struct net_device *netdev,
+			     struct ethtool_coalesce *coalesce,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct bnad *bnad = netdev_priv(netdev);
 	unsigned long flags;
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 66f2c553370c..2b9747867d4c 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -2108,7 +2108,9 @@ static int octnet_set_intrmod_cfg(struct lio *lio,
 }
 
 static int lio_get_intr_coalesce(struct net_device *netdev,
-				 struct ethtool_coalesce *intr_coal)
+				 struct ethtool_coalesce *intr_coal,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct lio *lio = GET_LIO(netdev);
 	struct octeon_device *oct = lio->oct_dev;
@@ -2412,7 +2414,9 @@ oct_cfg_tx_intrcnt(struct lio *lio,
 }
 
 static int lio_set_intr_coalesce(struct net_device *netdev,
-				 struct ethtool_coalesce *intr_coal)
+				 struct ethtool_coalesce *intr_coal,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct lio *lio = GET_LIO(netdev);
 	int ret;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
index 2f218fbfed06..7f2882109b16 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
@@ -456,7 +456,9 @@ static void nicvf_get_regs(struct net_device *dev,
 }
 
 static int nicvf_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *cmd)
+			      struct ethtool_coalesce *cmd,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct nicvf *nic = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
index e7575d41f4f5..73c016166f06 100644
--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
+++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
@@ -748,7 +748,9 @@ static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
 	return 0;
 }
 
-static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c,
+			struct kernel_ethtool_coalesce *kernel_coal,
+			struct netlink_ext_ack *extack)
 {
 	struct adapter *adapter = dev->ml_priv;
 
@@ -759,7 +761,9 @@ static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
 	return 0;
 }
 
-static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c,
+			struct kernel_ethtool_coalesce *kernel_coal,
+			struct netlink_ext_ack *extack)
 {
 	struct adapter *adapter = dev->ml_priv;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index 538f737af4fa..38e47703f9ab 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -1996,7 +1996,9 @@ static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
 	return 0;
 }
 
-static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c,
+			struct kernel_ethtool_coalesce *kernel_coal,
+			struct netlink_ext_ack *extack)
 {
 	struct port_info *pi = netdev_priv(dev);
 	struct adapter *adapter = pi->adapter;
@@ -2017,7 +2019,9 @@ static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
 	return 0;
 }
 
-static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c,
+			struct kernel_ethtool_coalesce *kernel_coal,
+			struct netlink_ext_ack *extack)
 {
 	struct port_info *pi = netdev_priv(dev);
 	struct adapter *adapter = pi->adapter;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index 83ed10ac8660..5903bdb78916 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -1147,7 +1147,9 @@ static int set_dbqtimer_tickval(struct net_device *dev,
 }
 
 static int set_coalesce(struct net_device *dev,
-			struct ethtool_coalesce *coalesce)
+			struct ethtool_coalesce *coalesce,
+			struct kernel_ethtool_coalesce *kernel_coal,
+			struct netlink_ext_ack *extack)
 {
 	int ret;
 
@@ -1163,7 +1165,9 @@ static int set_coalesce(struct net_device *dev,
 				    coalesce->tx_coalesce_usecs);
 }
 
-static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c,
+			struct kernel_ethtool_coalesce *kernel_coal,
+			struct netlink_ext_ack *extack)
 {
 	const struct port_info *pi = netdev_priv(dev);
 	const struct adapter *adap = pi->adapter;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index e5882df551ec..49b76fd47daa 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -1647,7 +1647,9 @@ static int cxgb4vf_set_ringparam(struct net_device *dev,
  * interrupt holdoff timer to be read on all of the device's Queue Sets.
  */
 static int cxgb4vf_get_coalesce(struct net_device *dev,
-				struct ethtool_coalesce *coalesce)
+				struct ethtool_coalesce *coalesce,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	const struct port_info *pi = netdev_priv(dev);
 	const struct adapter *adapter = pi->adapter;
@@ -1667,7 +1669,9 @@ static int cxgb4vf_get_coalesce(struct net_device *dev,
  * the interrupt holdoff timer on any of the device's Queue Sets.
  */
 static int cxgb4vf_set_coalesce(struct net_device *dev,
-				struct ethtool_coalesce *coalesce)
+				struct ethtool_coalesce *coalesce,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	const struct port_info *pi = netdev_priv(dev);
 	struct adapter *adapter = pi->adapter;
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 1a9803f2073e..12ffc14fbecd 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -298,7 +298,9 @@ static void enic_set_msglevel(struct net_device *netdev, u32 value)
 }
 
 static int enic_get_coalesce(struct net_device *netdev,
-	struct ethtool_coalesce *ecmd)
+			     struct ethtool_coalesce *ecmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct enic *enic = netdev_priv(netdev);
 	struct enic_rx_coal *rxcoal = &enic->rx_coalesce_setting;
@@ -343,7 +345,9 @@ static int enic_coalesce_valid(struct enic *enic,
 }
 
 static int enic_set_coalesce(struct net_device *netdev,
-	struct ethtool_coalesce *ecmd)
+			     struct ethtool_coalesce *ecmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct enic *enic = netdev_priv(netdev);
 	u32 tx_coalesce_usecs;
diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
index c2ebb3388789..6e745ca4c433 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -2144,7 +2144,9 @@ static int gmac_set_ringparam(struct net_device *netdev,
 }
 
 static int gmac_get_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *ecmd)
+			     struct ethtool_coalesce *ecmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct gemini_ethernet_port *port = netdev_priv(netdev);
 
@@ -2156,7 +2158,9 @@ static int gmac_get_coalesce(struct net_device *netdev,
 }
 
 static int gmac_set_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *ecmd)
+			     struct ethtool_coalesce *ecmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct gemini_ethernet_port *port = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index 99cc1c46fb30..f9955308b93d 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -315,7 +315,9 @@ static int be_read_dump_data(struct be_adapter *adapter, u32 dump_len,
 }
 
 static int be_get_coalesce(struct net_device *netdev,
-			   struct ethtool_coalesce *et)
+			   struct ethtool_coalesce *et,
+			   struct kernel_ethtool_coalesce *kernel_coal,
+			   struct netlink_ext_ack *extack)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_aic_obj *aic = &adapter->aic_obj[0];
@@ -338,7 +340,9 @@ static int be_get_coalesce(struct net_device *netdev,
  * eqd cmd is issued in the worker thread.
  */
 static int be_set_coalesce(struct net_device *netdev,
-			   struct ethtool_coalesce *et)
+			   struct ethtool_coalesce *et,
+			   struct kernel_ethtool_coalesce *kernel_coal,
+			   struct netlink_ext_ack *extack)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_aic_obj *aic = &adapter->aic_obj[0];
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 1268996b7030..763d2c7b5fb1 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -513,7 +513,9 @@ static int dpaa_get_ts_info(struct net_device *net_dev,
 }
 
 static int dpaa_get_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *c)
+			     struct ethtool_coalesce *c,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct qman_portal *portal;
 	u32 period;
@@ -530,7 +532,9 @@ static int dpaa_get_coalesce(struct net_device *dev,
 }
 
 static int dpaa_set_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *c)
+			     struct ethtool_coalesce *c,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	const cpumask_t *cpus = qman_affine_cpus();
 	bool needs_revert[NR_CPUS] = {false};
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
index ebccaf02411c..9690e36e9e85 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
@@ -585,7 +585,9 @@ static void enetc_get_ringparam(struct net_device *ndev,
 }
 
 static int enetc_get_coalesce(struct net_device *ndev,
-			      struct ethtool_coalesce *ic)
+			      struct ethtool_coalesce *ic,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
 	struct enetc_int_vector *v = priv->int_vector[0];
@@ -602,7 +604,9 @@ static int enetc_get_coalesce(struct net_device *ndev,
 }
 
 static int enetc_set_coalesce(struct net_device *ndev,
-			      struct ethtool_coalesce *ic)
+			      struct ethtool_coalesce *ic,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
 	u32 rx_ictt, tx_ictt;
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 83ab34b1d735..80bd5c629fa0 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2654,8 +2654,10 @@ static void fec_enet_itr_coal_set(struct net_device *ndev)
 	}
 }
 
-static int
-fec_enet_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *ec)
+static int fec_enet_get_coalesce(struct net_device *ndev,
+				 struct ethtool_coalesce *ec,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
 
@@ -2671,8 +2673,10 @@ fec_enet_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *ec)
 	return 0;
 }
 
-static int
-fec_enet_set_coalesce(struct net_device *ndev, struct ethtool_coalesce *ec)
+static int fec_enet_set_coalesce(struct net_device *ndev,
+				 struct ethtool_coalesce *ec,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
 	struct device *dev = &fep->pdev->dev;
@@ -2724,7 +2728,7 @@ static void fec_enet_itr_coal_init(struct net_device *ndev)
 	ec.tx_coalesce_usecs = FEC_ITR_ICTT_DEFAULT;
 	ec.tx_max_coalesced_frames = FEC_ITR_ICFT_DEFAULT;
 
-	fec_enet_set_coalesce(ndev, &ec);
+	fec_enet_set_coalesce(ndev, &ec, NULL, NULL);
 }
 
 static int fec_enet_get_tunable(struct net_device *netdev,
diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index cc7d4f93da54..7b32ed29bf4c 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -243,7 +243,9 @@ static unsigned int gfar_ticks2usecs(struct gfar_private *priv,
 /* Get the coalescing parameters, and put them in the cvals
  * structure.  */
 static int gfar_gcoalesce(struct net_device *dev,
-			  struct ethtool_coalesce *cvals)
+			  struct ethtool_coalesce *cvals,
+			  struct kernel_ethtool_coalesce *kernel_coal,
+			  struct netlink_ext_ack *extack)
 {
 	struct gfar_private *priv = netdev_priv(dev);
 	struct gfar_priv_rx_q *rx_queue = NULL;
@@ -280,7 +282,9 @@ static int gfar_gcoalesce(struct net_device *dev,
  * in order for coalescing to be active
  */
 static int gfar_scoalesce(struct net_device *dev,
-			  struct ethtool_coalesce *cvals)
+			  struct ethtool_coalesce *cvals,
+			  struct kernel_ethtool_coalesce *kernel_coal,
+			  struct netlink_ext_ack *extack)
 {
 	struct gfar_private *priv = netdev_priv(dev);
 	int i, err = 0;
diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
index e53512f6878a..37b605fed32c 100644
--- a/drivers/net/ethernet/hisilicon/hip04_eth.c
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -796,7 +796,9 @@ static void hip04_tx_timeout_task(struct work_struct *work)
 }
 
 static int hip04_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct hip04_priv *priv = netdev_priv(netdev);
 
@@ -807,7 +809,9 @@ static int hip04_get_coalesce(struct net_device *netdev,
 }
 
 static int hip04_set_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct hip04_priv *priv = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index 7e62dcff2426..ab7390225942 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -730,11 +730,15 @@ static int hns_set_pauseparam(struct net_device *net_dev,
  * hns_get_coalesce - get coalesce info.
  * @net_dev: net device
  * @ec: coalesce info.
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack: extack for reporting error messages
  *
  * Return 0 on success, negative on failure.
  */
 static int hns_get_coalesce(struct net_device *net_dev,
-			    struct ethtool_coalesce *ec)
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct hns_nic_priv *priv = netdev_priv(net_dev);
 	struct hnae_ae_ops *ops;
@@ -774,11 +778,15 @@ static int hns_get_coalesce(struct net_device *net_dev,
  * hns_set_coalesce - set coalesce info.
  * @net_dev: net device
  * @ec: coalesce info.
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack: extack for reporting error messages
  *
  * Return 0 on success, negative on failure.
  */
 static int hns_set_coalesce(struct net_device *net_dev,
-			    struct ethtool_coalesce *ec)
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct hns_nic_priv *priv = netdev_priv(net_dev);
 	struct hnae_ae_ops *ops;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 835105015763..049be076fd18 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -1179,7 +1179,9 @@ static void hns3_get_channels(struct net_device *netdev,
 }
 
 static int hns3_get_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *cmd)
+			     struct ethtool_coalesce *cmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct hns3_nic_priv *priv = netdev_priv(netdev);
 	struct hns3_enet_coalesce *tx_coal = &priv->tx_coal;
@@ -1361,7 +1363,9 @@ static void hns3_set_coalesce_per_queue(struct net_device *netdev,
 }
 
 static int hns3_set_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *cmd)
+			     struct ethtool_coalesce *cmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct hnae3_handle *h = hns3_get_handle(netdev);
 	struct hns3_nic_priv *priv = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
index 162d3c330dec..b431c300ef1b 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
@@ -795,13 +795,17 @@ static int __hinic_set_coalesce(struct net_device *netdev,
 }
 
 static int hinic_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	return __hinic_get_coalesce(netdev, coal, COALESCE_ALL_QUEUE);
 }
 
 static int hinic_set_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	return __hinic_set_coalesce(netdev, coal, COALESCE_ALL_QUEUE);
 }
diff --git a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c
index 3c51ee94fa00..0a57172dfcbc 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c
@@ -1739,7 +1739,9 @@ static int e1000_set_phys_id(struct net_device *netdev,
 }
 
 static int e1000_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 
@@ -1755,7 +1757,9 @@ static int e1000_get_coalesce(struct net_device *netdev,
 }
 
 static int e1000_set_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c
index 7256b43b7a65..8515e00d1b40 100644
--- a/drivers/net/ethernet/intel/e1000e/ethtool.c
+++ b/drivers/net/ethernet/intel/e1000e/ethtool.c
@@ -1993,7 +1993,9 @@ static int e1000_set_phys_id(struct net_device *netdev,
 }
 
 static int e1000_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 
@@ -2006,7 +2008,9 @@ static int e1000_get_coalesce(struct net_device *netdev,
 }
 
 static int e1000_set_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 66776ba7bfb6..0d37f011d0ce 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -632,7 +632,9 @@ clear_reset:
 }
 
 static int fm10k_get_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct fm10k_intfc *interface = netdev_priv(dev);
 
@@ -646,7 +648,9 @@ static int fm10k_get_coalesce(struct net_device *dev,
 }
 
 static int fm10k_set_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct fm10k_intfc *interface = netdev_priv(dev);
 	u16 tx_itr, rx_itr;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 2c9e4eeb7270..513ba6974355 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -2812,13 +2812,17 @@ static int __i40e_get_coalesce(struct net_device *netdev,
  * i40e_get_coalesce - get a netdev's coalesce settings
  * @netdev: the netdev to check
  * @ec: ethtool coalesce data structure
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack: extack for reporting error messages
  *
  * Gets the coalesce settings for a particular netdev. Note that if user has
  * modified per-queue settings, this only guarantees to represent queue 0. See
  * __i40e_get_coalesce for more details.
  **/
 static int i40e_get_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *ec)
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	return __i40e_get_coalesce(netdev, ec, -1);
 }
@@ -2986,11 +2990,15 @@ static int __i40e_set_coalesce(struct net_device *netdev,
  * i40e_set_coalesce - set coalesce settings for every queue on the netdev
  * @netdev: the netdev to change
  * @ec: ethtool coalesce settings
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack: extack for reporting error messages
  *
  * This will set each queue to the same coalesce settings.
  **/
 static int i40e_set_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *ec)
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	return __i40e_set_coalesce(netdev, ec, -1);
 }
diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index edbeb27213f8..5a359a0a20ec 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -685,6 +685,8 @@ static int __iavf_get_coalesce(struct net_device *netdev,
  * iavf_get_coalesce - Get interrupt coalescing settings
  * @netdev: network interface device structure
  * @ec: ethtool coalesce structure
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack: extack for reporting error messages
  *
  * Returns current coalescing settings. This is referred to elsewhere in the
  * driver as Interrupt Throttle Rate, as this is how the hardware describes
@@ -692,7 +694,9 @@ static int __iavf_get_coalesce(struct net_device *netdev,
  * only represents the settings of queue 0.
  **/
 static int iavf_get_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *ec)
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	return __iavf_get_coalesce(netdev, ec, -1);
 }
@@ -804,11 +808,15 @@ static int __iavf_set_coalesce(struct net_device *netdev,
  * iavf_set_coalesce - Set interrupt coalescing settings
  * @netdev: network interface device structure
  * @ec: ethtool coalesce structure
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack: extack for reporting error messages
  *
  * Change current coalescing settings for every queue.
  **/
 static int iavf_set_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *ec)
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	return __iavf_set_coalesce(netdev, ec, -1);
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index d95a5daca114..c451cf401e63 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -3568,8 +3568,10 @@ __ice_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec,
 	return 0;
 }
 
-static int
-ice_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
+static int ice_get_coalesce(struct net_device *netdev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	return __ice_get_coalesce(netdev, ec, -1);
 }
@@ -3787,8 +3789,10 @@ set_complete:
 	return 0;
 }
 
-static int
-ice_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
+static int ice_set_coalesce(struct net_device *netdev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	return __ice_set_coalesce(netdev, ec, -1);
 }
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 17f5c003c3df..fb1029352c3e 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2182,7 +2182,9 @@ static int igb_set_phys_id(struct net_device *netdev,
 }
 
 static int igb_set_coalesce(struct net_device *netdev,
-			    struct ethtool_coalesce *ec)
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	int i;
@@ -2238,7 +2240,9 @@ static int igb_set_coalesce(struct net_device *netdev,
 }
 
 static int igb_get_coalesce(struct net_device *netdev,
-			    struct ethtool_coalesce *ec)
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/intel/igbvf/ethtool.c b/drivers/net/ethernet/intel/igbvf/ethtool.c
index f4835eb62fee..06e5bd646a0e 100644
--- a/drivers/net/ethernet/intel/igbvf/ethtool.c
+++ b/drivers/net/ethernet/intel/igbvf/ethtool.c
@@ -314,7 +314,9 @@ static int igbvf_set_wol(struct net_device *netdev,
 }
 
 static int igbvf_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct igbvf_adapter *adapter = netdev_priv(netdev);
 
@@ -327,7 +329,9 @@ static int igbvf_get_coalesce(struct net_device *netdev,
 }
 
 static int igbvf_set_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct igbvf_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
index d3e84416248e..e0a76ac1bbbc 100644
--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
@@ -862,7 +862,9 @@ static void igc_ethtool_get_stats(struct net_device *netdev,
 }
 
 static int igc_ethtool_get_coalesce(struct net_device *netdev,
-				    struct ethtool_coalesce *ec)
+				    struct ethtool_coalesce *ec,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct igc_adapter *adapter = netdev_priv(netdev);
 
@@ -882,7 +884,9 @@ static int igc_ethtool_get_coalesce(struct net_device *netdev,
 }
 
 static int igc_ethtool_set_coalesce(struct net_device *netdev,
-				    struct ethtool_coalesce *ec)
+				    struct ethtool_coalesce *ec,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct igc_adapter *adapter = netdev_priv(netdev);
 	int i;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 4ceaca0f6ce3..fc26e4ddeb0d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -2358,7 +2358,9 @@ static int ixgbe_set_phys_id(struct net_device *netdev,
 }
 
 static int ixgbe_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 
@@ -2412,7 +2414,9 @@ static bool ixgbe_update_rsc(struct ixgbe_adapter *adapter)
 }
 
 static int ixgbe_set_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_q_vector *q_vector;
diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index e49fb1cd9a99..8380f905e708 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -787,7 +787,9 @@ static int ixgbevf_nway_reset(struct net_device *netdev)
 }
 
 static int ixgbevf_get_coalesce(struct net_device *netdev,
-				struct ethtool_coalesce *ec)
+				struct ethtool_coalesce *ec,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
 
@@ -811,7 +813,9 @@ static int ixgbevf_get_coalesce(struct net_device *netdev,
 }
 
 static int ixgbevf_set_coalesce(struct net_device *netdev,
-				struct ethtool_coalesce *ec)
+				struct ethtool_coalesce *ec,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
 	struct ixgbevf_q_vector *q_vector;
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 438c5602fbc5..1bdc4f23e1e5 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -2385,8 +2385,10 @@ jme_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *p)
 	mdio_memcpy(jme, p32, JME_PHY_REG_NR);
 }
 
-static int
-jme_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ecmd)
+static int jme_get_coalesce(struct net_device *netdev,
+			    struct ethtool_coalesce *ecmd,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct jme_adapter *jme = netdev_priv(netdev);
 
@@ -2422,8 +2424,10 @@ jme_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ecmd)
 	return 0;
 }
 
-static int
-jme_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ecmd)
+static int jme_set_coalesce(struct net_device *netdev,
+			    struct ethtool_coalesce *ecmd,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct jme_adapter *jme = netdev_priv(netdev);
 	struct dynpcc_info *dpi = &(jme->dpi);
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 6502c5c2ebca..28d5ad296646 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -1611,8 +1611,10 @@ static void mv643xx_eth_get_drvinfo(struct net_device *dev,
 	strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
 }
 
-static int
-mv643xx_eth_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+static int mv643xx_eth_get_coalesce(struct net_device *dev,
+				    struct ethtool_coalesce *ec,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct mv643xx_eth_private *mp = netdev_priv(dev);
 
@@ -1622,8 +1624,10 @@ mv643xx_eth_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
 	return 0;
 }
 
-static int
-mv643xx_eth_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+static int mv643xx_eth_set_coalesce(struct net_device *dev,
+				    struct ethtool_coalesce *ec,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct mv643xx_eth_private *mp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 5d1007e1b5c9..0e6d40701862 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4500,8 +4500,11 @@ static int mvneta_ethtool_nway_reset(struct net_device *dev)
 }
 
 /* Set interrupt coalescing for ethtools */
-static int mvneta_ethtool_set_coalesce(struct net_device *dev,
-				       struct ethtool_coalesce *c)
+static int
+mvneta_ethtool_set_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *c,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct mvneta_port *pp = netdev_priv(dev);
 	int queue;
@@ -4524,8 +4527,11 @@ static int mvneta_ethtool_set_coalesce(struct net_device *dev,
 }
 
 /* get coalescing for ethtools */
-static int mvneta_ethtool_get_coalesce(struct net_device *dev,
-				       struct ethtool_coalesce *c)
+static int
+mvneta_ethtool_get_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *c,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct mvneta_port *pp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 744f58f41ecc..d5c92e43f89e 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -5367,8 +5367,11 @@ static int mvpp2_ethtool_nway_reset(struct net_device *dev)
 }
 
 /* Set interrupt coalescing for ethtools */
-static int mvpp2_ethtool_set_coalesce(struct net_device *dev,
-				      struct ethtool_coalesce *c)
+static int
+mvpp2_ethtool_set_coalesce(struct net_device *dev,
+			   struct ethtool_coalesce *c,
+			   struct kernel_ethtool_coalesce *kernel_coal,
+			   struct netlink_ext_ack *extack)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
 	int queue;
@@ -5400,8 +5403,11 @@ static int mvpp2_ethtool_set_coalesce(struct net_device *dev,
 }
 
 /* get coalescing for ethtools */
-static int mvpp2_ethtool_get_coalesce(struct net_device *dev,
-				      struct ethtool_coalesce *c)
+static int
+mvpp2_ethtool_get_coalesce(struct net_device *dev,
+			   struct ethtool_coalesce *c,
+			   struct kernel_ethtool_coalesce *kernel_coal,
+			   struct netlink_ext_ack *extack)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
index 0151d6d939d4..5ce087686a1f 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
@@ -412,7 +412,9 @@ static int otx2_set_ringparam(struct net_device *netdev,
 }
 
 static int otx2_get_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *cmd)
+			     struct ethtool_coalesce *cmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct otx2_nic *pfvf = netdev_priv(netdev);
 	struct otx2_hw *hw = &pfvf->hw;
@@ -426,7 +428,9 @@ static int otx2_get_coalesce(struct net_device *netdev,
 }
 
 static int otx2_set_coalesce(struct net_device *netdev,
-			     struct ethtool_coalesce *ec)
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct otx2_nic *pfvf = netdev_priv(netdev);
 	struct otx2_hw *hw = &pfvf->hw;
diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index 150c06ee3627..051dd3fb5b03 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -615,7 +615,9 @@ static inline u32 skge_usecs2clk(const struct skge_hw *hw, u32 usec)
 }
 
 static int skge_get_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *ecmd)
+			     struct ethtool_coalesce *ecmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct skge_port *skge = netdev_priv(dev);
 	struct skge_hw *hw = skge->hw;
@@ -639,7 +641,9 @@ static int skge_get_coalesce(struct net_device *dev,
 
 /* Note: interrupt timer is per board, but can turn on/off per port */
 static int skge_set_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *ecmd)
+			     struct ethtool_coalesce *ecmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct skge_port *skge = netdev_priv(dev);
 	struct skge_hw *hw = skge->hw;
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index dc9dd77d1ea0..e9fc74e54b22 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -4052,7 +4052,9 @@ static int sky2_set_pauseparam(struct net_device *dev,
 }
 
 static int sky2_get_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *ecmd)
+			     struct ethtool_coalesce *ecmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
@@ -4087,7 +4089,9 @@ static int sky2_get_coalesce(struct net_device *dev,
 
 /* Note: this affect both ports */
 static int sky2_set_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *ecmd)
+			     struct ethtool_coalesce *ecmd,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 3616b77caa0a..ef518b1040f7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -998,7 +998,9 @@ mlx4_en_set_link_ksettings(struct net_device *dev,
 }
 
 static int mlx4_en_get_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *coal)
+				struct ethtool_coalesce *coal,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 
@@ -1020,7 +1022,9 @@ static int mlx4_en_get_coalesce(struct net_device *dev,
 }
 
 static int mlx4_en_set_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *coal)
+				struct ethtool_coalesce *coal,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 5696d3f1baaf..2cfd12953909 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -534,7 +534,9 @@ int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
 }
 
 static int mlx5e_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 
@@ -652,7 +654,9 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 }
 
 static int mlx5e_set_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv    = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index eb83f27850c7..ae71a17fdb27 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -251,7 +251,9 @@ static int mlx5e_rep_set_channels(struct net_device *dev,
 }
 
 static int mlx5e_rep_get_coalesce(struct net_device *netdev,
-				  struct ethtool_coalesce *coal)
+				  struct ethtool_coalesce *coal,
+				  struct kernel_ethtool_coalesce *kernel_coal,
+				  struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 
@@ -259,7 +261,9 @@ static int mlx5e_rep_get_coalesce(struct net_device *netdev,
 }
 
 static int mlx5e_rep_set_coalesce(struct net_device *netdev,
-				  struct ethtool_coalesce *coal)
+				  struct ethtool_coalesce *coal,
+				  struct kernel_ethtool_coalesce *kernel_coal,
+				  struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
index 0e487ec57d5c..0c8594c7df21 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
@@ -99,7 +99,9 @@ static void mlx5i_get_channels(struct net_device *dev,
 }
 
 static int mlx5i_set_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
 
@@ -107,7 +109,9 @@ static int mlx5i_set_coalesce(struct net_device *netdev,
 }
 
 static int mlx5i_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *coal)
+			      struct ethtool_coalesce *coal,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
 
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index 7359a8b768e9..c1a75b08ced7 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -1652,8 +1652,10 @@ myri10ge_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *info)
 	strlcpy(info->bus_info, pci_name(mgp->pdev), sizeof(info->bus_info));
 }
 
-static int
-myri10ge_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *coal)
+static int myri10ge_get_coalesce(struct net_device *netdev,
+				 struct ethtool_coalesce *coal,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct myri10ge_priv *mgp = netdev_priv(netdev);
 
@@ -1661,8 +1663,10 @@ myri10ge_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *coal)
 	return 0;
 }
 
-static int
-myri10ge_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *coal)
+static int myri10ge_set_coalesce(struct net_device *netdev,
+				 struct ethtool_coalesce *coal,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct myri10ge_priv *mgp = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 0bf2ff5717bc..0685ece1f155 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -1078,7 +1078,9 @@ static void nfp_net_get_regs(struct net_device *netdev,
 }
 
 static int nfp_net_get_coalesce(struct net_device *netdev,
-				struct ethtool_coalesce *ec)
+				struct ethtool_coalesce *ec,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 
@@ -1330,7 +1332,9 @@ exit_close_nsp:
 }
 
 static int nfp_net_set_coalesce(struct net_device *netdev,
-				struct ethtool_coalesce *ec)
+				struct ethtool_coalesce *ec,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 	unsigned int factor;
diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
index 2d097dcb7bda..36fe2c0f31ff 100644
--- a/drivers/net/ethernet/ni/nixge.c
+++ b/drivers/net/ethernet/ni/nixge.c
@@ -993,8 +993,11 @@ static void nixge_ethtools_get_drvinfo(struct net_device *ndev,
 	strlcpy(ed->bus_info, "platform", sizeof(ed->bus_info));
 }
 
-static int nixge_ethtools_get_coalesce(struct net_device *ndev,
-				       struct ethtool_coalesce *ecoalesce)
+static int
+nixge_ethtools_get_coalesce(struct net_device *ndev,
+			    struct ethtool_coalesce *ecoalesce,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct nixge_priv *priv = netdev_priv(ndev);
 	u32 regval = 0;
@@ -1008,8 +1011,11 @@ static int nixge_ethtools_get_coalesce(struct net_device *ndev,
 	return 0;
 }
 
-static int nixge_ethtools_set_coalesce(struct net_device *ndev,
-				       struct ethtool_coalesce *ecoalesce)
+static int
+nixge_ethtools_set_coalesce(struct net_device *ndev,
+			    struct ethtool_coalesce *ecoalesce,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct nixge_priv *priv = netdev_priv(ndev);
 
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
index adc9fdb03e86..e91b4874a57f 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
@@ -420,7 +420,9 @@ static int ionic_set_fecparam(struct net_device *netdev,
 }
 
 static int ionic_get_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *coalesce)
+			      struct ethtool_coalesce *coalesce,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct ionic_lif *lif = netdev_priv(netdev);
 
@@ -438,7 +440,9 @@ static int ionic_get_coalesce(struct net_device *netdev,
 }
 
 static int ionic_set_coalesce(struct net_device *netdev,
-			      struct ethtool_coalesce *coalesce)
+			      struct ethtool_coalesce *coalesce,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct ionic_lif *lif = netdev_priv(netdev);
 	struct ionic_identity *ident;
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c
index dd22cb056d03..a075643f5826 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c
@@ -731,7 +731,9 @@ netxen_nic_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
  * firmware coalescing to default.
  */
 static int netxen_set_intr_coalesce(struct net_device *netdev,
-			struct ethtool_coalesce *ethcoal)
+				    struct ethtool_coalesce *ethcoal,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct netxen_adapter *adapter = netdev_priv(netdev);
 
@@ -775,7 +777,9 @@ static int netxen_set_intr_coalesce(struct net_device *netdev,
 }
 
 static int netxen_get_intr_coalesce(struct net_device *netdev,
-			struct ethtool_coalesce *ethcoal)
+				    struct ethtool_coalesce *ethcoal,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct netxen_adapter *adapter = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 66c69f0f9af1..f90dcfe9ee68 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -580,7 +580,9 @@ int qede_add_tc_flower_fltr(struct qede_dev *edev, __be16 proto,
 			    struct flow_cls_offload *f);
 
 void qede_forced_speed_maps_init(void);
-int qede_set_coalesce(struct net_device *dev, struct ethtool_coalesce *coal);
+int qede_set_coalesce(struct net_device *dev, struct ethtool_coalesce *coal,
+		      struct kernel_ethtool_coalesce *kernel_coal,
+		      struct netlink_ext_ack *extack);
 int qede_set_per_coalesce(struct net_device *dev, u32 queue,
 			  struct ethtool_coalesce *coal);
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 9c6aa6859646..8284c4c1528f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -760,7 +760,9 @@ static int qede_flash_device(struct net_device *dev,
 }
 
 static int qede_get_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *coal)
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	void *rx_handle = NULL, *tx_handle = NULL;
 	struct qede_dev *edev = netdev_priv(dev);
@@ -819,7 +821,9 @@ out:
 	return rc;
 }
 
-int qede_set_coalesce(struct net_device *dev, struct ethtool_coalesce *coal)
+int qede_set_coalesce(struct net_device *dev, struct ethtool_coalesce *coal,
+		      struct kernel_ethtool_coalesce *kernel_coal,
+		      struct netlink_ext_ack *extack)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qede_fastpath *fp;
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
index f6b6651decf3..fc364b4ab6eb 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
@@ -1527,7 +1527,9 @@ qlcnic_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
  * firmware coalescing to default.
  */
 static int qlcnic_set_intr_coalesce(struct net_device *netdev,
-			struct ethtool_coalesce *ethcoal)
+				    struct ethtool_coalesce *ethcoal,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	int err;
@@ -1551,7 +1553,9 @@ static int qlcnic_set_intr_coalesce(struct net_device *netdev,
 }
 
 static int qlcnic_get_intr_coalesce(struct net_device *netdev,
-			struct ethtool_coalesce *ethcoal)
+				    struct ethtool_coalesce *ethcoal,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 9ea59efd0fd6..1225d27330f8 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -1749,7 +1749,10 @@ rtl_coalesce_info(struct rtl8169_private *tp)
 	return ERR_PTR(-ELNRNG);
 }
 
-static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+static int rtl_get_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 	const struct rtl_coalesce_info *ci;
@@ -1807,7 +1810,10 @@ static int rtl_coalesce_choose_scale(struct rtl8169_private *tp, u32 usec,
 	return -ERANGE;
 }
 
-static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+static int rtl_set_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 	u32 tx_fr = ec->tx_max_coalesced_frames;
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c
index 7f8b10c49660..98edb01024f0 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c
@@ -274,7 +274,9 @@ static u32 sxgbe_usec2riwt(u32 usec, struct sxgbe_priv_data *priv)
 }
 
 static int sxgbe_get_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct sxgbe_priv_data *priv = netdev_priv(dev);
 
@@ -285,7 +287,9 @@ static int sxgbe_get_coalesce(struct net_device *dev,
 }
 
 static int sxgbe_set_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *ec)
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct sxgbe_priv_data *priv = netdev_priv(dev);
 	unsigned int rx_riwt;
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 058d9fe41d99..e002ce21788d 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -97,7 +97,9 @@ static void efx_ethtool_get_regs(struct net_device *net_dev,
  */
 
 static int efx_ethtool_get_coalesce(struct net_device *net_dev,
-				    struct ethtool_coalesce *coalesce)
+				    struct ethtool_coalesce *coalesce,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	unsigned int tx_usecs, rx_usecs;
@@ -115,7 +117,9 @@ static int efx_ethtool_get_coalesce(struct net_device *net_dev,
 }
 
 static int efx_ethtool_set_coalesce(struct net_device *net_dev,
-				    struct ethtool_coalesce *coalesce)
+				    struct ethtool_coalesce *coalesce,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_channel *channel;
diff --git a/drivers/net/ethernet/sfc/falcon/ethtool.c b/drivers/net/ethernet/sfc/falcon/ethtool.c
index a6bae6a234ba..137e8a7aeaa1 100644
--- a/drivers/net/ethernet/sfc/falcon/ethtool.c
+++ b/drivers/net/ethernet/sfc/falcon/ethtool.c
@@ -577,7 +577,9 @@ static int ef4_ethtool_nway_reset(struct net_device *net_dev)
  */
 
 static int ef4_ethtool_get_coalesce(struct net_device *net_dev,
-				    struct ethtool_coalesce *coalesce)
+				    struct ethtool_coalesce *coalesce,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
 	unsigned int tx_usecs, rx_usecs;
@@ -595,7 +597,9 @@ static int ef4_ethtool_get_coalesce(struct net_device *net_dev,
 }
 
 static int ef4_ethtool_set_coalesce(struct net_device *net_dev,
-				    struct ethtool_coalesce *coalesce)
+				    struct ethtool_coalesce *coalesce,
+				    struct kernel_ethtool_coalesce *kernel_coal,
+				    struct netlink_ext_ack *extack)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
 	struct ef4_channel *channel;
diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index d15f7b3a3f10..1f46af136aa8 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -532,7 +532,9 @@ static void netsec_et_get_drvinfo(struct net_device *net_device,
 }
 
 static int netsec_et_get_coalesce(struct net_device *net_device,
-				  struct ethtool_coalesce *et_coalesce)
+				  struct ethtool_coalesce *et_coalesce,
+				  struct kernel_ethtool_coalesce *kernel_coal,
+				  struct netlink_ext_ack *extack)
 {
 	struct netsec_priv *priv = netdev_priv(net_device);
 
@@ -542,7 +544,9 @@ static int netsec_et_get_coalesce(struct net_device *net_device,
 }
 
 static int netsec_et_set_coalesce(struct net_device *net_device,
-				  struct ethtool_coalesce *et_coalesce)
+				  struct ethtool_coalesce *et_coalesce,
+				  struct kernel_ethtool_coalesce *kernel_coal,
+				  struct netlink_ext_ack *extack)
 {
 	struct netsec_priv *priv = netdev_priv(net_device);
 
@@ -1544,7 +1548,7 @@ static int netsec_start_gmac(struct netsec_priv *priv)
 	netsec_write(priv, NETSEC_REG_NRM_RX_INTEN_CLR, ~0);
 	netsec_write(priv, NETSEC_REG_NRM_TX_INTEN_CLR, ~0);
 
-	netsec_et_set_coalesce(priv->ndev, &priv->et_coalesce);
+	netsec_et_set_coalesce(priv->ndev, &priv->et_coalesce, NULL, NULL);
 
 	if (netsec_mac_write(priv, GMAC_REG_OMR, value))
 		return -ETIMEDOUT;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 595c3ccdcbb7..d89455803bed 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -874,7 +874,9 @@ static int __stmmac_get_coalesce(struct net_device *dev,
 }
 
 static int stmmac_get_coalesce(struct net_device *dev,
-			       struct ethtool_coalesce *ec)
+			       struct ethtool_coalesce *ec,
+			       struct kernel_ethtool_coalesce *kernel_coal,
+			       struct netlink_ext_ack *extack)
 {
 	return __stmmac_get_coalesce(dev, ec, -1);
 }
@@ -958,7 +960,9 @@ static int __stmmac_set_coalesce(struct net_device *dev,
 }
 
 static int stmmac_set_coalesce(struct net_device *dev,
-			       struct ethtool_coalesce *ec)
+			       struct ethtool_coalesce *ec,
+			       struct kernel_ethtool_coalesce *kernel_coal,
+			       struct netlink_ext_ack *extack)
 {
 	return __stmmac_set_coalesce(dev, ec, -1);
 }
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c
index bc198eadfcab..49f8c6be9459 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c
@@ -146,8 +146,11 @@ static void xlgmac_ethtool_get_channels(struct net_device *netdev,
 	channel->tx_count = pdata->tx_q_count;
 }
 
-static int xlgmac_ethtool_get_coalesce(struct net_device *netdev,
-				       struct ethtool_coalesce *ec)
+static int
+xlgmac_ethtool_get_coalesce(struct net_device *netdev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct xlgmac_pdata *pdata = netdev_priv(netdev);
 
@@ -158,8 +161,11 @@ static int xlgmac_ethtool_get_coalesce(struct net_device *netdev,
 	return 0;
 }
 
-static int xlgmac_ethtool_set_coalesce(struct net_device *netdev,
-				       struct ethtool_coalesce *ec)
+static int
+xlgmac_ethtool_set_coalesce(struct net_device *netdev,
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct xlgmac_pdata *pdata = netdev_priv(netdev);
 	struct xlgmac_hw_ops *hw_ops = &pdata->hw_ops;
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 8f6abaec41d1..6b409f9c5863 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -2151,8 +2151,10 @@ bdx_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
  * @netdev
  * @ecoal
  */
-static int
-bdx_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ecoal)
+static int bdx_get_coalesce(struct net_device *netdev,
+			    struct ethtool_coalesce *ecoal,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	u32 rdintcm;
 	u32 tdintcm;
@@ -2180,8 +2182,10 @@ bdx_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ecoal)
  * @netdev
  * @ecoal
  */
-static int
-bdx_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ecoal)
+static int bdx_set_coalesce(struct net_device *netdev,
+			    struct ethtool_coalesce *ecoal,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	u32 rdintcm;
 	u32 tdintcm;
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 9f70e40779f6..66f7ddd9b1f9 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -845,7 +845,7 @@ static int cpsw_ndo_open(struct net_device *ndev)
 		struct ethtool_coalesce coal;
 
 		coal.rx_coalesce_usecs = cpsw->coal_intvl;
-		cpsw_set_coalesce(ndev, &coal);
+		cpsw_set_coalesce(ndev, &coal, NULL, NULL);
 	}
 
 	cpdma_ctlr_start(cpsw->dma);
diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
index 4619c3a950b0..158c8d3793f4 100644
--- a/drivers/net/ethernet/ti/cpsw_ethtool.c
+++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
@@ -152,7 +152,9 @@ void cpsw_set_msglevel(struct net_device *ndev, u32 value)
 	priv->msg_enable = value;
 }
 
-int cpsw_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal)
+int cpsw_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal,
+		      struct kernel_ethtool_coalesce *kernel_coal,
+		      struct netlink_ext_ack *extack)
 {
 	struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
 
@@ -160,7 +162,9 @@ int cpsw_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal)
 	return 0;
 }
 
-int cpsw_set_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal)
+int cpsw_set_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal,
+		      struct kernel_ethtool_coalesce *kernel_coal,
+		      struct netlink_ext_ack *extack)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	u32 int_ctrl;
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 534d39f729e2..7968f24d99c8 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -894,7 +894,7 @@ static int cpsw_ndo_open(struct net_device *ndev)
 		struct ethtool_coalesce coal;
 
 		coal.rx_coalesce_usecs = cpsw->coal_intvl;
-		cpsw_set_coalesce(ndev, &coal);
+		cpsw_set_coalesce(ndev, &coal, NULL, NULL);
 	}
 
 	cpdma_ctlr_start(cpsw->dma);
diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
index 2951fb7b9dae..435668ee542d 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.h
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -464,8 +464,12 @@ void cpsw_mqprio_resume(struct cpsw_slave *slave, struct cpsw_priv *priv);
 /* ethtool */
 u32 cpsw_get_msglevel(struct net_device *ndev);
 void cpsw_set_msglevel(struct net_device *ndev, u32 value);
-int cpsw_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal);
-int cpsw_set_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal);
+int cpsw_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal,
+		      struct kernel_ethtool_coalesce *kernel_coal,
+		      struct netlink_ext_ack *extack);
+int cpsw_set_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal,
+		      struct kernel_ethtool_coalesce *kernel_coal,
+		      struct netlink_ext_ack *extack);
 int cpsw_get_sset_count(struct net_device *ndev, int sset);
 void cpsw_get_strings(struct net_device *ndev, u32 stringset, u8 *data);
 void cpsw_get_ethtool_stats(struct net_device *ndev,
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index b1c5cbe7478b..e8291d848839 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -383,12 +383,16 @@ static void emac_get_drvinfo(struct net_device *ndev,
  * emac_get_coalesce - Get interrupt coalesce settings for this device
  * @ndev : The DaVinci EMAC network adapter
  * @coal : ethtool coalesce settings structure
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack: extack for reporting error messages
  *
  * Fetch the current interrupt coalesce settings
  *
  */
 static int emac_get_coalesce(struct net_device *ndev,
-				struct ethtool_coalesce *coal)
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct emac_priv *priv = netdev_priv(ndev);
 
@@ -401,12 +405,16 @@ static int emac_get_coalesce(struct net_device *ndev,
  * emac_set_coalesce - Set interrupt coalesce settings for this device
  * @ndev : The DaVinci EMAC network adapter
  * @coal : ethtool coalesce settings structure
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack: extack for reporting error messages
  *
  * Set interrupt coalesce parameters
  *
  */
 static int emac_set_coalesce(struct net_device *ndev,
-				struct ethtool_coalesce *coal)
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct emac_priv *priv = netdev_priv(ndev);
 	u32 int_ctrl, num_interrupts = 0;
@@ -1462,7 +1470,7 @@ static int emac_dev_open(struct net_device *ndev)
 		struct ethtool_coalesce coal;
 
 		coal.rx_coalesce_usecs = (priv->coal_intvl << 4);
-		emac_set_coalesce(ndev, &coal);
+		emac_set_coalesce(ndev, &coal, NULL, NULL);
 	}
 
 	cpdma_ctlr_start(priv->dma);
diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c
index 6a08ea658816..4b9c30f735b5 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -3518,7 +3518,9 @@ static void set_pending_timer_val(int *val, u32 us)
 
 
 static int velocity_get_coalesce(struct net_device *dev,
-		struct ethtool_coalesce *ecmd)
+				 struct ethtool_coalesce *ecmd,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct velocity_info *vptr = netdev_priv(dev);
 
@@ -3532,7 +3534,9 @@ static int velocity_get_coalesce(struct net_device *dev,
 }
 
 static int velocity_set_coalesce(struct net_device *dev,
-		struct ethtool_coalesce *ecmd)
+				 struct ethtool_coalesce *ecmd,
+				 struct kernel_ethtool_coalesce *kernel_coal,
+				 struct netlink_ext_ack *extack)
 {
 	struct velocity_info *vptr = netdev_priv(dev);
 	int max_us = 0x3f * 64;
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index db1994fb51c5..463094ced104 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -1310,8 +1310,11 @@ static int ll_temac_ethtools_set_ringparam(struct net_device *ndev,
 	return 0;
 }
 
-static int ll_temac_ethtools_get_coalesce(struct net_device *ndev,
-					  struct ethtool_coalesce *ec)
+static int
+ll_temac_ethtools_get_coalesce(struct net_device *ndev,
+			       struct ethtool_coalesce *ec,
+			       struct kernel_ethtool_coalesce *kernel_coal,
+			       struct netlink_ext_ack *extack)
 {
 	struct temac_local *lp = netdev_priv(ndev);
 
@@ -1322,8 +1325,11 @@ static int ll_temac_ethtools_get_coalesce(struct net_device *ndev,
 	return 0;
 }
 
-static int ll_temac_ethtools_set_coalesce(struct net_device *ndev,
-					  struct ethtool_coalesce *ec)
+static int
+ll_temac_ethtools_set_coalesce(struct net_device *ndev,
+			       struct ethtool_coalesce *ec,
+			       struct kernel_ethtool_coalesce *kernel_coal,
+			       struct netlink_ext_ack *extack)
 {
 	struct temac_local *lp = netdev_priv(ndev);
 
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 348c0ba5edcf..871b5ec3183d 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1400,6 +1400,8 @@ axienet_ethtools_set_pauseparam(struct net_device *ndev,
  * axienet_ethtools_get_coalesce - Get DMA interrupt coalescing count.
  * @ndev:	Pointer to net_device structure
  * @ecoalesce:	Pointer to ethtool_coalesce structure
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack:	extack for reporting error messages
  *
  * This implements ethtool command for getting the DMA interrupt coalescing
  * count on Tx and Rx paths. Issue "ethtool -c ethX" under linux prompt to
@@ -1407,8 +1409,11 @@ axienet_ethtools_set_pauseparam(struct net_device *ndev,
  *
  * Return: 0 always
  */
-static int axienet_ethtools_get_coalesce(struct net_device *ndev,
-					 struct ethtool_coalesce *ecoalesce)
+static int
+axienet_ethtools_get_coalesce(struct net_device *ndev,
+			      struct ethtool_coalesce *ecoalesce,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	u32 regval = 0;
 	struct axienet_local *lp = netdev_priv(ndev);
@@ -1425,6 +1430,8 @@ static int axienet_ethtools_get_coalesce(struct net_device *ndev,
  * axienet_ethtools_set_coalesce - Set DMA interrupt coalescing count.
  * @ndev:	Pointer to net_device structure
  * @ecoalesce:	Pointer to ethtool_coalesce structure
+ * @kernel_coal: ethtool CQE mode setting structure
+ * @extack:	extack for reporting error messages
  *
  * This implements ethtool command for setting the DMA interrupt coalescing
  * count on Tx and Rx paths. Issue "ethtool -C ethX rx-frames 5" under linux
@@ -1432,8 +1439,11 @@ static int axienet_ethtools_get_coalesce(struct net_device *ndev,
  *
  * Return: 0, on success, Non-zero error value on failure.
  */
-static int axienet_ethtools_set_coalesce(struct net_device *ndev,
-					 struct ethtool_coalesce *ecoalesce)
+static int
+axienet_ethtools_set_coalesce(struct net_device *ndev,
+			      struct ethtool_coalesce *ecoalesce,
+			      struct kernel_ethtool_coalesce *kernel_coal,
+			      struct netlink_ext_ack *extack)
 {
 	struct axienet_local *lp = netdev_priv(ndev);
 
diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c
index c9ae52595a8f..b03a0513eb7e 100644
--- a/drivers/net/netdevsim/ethtool.c
+++ b/drivers/net/netdevsim/ethtool.c
@@ -43,7 +43,9 @@ nsim_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause)
 }
 
 static int nsim_get_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *coal)
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct netdevsim *ns = netdev_priv(dev);
 
@@ -52,7 +54,9 @@ static int nsim_get_coalesce(struct net_device *dev,
 }
 
 static int nsim_set_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *coal)
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct netdevsim *ns = netdev_priv(dev);
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2ced021f4faf..fecc9a1d293a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3510,7 +3510,9 @@ static void tun_set_msglevel(struct net_device *dev, u32 value)
 }
 
 static int tun_get_coalesce(struct net_device *dev,
-			    struct ethtool_coalesce *ec)
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 
@@ -3520,7 +3522,9 @@ static int tun_get_coalesce(struct net_device *dev,
 }
 
 static int tun_set_coalesce(struct net_device *dev,
-			    struct ethtool_coalesce *ec)
+			    struct ethtool_coalesce *ec,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index aa66671c484d..60ba9b734055 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -8848,7 +8848,9 @@ out:
 }
 
 static int rtl8152_get_coalesce(struct net_device *netdev,
-				struct ethtool_coalesce *coalesce)
+				struct ethtool_coalesce *coalesce,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct r8152 *tp = netdev_priv(netdev);
 
@@ -8867,7 +8869,9 @@ static int rtl8152_get_coalesce(struct net_device *netdev,
 }
 
 static int rtl8152_set_coalesce(struct net_device *netdev,
-				struct ethtool_coalesce *coalesce)
+				struct ethtool_coalesce *coalesce,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct r8152 *tp = netdev_priv(netdev);
 	int ret;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c8c9ad7ca2b5..b4ae2ac8a249 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2320,7 +2320,9 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
 }
 
 static int virtnet_set_coalesce(struct net_device *dev,
-				struct ethtool_coalesce *ec)
+				struct ethtool_coalesce *ec,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	int i, napi_weight;
@@ -2341,7 +2343,9 @@ static int virtnet_set_coalesce(struct net_device *dev,
 }
 
 static int virtnet_get_coalesce(struct net_device *dev,
-				struct ethtool_coalesce *ec)
+				struct ethtool_coalesce *ec,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct ethtool_coalesce ec_default = {
 		.cmd = ETHTOOL_GCOALESCE,
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index a3e2f2ba68b5..5dd8360b21a0 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -1053,8 +1053,10 @@ vmxnet3_set_rss(struct net_device *netdev, const u32 *p, const u8 *key,
 }
 #endif
 
-static int
-vmxnet3_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
+static int vmxnet3_get_coalesce(struct net_device *netdev,
+				struct ethtool_coalesce *ec,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 
@@ -1088,8 +1090,10 @@ vmxnet3_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
 	return 0;
 }
 
-static int
-vmxnet3_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
+static int vmxnet3_set_coalesce(struct net_device *netdev,
+				struct ethtool_coalesce *ec,
+				struct kernel_ethtool_coalesce *kernel_coal,
+				struct netlink_ext_ack *extack)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	struct Vmxnet3_DriverShared *shared = adapter->shared;
diff --git a/drivers/net/wireless/ath/wil6210/ethtool.c b/drivers/net/wireless/ath/wil6210/ethtool.c
index e481674485c2..29a9f17c2df0 100644
--- a/drivers/net/wireless/ath/wil6210/ethtool.c
+++ b/drivers/net/wireless/ath/wil6210/ethtool.c
@@ -11,8 +11,11 @@
 
 #include "wil6210.h"
 
-static int wil_ethtoolops_get_coalesce(struct net_device *ndev,
-				       struct ethtool_coalesce *cp)
+static int
+wil_ethtoolops_get_coalesce(struct net_device *ndev,
+			    struct ethtool_coalesce *cp,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct wil6210_priv *wil = ndev_to_wil(ndev);
 	u32 tx_itr_en, tx_itr_val = 0;
@@ -45,8 +48,11 @@ out:
 	return ret;
 }
 
-static int wil_ethtoolops_set_coalesce(struct net_device *ndev,
-				       struct ethtool_coalesce *cp)
+static int
+wil_ethtoolops_set_coalesce(struct net_device *ndev,
+			    struct ethtool_coalesce *cp,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
 {
 	struct wil6210_priv *wil = ndev_to_wil(ndev);
 	struct wireless_dev *wdev = ndev->ieee80211_ptr;
diff --git a/drivers/s390/net/qeth_ethtool.c b/drivers/s390/net/qeth_ethtool.c
index 3937986f159a..46d0fe0d0e8a 100644
--- a/drivers/s390/net/qeth_ethtool.c
+++ b/drivers/s390/net/qeth_ethtool.c
@@ -123,7 +123,9 @@ static void __qeth_set_coalesce(struct net_device *dev,
 }
 
 static int qeth_set_coalesce(struct net_device *dev,
-			     struct ethtool_coalesce *coal)
+			     struct ethtool_coalesce *coal,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct qeth_card *card = dev->ml_priv;
 	struct qeth_qdio_out_q *queue;
diff --git a/drivers/staging/qlge/qlge_ethtool.c b/drivers/staging/qlge/qlge_ethtool.c
index 87d60115ac67..12efcd1057ba 100644
--- a/drivers/staging/qlge/qlge_ethtool.c
+++ b/drivers/staging/qlge/qlge_ethtool.c
@@ -621,7 +621,10 @@ static void qlge_get_regs(struct net_device *ndev,
 		regs->len = sizeof(struct qlge_reg_dump);
 }
 
-static int qlge_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *c)
+static int qlge_get_coalesce(struct net_device *ndev,
+			     struct ethtool_coalesce *c,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct qlge_adapter *qdev = netdev_to_qdev(ndev);
 
@@ -644,7 +647,10 @@ static int qlge_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *c
 	return 0;
 }
 
-static int qlge_set_coalesce(struct net_device *ndev, struct ethtool_coalesce *c)
+static int qlge_set_coalesce(struct net_device *ndev,
+			     struct ethtool_coalesce *c,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
 {
 	struct qlge_adapter *qdev = netdev_to_qdev(ndev);
 
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index a9d77a6a3e00..849524b55d89 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -15,6 +15,7 @@
 
 #include <linux/bitmap.h>
 #include <linux/compat.h>
+#include <linux/netlink.h>
 #include <uapi/linux/ethtool.h>
 
 struct compat_ethtool_rx_flow_spec {
@@ -611,8 +612,14 @@ struct ethtool_ops {
 			      struct ethtool_eeprom *, u8 *);
 	int	(*set_eeprom)(struct net_device *,
 			      struct ethtool_eeprom *, u8 *);
-	int	(*get_coalesce)(struct net_device *, struct ethtool_coalesce *);
-	int	(*set_coalesce)(struct net_device *, struct ethtool_coalesce *);
+	int	(*get_coalesce)(struct net_device *,
+				struct ethtool_coalesce *,
+				struct kernel_ethtool_coalesce *,
+				struct netlink_ext_ack *);
+	int	(*set_coalesce)(struct net_device *,
+				struct ethtool_coalesce *,
+				struct kernel_ethtool_coalesce *,
+				struct netlink_ext_ack *);
 	void	(*get_ringparam)(struct net_device *,
 				 struct ethtool_ringparam *);
 	int	(*set_ringparam)(struct net_device *,
diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
index e6bc53634e68..46776ea42a92 100644
--- a/net/ethtool/coalesce.c
+++ b/net/ethtool/coalesce.c
@@ -62,6 +62,7 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base,
 				 struct genl_info *info)
 {
 	struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+	struct netlink_ext_ack *extack = info ? info->extack : NULL;
 	struct net_device *dev = reply_base->dev;
 	int ret;
 
@@ -71,7 +72,8 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base,
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		return ret;
-	ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce);
+	ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce,
+					     &data->kernel_coalesce, extack);
 	ethnl_ops_complete(dev);
 
 	return ret;
@@ -266,7 +268,8 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		goto out_rtnl;
-	ret = ops->get_coalesce(dev, &coalesce);
+	ret = ops->get_coalesce(dev, &coalesce, &kernel_coalesce,
+				info->extack);
 	if (ret < 0)
 		goto out_ops;
 
@@ -322,7 +325,8 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
 	if (!mod)
 		goto out_ops;
 
-	ret = dev->ethtool_ops->set_coalesce(dev, &coalesce);
+	ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce,
+					     info->extack);
 	if (ret < 0)
 		goto out_ops;
 	ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 81fa36a4c9c4..f2abc3152888 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1619,12 +1619,14 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
 						   void __user *useraddr)
 {
 	struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+	struct kernel_ethtool_coalesce kernel_coalesce = {};
 	int ret;
 
 	if (!dev->ethtool_ops->get_coalesce)
 		return -EOPNOTSUPP;
 
-	ret = dev->ethtool_ops->get_coalesce(dev, &coalesce);
+	ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce,
+					     NULL);
 	if (ret)
 		return ret;
 
@@ -1691,19 +1693,26 @@ ethtool_set_coalesce_supported(struct net_device *dev,
 static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
 						   void __user *useraddr)
 {
+	struct kernel_ethtool_coalesce kernel_coalesce = {};
 	struct ethtool_coalesce coalesce;
 	int ret;
 
-	if (!dev->ethtool_ops->set_coalesce)
+	if (!dev->ethtool_ops->set_coalesce && !dev->ethtool_ops->get_coalesce)
 		return -EOPNOTSUPP;
 
+	ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce,
+					     NULL);
+	if (ret)
+		return ret;
+
 	if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
 		return -EFAULT;
 
 	if (!ethtool_set_coalesce_supported(dev, &coalesce))
 		return -EOPNOTSUPP;
 
-	ret = dev->ethtool_ops->set_coalesce(dev, &coalesce);
+	ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce,
+					     NULL);
 	if (!ret)
 		ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL);
 	return ret;
-- 
cgit v1.2.3


From 1d71eb53e45187f58089d32b51e27784c791d90e Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 26 Jul 2021 20:36:54 -0700
Subject: Revert "PCI: Make pci_enable_ptm() private"

Make pci_enable_ptm() accessible from the drivers.

Exposing this to the driver enables the driver to use the
'ptm_enabled' field of 'pci_dev' to check if PTM is enabled or not.

This reverts commit ac6c26da29c1 ("PCI: Make pci_enable_ptm() private").

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/pci/pci.h   | 3 ---
 include/linux/pci.h | 7 +++++++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 93dcdd431072..2f52110cac97 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -597,11 +597,8 @@ static inline void pcie_ecrc_get_policy(char *str) { }
 
 #ifdef CONFIG_PCIE_PTM
 void pci_ptm_init(struct pci_dev *dev);
-int pci_enable_ptm(struct pci_dev *dev, u8 *granularity);
 #else
 static inline void pci_ptm_init(struct pci_dev *dev) { }
-static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity)
-{ return -EINVAL; }
 #endif
 
 struct pci_dev_reset_methods {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 540b377ca8f6..21a9d244e4e4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1620,6 +1620,13 @@ static inline bool pci_aer_available(void) { return false; }
 
 bool pci_ats_disabled(void);
 
+#ifdef CONFIG_PCIE_PTM
+int pci_enable_ptm(struct pci_dev *dev, u8 *granularity);
+#else
+static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity)
+{ return -EINVAL; }
+#endif
+
 void pci_cfg_access_lock(struct pci_dev *dev);
 bool pci_cfg_access_trylock(struct pci_dev *dev);
 void pci_cfg_access_unlock(struct pci_dev *dev);
-- 
cgit v1.2.3


From 014408cd624e9fd2820f4a593b710325ee05fec9 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 26 Jul 2021 20:36:55 -0700
Subject: PCI: Add pcie_ptm_enabled()

Add a predicate that returns if PCIe PTM (Precision Time Measurement)
is enabled.

It will only return true if it's enabled in all the ports in the path
from the device to the root.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/pci/pcie/ptm.c | 9 +++++++++
 include/linux/pci.h    | 3 +++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c
index 95d4eef2c9e8..8a4ad974c5ac 100644
--- a/drivers/pci/pcie/ptm.c
+++ b/drivers/pci/pcie/ptm.c
@@ -204,3 +204,12 @@ int pci_enable_ptm(struct pci_dev *dev, u8 *granularity)
 	return 0;
 }
 EXPORT_SYMBOL(pci_enable_ptm);
+
+bool pcie_ptm_enabled(struct pci_dev *dev)
+{
+	if (!dev)
+		return false;
+
+	return dev->ptm_enabled;
+}
+EXPORT_SYMBOL(pcie_ptm_enabled);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 21a9d244e4e4..947430637cac 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1622,9 +1622,12 @@ bool pci_ats_disabled(void);
 
 #ifdef CONFIG_PCIE_PTM
 int pci_enable_ptm(struct pci_dev *dev, u8 *granularity);
+bool pcie_ptm_enabled(struct pci_dev *dev);
 #else
 static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity)
 { return -EINVAL; }
+static inline bool pcie_ptm_enabled(struct pci_dev *dev)
+{ return false; }
 #endif
 
 void pci_cfg_access_lock(struct pci_dev *dev);
-- 
cgit v1.2.3


From 406f42fa0d3cbcea3766c3111d79ac5afe711c5b Mon Sep 17 00:00:00 2001
From: Gilad Naaman <gnaaman@drivenets.com>
Date: Thu, 19 Aug 2021 10:17:27 +0300
Subject: net-next: When a bond have a massive amount of VLANs with IPv6
 addresses, performance of changing link state, attaching a VRF, changing an
 IPv6 address, etc. go down dramtically.

The source of most of the slow down is the `dev_addr_lists.c` module,
which mainatins a linked list of HW addresses.
When using IPv6, this list grows for each IPv6 address added on a
VLAN, since each IPv6 address has a multicast HW address associated with
it.

When performing any modification to the involved links, this list is
traversed many times, often for nothing, all while holding the RTNL
lock.

Instead, this patch adds an auxilliary rbtree which cuts down
traversal time significantly.

Performance can be seen with the following script:

	#!/bin/bash
	ip netns del test || true 2>/dev/null
	ip netns add test

	echo 1 | ip netns exec test tee /proc/sys/net/ipv6/conf/all/keep_addr_on_down > /dev/null

	set -e

	ip -n test link add foo type veth peer name bar
	ip -n test link add b1 type bond
	ip -n test link add florp type vrf table 10

	ip -n test link set bar master b1
	ip -n test link set foo up
	ip -n test link set bar up
	ip -n test link set b1 up
	ip -n test link set florp up

	VLAN_COUNT=1500
	BASE_DEV=b1

	echo Creating vlans
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test link add link $BASE_DEV name foo.\$i type vlan id \$i; done"

	echo Bringing them up
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test link set foo.\$i up; done"

	echo Assiging IPv6 Addresses
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test address add dev foo.\$i 2000::\$i/64; done"

	echo Attaching to VRF
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test link set foo.\$i master florp; done"

On an Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz machine, the performance
before the patch is (truncated):

	Creating vlans
	real 108.35
	Bringing them up
	real 4.96
	Assiging IPv6 Addresses
	real 19.22
	Attaching to VRF
	real 458.84

After the patch:

	Creating vlans
	real 5.59
	Bringing them up
	real 5.07
	Assiging IPv6 Addresses
	real 5.64
	Attaching to VRF
	real 25.37

Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Lu Wei <luwei32@huawei.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   5 ++
 net/core/dev_addr_lists.c | 144 +++++++++++++++++++++++++++++++---------------
 2 files changed, 103 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b88ad5aef7fe..6fd3a4d42668 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -47,6 +47,7 @@
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
 #include <linux/hashtable.h>
+#include <linux/rbtree.h>
 
 struct netpoll_info;
 struct device;
@@ -208,6 +209,7 @@ struct sk_buff;
 
 struct netdev_hw_addr {
 	struct list_head	list;
+	struct rb_node		node;
 	unsigned char		addr[MAX_ADDR_LEN];
 	unsigned char		type;
 #define NETDEV_HW_ADDR_T_LAN		1
@@ -224,6 +226,9 @@ struct netdev_hw_addr {
 struct netdev_hw_addr_list {
 	struct list_head	list;
 	int			count;
+
+	/* Auxiliary tree for faster lookup on addition and deletion */
+	struct rb_root		tree;
 };
 
 #define netdev_hw_addr_list_count(l) ((l)->count)
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 45ae6eeb2964..8c39283c26ae 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -16,10 +16,9 @@
  * General list handling functions
  */
 
-static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
-			       const unsigned char *addr, int addr_len,
-			       unsigned char addr_type, bool global,
-			       bool sync)
+static struct netdev_hw_addr*
+__hw_addr_create(const unsigned char *addr, int addr_len,
+		 unsigned char addr_type, bool global, bool sync)
 {
 	struct netdev_hw_addr *ha;
 	int alloc_size;
@@ -29,32 +28,44 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
 		alloc_size = L1_CACHE_BYTES;
 	ha = kmalloc(alloc_size, GFP_ATOMIC);
 	if (!ha)
-		return -ENOMEM;
+		return NULL;
 	memcpy(ha->addr, addr, addr_len);
 	ha->type = addr_type;
 	ha->refcount = 1;
 	ha->global_use = global;
 	ha->synced = sync ? 1 : 0;
 	ha->sync_cnt = 0;
-	list_add_tail_rcu(&ha->list, &list->list);
-	list->count++;
 
-	return 0;
+	return ha;
 }
 
 static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
 			    const unsigned char *addr, int addr_len,
 			    unsigned char addr_type, bool global, bool sync,
-			    int sync_count)
+			    int sync_count, bool exclusive)
 {
+	struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
 	struct netdev_hw_addr *ha;
 
 	if (addr_len > MAX_ADDR_LEN)
 		return -EINVAL;
 
-	list_for_each_entry(ha, &list->list, list) {
-		if (ha->type == addr_type &&
-		    !memcmp(ha->addr, addr, addr_len)) {
+	while (*ins_point) {
+		int diff;
+
+		ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+		diff = memcmp(addr, ha->addr, addr_len);
+		if (diff == 0)
+			diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+		parent = *ins_point;
+		if (diff < 0) {
+			ins_point = &parent->rb_left;
+		} else if (diff > 0) {
+			ins_point = &parent->rb_right;
+		} else {
+			if (exclusive)
+				return -EEXIST;
 			if (global) {
 				/* check if addr is already used as global */
 				if (ha->global_use)
@@ -73,8 +84,25 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
 		}
 	}
 
-	return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
-				   sync);
+	ha = __hw_addr_create(addr, addr_len, addr_type, global, sync);
+	if (!ha)
+		return -ENOMEM;
+
+	/* The first address in dev->dev_addrs is pointed to by dev->dev_addr
+	 * and mutated freely by device drivers and netdev ops, so if we insert
+	 * it into the tree we'll end up with an invalid rbtree.
+	 */
+	if (list->count > 0) {
+		rb_link_node(&ha->node, parent, ins_point);
+		rb_insert_color(&ha->node, &list->tree);
+	} else {
+		RB_CLEAR_NODE(&ha->node);
+	}
+
+	list_add_tail_rcu(&ha->list, &list->list);
+	list->count++;
+
+	return 0;
 }
 
 static int __hw_addr_add(struct netdev_hw_addr_list *list,
@@ -82,7 +110,7 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list,
 			 unsigned char addr_type)
 {
 	return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
-				0);
+				0, false);
 }
 
 static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
@@ -103,24 +131,61 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
 
 	if (--ha->refcount)
 		return 0;
+
+	if (!RB_EMPTY_NODE(&ha->node))
+		rb_erase(&ha->node, &list->tree);
+
 	list_del_rcu(&ha->list);
 	kfree_rcu(ha, rcu_head);
 	list->count--;
 	return 0;
 }
 
+static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list,
+					       const unsigned char *addr, int addr_len,
+					       unsigned char addr_type)
+{
+	struct netdev_hw_addr *ha;
+	struct rb_node *node;
+
+	/* The first address isn't inserted into the tree because in the dev->dev_addrs
+	 * list it's the address pointed to by dev->dev_addr which is freely mutated
+	 * in place, so we need to check it separately.
+	 */
+	ha = list_first_entry(&list->list, struct netdev_hw_addr, list);
+	if (ha && !memcmp(addr, ha->addr, addr_len) &&
+	    (!addr_type || addr_type == ha->type))
+		return ha;
+
+	node = list->tree.rb_node;
+
+	while (node) {
+		struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node);
+		int diff = memcmp(addr, ha->addr, addr_len);
+
+		if (diff == 0 && addr_type)
+			diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+		if (diff < 0)
+			node = node->rb_left;
+		else if (diff > 0)
+			node = node->rb_right;
+		else
+			return ha;
+	}
+
+	return NULL;
+}
+
 static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
 			    const unsigned char *addr, int addr_len,
 			    unsigned char addr_type, bool global, bool sync)
 {
-	struct netdev_hw_addr *ha;
+	struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type);
 
-	list_for_each_entry(ha, &list->list, list) {
-		if (!memcmp(ha->addr, addr, addr_len) &&
-		    (ha->type == addr_type || !addr_type))
-			return __hw_addr_del_entry(list, ha, global, sync);
-	}
-	return -ENOENT;
+	if (!ha)
+		return -ENOENT;
+	return __hw_addr_del_entry(list, ha, global, sync);
 }
 
 static int __hw_addr_del(struct netdev_hw_addr_list *list,
@@ -137,7 +202,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
 	int err;
 
 	err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
-			       false, true, ha->sync_cnt);
+			       false, true, ha->sync_cnt, false);
 	if (err && err != -EEXIST)
 		return err;
 
@@ -407,6 +472,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list)
 {
 	struct netdev_hw_addr *ha, *tmp;
 
+	list->tree = RB_ROOT;
 	list_for_each_entry_safe(ha, tmp, &list->list, list) {
 		list_del_rcu(&ha->list);
 		kfree_rcu(ha, rcu_head);
@@ -418,6 +484,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list)
 {
 	INIT_LIST_HEAD(&list->list);
 	list->count = 0;
+	list->tree = RB_ROOT;
 }
 EXPORT_SYMBOL(__hw_addr_init);
 
@@ -552,22 +619,14 @@ EXPORT_SYMBOL(dev_addr_del);
  */
 int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
 {
-	struct netdev_hw_addr *ha;
 	int err;
 
 	netif_addr_lock_bh(dev);
-	list_for_each_entry(ha, &dev->uc.list, list) {
-		if (!memcmp(ha->addr, addr, dev->addr_len) &&
-		    ha->type == NETDEV_HW_ADDR_T_UNICAST) {
-			err = -EEXIST;
-			goto out;
-		}
-	}
-	err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
-				  NETDEV_HW_ADDR_T_UNICAST, true, false);
+	err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len,
+			       NETDEV_HW_ADDR_T_UNICAST, true, false,
+			       0, true);
 	if (!err)
 		__dev_set_rx_mode(dev);
-out:
 	netif_addr_unlock_bh(dev);
 	return err;
 }
@@ -745,22 +804,14 @@ EXPORT_SYMBOL(dev_uc_init);
  */
 int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
 {
-	struct netdev_hw_addr *ha;
 	int err;
 
 	netif_addr_lock_bh(dev);
-	list_for_each_entry(ha, &dev->mc.list, list) {
-		if (!memcmp(ha->addr, addr, dev->addr_len) &&
-		    ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
-			err = -EEXIST;
-			goto out;
-		}
-	}
-	err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
-				  NETDEV_HW_ADDR_T_MULTICAST, true, false);
+	err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+			       NETDEV_HW_ADDR_T_MULTICAST, true, false,
+			       0, true);
 	if (!err)
 		__dev_set_rx_mode(dev);
-out:
 	netif_addr_unlock_bh(dev);
 	return err;
 }
@@ -773,7 +824,8 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
 
 	netif_addr_lock_bh(dev);
 	err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
-			       NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
+			       NETDEV_HW_ADDR_T_MULTICAST, global, false,
+			       0, false);
 	if (!err)
 		__dev_set_rx_mode(dev);
 	netif_addr_unlock_bh(dev);
-- 
cgit v1.2.3


From d7b269083786dca3b1b0141bde6cea834062b691 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 24 Aug 2021 16:26:14 -0700
Subject: mptcp: shrink mptcp_out_options struct

After the previous patch we can alias with a union several
fields in mptcp_out_options. Such struct is stack allocated and
memset() for each plain TCP out packet. Every saved byted counts.

Before:
pahole -EC mptcp_out_options
 # ...
/* size: 136, cachelines: 3, members: 17 */

After:
pahole -EC mptcp_out_options
 # ...
/* size: 56, cachelines: 1, members: 9 */

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mptcp.h | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 8b5af683a818..3236010afa29 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -58,10 +58,6 @@ struct mptcp_addr_info {
 struct mptcp_out_options {
 #if IS_ENABLED(CONFIG_MPTCP)
 	u16 suboptions;
-	u64 sndr_key;
-	u64 rcvr_key;
-	u64 ahmac;
-	struct mptcp_addr_info addr;
 	struct mptcp_rm_list rm_list;
 	u8 join_id;
 	u8 backup;
@@ -69,11 +65,23 @@ struct mptcp_out_options {
 	   reset_transient:1,
 	   csum_reqd:1,
 	   allow_join_id0:1;
-	u32 nonce;
-	u64 thmac;
-	u32 token;
-	u8 hmac[20];
-	struct mptcp_ext ext_copy;
+	union {
+		struct {
+			u64 sndr_key;
+			u64 rcvr_key;
+		};
+		struct {
+			struct mptcp_addr_info addr;
+			u64 ahmac;
+		};
+		struct mptcp_ext ext_copy;
+		struct {
+			u32 nonce;
+			u32 token;
+			u64 thmac;
+			u8 hmac[20];
+		};
+	};
 #endif
 };
 
-- 
cgit v1.2.3


From c25aeb4e095355eec3beb6a2b2b30322bd6d0dd4 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@xiaomi.com>
Date: Tue, 24 Aug 2021 16:26:15 -0700
Subject: mptcp: MP_FAIL suboption sending

This patch added the MP_FAIL suboption sending support.

Add a new flag named send_mp_fail in struct mptcp_subflow_context. If
this flag is set, send out MP_FAIL suboption.

Add a new member fail_seq in struct mptcp_out_options to save the data
sequence number to put into the MP_FAIL suboption.

An MP_FAIL option could be included in a RST or on the subflow-level
ACK.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Geliang Tang <geliangtang@xiaomi.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mptcp.h  |  5 ++++-
 net/mptcp/options.c  | 59 ++++++++++++++++++++++++++++++++++++++++++++++++----
 net/mptcp/protocol.h |  3 +++
 3 files changed, 62 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 3236010afa29..6026bbefbffd 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -74,7 +74,10 @@ struct mptcp_out_options {
 			struct mptcp_addr_info addr;
 			u64 ahmac;
 		};
-		struct mptcp_ext ext_copy;
+		struct {
+			struct mptcp_ext ext_copy;
+			u64 fail_seq;
+		};
 		struct {
 			u32 nonce;
 			u32 token;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 1a59b3045a33..f2ebdd55d3cc 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -767,7 +767,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk,
 	return true;
 }
 
-static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb,
+static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb,
 						   unsigned int *size,
 						   unsigned int remaining,
 						   struct mptcp_out_options *opts)
@@ -775,12 +775,36 @@ static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_bu
 	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 
 	if (remaining < TCPOLEN_MPTCP_RST)
-		return;
+		return false;
 
 	*size = TCPOLEN_MPTCP_RST;
 	opts->suboptions |= OPTION_MPTCP_RST;
 	opts->reset_transient = subflow->reset_transient;
 	opts->reset_reason = subflow->reset_reason;
+
+	return true;
+}
+
+static bool mptcp_established_options_mp_fail(struct sock *sk,
+					      unsigned int *size,
+					      unsigned int remaining,
+					      struct mptcp_out_options *opts)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+	if (likely(!subflow->send_mp_fail))
+		return false;
+
+	if (remaining < TCPOLEN_MPTCP_FAIL)
+		return false;
+
+	*size = TCPOLEN_MPTCP_FAIL;
+	opts->suboptions |= OPTION_MPTCP_FAIL;
+	opts->fail_seq = subflow->map_seq;
+
+	pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);
+
+	return true;
 }
 
 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
@@ -799,15 +823,28 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 		return false;
 
 	if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
-		mptcp_established_options_rst(sk, skb, size, remaining, opts);
+		if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
+			*size += opt_size;
+			remaining -= opt_size;
+		}
+		if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) {
+			*size += opt_size;
+			remaining -= opt_size;
+		}
 		return true;
 	}
 
 	snd_data_fin = mptcp_data_fin_enabled(msk);
 	if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts))
 		ret = true;
-	else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts))
+	else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) {
 		ret = true;
+		if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
+			*size += opt_size;
+			remaining -= opt_size;
+			return true;
+		}
+	}
 
 	/* we reserved enough space for the above options, and exceeding the
 	 * TCP option space would be fatal
@@ -1210,6 +1247,20 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
 void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
 			 struct mptcp_out_options *opts)
 {
+	if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
+		const struct sock *ssk = (const struct sock *)tp;
+		struct mptcp_subflow_context *subflow;
+
+		subflow = mptcp_subflow_ctx(ssk);
+		subflow->send_mp_fail = 0;
+
+		*ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
+				      TCPOLEN_MPTCP_FAIL,
+				      0, 0);
+		put_unaligned_be64(opts->fail_seq, ptr);
+		ptr += 2;
+	}
+
 	/* RST is mutually exclusive with everything else */
 	if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
 		*ptr++ = mptcp_option(MPTCPOPT_RST,
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index d276ce16f126..3e4a79cf520a 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -27,6 +27,7 @@
 #define OPTION_MPTCP_PRIO	BIT(9)
 #define OPTION_MPTCP_RST	BIT(10)
 #define OPTION_MPTCP_DSS	BIT(11)
+#define OPTION_MPTCP_FAIL	BIT(12)
 
 /* MPTCP option subtypes */
 #define MPTCPOPT_MP_CAPABLE	0
@@ -68,6 +69,7 @@
 #define TCPOLEN_MPTCP_PRIO_ALIGN	4
 #define TCPOLEN_MPTCP_FASTCLOSE		12
 #define TCPOLEN_MPTCP_RST		4
+#define TCPOLEN_MPTCP_FAIL		12
 
 #define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM	(TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA)
 
@@ -429,6 +431,7 @@ struct mptcp_subflow_context {
 		mpc_map : 1,
 		backup : 1,
 		send_mp_prio : 1,
+		send_mp_fail : 1,
 		rx_eof : 1,
 		can_ack : 1,        /* only after processing the remote a key */
 		disposable : 1,	    /* ctx can be free at ulp release time */
-- 
cgit v1.2.3


From b0b8c67eaa5c65f8426017e78fcce12dc7d85110 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 24 Aug 2021 20:15:01 +0300
Subject: net: dsa: sja1105: drop untagged packets on the CPU and DSA ports

The sja1105 driver is a bit special in its use of VLAN headers as DSA
tags. This is because in VLAN-aware mode, the VLAN headers use an actual
TPID of 0x8100, which is understood even by the DSA master as an actual
VLAN header.

Furthermore, control packets such as PTP and STP are transmitted with no
VLAN header as a DSA tag, because, depending on switch generation, there
are ways to steer these control packets towards a precise egress port
other than VLAN tags. Transmitting control packets as untagged means
leaving a door open for traffic in general to be transmitted as untagged
from the DSA master, and for it to traverse the switch and exit a random
switch port according to the FDB lookup.

This behavior is a bit out of line with other DSA drivers which have
native support for DSA tagging. There, it is to be expected that the
switch only accepts DSA-tagged packets on its CPU port, dropping
everything that does not match this pattern.

We perhaps rely a bit too much on the switches' hardware dropping on the
CPU port, and place no other restrictions in the kernel data path to
avoid that. For example, sja1105 is also a bit special in that STP/PTP
packets are transmitted using "management routes"
(sja1105_port_deferred_xmit): when sending a link-local packet from the
CPU, we must first write a SPI message to the switch to tell it to
expect a packet towards multicast MAC DA 01-80-c2-00-00-0e, and to route
it towards port 3 when it gets it. This entry expires as soon as it
matches a packet received by the switch, and it needs to be reinstalled
for the next packet etc. All in all quite a ghetto mechanism, but it is
all that the sja1105 switches offer for injecting a control packet.
The driver takes a mutex for serializing control packets and making the
pairs of SPI writes of a management route and its associated skb atomic,
but to be honest, a mutex is only relevant as long as all parties agree
to take it. With the DSA design, it is possible to open an AF_PACKET
socket on the DSA master net device, and blast packets towards
01-80-c2-00-00-0e, and whatever locking the DSA switch driver might use,
it all goes kaput because management routes installed by the driver will
match skbs sent by the DSA master, and not skbs generated by the driver
itself. So they will end up being routed on the wrong port.

So through the lens of that, maybe it would make sense to avoid that
from happening by doing something in the network stack, like: introduce
a new bit in struct sk_buff, like xmit_from_dsa. Then, somewhere around
dev_hard_start_xmit(), introduce the following check:

	if (netdev_uses_dsa(dev) && !skb->xmit_from_dsa)
		kfree_skb(skb);

Ok, maybe that is a bit drastic, but that would at least prevent a bunch
of problems. For example, right now, even though the majority of DSA
switches drop packets without DSA tags sent by the DSA master (and
therefore the majority of garbage that user space daemons like avahi and
udhcpcd and friends create), it is still conceivable that an aggressive
user space program can open an AF_PACKET socket and inject a spoofed DSA
tag directly on the DSA master. We have no protection against that; the
packet will be understood by the switch and be routed wherever user
space says. Furthermore: there are some DSA switches where we even have
register access over Ethernet, using DSA tags. So even user space
drivers are possible in this way. This is a huge hole.

However, the biggest thing that bothers me is that udhcpcd attempts to
ask for an IP address on all interfaces by default, and with sja1105, it
will attempt to get a valid IP address on both the DSA master as well as
on sja1105 switch ports themselves. So with IP addresses in the same
subnet on multiple interfaces, the routing table will be messed up and
the system will be unusable for traffic until it is configured manually
to not ask for an IP address on the DSA master itself.

It turns out that it is possible to avoid that in the sja1105 driver, at
least very superficially, by requesting the switch to drop VLAN-untagged
packets on the CPU port. With the exception of control packets, all
traffic originated from tag_sja1105.c is already VLAN-tagged, so only
STP and PTP packets need to be converted. For that, we need to uphold
the equivalence between an untagged and a pvid-tagged packet, and to
remember that the CPU port of sja1105 uses a pvid of 4095.

Now that we drop untagged traffic on the CPU port, non-aggressive user
space applications like udhcpcd stop bothering us, and sja1105 effectively
becomes just as vulnerable to the aggressive kind of user space programs
as other DSA switches are (ok, users can also create 8021q uppers on top
of the DSA master in the case of sja1105, but in future patches we can
easily deny that, but it still doesn't change the fact that VLAN-tagged
packets can still be injected over raw sockets).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c | 10 ++++++++-
 include/linux/dsa/sja1105.h            |  2 ++
 net/dsa/tag_sja1105.c                  | 41 +++++++++++++++++++++++++++++++++-
 3 files changed, 51 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 6be9fed50ed5..976f06462223 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -26,7 +26,6 @@
 #include "sja1105_tas.h"
 
 #define SJA1105_UNKNOWN_MULTICAST	0x010000000000ull
-#define SJA1105_DEFAULT_VLAN		(VLAN_N_VID - 1)
 
 static void sja1105_hw_reset(struct gpio_desc *gpio, unsigned int pulse_len,
 			     unsigned int startup_delay)
@@ -136,6 +135,9 @@ static int sja1105_commit_pvid(struct dsa_switch *ds, int port)
 			drop_untagged = true;
 	}
 
+	if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
+		drop_untagged = true;
+
 	return sja1105_drop_untagged(ds, port, drop_untagged);
 }
 
@@ -217,6 +219,12 @@ static int sja1105_init_mac_settings(struct sja1105_private *priv)
 		 */
 		if (dsa_port_is_dsa(dp))
 			dp->learning = true;
+
+		/* Disallow untagged packets from being received on the
+		 * CPU and DSA ports.
+		 */
+		if (dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))
+			mac[dp->index].drpuntag = true;
 	}
 
 	return 0;
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 6b0dc9ff92d1..8c5601f1c979 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -16,6 +16,8 @@
 #define ETH_P_SJA1105_META			0x0008
 #define ETH_P_SJA1110				0xdadc
 
+#define SJA1105_DEFAULT_VLAN			(VLAN_N_VID - 1)
+
 /* IEEE 802.3 Annex 57A: Slow Protocols PDUs (01:80:C2:xx:xx:xx) */
 #define SJA1105_LINKLOCAL_FILTER_A		0x0180C2000000ull
 #define SJA1105_LINKLOCAL_FILTER_A_MASK		0xFFFFFF000000ull
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 5b80a9049e2c..a49308fbd19f 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -168,6 +168,36 @@ static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
 	return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), tx_vid);
 }
 
+/* Transform untagged control packets into pvid-tagged control packets so that
+ * all packets sent by this tagger are VLAN-tagged and we can configure the
+ * switch to drop untagged packets coming from the DSA master.
+ */
+static struct sk_buff *sja1105_pvid_tag_control_pkt(struct dsa_port *dp,
+						    struct sk_buff *skb, u8 pcp)
+{
+	__be16 xmit_tpid = htons(sja1105_xmit_tpid(dp));
+	struct vlan_ethhdr *hdr;
+
+	/* If VLAN tag is in hwaccel area, move it to the payload
+	 * to deal with both cases uniformly and to ensure that
+	 * the VLANs are added in the right order.
+	 */
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		skb = __vlan_hwaccel_push_inside(skb);
+		if (!skb)
+			return NULL;
+	}
+
+	hdr = (struct vlan_ethhdr *)skb_mac_header(skb);
+
+	/* If skb is already VLAN-tagged, leave that VLAN ID in place */
+	if (hdr->h_vlan_proto == xmit_tpid)
+		return skb;
+
+	return vlan_insert_tag(skb, xmit_tpid, (pcp << VLAN_PRIO_SHIFT) |
+			       SJA1105_DEFAULT_VLAN);
+}
+
 static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
@@ -183,8 +213,13 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 	 * but instead SPI-installed management routes. Part 2 of this
 	 * is the .port_deferred_xmit driver callback.
 	 */
-	if (unlikely(sja1105_is_link_local(skb)))
+	if (unlikely(sja1105_is_link_local(skb))) {
+		skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp);
+		if (!skb)
+			return NULL;
+
 		return sja1105_defer_xmit(dp, skb);
+	}
 
 	return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp),
 			     ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
@@ -213,6 +248,10 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
 		return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp),
 				     ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
 
+	skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp);
+	if (!skb)
+		return NULL;
+
 	skb_push(skb, SJA1110_HEADER_LEN);
 
 	dsa_alloc_etype_header(skb, SJA1110_HEADER_LEN);
-- 
cgit v1.2.3


From 8ded9160928e545c2e694b77a87263fa078ff4c6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 24 Aug 2021 20:15:02 +0300
Subject: net: dsa: tag_sja1105: stop asking the sja1105 driver in
 sja1105_xmit_tpid

Introduced in commit 38b5beeae7a4 ("net: dsa: sja1105: prepare tagger
for handling DSA tags and VLAN simultaneously"), the sja1105_xmit_tpid
function solved quite a different problem than our needs are now.

Then, we used best-effort VLAN filtering and we were using the xmit_tpid
to tunnel packets coming from an 8021q upper through the TX VLAN allocated
by tag_8021q to that egress port. The need for a different VLAN protocol
depending on switch revision came from the fact that this in itself was
more of a hack to trick the hardware into accepting tunneled VLANs in
the first place.

Right now, we deny 8021q uppers (see sja1105_prechangeupper). Even if we
supported them again, we would not do that using the same method of
{tunneling the VLAN on egress, retagging the VLAN on ingress} that we
had in the best-effort VLAN filtering mode. It seems rather simpler that
we just allocate a VLAN in the VLAN table that is simply not used by the
bridge at all, or by any other port.

Anyway, I have 2 gripes with the current sja1105_xmit_tpid:

1. When sending packets on behalf of a VLAN-aware bridge (with the new
   TX forwarding offload framework) plus untagged (with the tag_8021q
   VLAN added by the tagger) packets, we can see that on SJA1105P/Q/R/S
   and later (which have a qinq_tpid of ETH_P_8021AD), some packets sent
   through the DSA master have a VLAN protocol of 0x8100 and others of
   0x88a8. This is strange and there is no reason for it now. If we have
   a bridge and are therefore forced to send using that bridge's TPID,
   we can as well blend with that bridge's VLAN protocol for all packets.

2. The sja1105_xmit_tpid introduces a dependency on the sja1105 driver,
   because it looks inside dp->priv. It is desirable to keep as much
   separation between taggers and switch drivers as possible. Now it
   doesn't do that anymore.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h      |  6 ------
 drivers/net/dsa/sja1105/sja1105_main.c | 10 ---------
 drivers/net/dsa/sja1105/sja1105_spi.c  | 10 ---------
 include/linux/dsa/sja1105.h            |  1 -
 net/dsa/tag_sja1105.c                  | 38 ++++++++++++++++++++++++++++++----
 5 files changed, 34 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 2e899c9f036d..5e5d24e7c02b 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -115,12 +115,6 @@ struct sja1105_info {
 	const struct sja1105_dynamic_table_ops *dyn_ops;
 	const struct sja1105_table_ops *static_ops;
 	const struct sja1105_regs *regs;
-	/* Both E/T and P/Q/R/S have quirks when it comes to popping the S-Tag
-	 * from double-tagged frames. E/T will pop it only when it's equal to
-	 * TPID from the General Parameters Table, while P/Q/R/S will only
-	 * pop it when it's equal to TPID2.
-	 */
-	u16 qinq_tpid;
 	bool can_limit_mcast_flood;
 	int (*reset_cmd)(struct dsa_switch *ds);
 	int (*setup_rgmii_delay)(const void *ctx, int port);
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 976f06462223..2f8cc6686c38 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2295,15 +2295,6 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled,
 		tpid2 = ETH_P_SJA1105;
 	}
 
-	for (port = 0; port < ds->num_ports; port++) {
-		struct sja1105_port *sp = &priv->ports[port];
-
-		if (enabled)
-			sp->xmit_tpid = priv->info->qinq_tpid;
-		else
-			sp->xmit_tpid = ETH_P_SJA1105;
-	}
-
 	if (priv->vlan_aware == enabled)
 		return 0;
 
@@ -2988,7 +2979,6 @@ static int sja1105_setup_ports(struct sja1105_private *priv)
 		}
 		sp->xmit_worker = worker;
 		skb_queue_head_init(&sp->xmit_queue);
-		sp->xmit_tpid = ETH_P_SJA1105;
 	}
 
 	return 0;
diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c
index 08cc5dbf2fa6..d60a530d0272 100644
--- a/drivers/net/dsa/sja1105/sja1105_spi.c
+++ b/drivers/net/dsa/sja1105/sja1105_spi.c
@@ -575,7 +575,6 @@ const struct sja1105_info sja1105e_info = {
 	.part_no		= SJA1105ET_PART_NO,
 	.static_ops		= sja1105e_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
-	.qinq_tpid		= ETH_P_8021Q,
 	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= false,
 	.ptp_ts_bits		= 24,
@@ -608,7 +607,6 @@ const struct sja1105_info sja1105t_info = {
 	.part_no		= SJA1105ET_PART_NO,
 	.static_ops		= sja1105t_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
-	.qinq_tpid		= ETH_P_8021Q,
 	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= false,
 	.ptp_ts_bits		= 24,
@@ -641,7 +639,6 @@ const struct sja1105_info sja1105p_info = {
 	.part_no		= SJA1105P_PART_NO,
 	.static_ops		= sja1105p_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
-	.qinq_tpid		= ETH_P_8021AD,
 	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= true,
 	.ptp_ts_bits		= 32,
@@ -675,7 +672,6 @@ const struct sja1105_info sja1105q_info = {
 	.part_no		= SJA1105Q_PART_NO,
 	.static_ops		= sja1105q_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
-	.qinq_tpid		= ETH_P_8021AD,
 	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= true,
 	.ptp_ts_bits		= 32,
@@ -709,7 +705,6 @@ const struct sja1105_info sja1105r_info = {
 	.part_no		= SJA1105R_PART_NO,
 	.static_ops		= sja1105r_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
-	.qinq_tpid		= ETH_P_8021AD,
 	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= true,
 	.ptp_ts_bits		= 32,
@@ -747,7 +742,6 @@ const struct sja1105_info sja1105s_info = {
 	.static_ops		= sja1105s_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
 	.regs			= &sja1105pqrs_regs,
-	.qinq_tpid		= ETH_P_8021AD,
 	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= true,
 	.ptp_ts_bits		= 32,
@@ -784,7 +778,6 @@ const struct sja1105_info sja1110a_info = {
 	.static_ops		= sja1110_table_ops,
 	.dyn_ops		= sja1110_dyn_ops,
 	.regs			= &sja1110_regs,
-	.qinq_tpid		= ETH_P_8021AD,
 	.tag_proto		= DSA_TAG_PROTO_SJA1110,
 	.can_limit_mcast_flood	= true,
 	.multiple_cascade_ports	= true,
@@ -835,7 +828,6 @@ const struct sja1105_info sja1110b_info = {
 	.static_ops		= sja1110_table_ops,
 	.dyn_ops		= sja1110_dyn_ops,
 	.regs			= &sja1110_regs,
-	.qinq_tpid		= ETH_P_8021AD,
 	.tag_proto		= DSA_TAG_PROTO_SJA1110,
 	.can_limit_mcast_flood	= true,
 	.multiple_cascade_ports	= true,
@@ -886,7 +878,6 @@ const struct sja1105_info sja1110c_info = {
 	.static_ops		= sja1110_table_ops,
 	.dyn_ops		= sja1110_dyn_ops,
 	.regs			= &sja1110_regs,
-	.qinq_tpid		= ETH_P_8021AD,
 	.tag_proto		= DSA_TAG_PROTO_SJA1110,
 	.can_limit_mcast_flood	= true,
 	.multiple_cascade_ports	= true,
@@ -937,7 +928,6 @@ const struct sja1105_info sja1110d_info = {
 	.static_ops		= sja1110_table_ops,
 	.dyn_ops		= sja1110_dyn_ops,
 	.regs			= &sja1110_regs,
-	.qinq_tpid		= ETH_P_8021AD,
 	.tag_proto		= DSA_TAG_PROTO_SJA1110,
 	.can_limit_mcast_flood	= true,
 	.multiple_cascade_ports	= true,
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 8c5601f1c979..171106202fe5 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -67,7 +67,6 @@ struct sja1105_port {
 	struct sja1105_tagger_data *data;
 	struct dsa_port *dp;
 	bool hwts_tx_en;
-	u16 xmit_tpid;
 };
 
 enum sja1110_meta_tstamp {
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index a49308fbd19f..c054f48541c8 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -133,14 +133,44 @@ static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp,
 	return NULL;
 }
 
+/* Send VLAN tags with a TPID that blends in with whatever VLAN protocol a
+ * bridge spanning ports of this switch might have.
+ */
 static u16 sja1105_xmit_tpid(struct dsa_port *dp)
 {
-	struct sja1105_port *sp = dp->priv;
+	struct dsa_switch *ds = dp->ds;
+	struct dsa_port *other_dp;
+	u16 proto;
+
+	/* Since VLAN awareness is global, then if this port is VLAN-unaware,
+	 * all ports are. Use the VLAN-unaware TPID used for tag_8021q.
+	 */
+	if (!dsa_port_is_vlan_filtering(dp))
+		return ETH_P_SJA1105;
+
+	/* Port is VLAN-aware, so there is a bridge somewhere (a single one,
+	 * we're sure about that). It may not be on this port though, so we
+	 * need to find it.
+	 */
+	list_for_each_entry(other_dp, &ds->dst->ports, list) {
+		if (other_dp->ds != ds)
+			continue;
+
+		if (!other_dp->bridge_dev)
+			continue;
+
+		/* Error is returned only if CONFIG_BRIDGE_VLAN_FILTERING,
+		 * which seems pointless to handle, as our port cannot become
+		 * VLAN-aware in that case.
+		 */
+		br_vlan_get_proto(other_dp->bridge_dev, &proto);
+
+		return proto;
+	}
 
-	if (unlikely(!dsa_port_is_sja1105(dp)))
-		return ETH_P_8021Q;
+	WARN_ONCE(1, "Port is VLAN-aware but cannot find associated bridge!\n");
 
-	return sp->xmit_tpid;
+	return ETH_P_SJA1105;
 }
 
 static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
-- 
cgit v1.2.3


From 87e5ef4b19cec86c861e3ebab3a5d840ecc2f4a4 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Wed, 25 Aug 2021 14:34:31 +0800
Subject: mctp: Remove the repeated declaration

Function 'mctp_dev_get_rtnl' is declared twice, so remove the
repeated declaration.

Cc: Jeremy Kerr <jk@codeconstruct.com.au>
Cc: Matt Johnston <matt@codeconstruct.com.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctpdevice.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h
index 57e773ff08bb..71a11012fac7 100644
--- a/include/net/mctpdevice.h
+++ b/include/net/mctpdevice.h
@@ -31,6 +31,5 @@ struct mctp_dev {
 
 struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev);
 struct mctp_dev *__mctp_dev_get(const struct net_device *dev);
-struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev);
 
 #endif /* __NET_MCTPDEVICE_H */
-- 
cgit v1.2.3


From 478374a3c15f369e57fdd79d64d7a1d2eb307e16 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Aug 2021 17:16:22 +0200
Subject: netfilter: ecache: remove one indent level

nf_conntrack_eventmask_report and nf_ct_deliver_cached_events shared
most of their code.  This unifies the layout by changing

 if (nf_ct_is_confirmed(ct)) {
   foo
 }

 to
 if (!nf_ct_is_confirmed(ct)))
   return
 foo

This removes one level of indentation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_ecache.h |  2 +-
 net/netfilter/nf_conntrack_ecache.c         | 64 +++++++++++++++--------------
 net/netfilter/nf_conntrack_netlink.c        |  2 +-
 3 files changed, 36 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index d00ba6048e44..3734bacf9763 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -73,7 +73,7 @@ struct nf_ct_event {
 };
 
 struct nf_ct_event_notifier {
-	int (*fcn)(unsigned int events, struct nf_ct_event *item);
+	int (*fcn)(unsigned int events, const struct nf_ct_event *item);
 };
 
 int nf_conntrack_register_notifier(struct net *net,
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 296e4a171bd1..3f1e0add58bc 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -133,10 +133,15 @@ static void ecache_work(struct work_struct *work)
 int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
 				  u32 portid, int report)
 {
-	int ret = 0;
 	struct net *net = nf_ct_net(ct);
 	struct nf_ct_event_notifier *notify;
 	struct nf_conntrack_ecache *e;
+	struct nf_ct_event item;
+	unsigned long missed;
+	int ret = 0;
+
+	if (!nf_ct_is_confirmed(ct))
+		return ret;
 
 	rcu_read_lock();
 	notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
@@ -147,38 +152,37 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
 	if (!e)
 		goto out_unlock;
 
-	if (nf_ct_is_confirmed(ct)) {
-		struct nf_ct_event item = {
-			.ct	= ct,
-			.portid	= e->portid ? e->portid : portid,
-			.report = report
-		};
-		/* This is a resent of a destroy event? If so, skip missed */
-		unsigned long missed = e->portid ? 0 : e->missed;
-
-		if (!((eventmask | missed) & e->ctmask))
-			goto out_unlock;
-
-		ret = notify->fcn(eventmask | missed, &item);
-		if (unlikely(ret < 0 || missed)) {
-			spin_lock_bh(&ct->lock);
-			if (ret < 0) {
-				/* This is a destroy event that has been
-				 * triggered by a process, we store the PORTID
-				 * to include it in the retransmission.
-				 */
-				if (eventmask & (1 << IPCT_DESTROY)) {
-					if (e->portid == 0 && portid != 0)
-						e->portid = portid;
-					e->state = NFCT_ECACHE_DESTROY_FAIL;
-				} else {
-					e->missed |= eventmask;
-				}
+	memset(&item, 0, sizeof(item));
+
+	item.ct = ct;
+	item.portid = e->portid ? e->portid : portid;
+	item.report = report;
+
+	/* This is a resent of a destroy event? If so, skip missed */
+	missed = e->portid ? 0 : e->missed;
+
+	if (!((eventmask | missed) & e->ctmask))
+		goto out_unlock;
+
+	ret = notify->fcn(eventmask | missed, &item);
+	if (unlikely(ret < 0 || missed)) {
+		spin_lock_bh(&ct->lock);
+		if (ret < 0) {
+			/* This is a destroy event that has been
+			 * triggered by a process, we store the PORTID
+			 * to include it in the retransmission.
+			 */
+			if (eventmask & (1 << IPCT_DESTROY)) {
+				if (e->portid == 0 && portid != 0)
+					e->portid = portid;
+				e->state = NFCT_ECACHE_DESTROY_FAIL;
 			} else {
-				e->missed &= ~missed;
+				e->missed |= eventmask;
 			}
-			spin_unlock_bh(&ct->lock);
+		} else {
+			e->missed &= ~missed;
 		}
+		spin_unlock_bh(&ct->lock);
 	}
 out_unlock:
 	rcu_read_unlock();
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index eb35c6151fb0..43b891a902de 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -706,7 +706,7 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
 }
 
 static int
-ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
 {
 	const struct nf_conntrack_zone *zone;
 	struct net *net;
-- 
cgit v1.2.3


From b86c0e6429dac2458694495aeebf15f4fe6b269d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Aug 2021 17:16:25 +0200
Subject: netfilter: ecache: prepare for event notifier merge

This prepares for merge for ct and exp notifier structs.

The 'fcn' member is renamed to something unique.
Second, the register/unregister api is simplified.  There is only
one implementation so there is no need to do any error checking.

Replace the EBUSY logic with WARN_ON_ONCE.  This allows to remove
error unwinding.

The exp notifier register/unregister function is removed in
a followup patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_ecache.h | 11 +++++------
 net/netfilter/nf_conntrack_ecache.c         | 26 ++++++--------------------
 net/netfilter/nf_conntrack_netlink.c        | 22 ++++++----------------
 3 files changed, 17 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 3734bacf9763..061a93a03b82 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -73,13 +73,12 @@ struct nf_ct_event {
 };
 
 struct nf_ct_event_notifier {
-	int (*fcn)(unsigned int events, const struct nf_ct_event *item);
+	int (*ct_event)(unsigned int events, const struct nf_ct_event *item);
 };
 
-int nf_conntrack_register_notifier(struct net *net,
-				   struct nf_ct_event_notifier *nb);
-void nf_conntrack_unregister_notifier(struct net *net,
-				      struct nf_ct_event_notifier *nb);
+void nf_conntrack_register_notifier(struct net *net,
+				   const struct nf_ct_event_notifier *nb);
+void nf_conntrack_unregister_notifier(struct net *net);
 
 void nf_ct_deliver_cached_events(struct nf_conn *ct);
 int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
@@ -159,7 +158,7 @@ struct nf_exp_event {
 };
 
 struct nf_exp_event_notifier {
-	int (*fcn)(unsigned int events, struct nf_exp_event *item);
+	int (*exp_event)(unsigned int events, struct nf_exp_event *item);
 };
 
 int nf_ct_expect_register_notifier(struct net *net,
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index fbe04e16280a..d92f78e4bc7c 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -151,7 +151,7 @@ static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
 		return 0;
 	}
 
-	ret = notify->fcn(events | missed, item);
+	ret = notify->ct_event(events | missed, item);
 	rcu_read_unlock();
 
 	if (likely(ret >= 0 && missed == 0))
@@ -258,43 +258,29 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
 			.portid	= portid,
 			.report = report
 		};
-		notify->fcn(1 << event, &item);
+		notify->exp_event(1 << event, &item);
 	}
 out_unlock:
 	rcu_read_unlock();
 }
 
-int nf_conntrack_register_notifier(struct net *net,
-				   struct nf_ct_event_notifier *new)
+void nf_conntrack_register_notifier(struct net *net,
+				    const struct nf_ct_event_notifier *new)
 {
-	int ret;
 	struct nf_ct_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
 	notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
 					   lockdep_is_held(&nf_ct_ecache_mutex));
-	if (notify != NULL) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
+	WARN_ON_ONCE(notify);
 	rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
-	ret = 0;
-
-out_unlock:
 	mutex_unlock(&nf_ct_ecache_mutex);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
 
-void nf_conntrack_unregister_notifier(struct net *net,
-				      struct nf_ct_event_notifier *new)
+void nf_conntrack_unregister_notifier(struct net *net)
 {
-	struct nf_ct_event_notifier *notify;
-
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
-					   lockdep_is_held(&nf_ct_ecache_mutex));
-	BUG_ON(notify != new);
 	RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
 	/* synchronize_rcu() is called from ctnetlink_exit. */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 43b891a902de..6d6f7cd70753 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3755,11 +3755,11 @@ static int ctnetlink_stat_exp_cpu(struct sk_buff *skb,
 
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 static struct nf_ct_event_notifier ctnl_notifier = {
-	.fcn = ctnetlink_conntrack_event,
+	.ct_event = ctnetlink_conntrack_event,
 };
 
 static struct nf_exp_event_notifier ctnl_notifier_exp = {
-	.fcn = ctnetlink_expect_event,
+	.exp_event = ctnetlink_expect_event,
 };
 #endif
 
@@ -3854,33 +3854,23 @@ static int __net_init ctnetlink_net_init(struct net *net)
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	int ret;
 
-	ret = nf_conntrack_register_notifier(net, &ctnl_notifier);
-	if (ret < 0) {
-		pr_err("ctnetlink_init: cannot register notifier.\n");
-		goto err_out;
-	}
+	nf_conntrack_register_notifier(net, &ctnl_notifier);
 
 	ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
 	if (ret < 0) {
 		pr_err("ctnetlink_init: cannot expect register notifier.\n");
-		goto err_unreg_notifier;
+		nf_conntrack_unregister_notifier(net);
+		return ret;
 	}
 #endif
 	return 0;
-
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-err_unreg_notifier:
-	nf_conntrack_unregister_notifier(net, &ctnl_notifier);
-err_out:
-	return ret;
-#endif
 }
 
 static void ctnetlink_net_exit(struct net *net)
 {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
-	nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+	nf_conntrack_unregister_notifier(net);
 #endif
 }
 
-- 
cgit v1.2.3


From bd1431db0b8131098a285c8cc6a357629b4362e5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Aug 2021 17:16:26 +0200
Subject: netfilter: ecache: remove nf_exp_event_notifier structure

Reuse the conntrack event notofier struct, this allows to remove the
extra register/unregister functions and avoids a pointer in struct net.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_ecache.h | 23 +++++----------
 include/net/netns/conntrack.h               |  1 -
 net/netfilter/nf_conntrack_ecache.c         | 43 ++---------------------------
 net/netfilter/nf_conntrack_netlink.c        | 30 ++------------------
 4 files changed, 13 insertions(+), 84 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 061a93a03b82..d932e22edcb4 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -72,8 +72,15 @@ struct nf_ct_event {
 	int report;
 };
 
+struct nf_exp_event {
+	struct nf_conntrack_expect *exp;
+	u32 portid;
+	int report;
+};
+
 struct nf_ct_event_notifier {
 	int (*ct_event)(unsigned int events, const struct nf_ct_event *item);
+	int (*exp_event)(unsigned int events, const struct nf_exp_event *item);
 };
 
 void nf_conntrack_register_notifier(struct net *net,
@@ -150,22 +157,6 @@ nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
 }
 
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-
-struct nf_exp_event {
-	struct nf_conntrack_expect *exp;
-	u32 portid;
-	int report;
-};
-
-struct nf_exp_event_notifier {
-	int (*exp_event)(unsigned int events, struct nf_exp_event *item);
-};
-
-int nf_ct_expect_register_notifier(struct net *net,
-				   struct nf_exp_event_notifier *nb);
-void nf_ct_expect_unregister_notifier(struct net *net,
-				      struct nf_exp_event_notifier *nb);
-
 void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
 			       struct nf_conntrack_expect *exp,
 			       u32 portid, int report);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index fefd38db95b3..0294f3d473af 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -113,7 +113,6 @@ struct netns_ct {
 	struct ct_pcpu __percpu *pcpu_lists;
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
-	struct nf_exp_event_notifier __rcu *nf_expect_event_cb;
 	struct nf_ip_net	nf_ct_proto;
 #if defined(CONFIG_NF_CONNTRACK_LABELS)
 	unsigned int		labels_used;
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index d92f78e4bc7c..41768ff19464 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -240,11 +240,11 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
 
 {
 	struct net *net = nf_ct_exp_net(exp);
-	struct nf_exp_event_notifier *notify;
+	struct nf_ct_event_notifier *notify;
 	struct nf_conntrack_ecache *e;
 
 	rcu_read_lock();
-	notify = rcu_dereference(net->ct.nf_expect_event_cb);
+	notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
 	if (!notify)
 		goto out_unlock;
 
@@ -283,47 +283,10 @@ void nf_conntrack_unregister_notifier(struct net *net)
 	mutex_lock(&nf_ct_ecache_mutex);
 	RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
-	/* synchronize_rcu() is called from ctnetlink_exit. */
+	/* synchronize_rcu() is called after netns pre_exit */
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
 
-int nf_ct_expect_register_notifier(struct net *net,
-				   struct nf_exp_event_notifier *new)
-{
-	int ret;
-	struct nf_exp_event_notifier *notify;
-
-	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
-					   lockdep_is_held(&nf_ct_ecache_mutex));
-	if (notify != NULL) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-	rcu_assign_pointer(net->ct.nf_expect_event_cb, new);
-	ret = 0;
-
-out_unlock:
-	mutex_unlock(&nf_ct_ecache_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
-
-void nf_ct_expect_unregister_notifier(struct net *net,
-				      struct nf_exp_event_notifier *new)
-{
-	struct nf_exp_event_notifier *notify;
-
-	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
-					   lockdep_is_held(&nf_ct_ecache_mutex));
-	BUG_ON(notify != new);
-	RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
-	mutex_unlock(&nf_ct_ecache_mutex);
-	/* synchronize_rcu() is called from ctnetlink_exit. */
-}
-EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
-
 void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 6d6f7cd70753..5008fa0891b3 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3104,7 +3104,7 @@ nla_put_failure:
 
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 static int
-ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
+ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item)
 {
 	struct nf_conntrack_expect *exp = item->exp;
 	struct net *net = nf_ct_exp_net(exp);
@@ -3756,9 +3756,6 @@ static int ctnetlink_stat_exp_cpu(struct sk_buff *skb,
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 static struct nf_ct_event_notifier ctnl_notifier = {
 	.ct_event = ctnetlink_conntrack_event,
-};
-
-static struct nf_exp_event_notifier ctnl_notifier_exp = {
 	.exp_event = ctnetlink_expect_event,
 };
 #endif
@@ -3852,42 +3849,21 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
 static int __net_init ctnetlink_net_init(struct net *net)
 {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-	int ret;
-
 	nf_conntrack_register_notifier(net, &ctnl_notifier);
-
-	ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
-	if (ret < 0) {
-		pr_err("ctnetlink_init: cannot expect register notifier.\n");
-		nf_conntrack_unregister_notifier(net);
-		return ret;
-	}
 #endif
 	return 0;
 }
 
-static void ctnetlink_net_exit(struct net *net)
+static void ctnetlink_net_pre_exit(struct net *net)
 {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-	nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
 	nf_conntrack_unregister_notifier(net);
 #endif
 }
 
-static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
-{
-	struct net *net;
-
-	list_for_each_entry(net, net_exit_list, exit_list)
-		ctnetlink_net_exit(net);
-
-	/* wait for other cpus until they are done with ctnl_notifiers */
-	synchronize_rcu();
-}
-
 static struct pernet_operations ctnetlink_net_ops = {
 	.init		= ctnetlink_net_init,
-	.exit_batch	= ctnetlink_net_exit_batch,
+	.pre_exit	= ctnetlink_net_pre_exit,
 };
 
 static int __init ctnetlink_init(void)
-- 
cgit v1.2.3


From 1b07d00a15d6a96d1a36b6a284c4fd5f2e2fa383 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Mon, 23 Aug 2021 19:43:46 -0700
Subject: bpf: Add BTF_ID_LIST_GLOBAL_SINGLE macro

Same as BTF_ID_LIST_SINGLE macro except defines a global ID.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/a867a97517df42fd3953eeb5454402b57e74538f.1629772842.git.dxu@dxuuu.xyz
---
 include/linux/btf_ids.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index bed4b9964581..6d1395030616 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -82,6 +82,9 @@ __BTF_ID_LIST(name, globl)
 #define BTF_ID_LIST_SINGLE(name, prefix, typename)	\
 	BTF_ID_LIST(name) \
 	BTF_ID(prefix, typename)
+#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \
+	BTF_ID_LIST_GLOBAL(name) \
+	BTF_ID(prefix, typename)
 
 /*
  * The BTF_ID_UNUSED macro defines 4 zero bytes.
-- 
cgit v1.2.3


From 33c5cb36015ac1034b50b823fae367e908d05147 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Mon, 23 Aug 2021 19:43:47 -0700
Subject: bpf: Consolidate task_struct BTF_ID declarations

No need to have it defined 5 times. Once is enough.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/6dcefa5bed26fe1226f26683f36819bb53ec19a2.1629772842.git.dxu@dxuuu.xyz
---
 include/linux/btf_ids.h       |  2 ++
 kernel/bpf/bpf_task_storage.c |  6 ++----
 kernel/bpf/stackmap.c         |  4 +---
 kernel/bpf/task_iter.c        | 11 +++++------
 kernel/trace/bpf_trace.c      |  4 ++--
 5 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 6d1395030616..93d881ab0d48 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -188,4 +188,6 @@ MAX_BTF_SOCK_TYPE,
 extern u32 btf_sock_ids[];
 #endif
 
+extern u32 btf_task_struct_ids[];
+
 #endif
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 3ce75758d394..ebfa8bc90892 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -317,15 +317,13 @@ const struct bpf_map_ops task_storage_map_ops = {
 	.map_owner_storage_ptr = task_storage_ptr,
 };
 
-BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct)
-
 const struct bpf_func_proto bpf_task_storage_get_proto = {
 	.func = bpf_task_storage_get,
 	.gpl_only = false,
 	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type = ARG_CONST_MAP_PTR,
 	.arg2_type = ARG_PTR_TO_BTF_ID,
-	.arg2_btf_id = &bpf_task_storage_btf_ids[0],
+	.arg2_btf_id = &btf_task_struct_ids[0],
 	.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg4_type = ARG_ANYTHING,
 };
@@ -336,5 +334,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
 	.ret_type = RET_INTEGER,
 	.arg1_type = ARG_CONST_MAP_PTR,
 	.arg2_type = ARG_PTR_TO_BTF_ID,
-	.arg2_btf_id = &bpf_task_storage_btf_ids[0],
+	.arg2_btf_id = &btf_task_struct_ids[0],
 };
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6fbc2abe9c91..e8eefdf8cf3e 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -530,14 +530,12 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
 	return res;
 }
 
-BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
-
 const struct bpf_func_proto bpf_get_task_stack_proto = {
 	.func		= bpf_get_task_stack,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_BTF_ID,
-	.arg1_btf_id	= &bpf_get_task_stack_btf_ids[0],
+	.arg1_btf_id	= &btf_task_struct_ids[0],
 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg4_type	= ARG_ANYTHING,
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index b68cb5d6d6eb..b48750bfba5a 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -525,7 +525,6 @@ static const struct seq_operations task_vma_seq_ops = {
 };
 
 BTF_ID_LIST(btf_task_file_ids)
-BTF_ID(struct, task_struct)
 BTF_ID(struct, file)
 BTF_ID(struct, vm_area_struct)
 
@@ -591,19 +590,19 @@ static int __init task_iter_init(void)
 {
 	int ret;
 
-	task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+	task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
 	ret = bpf_iter_reg_target(&task_reg_info);
 	if (ret)
 		return ret;
 
-	task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
-	task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
+	task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
+	task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
 	ret =  bpf_iter_reg_target(&task_file_reg_info);
 	if (ret)
 		return ret;
 
-	task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
-	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2];
+	task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
+	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
 	return bpf_iter_reg_target(&task_vma_reg_info);
 }
 late_initcall(task_iter_init);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cbc73c08c4a4..50d055fc2327 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -714,13 +714,13 @@ BPF_CALL_0(bpf_get_current_task_btf)
 	return (unsigned long) current;
 }
 
-BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct)
+BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
 
 static const struct bpf_func_proto bpf_get_current_task_btf_proto = {
 	.func		= bpf_get_current_task_btf,
 	.gpl_only	= true,
 	.ret_type	= RET_PTR_TO_BTF_ID,
-	.ret_btf_id	= &bpf_get_current_btf_ids[0],
+	.ret_btf_id	= &btf_task_struct_ids[0],
 };
 
 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
-- 
cgit v1.2.3


From dd6e10fbd9fb86a571d925602c8a24bb4d09a2a7 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Mon, 23 Aug 2021 19:43:49 -0700
Subject: bpf: Add bpf_task_pt_regs() helper

The motivation behind this helper is to access userspace pt_regs in a
kprobe handler.

uprobe's ctx is the userspace pt_regs. kprobe's ctx is the kernelspace
pt_regs. bpf_task_pt_regs() allows accessing userspace pt_regs in a
kprobe handler. The final case (kernelspace pt_regs in uprobe) is
pretty rare (usermode helper) so I think that can be solved later if
necessary.

More concretely, this helper is useful in doing BPF-based DWARF stack
unwinding. Currently the kernel can only do framepointer based stack
unwinds for userspace code. This is because the DWARF state machines are
too fragile to be computed in kernelspace [0]. The idea behind
DWARF-based stack unwinds w/ BPF is to copy a chunk of the userspace
stack (while in prog context) and send it up to userspace for unwinding
(probably with libunwind) [1]. This would effectively enable profiling
applications with -fomit-frame-pointer using kprobes and uprobes.

[0]: https://lkml.org/lkml/2012/2/10/356
[1]: https://github.com/danobi/bpf-dwarf-walk

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/e2718ced2d51ef4268590ab8562962438ab82815.1629772842.git.dxu@dxuuu.xyz
---
 include/uapi/linux/bpf.h       |  7 +++++++
 kernel/bpf/helpers.c           |  3 +++
 kernel/trace/bpf_trace.c       | 19 +++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  7 +++++++
 4 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 191f0b286ee3..791f31dd0abe 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4871,6 +4871,12 @@ union bpf_attr {
  * 	Return
  *		Value specified by user at BPF link creation/attachment time
  *		or 0, if it was not specified.
+ *
+ * long bpf_task_pt_regs(struct task_struct *task)
+ *	Description
+ *		Get the struct pt_regs associated with **task**.
+ *	Return
+ *		A pointer to struct pt_regs.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5048,6 +5054,7 @@ union bpf_attr {
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
 	FN(get_attach_cookie),		\
+	FN(task_pt_regs),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 609674f409ed..c227b7d4f56c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1327,6 +1327,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
 const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
 const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
+const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
 
 const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
@@ -1424,6 +1425,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_snprintf_btf_proto;
 	case BPF_FUNC_snprintf:
 		return &bpf_snprintf_proto;
+	case BPF_FUNC_task_pt_regs:
+		return &bpf_task_pt_regs_proto;
 	default:
 		return NULL;
 	}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4e54f3dc209f..580e14ee7ff9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -723,6 +723,23 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = {
 	.ret_btf_id	= &btf_task_struct_ids[0],
 };
 
+BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
+{
+	return (unsigned long) task_pt_regs(task);
+}
+
+BTF_ID_LIST(bpf_task_pt_regs_ids)
+BTF_ID(struct, pt_regs)
+
+const struct bpf_func_proto bpf_task_pt_regs_proto = {
+	.func		= bpf_task_pt_regs,
+	.gpl_only	= true,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg1_btf_id	= &btf_task_struct_ids[0],
+	.ret_type	= RET_PTR_TO_BTF_ID,
+	.ret_btf_id	= &bpf_task_pt_regs_ids[0],
+};
+
 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -1032,6 +1049,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_task_proto;
 	case BPF_FUNC_get_current_task_btf:
 		return &bpf_get_current_task_btf_proto;
+	case BPF_FUNC_task_pt_regs:
+		return &bpf_task_pt_regs_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_comm:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 191f0b286ee3..791f31dd0abe 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4871,6 +4871,12 @@ union bpf_attr {
  * 	Return
  *		Value specified by user at BPF link creation/attachment time
  *		or 0, if it was not specified.
+ *
+ * long bpf_task_pt_regs(struct task_struct *task)
+ *	Description
+ *		Get the struct pt_regs associated with **task**.
+ *	Return
+ *		A pointer to struct pt_regs.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5048,6 +5054,7 @@ union bpf_attr {
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
 	FN(get_attach_cookie),		\
+	FN(task_pt_regs),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From eb529c5b10b9401a0f2d1f469e82c6a0ba98082c Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Wed, 25 Aug 2021 18:48:31 -0700
Subject: bpf: Fix bpf-next builds without CONFIG_BPF_EVENTS

This commit fixes linker errors along the lines of:

    s390-linux-ld: task_iter.c:(.init.text+0xa4): undefined reference to `btf_task_struct_ids'`

Fix by defining btf_task_struct_ids unconditionally in kernel/bpf/btf.c
since there exists code that unconditionally uses btf_task_struct_ids.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/05d94748d9f4b3eecedc4fddd6875418a396e23c.1629942444.git.dxu@dxuuu.xyz
---
 include/linux/btf_ids.h  | 1 +
 kernel/bpf/btf.c         | 2 ++
 kernel/trace/bpf_trace.c | 2 --
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 93d881ab0d48..47d9abfbdb55 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -151,6 +151,7 @@ extern struct btf_id_set name;
 #define BTF_ID_UNUSED
 #define BTF_ID_LIST_GLOBAL(name) u32 name[1];
 #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1];
+#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1];
 #define BTF_SET_START(name) static struct btf_id_set name = { 0 };
 #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 };
 #define BTF_SET_END(name)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c395024610ed..dfe61df4f974 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6213,3 +6213,5 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 580e14ee7ff9..8e2eb950aa82 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -714,8 +714,6 @@ BPF_CALL_0(bpf_get_current_task_btf)
 	return (unsigned long) current;
 }
 
-BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
-
 const struct bpf_func_proto bpf_get_current_task_btf_proto = {
 	.func		= bpf_get_current_task_btf,
 	.gpl_only	= true,
-- 
cgit v1.2.3


From a1ef61825469b874920f4afb889e1a92353680ff Mon Sep 17 00:00:00 2001
From: Wen Gong <wgong@codeaurora.org>
Date: Fri, 20 Aug 2021 08:20:35 -0400
Subject: ieee80211: add definition of regulatory info in 6 GHz operation
 information
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IEEE Std 802.11ax™-2021 added regulatory info subfield in HE operation
element, add it to the header file.

Signed-off-by: Wen Gong <wgong@codeaurora.org>
Link: https://lore.kernel.org/r/20210820122041.12157-3-wgong@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 2e8953d80d4b..f91cb15a74e7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2312,6 +2312,9 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
 #define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x40000000
 #define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED		0x80000000
 
+#define IEEE80211_6GHZ_CTRL_REG_LPI_AP	0
+#define IEEE80211_6GHZ_CTRL_REG_SP_AP	1
+
 /**
  * ieee80211_he_6ghz_oper - HE 6 GHz operation Information field
  * @primary: primary channel
@@ -2328,6 +2331,7 @@ struct ieee80211_he_6ghz_oper {
 #define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ	2
 #define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ	3
 #define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON	0x4
+#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO	0x38
 	u8 control;
 	u8 ccfs0;
 	u8 ccfs1;
-- 
cgit v1.2.3


From ad31393b98e4addbc5f1ccc484bfbb8d07c92056 Mon Sep 17 00:00:00 2001
From: Wen Gong <wgong@codeaurora.org>
Date: Fri, 20 Aug 2021 08:20:39 -0400
Subject: ieee80211: add definition for transmit power envelope element
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IEEE Std 802.11ax™-2021 makes changes to the transmit power envelope
element, adjust the code accordingly.

Signed-off-by: Wen Gong <wgong@codeaurora.org>
Link: https://lore.kernel.org/r/20210820122041.12157-7-wgong@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f91cb15a74e7..694264503119 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2338,6 +2338,44 @@ struct ieee80211_he_6ghz_oper {
 	u8 minrate;
 } __packed;
 
+/*
+ * In "9.4.2.161 Transmit Power Envelope element" of "IEEE Std 802.11ax-2021",
+ * it show four types in "Table 9-275a-Maximum Transmit Power Interpretation
+ * subfield encoding", and two category for each type in "Table E-12-Regulatory
+ * Info subfield encoding in the United States".
+ * So it it totally max 8 Transmit Power Envelope element.
+ */
+#define IEEE80211_TPE_MAX_IE_COUNT	8
+/*
+ * In "Table 9-277—Meaning of Maximum Transmit Power Count subfield"
+ * of "IEEE Std 802.11ax™‐2021", the max power level is 8.
+ */
+#define IEEE80211_MAX_NUM_PWR_LEVEL	8
+
+#define IEEE80211_TPE_MAX_POWER_COUNT	8
+
+/* transmit power interpretation type of transmit power envelope element */
+enum ieee80211_tx_power_intrpt_type {
+	IEEE80211_TPE_LOCAL_EIRP,
+	IEEE80211_TPE_LOCAL_EIRP_PSD,
+	IEEE80211_TPE_REG_CLIENT_EIRP,
+	IEEE80211_TPE_REG_CLIENT_EIRP_PSD,
+};
+
+/**
+ * struct ieee80211_tx_pwr_env
+ *
+ * This structure represents the "Transmit Power Envelope element"
+ */
+struct ieee80211_tx_pwr_env {
+	u8 tx_power_info;
+	s8 tx_power[IEEE80211_TPE_MAX_POWER_COUNT];
+} __packed;
+
+#define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7
+#define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38
+#define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0
+
 /*
  * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size
  * @he_oper_ie: byte data of the He Operations IE, stating from the byte
@@ -2919,7 +2957,7 @@ enum ieee80211_eid {
 	WLAN_EID_VHT_OPERATION = 192,
 	WLAN_EID_EXTENDED_BSS_LOAD = 193,
 	WLAN_EID_WIDE_BW_CHANNEL_SWITCH = 194,
-	WLAN_EID_VHT_TX_POWER_ENVELOPE = 195,
+	WLAN_EID_TX_POWER_ENVELOPE = 195,
 	WLAN_EID_CHANNEL_SWITCH_WRAPPER = 196,
 	WLAN_EID_AID = 197,
 	WLAN_EID_QUIET_CHANNEL = 198,
-- 
cgit v1.2.3


From 723783d077e39c256a1fafebbd97cbb14207c28f Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Thu, 26 Aug 2021 10:49:47 +0800
Subject: sock: remove one redundant SKB_FRAG_PAGE_ORDER macro

Both SKB_FRAG_PAGE_ORDER are defined to the same value in
net/core/sock.c and drivers/vhost/net.c.

Move the SKB_FRAG_PAGE_ORDER definition to net/core/sock.h,
as both net/core/sock.c and drivers/vhost/net.c include it,
and it seems a reasonable file to put the macro.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/net.c | 2 --
 include/net/sock.h  | 1 +
 net/core/sock.c     | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6414bd5741b8..3a249ee7e144 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -643,8 +643,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
 	       !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-#define SKB_FRAG_PAGE_ORDER     get_order(32768)
-
 static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
 				       struct page_frag *pfrag, gfp_t gfp)
 {
diff --git a/include/net/sock.h b/include/net/sock.h
index 95b25777b53e..66a9a90f9558 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2717,6 +2717,7 @@ extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
+#define SKB_FRAG_PAGE_ORDER	get_order(32768)
 DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
 
 static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
diff --git a/net/core/sock.c b/net/core/sock.c
index 950f1e70dbf5..62627e868e03 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2574,7 +2574,6 @@ static void sk_leave_memory_pressure(struct sock *sk)
 	}
 }
 
-#define SKB_FRAG_PAGE_ORDER	get_order(32768)
 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
 
 /**
-- 
cgit v1.2.3


From d0efb16294d145d157432feda83877ae9d7cdf37 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Thu, 26 Aug 2021 12:46:01 -0700
Subject: net: don't unconditionally copy_from_user a struct ifreq for socket
 ioctls

A common implementation of isatty(3) involves calling a ioctl passing
a dummy struct argument and checking whether the syscall failed --
bionic and glibc use TCGETS (passing a struct termios), and musl uses
TIOCGWINSZ (passing a struct winsize). If the FD is a socket, we will
copy sizeof(struct ifreq) bytes of data from the argument and return
-EFAULT if that fails. The result is that the isatty implementations
may return a non-POSIX-compliant value in errno in the case where part
of the dummy struct argument is inaccessible, as both struct termios
and struct winsize are smaller than struct ifreq (at least on arm64).

Although there is usually enough stack space following the argument
on the stack that this did not present a practical problem up to now,
with MTE stack instrumentation it's more likely for the copy to fail,
as the memory following the struct may have a different tag.

Fix the problem by adding an early check for whether the ioctl is a
valid socket ioctl, and return -ENOTTY if it isn't.

Fixes: 44c02a2c3dc5 ("dev_ioctl(): move copyin/copyout to callers")
Link: https://linux-review.googlesource.com/id/I869da6cf6daabc3e4b7b82ac979683ba05e27d4d
Signed-off-by: Peter Collingbourne <pcc@google.com>
Cc: <stable@vger.kernel.org> # 4.19
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++++
 net/socket.c              | 6 +++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eaf5bb008aa9..d65ce093e5a7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4012,6 +4012,10 @@ int netdev_rx_handler_register(struct net_device *dev,
 void netdev_rx_handler_unregister(struct net_device *dev);
 
 bool dev_valid_name(const char *name);
+static inline bool is_socket_ioctl_cmd(unsigned int cmd)
+{
+	return _IOC_TYPE(cmd) == SOCK_IOC_TYPE;
+}
 int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
 		bool *need_copyout);
 int dev_ifconf(struct net *net, struct ifconf *, int);
diff --git a/net/socket.c b/net/socket.c
index 0b2dad3bdf7f..8808b3617dac 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1109,7 +1109,7 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
 		rtnl_unlock();
 		if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf)))
 			err = -EFAULT;
-	} else {
+	} else if (is_socket_ioctl_cmd(cmd)) {
 		struct ifreq ifr;
 		bool need_copyout;
 		if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
@@ -1118,6 +1118,8 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
 		if (!err && need_copyout)
 			if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
 				return -EFAULT;
+	} else {
+		err = -ENOTTY;
 	}
 	return err;
 }
@@ -3306,6 +3308,8 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
 	struct ifreq ifreq;
 	u32 data32;
 
+	if (!is_socket_ioctl_cmd(cmd))
+		return -ENOTTY;
 	if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ))
 		return -EFAULT;
 	if (get_user(data32, &u_ifreq32->ifr_data))
-- 
cgit v1.2.3


From 49b99da2c9ce13ffcd93fe3a0f5670791c1d76f7 Mon Sep 17 00:00:00 2001
From: Rocco Yue <rocco.yue@mediatek.com>
Date: Fri, 27 Aug 2021 23:04:12 +0800
Subject: ipv6: add IFLA_INET6_RA_MTU to expose mtu value

The kernel provides a "/proc/sys/net/ipv6/conf/<iface>/mtu"
file, which can temporarily record the mtu value of the last
received RA message when the RA mtu value is lower than the
interface mtu, but this proc has following limitations:

(1) when the interface mtu (/sys/class/net/<iface>/mtu) is
updeated, mtu6 (/proc/sys/net/ipv6/conf/<iface>/mtu) will
be updated to the value of interface mtu;
(2) mtu6 (/proc/sys/net/ipv6/conf/<iface>/mtu) only affect
ipv6 connection, and not affect ipv4.

Therefore, when the mtu option is carried in the RA message,
there will be a problem that the user sometimes cannot obtain
RA mtu value correctly by reading mtu6.

After this patch set, if a RA message carries the mtu option,
you can send a netlink msg which nlmsg_type is RTM_GETLINK,
and then by parsing the attribute of IFLA_INET6_RA_MTU to
get the mtu value carried in the RA message received on the
inet6 device. In addition, you can also get a link notification
when ra_mtu is updated so it doesn't have to poll.

In this way, if the MTU values that the device receives from
the network in the PCO IPv4 and the RA IPv6 procedures are
different, the user can obtain the correct ipv6 ra_mtu value
and compare the value of ra_mtu and ipv4 mtu, then the device
can use the lower MTU value for both IPv4 and IPv6.

Signed-off-by: Rocco Yue <rocco.yue@mediatek.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20210827150412.9267-1-rocco.yue@mediatek.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/if_inet6.h             |  2 ++
 include/uapi/linux/if_link.h       |  1 +
 net/ipv6/addrconf.c                | 10 ++++++++++
 net/ipv6/ndisc.c                   | 17 +++++++++++------
 tools/include/uapi/linux/if_link.h |  1 +
 5 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 42235c178b06..653e7d0f65cb 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -210,6 +210,8 @@ struct inet6_dev {
 
 	unsigned long		tstamp; /* ipv6InterfaceTable update timestamp */
 	struct rcu_head		rcu;
+
+	unsigned int		ra_mtu;
 };
 
 static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8aad65b69054..eebd3894fe89 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -417,6 +417,7 @@ enum {
 	IFLA_INET6_ICMP6STATS,	/* statistics (icmpv6)		*/
 	IFLA_INET6_TOKEN,	/* device token			*/
 	IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */
+	IFLA_INET6_RA_MTU,	/* mtu carried in the RA message */
 	__IFLA_INET6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8381288a0d6e..17756f3ed33b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -394,6 +394,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 		ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 
 	ndev->cnf.mtu6 = dev->mtu;
+	ndev->ra_mtu = 0;
 	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
 	if (!ndev->nd_parms) {
 		kfree(ndev);
@@ -3849,6 +3850,7 @@ restart:
 	}
 
 	idev->tstamp = jiffies;
+	idev->ra_mtu = 0;
 
 	/* Last: Shot the device (if unregistered) */
 	if (unregister) {
@@ -5543,6 +5545,7 @@ static inline size_t inet6_ifla6_size(void)
 	     + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
 	     + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
 	     + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
+	     + nla_total_size(4) /* IFLA_INET6_RA_MTU */
 	     + 0;
 }
 
@@ -5651,6 +5654,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
 	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
 		goto nla_put_failure;
 
+	if (idev->ra_mtu &&
+	    nla_put_u32(skb, IFLA_INET6_RA_MTU, idev->ra_mtu))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -5767,6 +5774,9 @@ update_lft:
 static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = {
 	[IFLA_INET6_ADDR_GEN_MODE]	= { .type = NLA_U8 },
 	[IFLA_INET6_TOKEN]		= { .len = sizeof(struct in6_addr) },
+	[IFLA_INET6_RA_MTU]		= { .type = NLA_REJECT,
+					    .reject_message =
+						"IFLA_INET6_RA_MTU can not be set" },
 };
 
 static int check_addr_gen_mode(int mode)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index c467c6419893..4b098521a44c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1391,12 +1391,6 @@ skip_defrtr:
 		}
 	}
 
-	/*
-	 *	Send a notify if RA changed managed/otherconf flags or timer settings
-	 */
-	if (send_ifinfo_notify)
-		inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
-
 skip_linkparms:
 
 	/*
@@ -1496,6 +1490,11 @@ skip_routeinfo:
 		memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
 		mtu = ntohl(n);
 
+		if (in6_dev->ra_mtu != mtu) {
+			in6_dev->ra_mtu = mtu;
+			send_ifinfo_notify = true;
+		}
+
 		if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
 			ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
 		} else if (in6_dev->cnf.mtu6 != mtu) {
@@ -1519,6 +1518,12 @@ skip_routeinfo:
 		ND_PRINTK(2, warn, "RA: invalid RA options\n");
 	}
 out:
+	/* Send a notify if RA changed managed/otherconf flags or
+	 * timer settings or ra_mtu value
+	 */
+	if (send_ifinfo_notify)
+		inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+
 	fib6_info_release(rt);
 	if (neigh)
 		neigh_release(neigh);
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index eb15f319aa57..b3610fdd1fee 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -230,6 +230,7 @@ enum {
 	IFLA_INET6_ICMP6STATS,	/* statistics (icmpv6)		*/
 	IFLA_INET6_TOKEN,	/* device token			*/
 	IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */
+	IFLA_INET6_RA_MTU,	/* mtu carried in the RA message */
 	__IFLA_INET6_MAX
 };
 
-- 
cgit v1.2.3


From 81f9ebd43659320a88cae8ed5124c50b4d47ab66 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 22 Aug 2021 01:58:00 +0200
Subject: ssb: Drop legacy header include

The SSB header only uses the legacy <linux/gpio.h> header to get
struct gpio_chip so inluce <linux/gpio/driver.h> which is the right
include to deal with gpio_chip.

Cc: Michael Buesch <m@bues.ch>
Cc: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/20210821235800.138817-1-linus.walleij@linaro.org
---
 include/linux/ssb/ssb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 0d5a2691e7e9..f9b53acb4e02 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -7,7 +7,7 @@
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
 #include <linux/mod_devicetable.h>
 #include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
-- 
cgit v1.2.3


From 8d4be124062bddbb2bcb887702a0601b790b9a83 Mon Sep 17 00:00:00 2001
From: Jing Yangyang <jing.yangyang@zte.com.cn>
Date: Mon, 23 Aug 2021 23:13:41 -0700
Subject: ssb: fix boolreturn.cocci warning

./include/linux/ssb/ssb_driver_extif.h:200:8-9:WARNING: return of 0/1 in
function 'ssb_extif_available' with return type bool

Return statements in functions returning bool should use true/false
instead of 1/0.

Generated by: scripts/coccinelle/misc/boolreturn.cocci

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Jing Yangyang <jing.yangyang@zte.com.cn>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/20210824061341.59255-1-deng.changcheng@zte.com.cn
---
 include/linux/ssb/ssb_driver_extif.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ssb/ssb_driver_extif.h b/include/linux/ssb/ssb_driver_extif.h
index 3f8bc973d67d..19253bfacd1a 100644
--- a/include/linux/ssb/ssb_driver_extif.h
+++ b/include/linux/ssb/ssb_driver_extif.h
@@ -197,7 +197,7 @@ struct ssb_extif {
 
 static inline bool ssb_extif_available(struct ssb_extif *extif)
 {
-	return 0;
+	return false;
 }
 
 static inline
-- 
cgit v1.2.3


From 7a3f5b0de3647c854e34269c3332d7a1e902901a Mon Sep 17 00:00:00 2001
From: Ryoga Saito <contact@proelbtn.com>
Date: Tue, 17 Aug 2021 08:39:37 +0000
Subject: netfilter: add netfilter hooks to SRv6 data plane

This patch introduces netfilter hooks for solving the problem that
conntrack couldn't record both inner flows and outer flows.

This patch also introduces a new sysctl toggle for enabling lightweight
tunnel netfilter hooks.

Signed-off-by: Ryoga Saito <contact@proelbtn.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/nf_conntrack-sysctl.rst |   7 ++
 include/net/lwtunnel.h                           |   3 +
 include/net/netfilter/nf_hooks_lwtunnel.h        |   7 ++
 net/core/lwtunnel.c                              |   3 +
 net/ipv6/seg6_iptunnel.c                         |  75 ++++++++++++++-
 net/ipv6/seg6_local.c                            | 111 ++++++++++++++++-------
 net/netfilter/Makefile                           |   3 +
 net/netfilter/nf_conntrack_standalone.c          |  15 +++
 net/netfilter/nf_hooks_lwtunnel.c                |  53 +++++++++++
 9 files changed, 241 insertions(+), 36 deletions(-)
 create mode 100644 include/net/netfilter/nf_hooks_lwtunnel.h
 create mode 100644 net/netfilter/nf_hooks_lwtunnel.c

(limited to 'include')

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 024d784157c8..34ca762ea56f 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -184,6 +184,13 @@ nf_conntrack_gre_timeout_stream - INTEGER (seconds)
 	This extended timeout will be used in case there is an GRE stream
 	detected.
 
+nf_hooks_lwtunnel - BOOLEAN
+	- 0 - disabled (default)
+	- not 0 - enabled
+
+	If this option is enabled, the lightweight tunnel netfilter hooks are
+	enabled. This option cannot be disabled once it is enabled.
+
 nf_flowtable_tcp_timeout - INTEGER (seconds)
         default 30
 
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 05cfd6ff6528..6f15e6fa154e 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -51,6 +51,9 @@ struct lwtunnel_encap_ops {
 };
 
 #ifdef CONFIG_LWTUNNEL
+
+DECLARE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
+
 void lwtstate_free(struct lwtunnel_state *lws);
 
 static inline struct lwtunnel_state *
diff --git a/include/net/netfilter/nf_hooks_lwtunnel.h b/include/net/netfilter/nf_hooks_lwtunnel.h
new file mode 100644
index 000000000000..52e27920f829
--- /dev/null
+++ b/include/net/netfilter/nf_hooks_lwtunnel.h
@@ -0,0 +1,7 @@
+#include <linux/sysctl.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_SYSCTL
+int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write,
+				     void *buffer, size_t *lenp, loff_t *ppos);
+#endif
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index d0ae987d2de9..2820aca2173a 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -23,6 +23,9 @@
 #include <net/ip6_fib.h>
 #include <net/rtnh.h>
 
+DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
+EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
+
 #ifdef CONFIG_MODULES
 
 static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 897fa59c47de..6ebc7aa24466 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -26,6 +26,8 @@
 #ifdef CONFIG_IPV6_SEG6_HMAC
 #include <net/seg6_hmac.h>
 #endif
+#include <net/lwtunnel.h>
+#include <linux/netfilter.h>
 
 static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 {
@@ -295,11 +297,19 @@ static int seg6_do_srh(struct sk_buff *skb)
 
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+	nf_reset_ct(skb);
 
 	return 0;
 }
 
-static int seg6_input(struct sk_buff *skb)
+static int seg6_input_finish(struct net *net, struct sock *sk,
+			     struct sk_buff *skb)
+{
+	return dst_input(skb);
+}
+
+static int seg6_input_core(struct net *net, struct sock *sk,
+			   struct sk_buff *skb)
 {
 	struct dst_entry *orig_dst = skb_dst(skb);
 	struct dst_entry *dst = NULL;
@@ -337,10 +347,41 @@ static int seg6_input(struct sk_buff *skb)
 	if (unlikely(err))
 		return err;
 
-	return dst_input(skb);
+	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+			       dev_net(skb->dev), NULL, skb, NULL,
+			       skb_dst(skb)->dev, seg6_input_finish);
+
+	return seg6_input_finish(dev_net(skb->dev), NULL, skb);
 }
 
-static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int seg6_input_nf(struct sk_buff *skb)
+{
+	struct net_device *dev = skb_dst(skb)->dev;
+	struct net *net = dev_net(skb->dev);
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, NULL,
+			       skb, NULL, dev, seg6_input_core);
+	case htons(ETH_P_IPV6):
+		return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, NULL,
+			       skb, NULL, dev, seg6_input_core);
+	}
+
+	return -EINVAL;
+}
+
+static int seg6_input(struct sk_buff *skb)
+{
+	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+		return seg6_input_nf(skb);
+
+	return seg6_input_core(dev_net(skb->dev), NULL, skb);
+}
+
+static int seg6_output_core(struct net *net, struct sock *sk,
+			    struct sk_buff *skb)
 {
 	struct dst_entry *orig_dst = skb_dst(skb);
 	struct dst_entry *dst = NULL;
@@ -387,12 +428,40 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	if (unlikely(err))
 		goto drop;
 
+	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
+			       NULL, skb_dst(skb)->dev, dst_output);
+
 	return dst_output(net, sk, skb);
 drop:
 	kfree_skb(skb);
 	return err;
 }
 
+static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct net_device *dev = skb_dst(skb)->dev;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb,
+			       NULL, dev, seg6_output_core);
+	case htons(ETH_P_IPV6):
+		return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb,
+			       NULL, dev, seg6_output_core);
+	}
+
+	return -EINVAL;
+}
+
+static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+		return seg6_output_nf(net, sk, skb);
+
+	return seg6_output_core(net, sk, skb);
+}
+
 static int seg6_build_state(struct net *net, struct nlattr *nla,
 			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts,
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 60bf3b877957..ddc8dfcd4e2b 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -30,6 +30,8 @@
 #include <net/seg6_local.h>
 #include <linux/etherdevice.h>
 #include <linux/bpf.h>
+#include <net/lwtunnel.h>
+#include <linux/netfilter.h>
 
 #define SEG6_F_ATTR(i)		BIT(i)
 
@@ -413,12 +415,33 @@ drop:
 	return -EINVAL;
 }
 
+static int input_action_end_dx6_finish(struct net *net, struct sock *sk,
+				       struct sk_buff *skb)
+{
+	struct dst_entry *orig_dst = skb_dst(skb);
+	struct in6_addr *nhaddr = NULL;
+	struct seg6_local_lwt *slwt;
+
+	slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+
+	/* The inner packet is not associated to any local interface,
+	 * so we do not call netif_rx().
+	 *
+	 * If slwt->nh6 is set to ::, then lookup the nexthop for the
+	 * inner packet's DA. Otherwise, use the specified nexthop.
+	 */
+	if (!ipv6_addr_any(&slwt->nh6))
+		nhaddr = &slwt->nh6;
+
+	seg6_lookup_nexthop(skb, nhaddr, 0);
+
+	return dst_input(skb);
+}
+
 /* decapsulate and forward to specified nexthop */
 static int input_action_end_dx6(struct sk_buff *skb,
 				struct seg6_local_lwt *slwt)
 {
-	struct in6_addr *nhaddr = NULL;
-
 	/* this function accepts IPv6 encapsulated packets, with either
 	 * an SRH with SL=0, or no SRH.
 	 */
@@ -429,40 +452,30 @@ static int input_action_end_dx6(struct sk_buff *skb,
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
-	/* The inner packet is not associated to any local interface,
-	 * so we do not call netif_rx().
-	 *
-	 * If slwt->nh6 is set to ::, then lookup the nexthop for the
-	 * inner packet's DA. Otherwise, use the specified nexthop.
-	 */
-
-	if (!ipv6_addr_any(&slwt->nh6))
-		nhaddr = &slwt->nh6;
-
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+	nf_reset_ct(skb);
 
-	seg6_lookup_nexthop(skb, nhaddr, 0);
+	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+		return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+			       dev_net(skb->dev), NULL, skb, NULL,
+			       skb_dst(skb)->dev, input_action_end_dx6_finish);
 
-	return dst_input(skb);
+	return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb);
 drop:
 	kfree_skb(skb);
 	return -EINVAL;
 }
 
-static int input_action_end_dx4(struct sk_buff *skb,
-				struct seg6_local_lwt *slwt)
+static int input_action_end_dx4_finish(struct net *net, struct sock *sk,
+				       struct sk_buff *skb)
 {
+	struct dst_entry *orig_dst = skb_dst(skb);
+	struct seg6_local_lwt *slwt;
 	struct iphdr *iph;
 	__be32 nhaddr;
 	int err;
 
-	if (!decap_and_validate(skb, IPPROTO_IPIP))
-		goto drop;
-
-	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-		goto drop;
-
-	skb->protocol = htons(ETH_P_IP);
+	slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
 
 	iph = ip_hdr(skb);
 
@@ -470,14 +483,34 @@ static int input_action_end_dx4(struct sk_buff *skb,
 
 	skb_dst_drop(skb);
 
-	skb_set_transport_header(skb, sizeof(struct iphdr));
-
 	err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
-	if (err)
-		goto drop;
+	if (err) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
 
 	return dst_input(skb);
+}
+
+static int input_action_end_dx4(struct sk_buff *skb,
+				struct seg6_local_lwt *slwt)
+{
+	if (!decap_and_validate(skb, IPPROTO_IPIP))
+		goto drop;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto drop;
+
+	skb->protocol = htons(ETH_P_IP);
+	skb_set_transport_header(skb, sizeof(struct iphdr));
+	nf_reset_ct(skb);
+
+	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+		return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+			       dev_net(skb->dev), NULL, skb, NULL,
+			       skb_dst(skb)->dev, input_action_end_dx4_finish);
 
+	return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb);
 drop:
 	kfree_skb(skb);
 	return -EINVAL;
@@ -645,6 +678,7 @@ static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb,
 	skb_dst_drop(skb);
 
 	skb_set_transport_header(skb, hdrlen);
+	nf_reset_ct(skb);
 
 	return end_dt_vrf_rcv(skb, family, vrf);
 
@@ -1078,7 +1112,8 @@ static void seg6_local_update_counters(struct seg6_local_lwt *slwt,
 	u64_stats_update_end(&pcounters->syncp);
 }
 
-static int seg6_local_input(struct sk_buff *skb)
+static int seg6_local_input_core(struct net *net, struct sock *sk,
+				 struct sk_buff *skb)
 {
 	struct dst_entry *orig_dst = skb_dst(skb);
 	struct seg6_action_desc *desc;
@@ -1086,11 +1121,6 @@ static int seg6_local_input(struct sk_buff *skb)
 	unsigned int len = skb->len;
 	int rc;
 
-	if (skb->protocol != htons(ETH_P_IPV6)) {
-		kfree_skb(skb);
-		return -EINVAL;
-	}
-
 	slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
 	desc = slwt->desc;
 
@@ -1104,6 +1134,21 @@ static int seg6_local_input(struct sk_buff *skb)
 	return rc;
 }
 
+static int seg6_local_input(struct sk_buff *skb)
+{
+	if (skb->protocol != htons(ETH_P_IPV6)) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+			       dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+			       seg6_local_input_core);
+
+	return seg6_local_input_core(dev_net(skb->dev), NULL, skb);
+}
+
 static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
 	[SEG6_LOCAL_ACTION]	= { .type = NLA_U32 },
 	[SEG6_LOCAL_SRH]	= { .type = NLA_BINARY },
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 049890e00a3d..aab20e575ecd 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -212,3 +212,6 @@ obj-$(CONFIG_IP_SET) += ipset/
 
 # IPVS
 obj-$(CONFIG_IP_VS) += ipvs/
+
+# lwtunnel
+obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index e84b499b7bfa..7e0d956da51d 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -22,6 +22,9 @@
 #include <net/netfilter/nf_conntrack_acct.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_conntrack_timestamp.h>
+#ifdef CONFIG_LWTUNNEL
+#include <net/netfilter/nf_hooks_lwtunnel.h>
+#endif
 #include <linux/rculist_nulls.h>
 
 static bool enable_hooks __read_mostly;
@@ -612,6 +615,9 @@ enum nf_ct_sysctl_index {
 	NF_SYSCTL_CT_PROTO_TIMEOUT_GRE,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM,
 #endif
+#ifdef CONFIG_LWTUNNEL
+	NF_SYSCTL_CT_LWTUNNEL,
+#endif
 
 	__NF_SYSCTL_CT_LAST_SYSCTL,
 };
@@ -958,6 +964,15 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec_jiffies,
 	},
+#endif
+#ifdef CONFIG_LWTUNNEL
+	[NF_SYSCTL_CT_LWTUNNEL] = {
+		.procname	= "nf_hooks_lwtunnel",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= nf_hooks_lwtunnel_sysctl_handler,
+	},
 #endif
 	{}
 };
diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c
new file mode 100644
index 000000000000..00e89ffd78f6
--- /dev/null
+++ b/net/netfilter/nf_hooks_lwtunnel.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sysctl.h>
+#include <net/lwtunnel.h>
+#include <net/netfilter/nf_hooks_lwtunnel.h>
+
+static inline int nf_hooks_lwtunnel_get(void)
+{
+	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+		return 1;
+	else
+		return 0;
+}
+
+static inline int nf_hooks_lwtunnel_set(int enable)
+{
+	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) {
+		if (!enable)
+			return -EBUSY;
+	} else if (enable) {
+		static_branch_enable(&nf_hooks_lwtunnel_enabled);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write,
+				     void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int proc_nf_hooks_lwtunnel_enabled = 0;
+	struct ctl_table tmp = {
+		.procname = table->procname,
+		.data = &proc_nf_hooks_lwtunnel_enabled,
+		.maxlen = sizeof(int),
+		.mode = table->mode,
+		.extra1 = SYSCTL_ZERO,
+		.extra2 = SYSCTL_ONE,
+	};
+	int ret;
+
+	if (!write)
+		proc_nf_hooks_lwtunnel_enabled = nf_hooks_lwtunnel_get();
+
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0)
+		ret = nf_hooks_lwtunnel_set(proc_nf_hooks_lwtunnel_enabled);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler);
+#endif /* CONFIG_SYSCTL */
-- 
cgit v1.2.3


From ca49bfd90a9dde175d2929dc1544b54841e33804 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Thu, 26 Aug 2021 14:54:25 +0300
Subject: sch_htb: Fix inconsistency when leaf qdisc creation fails

In HTB offload mode, qdiscs of leaf classes are grafted to netdev
queues. sch_htb expects the dev_queue field of these qdiscs to point to
the corresponding queues. However, qdisc creation may fail, and in that
case noop_qdisc is used instead. Its dev_queue doesn't point to the
right queue, so sch_htb can lose track of used netdev queues, which will
cause internal inconsistencies.

This commit fixes this bug by keeping track of the netdev queue inside
struct htb_class. All reads of cl->leaf.q->dev_queue are replaced by the
new field, the two values are synced on writes, and WARNs are added to
assert equality of the two values.

The driver API has changed: when TC_HTB_LEAF_DEL needs to move a queue,
the driver used to pass the old and new queue IDs to sch_htb. Now that
there is a new field (offload_queue) in struct htb_class that needs to
be updated on this operation, the driver will pass the old class ID to
sch_htb instead (it already knows the new class ID).

Fixes: d03b195b5aa0 ("sch_htb: Hierarchical QoS hardware offload")
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20210826115425.1744053-1-maximmi@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/qos.c  | 15 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en/qos.h  |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  3 +-
 include/net/pkt_cls.h                             |  3 +-
 net/sched/sch_htb.c                               | 97 +++++++++++++++--------
 5 files changed, 72 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
index c9ac69f62f21..e8a8d78e3e4d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
@@ -733,8 +733,8 @@ static void mlx5e_reset_qdisc(struct net_device *dev, u16 qid)
 	spin_unlock_bh(qdisc_lock(qdisc));
 }
 
-int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid,
-		       u16 *new_qid, struct netlink_ext_ack *extack)
+int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 *classid,
+		       struct netlink_ext_ack *extack)
 {
 	struct mlx5e_qos_node *node;
 	struct netdev_queue *txq;
@@ -742,11 +742,9 @@ int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid,
 	bool opened;
 	int err;
 
-	qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL classid %04x\n", classid);
-
-	*old_qid = *new_qid = 0;
+	qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL classid %04x\n", *classid);
 
-	node = mlx5e_sw_node_find(priv, classid);
+	node = mlx5e_sw_node_find(priv, *classid);
 	if (!node)
 		return -ENOENT;
 
@@ -764,7 +762,7 @@ int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid,
 	err = mlx5_qos_destroy_node(priv->mdev, node->hw_id);
 	if (err) /* Not fatal. */
 		qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n",
-			 node->hw_id, classid, err);
+			 node->hw_id, *classid, err);
 
 	mlx5e_sw_node_delete(priv, node);
 
@@ -826,8 +824,7 @@ int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid,
 	if (opened)
 		mlx5e_reactivate_qos_sq(priv, moved_qid, txq);
 
-	*old_qid = mlx5e_qid_from_qos(&priv->channels, moved_qid);
-	*new_qid = mlx5e_qid_from_qos(&priv->channels, qid);
+	*classid = node->classid;
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h
index 5af7991fcd19..757682b7c0e0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h
@@ -34,8 +34,8 @@ int mlx5e_htb_leaf_alloc_queue(struct mlx5e_priv *priv, u16 classid,
 			       struct netlink_ext_ack *extack);
 int mlx5e_htb_leaf_to_inner(struct mlx5e_priv *priv, u16 classid, u16 child_classid,
 			    u64 rate, u64 ceil, struct netlink_ext_ack *extack);
-int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid,
-		       u16 *new_qid, struct netlink_ext_ack *extack);
+int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 *classid,
+		       struct netlink_ext_ack *extack);
 int mlx5e_htb_leaf_del_last(struct mlx5e_priv *priv, u16 classid, bool force,
 			    struct netlink_ext_ack *extack);
 int mlx5e_htb_node_modify(struct mlx5e_priv *priv, u16 classid, u64 rate, u64 ceil,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 26d2f78c7706..47efd858964d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3000,8 +3000,7 @@ static int mlx5e_setup_tc_htb(struct mlx5e_priv *priv, struct tc_htb_qopt_offloa
 		return mlx5e_htb_leaf_to_inner(priv, htb->parent_classid, htb->classid,
 					       htb->rate, htb->ceil, htb->extack);
 	case TC_HTB_LEAF_DEL:
-		return mlx5e_htb_leaf_del(priv, htb->classid, &htb->moved_qid, &htb->qid,
-					  htb->extack);
+		return mlx5e_htb_leaf_del(priv, &htb->classid, htb->extack);
 	case TC_HTB_LEAF_DEL_LAST:
 	case TC_HTB_LEAF_DEL_LAST_FORCE:
 		return mlx5e_htb_leaf_del_last(priv, htb->classid,
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 8fb47fc61097..83a6d0792180 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -816,10 +816,9 @@ enum tc_htb_command {
 struct tc_htb_qopt_offload {
 	struct netlink_ext_ack *extack;
 	enum tc_htb_command command;
-	u16 classid;
 	u32 parent_classid;
+	u16 classid;
 	u16 qid;
-	u16 moved_qid;
 	u64 rate;
 	u64 ceil;
 };
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 81ea8332547a..5067a6e5d4fd 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -125,6 +125,7 @@ struct htb_class {
 		struct htb_class_leaf {
 			int		deficit[TC_HTB_MAXDEPTH];
 			struct Qdisc	*q;
+			struct netdev_queue *offload_queue;
 		} leaf;
 		struct htb_class_inner {
 			struct htb_prio clprio[TC_HTB_NUMPRIO];
@@ -1411,24 +1412,47 @@ htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
 	return old_q;
 }
 
-static void htb_offload_move_qdisc(struct Qdisc *sch, u16 qid_old, u16 qid_new)
+static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
+{
+	struct netdev_queue *queue;
+
+	queue = cl->leaf.offload_queue;
+	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
+		WARN_ON(cl->leaf.q->dev_queue != queue);
+
+	return queue;
+}
+
+static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
+				   struct htb_class *cl_new, bool destroying)
 {
 	struct netdev_queue *queue_old, *queue_new;
 	struct net_device *dev = qdisc_dev(sch);
-	struct Qdisc *qdisc;
 
-	queue_old = netdev_get_tx_queue(dev, qid_old);
-	queue_new = netdev_get_tx_queue(dev, qid_new);
+	queue_old = htb_offload_get_queue(cl_old);
+	queue_new = htb_offload_get_queue(cl_new);
 
-	if (dev->flags & IFF_UP)
-		dev_deactivate(dev);
-	qdisc = dev_graft_qdisc(queue_old, NULL);
-	qdisc->dev_queue = queue_new;
-	qdisc = dev_graft_qdisc(queue_new, qdisc);
-	if (dev->flags & IFF_UP)
-		dev_activate(dev);
+	if (!destroying) {
+		struct Qdisc *qdisc;
 
-	WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
+		if (dev->flags & IFF_UP)
+			dev_deactivate(dev);
+		qdisc = dev_graft_qdisc(queue_old, NULL);
+		WARN_ON(qdisc != cl_old->leaf.q);
+	}
+
+	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
+		cl_old->leaf.q->dev_queue = queue_new;
+	cl_old->leaf.offload_queue = queue_new;
+
+	if (!destroying) {
+		struct Qdisc *qdisc;
+
+		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
+		if (dev->flags & IFF_UP)
+			dev_activate(dev);
+		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
+	}
 }
 
 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
@@ -1442,10 +1466,8 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (cl->level)
 		return -EINVAL;
 
-	if (q->offload) {
-		dev_queue = new->dev_queue;
-		WARN_ON(dev_queue != cl->leaf.q->dev_queue);
-	}
+	if (q->offload)
+		dev_queue = htb_offload_get_queue(cl);
 
 	if (!new) {
 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
@@ -1514,6 +1536,8 @@ static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
 	parent->ctokens = parent->cbuffer;
 	parent->t_c = ktime_get_ns();
 	parent->cmode = HTB_CAN_SEND;
+	if (q->offload)
+		parent->leaf.offload_queue = cl->leaf.offload_queue;
 }
 
 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
@@ -1534,6 +1558,7 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
 				     struct netlink_ext_ack *extack)
 {
 	struct tc_htb_qopt_offload offload_opt;
+	struct netdev_queue *dev_queue;
 	struct Qdisc *q = cl->leaf.q;
 	struct Qdisc *old = NULL;
 	int err;
@@ -1542,16 +1567,15 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
 		return -EINVAL;
 
 	WARN_ON(!q);
-	if (!destroying) {
-		/* On destroy of HTB, two cases are possible:
-		 * 1. q is a normal qdisc, but q->dev_queue has noop qdisc.
-		 * 2. q is a noop qdisc (for nodes that were inner),
-		 *    q->dev_queue is noop_netdev_queue.
+	dev_queue = htb_offload_get_queue(cl);
+	old = htb_graft_helper(dev_queue, NULL);
+	if (destroying)
+		/* Before HTB is destroyed, the kernel grafts noop_qdisc to
+		 * all queues.
 		 */
-		old = htb_graft_helper(q->dev_queue, NULL);
-		WARN_ON(!old);
+		WARN_ON(!(old->flags & TCQ_F_BUILTIN));
+	else
 		WARN_ON(old != q);
-	}
 
 	if (cl->parent) {
 		cl->parent->bstats_bias.bytes += q->bstats.bytes;
@@ -1570,18 +1594,17 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
 	if (!err || destroying)
 		qdisc_put(old);
 	else
-		htb_graft_helper(q->dev_queue, old);
+		htb_graft_helper(dev_queue, old);
 
 	if (last_child)
 		return err;
 
-	if (!err && offload_opt.moved_qid != 0) {
-		if (destroying)
-			q->dev_queue = netdev_get_tx_queue(qdisc_dev(sch),
-							   offload_opt.qid);
-		else
-			htb_offload_move_qdisc(sch, offload_opt.moved_qid,
-					       offload_opt.qid);
+	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
+		u32 classid = TC_H_MAJ(sch->handle) |
+			      TC_H_MIN(offload_opt.classid);
+		struct htb_class *moved_cl = htb_find(classid, sch);
+
+		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
 	}
 
 	return err;
@@ -1704,9 +1727,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg,
 	}
 
 	if (last_child) {
-		struct netdev_queue *dev_queue;
+		struct netdev_queue *dev_queue = sch->dev_queue;
+
+		if (q->offload)
+			dev_queue = htb_offload_get_queue(cl);
 
-		dev_queue = q->offload ? cl->leaf.q->dev_queue : sch->dev_queue;
 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
 					  cl->parent->common.classid,
 					  NULL);
@@ -1878,7 +1903,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			}
 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
 		} else { /* First child. */
-			dev_queue = parent->leaf.q->dev_queue;
+			dev_queue = htb_offload_get_queue(parent);
 			old_q = htb_graft_helper(dev_queue, NULL);
 			WARN_ON(old_q != parent->leaf.q);
 			offload_opt = (struct tc_htb_qopt_offload) {
@@ -1935,6 +1960,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 
 		/* leaf (we) needs elementary qdisc */
 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
+		if (q->offload)
+			cl->leaf.offload_queue = dev_queue;
 
 		cl->parent = parent;
 
-- 
cgit v1.2.3